teddy: make u8x16 and u8x32 have vector call ABI

Before this commit, u8x16 and u8x32 were repr(Rust) unions. This
introduced unspecified behavior because the field offsets of repr(Rust)
unions are not guaranteed to be at offset 0, so that field access was
potentially UB. This commit fixes that.

The unions were also generating a lot of unnecessary memory operations.
This commit fixes that as well.

The issue is that unions have an Aggregate call ABI, which is the same
as the call ABI of arrays. That is, they are passed around by memory,
and not in Vector registers.

This is good, if most of the time one operates on them as arrays. This
was, however, not the case. Most of the operations on these unions are
using SIMD instructions. This means that the union needs to be copied
into a SIMD register, operated on, and then spilled back to the stack,
on every single operation. That's unnecessary, although apparently LLVM
was able to optimize all the unnecessary memory operations away and
leave these always in registers.

This commit fixes this issue as well, by making the u8x16 and u8x32
repr(transparent) newtypes over the architecture specific vector types,
giving them the Vector ABI.

The vectors are then copied to the stack only when necessary, and as
little as possible. This is done using mem::transmute, removing the
need for unions altogether (fixing #588 by not having to worry about
union layout at all).

To make it clear when the vectors are spilled into the stack, the
vector::replace(index, value) and vector::extract(index) APIs
have been removed, and instead, only a vector::bytes(self) and a
vector::from_bytes(&mut self, [u8; N]) APIs are provided instead. This
prevents spilling the vectors back and forth onto the stack every time
an index needs to be modified, by using vector::bytes to spill the
vector to the stack once, making all the random-access modifications in
memory, and then using vector::from_bytes only once to move the memory
back into a SIMD register.

Closes #588, Closes #589
This commit is contained in:
gnzlbg
2019-07-01 09:52:39 +02:00
committed by Andrew Gallant
parent 95ca8ec2ab
commit 2b501aaad7
4 changed files with 64 additions and 47 deletions
+16 -7
View File
@@ -286,6 +286,7 @@ impl Teddy {
res: u8x32,
mut bitfield: u32,
) -> Option<Match> {
let patterns = res.bytes();
while bitfield != 0 {
// The next offset, relative to pos, where some fingerprint
// matched.
@@ -297,7 +298,7 @@ impl Teddy {
// The bitfield telling us which patterns had fingerprints that
// match at this starting position.
let mut patterns = res.extract(byte_pos);
let mut patterns = patterns[byte_pos];
while patterns != 0 {
let bucket = patterns.trailing_zeros() as usize;
patterns &= !(1 << bucket);
@@ -462,12 +463,20 @@ impl Mask {
let byte_lo = (byte & 0xF) as usize;
let byte_hi = (byte >> 4) as usize;
let lo = self.lo.extract(byte_lo) | ((1 << bucket) as u8);
self.lo.replace(byte_lo, lo);
self.lo.replace(byte_lo + 16, lo);
{
let mut lo_bytes = self.lo.bytes();
let lo = lo_bytes[byte_lo] | ((1 << bucket) as u8);
lo_bytes[byte_lo] = lo;
lo_bytes[byte_lo + 16] = lo;
self.lo.replace_bytes(lo_bytes);
}
let hi = self.hi.extract(byte_hi) | ((1 << bucket) as u8);
self.hi.replace(byte_hi, hi);
self.hi.replace(byte_hi + 16, hi);
{
let mut hi_bytes = self.hi.bytes();
let hi = hi_bytes[byte_hi] | ((1 << bucket) as u8);
hi_bytes[byte_hi] = hi;
hi_bytes[byte_hi + 16] = hi;
self.hi.replace_bytes(hi_bytes);
}
}
}
+14 -6
View File
@@ -596,6 +596,7 @@ impl Teddy {
res: u8x16,
mut bitfield: u32,
) -> Option<Match> {
let patterns = res.bytes();
while bitfield != 0 {
// The next offset, relative to pos, where some fingerprint
// matched.
@@ -607,7 +608,7 @@ impl Teddy {
// The bitfield telling us which patterns had fingerprints that
// match at this starting position.
let mut patterns = res.extract(byte_pos);
let mut patterns = patterns[byte_pos];
while patterns != 0 {
let bucket = patterns.trailing_zeros() as usize;
patterns &= !(1 << bucket);
@@ -772,10 +773,17 @@ impl Mask {
let byte_lo = (byte & 0xF) as usize;
let byte_hi = (byte >> 4) as usize;
let lo = self.lo.extract(byte_lo);
self.lo.replace(byte_lo, ((1 << bucket) as u8) | lo);
let hi = self.hi.extract(byte_hi);
self.hi.replace(byte_hi, ((1 << bucket) as u8) | hi);
{
let mut lo_bytes = self.lo.bytes();
let lo = lo_bytes[byte_lo];
lo_bytes[byte_lo] = ((1 << bucket) as u8) | lo;
self.lo.replace_bytes(lo_bytes);
}
{
let mut hi_bytes = self.hi.bytes();
let hi = hi_bytes[byte_hi];
hi_bytes[byte_hi] = ((1 << bucket) as u8) | hi;
self.hi.replace_bytes(hi_bytes);
}
}
}
+17 -17
View File
@@ -2,6 +2,7 @@
use std::arch::x86_64::*;
use std::fmt;
use std::mem;
#[derive(Clone, Copy, Debug)]
pub struct AVX2VectorBuilder(());
@@ -56,9 +57,9 @@ impl AVX2VectorBuilder {
#[derive(Clone, Copy)]
#[allow(non_camel_case_types)]
pub union u8x32 {
vector: __m256i,
bytes: [u8; 32],
#[repr(transparent)]
pub struct u8x32 {
vector: __m256i
}
impl u8x32 {
@@ -92,18 +93,6 @@ impl u8x32 {
u8x32 { vector: _mm256_load_si256(p) }
}
#[inline]
pub fn extract(self, i: usize) -> u8 {
// Safe because `bytes` is always accessible.
unsafe { self.bytes[i] }
}
#[inline]
pub fn replace(&mut self, i: usize, byte: u8) {
// Safe because `bytes` is always accessible.
unsafe { self.bytes[i] = byte; }
}
#[inline]
pub fn shuffle(self, indices: u8x32) -> u8x32 {
// Safe because we know AVX2 is enabled.
@@ -177,11 +166,22 @@ impl u8x32 {
u8x32 { vector: _mm256_srli_epi16(self.vector, 4) }
}
}
#[inline]
pub fn bytes(self) -> [u8; 32] {
// Safe because __m256i and [u8; 32] are layout compatible
unsafe { mem::transmute(self) }
}
#[inline]
pub fn replace_bytes(&mut self, value: [u8; 32]) {
// Safe because __m256i and [u8; 32] are layout compatible
self.vector = unsafe { mem::transmute(value) };
}
}
impl fmt::Debug for u8x32 {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
// Safe because `bytes` is always accessible.
unsafe { self.bytes.fmt(f) }
self.bytes().fmt(f)
}
}
+17 -17
View File
@@ -2,6 +2,7 @@
use std::arch::x86_64::*;
use std::fmt;
use std::mem;
/// A builder for SSSE3 empowered vectors.
///
@@ -77,9 +78,9 @@ impl SSSE3VectorBuilder {
/// inlined, otherwise you probably have a performance bug.
#[derive(Clone, Copy)]
#[allow(non_camel_case_types)]
pub union u8x16 {
vector: __m128i,
bytes: [u8; 16],
#[repr(transparent)]
pub struct u8x16 {
vector: __m128i
}
impl u8x16 {
@@ -113,18 +114,6 @@ impl u8x16 {
u8x16 { vector: v }
}
#[inline]
pub fn extract(self, i: usize) -> u8 {
// Safe because `bytes` is always accessible.
unsafe { self.bytes[i] }
}
#[inline]
pub fn replace(&mut self, i: usize, byte: u8) {
// Safe because `bytes` is always accessible.
unsafe { self.bytes[i] = byte; }
}
#[inline]
pub fn shuffle(self, indices: u8x16) -> u8x16 {
// Safe because we know SSSE3 is enabled.
@@ -182,11 +171,22 @@ impl u8x16 {
u8x16 { vector: _mm_srli_epi16(self.vector, 4) }
}
}
#[inline]
pub fn bytes(self) -> [u8; 16] {
// Safe because __m128i and [u8; 16] are layout compatible
unsafe { mem::transmute(self) }
}
#[inline]
pub fn replace_bytes(&mut self, value: [u8; 16]) {
// Safe because __m128i and [u8; 16] are layout compatible
self.vector = unsafe { mem::transmute(value) };
}
}
impl fmt::Debug for u8x16 {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
// Safe because `bytes` is always accessible.
unsafe { self.bytes.fmt(f) }
self.bytes().fmt(f)
}
}