teddy: make u8x16 and u8x32 have vector call ABI

Before this commit, u8x16 and u8x32 were repr(Rust) unions. This introduced unspecified behavior because the field offsets of repr(Rust) unions are not guaranteed to be at offset 0, so that field access was potentially UB. This commit fixes that. The unions were also generating a lot of unnecessary memory operations. This commit fixes that as well. The issue is that unions have an Aggregate call ABI, which is the same as the call ABI of arrays. That is, they are passed around by memory, and not in Vector registers. This is good, if most of the time one operates on them as arrays. This was, however, not the case. Most of the operations on these unions are using SIMD instructions. This means that the union needs to be copied into a SIMD register, operated on, and then spilled back to the stack, on every single operation. That's unnecessary, although apparently LLVM was able to optimize all the unnecessary memory operations away and leave these always in registers. This commit fixes this issue as well, by making the u8x16 and u8x32 repr(transparent) newtypes over the architecture specific vector types, giving them the Vector ABI. The vectors are then copied to the stack only when necessary, and as little as possible. This is done using mem::transmute, removing the need for unions altogether (fixing #588 by not having to worry about union layout at all). To make it clear when the vectors are spilled into the stack, the vector::replace(index, value) and vector::extract(index) APIs have been removed, and instead, only a vector::bytes(self) and a vector::from_bytes(&mut self, [u8; N]) APIs are provided instead. This prevents spilling the vectors back and forth onto the stack every time an index needs to be modified, by using vector::bytes to spill the vector to the stack once, making all the random-access modifications in memory, and then using vector::from_bytes only once to move the memory back into a SIMD register. Closes #588, Closes #589
2026-07-01 21:33:59 -04:00 · 2019-07-01 09:52:39 +02:00
parent 95ca8ec2ab
commit 2b501aaad7
4 changed files with 64 additions and 47 deletions
@@ -286,6 +286,7 @@ impl Teddy {
        res: u8x32,
        mut bitfield: u32,
    ) -> Option<Match> {
+        let patterns = res.bytes();
        while bitfield != 0 {
            // The next offset, relative to pos, where some fingerprint
            // matched.
@@ -297,7 +298,7 @@ impl Teddy {

            // The bitfield telling us which patterns had fingerprints that
            // match at this starting position.
-            let mut patterns = res.extract(byte_pos);
+            let mut patterns = patterns[byte_pos];
            while patterns != 0 {
                let bucket = patterns.trailing_zeros() as usize;
                patterns &= !(1 << bucket);
@@ -462,12 +463,20 @@ impl Mask {
        let byte_lo = (byte & 0xF) as usize;
        let byte_hi = (byte >> 4) as usize;

-        let lo = self.lo.extract(byte_lo) | ((1 << bucket) as u8);
-        self.lo.replace(byte_lo, lo);
-        self.lo.replace(byte_lo + 16, lo);
+        {
+            let mut lo_bytes = self.lo.bytes();
+            let lo = lo_bytes[byte_lo] | ((1 << bucket) as u8);
+            lo_bytes[byte_lo] = lo;
+            lo_bytes[byte_lo + 16] = lo;
+            self.lo.replace_bytes(lo_bytes);
+        }

-        let hi = self.hi.extract(byte_hi) | ((1 << bucket) as u8);
-        self.hi.replace(byte_hi, hi);
-        self.hi.replace(byte_hi + 16, hi);
+        {
+            let mut hi_bytes = self.hi.bytes();
+            let hi = hi_bytes[byte_hi] | ((1 << bucket) as u8);
+            hi_bytes[byte_hi] = hi;
+            hi_bytes[byte_hi + 16] = hi;
+            self.hi.replace_bytes(hi_bytes);
+        }
    }
 }
@@ -596,6 +596,7 @@ impl Teddy {
        res: u8x16,
        mut bitfield: u32,
    ) -> Option<Match> {
+        let patterns = res.bytes();
        while bitfield != 0 {
            // The next offset, relative to pos, where some fingerprint
            // matched.
@@ -607,7 +608,7 @@ impl Teddy {

            // The bitfield telling us which patterns had fingerprints that
            // match at this starting position.
-            let mut patterns = res.extract(byte_pos);
+            let mut patterns = patterns[byte_pos];
            while patterns != 0 {
                let bucket = patterns.trailing_zeros() as usize;
                patterns &= !(1 << bucket);
@@ -772,10 +773,17 @@ impl Mask {
        let byte_lo = (byte & 0xF) as usize;
        let byte_hi = (byte >> 4) as usize;

-        let lo = self.lo.extract(byte_lo);
-        self.lo.replace(byte_lo, ((1 << bucket) as u8) | lo);
-
-        let hi = self.hi.extract(byte_hi);
-        self.hi.replace(byte_hi, ((1 << bucket) as u8) | hi);
+        {
+            let mut lo_bytes = self.lo.bytes();
+            let lo = lo_bytes[byte_lo];
+            lo_bytes[byte_lo] = ((1 << bucket) as u8) | lo;
+            self.lo.replace_bytes(lo_bytes);
+        }
+        {
+            let mut hi_bytes = self.hi.bytes();
+            let hi = hi_bytes[byte_hi];
+            hi_bytes[byte_hi] = ((1 << bucket) as u8) | hi;
+            self.hi.replace_bytes(hi_bytes);
+        }
    }
 }
@@ -2,6 +2,7 @@

 use std::arch::x86_64::*;
 use std::fmt;
+use std::mem;

 #[derive(Clone, Copy, Debug)]
 pub struct AVX2VectorBuilder(());
@@ -56,9 +57,9 @@ impl AVX2VectorBuilder {

 #[derive(Clone, Copy)]
 #[allow(non_camel_case_types)]
-pub union u8x32 {
-    vector: __m256i,
-    bytes: [u8; 32],
+#[repr(transparent)]
+pub struct u8x32 {
+    vector: __m256i
 }

 impl u8x32 {
@@ -92,18 +93,6 @@ impl u8x32 {
        u8x32 { vector: _mm256_load_si256(p) }
    }

-    #[inline]
-    pub fn extract(self, i: usize) -> u8 {
-        // Safe because `bytes` is always accessible.
-        unsafe { self.bytes[i] }
-    }
-
-    #[inline]
-    pub fn replace(&mut self, i: usize, byte: u8) {
-        // Safe because `bytes` is always accessible.
-        unsafe { self.bytes[i] = byte; }
-    }
-
    #[inline]
    pub fn shuffle(self, indices: u8x32) -> u8x32 {
        // Safe because we know AVX2 is enabled.
@@ -177,11 +166,22 @@ impl u8x32 {
            u8x32 { vector: _mm256_srli_epi16(self.vector, 4) }
        }
    }
+
+    #[inline]
+    pub fn bytes(self) -> [u8; 32] {
+        // Safe because __m256i and [u8; 32] are layout compatible
+        unsafe { mem::transmute(self) }
+    }
+
+    #[inline]
+    pub fn replace_bytes(&mut self, value: [u8; 32]) {
+        // Safe because __m256i and [u8; 32] are layout compatible
+        self.vector = unsafe { mem::transmute(value) };
+    }
 }

 impl fmt::Debug for u8x32 {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        // Safe because `bytes` is always accessible.
-        unsafe { self.bytes.fmt(f) }
+        self.bytes().fmt(f)
    }
 }
@@ -2,6 +2,7 @@

 use std::arch::x86_64::*;
 use std::fmt;
+use std::mem;

 /// A builder for SSSE3 empowered vectors.
 ///
@@ -77,9 +78,9 @@ impl SSSE3VectorBuilder {
 /// inlined, otherwise you probably have a performance bug.
 #[derive(Clone, Copy)]
 #[allow(non_camel_case_types)]
-pub union u8x16 {
-    vector: __m128i,
-    bytes: [u8; 16],
+#[repr(transparent)]
+pub struct u8x16 {
+    vector: __m128i
 }

 impl u8x16 {
@@ -113,18 +114,6 @@ impl u8x16 {
        u8x16 { vector: v }
    }

-    #[inline]
-    pub fn extract(self, i: usize) -> u8 {
-        // Safe because `bytes` is always accessible.
-        unsafe { self.bytes[i] }
-    }
-
-    #[inline]
-    pub fn replace(&mut self, i: usize, byte: u8) {
-        // Safe because `bytes` is always accessible.
-        unsafe { self.bytes[i] = byte; }
-    }
-
    #[inline]
    pub fn shuffle(self, indices: u8x16) -> u8x16 {
        // Safe because we know SSSE3 is enabled.
@@ -182,11 +171,22 @@ impl u8x16 {
            u8x16 { vector: _mm_srli_epi16(self.vector, 4) }
        }
    }
+
+    #[inline]
+    pub fn bytes(self) -> [u8; 16] {
+        // Safe because __m128i and [u8; 16] are layout compatible
+        unsafe { mem::transmute(self) }
+    }
+
+    #[inline]
+    pub fn replace_bytes(&mut self, value: [u8; 16]) {
+        // Safe because __m128i and [u8; 16] are layout compatible
+        self.vector = unsafe { mem::transmute(value) };
+    }
 }

 impl fmt::Debug for u8x16 {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        // Safe because `bytes` is always accessible.
-        unsafe { self.bytes.fmt(f) }
+        self.bytes().fmt(f)
    }
 }