Use little-endian mask during BLAKE2 loads

2025-02-11 15:55:19 +00:00 · 2020-06-28 02:34:52 -04:00 · 2020-06-28 02:34:52 -04:00 · 25cdab6d32
commit 25cdab6d32
parent dc2b336ace
3 changed files with 44 additions and 46 deletions
--- a/blake2b_simd.cpp
+++ b/blake2b_simd.cpp
@ -763,13 +763,13 @@ inline uint64x2_p VecLoad64(const void* p)
    return (uint64x2_p)VecLoad((const byte*)p);
 }

-inline uint64x2_p VecLoad64LE(const void* p)
+inline uint64x2_p VecLoad64LE(const void* p, const uint8x16_p le_mask)
 {
 #if defined(CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
    const uint64x2_p v = VecLoad64(p);
-    return (uint64x2_p)VecPermute(v, v, m);
+    return (uint64x2_p)VecPermute(v, v, le_mask);
 #else
+    CRYPTOPP_UNUSED(le_mask);
    return (uint64x2_p)VecLoad64(p);
 #endif
 }
@ -779,12 +779,12 @@ inline void VecStore64(void* p, const uint64x2_p x)
    VecStore((uint8x16_p)x, (byte*)p);
 }

-inline void VecStore64LE(void* p, const uint64x2_p x)
+inline void VecStore64LE(void* p, const uint64x2_p x, const uint8x16_p le_mask)
 {
 #if defined(CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
-    VecStore64(p, VecPermute(x, x, m));
+    VecStore64(p, VecPermute(x, x, le_mask));
 #else
+    CRYPTOPP_UNUSED(le_mask);
    VecStore64(p, x);
 #endif
 }
@ -1155,22 +1155,24 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
      BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
    } while(0)

-    const uint64x2_p m0 = VecLoad64LE(input +  00);
-    const uint64x2_p m1 = VecLoad64LE(input +  16);
-    const uint64x2_p m2 = VecLoad64LE(input +  32);
-    const uint64x2_p m3 = VecLoad64LE(input +  48);
-    const uint64x2_p m4 = VecLoad64LE(input +  64);
-    const uint64x2_p m5 = VecLoad64LE(input +  80);
-    const uint64x2_p m6 = VecLoad64LE(input +  96);
-    const uint64x2_p m7 = VecLoad64LE(input + 112);
+    const uint8x16_p le_mask = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
+
+    const uint64x2_p m0 = VecLoad64LE(input +  00, le_mask);
+    const uint64x2_p m1 = VecLoad64LE(input +  16, le_mask);
+    const uint64x2_p m2 = VecLoad64LE(input +  32, le_mask);
+    const uint64x2_p m3 = VecLoad64LE(input +  48, le_mask);
+    const uint64x2_p m4 = VecLoad64LE(input +  64, le_mask);
+    const uint64x2_p m5 = VecLoad64LE(input +  80, le_mask);
+    const uint64x2_p m6 = VecLoad64LE(input +  96, le_mask);
+    const uint64x2_p m7 = VecLoad64LE(input + 112, le_mask);

    uint64x2_p row1l, row1h, row2l, row2h;
    uint64x2_p row3l, row3h, row4l, row4h;

-    const uint64x2_p h0 = row1l = VecLoad64LE(state.h()+0);
-    const uint64x2_p h1 = row1h = VecLoad64LE(state.h()+2);
-    const uint64x2_p h2 = row2l = VecLoad64LE(state.h()+4);
-    const uint64x2_p h3 = row2h = VecLoad64LE(state.h()+6);
+    const uint64x2_p h0 = row1l = VecLoad64LE(state.h()+0, le_mask);
+    const uint64x2_p h1 = row1h = VecLoad64LE(state.h()+2, le_mask);
+    const uint64x2_p h2 = row2l = VecLoad64LE(state.h()+4, le_mask);
+    const uint64x2_p h3 = row2h = VecLoad64LE(state.h()+6, le_mask);

    row3l = VecLoad64(BLAKE2B_IV+0);
    row3h = VecLoad64(BLAKE2B_IV+2);
@ -1190,10 +1192,10 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
    BLAKE2B_ROUND(10);
    BLAKE2B_ROUND(11);

-    VecStore64LE(state.h()+0, VecXor(h0, VecXor(row1l, row3l)));
-    VecStore64LE(state.h()+2, VecXor(h1, VecXor(row1h, row3h)));
-    VecStore64LE(state.h()+4, VecXor(h2, VecXor(row2l, row4l)));
-    VecStore64LE(state.h()+6, VecXor(h3, VecXor(row2h, row4h)));
+    VecStore64LE(state.h()+0, VecXor(h0, VecXor(row1l, row3l)), le_mask);
+    VecStore64LE(state.h()+2, VecXor(h1, VecXor(row1h, row3h)), le_mask);
+    VecStore64LE(state.h()+4, VecXor(h2, VecXor(row2l, row4l)), le_mask);
+    VecStore64LE(state.h()+6, VecXor(h3, VecXor(row2h, row4h)), le_mask);
 }
 #endif  // CRYPTOPP_POWER8_AVAILABLE

--- a/blake2s_simd.cpp
+++ b/blake2s_simd.cpp
@ -706,13 +706,13 @@ inline uint32x4_p VecLoad32(const T* p)
 }

 template <class T>
-inline uint32x4_p VecLoad32LE(const T* p)
+inline uint32x4_p VecLoad32LE(const T* p, const uint8x16_p le_mask)
 {
-#if __BIG_ENDIAN__
-    const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
+#if defined(CRYPTOPP_BIG_ENDIAN)
    const uint32x4_p v = VecLoad(p);
-    return VecPermute(v, v, m);
+    return VecPermute(v, v, le_mask);
 #else
+    CRYPTOPP_UNUSED(le_mask);
    return VecLoad(p);
 #endif
 }
@ -724,12 +724,13 @@ inline void VecStore32(T* p, const uint32x4_p x)
 }

 template <class T>
-inline void VecStore32LE(T* p, const uint32x4_p x)
+inline void VecStore32LE(T* p, const uint32x4_p x, const uint8x16_p le_mask)
 {
-#if __BIG_ENDIAN__
-    const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
-    VecStore(VecPermute(x, x, m), p);
+#if defined(CRYPTOPP_BIG_ENDIAN)
+    const uint32x4_p v = VecPermute(x, x, le_mask);
+    VecStore(v, p);
 #else
+    CRYPTOPP_UNUSED(le_mask);
    VecStore(x, p);
 #endif
 }
@ -991,17 +992,19 @@ void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state)
      BLAKE2S_G2(row1,row2,row3,row4,buf4); \
      BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4);

+    const uint8x16_p le_mask = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
+
+    const uint32x4_p  m0 = VecLoad32LE(input +  0, le_mask);
+    const uint32x4_p  m4 = VecLoad32LE(input + 16, le_mask);
+    const uint32x4_p  m8 = VecLoad32LE(input + 32, le_mask);
+    const uint32x4_p m12 = VecLoad32LE(input + 48, le_mask);
+
    uint32x4_p row1, row2, row3, row4;
    uint32x4_p buf1, buf2, buf3, buf4;
    uint32x4_p  ff0,  ff1;

-    const uint32x4_p  m0 = VecLoad32LE(input +  0);
-    const uint32x4_p  m4 = VecLoad32LE(input + 16);
-    const uint32x4_p  m8 = VecLoad32LE(input + 32);
-    const uint32x4_p m12 = VecLoad32LE(input + 48);
-
-    row1 = ff0 = VecLoad32LE(state.h()+0);
-    row2 = ff1 = VecLoad32LE(state.h()+4);
+    row1 = ff0 = VecLoad32LE(state.h()+0, le_mask);
+    row2 = ff1 = VecLoad32LE(state.h()+4, le_mask);
    row3 = VecLoad32(BLAKE2S_IV+0);
    row4 = VecXor(VecLoad32(BLAKE2S_IV+4), VecLoad32(state.t()+0));

@ -1016,8 +1019,8 @@ void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state)
    BLAKE2S_ROUND(8);
    BLAKE2S_ROUND(9);

-    VecStore32LE(state.h()+0, VecXor(ff0, VecXor(row1, row3)));
-    VecStore32LE(state.h()+4, VecXor(ff1, VecXor(row2, row4)));
+    VecStore32LE(state.h()+0, VecXor(ff0, VecXor(row1, row3)), le_mask);
+    VecStore32LE(state.h()+4, VecXor(ff1, VecXor(row2, row4)), le_mask);
 }
 #endif  // CRYPTOPP_ALTIVEC_AVAILABLE

--- a/ppc_simd.h
+++ b/ppc_simd.h
@ -2754,11 +2754,4 @@ NAMESPACE_END
 # pragma GCC diagnostic pop
 #endif

-#undef CONST_V8_CAST
-#undef CONST_V32_CAST
-#undef CONST_V64_CAST
-#undef NCONST_V8_CAST
-#undef NCONST_V32_CAST
-#undef NCONST_V64_CAST
-
 #endif  // CRYPTOPP_PPC_CRYPTO_H