Fix disjoint t[] and f[] when using SIMD implementations

2025-02-09 22:52:36 +00:00 · 2018-11-03 10:49:22 -04:00 · 2018-11-03 10:49:22 -04:00 · bdeaae3ac9
commit bdeaae3ac9
parent 600e2a8be4
4 changed files with 37 additions and 37 deletions
--- a/blake2.cpp
+++ b/blake2.cpp
@ -455,12 +455,12 @@ void BLAKE2s::Restart(const BLAKE2s_ParameterBlock& block, const word32 counter[
    }

    State& state = *m_state.data();
-    state.t[0] = state.t[1] = 0, state.f[0] = state.f[1] = 0, state.length = 0;
+    state.tf[0] = state.tf[1] = 0, state.tf[2] = state.tf[3] = 0, state.length = 0;

    if (counter != NULLPTR)
    {
-        state.t[0] = counter[0];
-        state.t[1] = counter[1];
+        state.tf[0] = counter[0];
+        state.tf[1] = counter[1];
    }

    const word32* iv = BLAKE2S_IV;
@ -486,12 +486,12 @@ void BLAKE2b::Restart(const BLAKE2b_ParameterBlock& block, const word64 counter[
    }

    State& state = *m_state.data();
-    state.t[0] = state.t[1] = 0, state.f[0] = state.f[1] = 0, state.length = 0;
+    state.tf[0] = state.tf[1] = 0, state.tf[2] = state.tf[3] = 0, state.length = 0;

    if (counter != NULLPTR)
    {
-        state.t[0] = counter[0];
-        state.t[1] = counter[1];
+        state.tf[0] = counter[0];
+        state.tf[1] = counter[1];
    }

    const word64* iv = BLAKE2B_IV;
@ -584,11 +584,11 @@ void BLAKE2s::TruncatedFinal(byte *hash, size_t size)

    // Set last block unconditionally
    State& state = *m_state.data();
-    state.f[0] = ~static_cast<word32>(0);
+    state.tf[2] = ~static_cast<word32>(0);

    // Set last node if tree mode
    if (m_treeMode)
-        state.f[1] = ~static_cast<word32>(0);
+        state.tf[3] = ~static_cast<word32>(0);

    // Increment counter for tail bytes only
    IncrementCounter(state.length);
@ -609,11 +609,11 @@ void BLAKE2b::TruncatedFinal(byte *hash, size_t size)

    // Set last block unconditionally
    State& state = *m_state.data();
-    state.f[0] = ~static_cast<word64>(0);
+    state.tf[2] = ~static_cast<word64>(0);

    // Set last node if tree mode
    if (m_treeMode)
-        state.f[1] = ~static_cast<word64>(0);
+        state.tf[3] = ~static_cast<word64>(0);

    // Increment counter for tail bytes only
    IncrementCounter(state.length);
@ -630,15 +630,15 @@ void BLAKE2b::TruncatedFinal(byte *hash, size_t size)
 void BLAKE2s::IncrementCounter(size_t count)
 {
    State& state = *m_state.data();
-    state.t[0] += static_cast<word32>(count);
-    state.t[1] += !!(state.t[0] < count);
+    state.tf[0] += static_cast<word32>(count);
+    state.tf[1] += !!(state.tf[0] < count);
 }

 void BLAKE2b::IncrementCounter(size_t count)
 {
    State& state = *m_state.data();
-    state.t[0] += static_cast<word64>(count);
-    state.t[1] += !!(state.t[0] < count);
+    state.tf[0] += static_cast<word64>(count);
+    state.tf[1] += !!(state.tf[0] < count);
 }

 void BLAKE2s::Compress(const byte *input)
@ -702,10 +702,10 @@ void BLAKE2_Compress64_CXX(const byte* input, BLAKE2b_State& state)
    v[ 9] = iv[1];
    v[10] = iv[2];
    v[11] = iv[3];
-    v[12] = state.t[0] ^ iv[4];
-    v[13] = state.t[1] ^ iv[5];
-    v[14] = state.f[0] ^ iv[6];
-    v[15] = state.f[1] ^ iv[7];
+    v[12] = state.tf[0] ^ iv[4];
+    v[13] = state.tf[1] ^ iv[5];
+    v[14] = state.tf[2] ^ iv[6];
+    v[15] = state.tf[3] ^ iv[7];

    BLAKE2B_ROUND<0>(m, v);
    BLAKE2B_ROUND<1>(m, v);
@ -739,10 +739,10 @@ void BLAKE2_Compress32_CXX(const byte* input, BLAKE2s_State& state)
    v[ 9] = iv[1];
    v[10] = iv[2];
    v[11] = iv[3];
-    v[12] = state.t[0] ^ iv[4];
-    v[13] = state.t[1] ^ iv[5];
-    v[14] = state.f[0] ^ iv[6];
-    v[15] = state.f[1] ^ iv[7];
+    v[12] = state.tf[0] ^ iv[4];
+    v[13] = state.tf[1] ^ iv[5];
+    v[14] = state.tf[2] ^ iv[6];
+    v[15] = state.tf[3] ^ iv[7];

    BLAKE2S_ROUND<0>(m, v);
    BLAKE2S_ROUND<1>(m, v);
--- a/blake2.h
+++ b/blake2.h
@ -134,12 +134,12 @@ struct CRYPTOPP_NO_VTABLE BLAKE2s_State
    {
        // Set all members except scratch buffer[]
        h[0]=h[1]=h[2]=h[3]=h[4]=h[5]=h[6]=h[7] = 0;
-        t[0]=t[1]=f[0]=f[1] = 0;
+        tf[0]=tf[1]=tf[2]=tf[3] = 0;
        length = 0;
    }

-    // SSE2, SSE4 and NEON depend upon t[] and f[] being side-by-side
-    word32 h[8], t[2], f[2];
+    // SSE4, Power7 and NEON depend upon t[] and f[] being side-by-side
+    word32 h[8], tf[4];  // t[2], f[2];
    byte  buffer[BLAKE2s_Info::BLOCKSIZE];
    size_t length;
 };
@ -152,12 +152,12 @@ struct CRYPTOPP_NO_VTABLE BLAKE2b_State
    {
        // Set all members except scratch buffer[]
        h[0]=h[1]=h[2]=h[3]=h[4]=h[5]=h[6]=h[7] = 0;
-        t[0]=t[1]=f[0]=f[1] = 0;
+        tf[0]=tf[1]=tf[2]=tf[3] = 0;
        length = 0;
    }

-    // SSE2, SSE4 and NEON depend upon t[] and f[] being side-by-side
-    word64 h[8], t[2], f[2];
+    // SSE4, Power8 and NEON depend upon t[] and f[] being side-by-side
+    word64 h[8], tf[4];  // t[2], f[2];
    byte  buffer[BLAKE2b_Info::BLOCKSIZE];
    size_t length;
 };
--- a/blake2b-simd.cpp
+++ b/blake2b-simd.cpp
@ -457,8 +457,8 @@ void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2b_State& state)
    row2h = LOADU( &state.h[6] );
    row3l = LOADU( &BLAKE2B_IV[0] );
    row3h = LOADU( &BLAKE2B_IV[2] );
-    row4l = _mm_xor_si128( LOADU( &BLAKE2B_IV[4] ), LOADU( &state.t[0] ) );
-    row4h = _mm_xor_si128( LOADU( &BLAKE2B_IV[6] ), LOADU( &state.f[0] ) );
+    row4l = _mm_xor_si128( LOADU( &BLAKE2B_IV[4] ), LOADU( &state.tf[0] ) );
+    row4h = _mm_xor_si128( LOADU( &BLAKE2B_IV[6] ), LOADU( &state.tf[2] ) );

    BLAKE2B_ROUND( 0 );
    BLAKE2B_ROUND( 1 );
@ -717,8 +717,8 @@ void BLAKE2_Compress64_NEON(const byte* input, BLAKE2b_State& state)

    row3l = vld1q_u64(&BLAKE2B_IV[0]);
    row3h = vld1q_u64(&BLAKE2B_IV[2]);
-    row4l = veorq_u64(vld1q_u64(&BLAKE2B_IV[4]), vld1q_u64(&state.t[0]));
-    row4h = veorq_u64(vld1q_u64(&BLAKE2B_IV[6]), vld1q_u64(&state.f[0]));
+    row4l = veorq_u64(vld1q_u64(&BLAKE2B_IV[4]), vld1q_u64(&state.tf[0]));
+    row4h = veorq_u64(vld1q_u64(&BLAKE2B_IV[6]), vld1q_u64(&state.tf[2]));

    BLAKE2B_ROUND(0);
    BLAKE2B_ROUND(1);
@ -1194,8 +1194,8 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)

    row3l = VectorLoad64(&BLAKE2B_IV[0]);
    row3h = VectorLoad64(&BLAKE2B_IV[2]);
-    row4l = vec_xor(VectorLoad64(&BLAKE2B_IV[4]), VectorLoad64(&state.t[0]));
-    row4h = vec_xor(VectorLoad64(&BLAKE2B_IV[6]), VectorLoad64(&state.f[0]));
+    row4l = vec_xor(VectorLoad64(&BLAKE2B_IV[4]), VectorLoad64(&state.tf[0]));
+    row4h = vec_xor(VectorLoad64(&BLAKE2B_IV[6]), VectorLoad64(&state.tf[2]));

    BLAKE2B_ROUND(0);
    BLAKE2B_ROUND(1);
--- a/blake2s-simd.cpp
+++ b/blake2s-simd.cpp
@ -335,7 +335,7 @@ void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2s_State& state)
    row1 = ff0 = LOADU( &state.h[0] );
    row2 = ff1 = LOADU( &state.h[4] );
    row3 = LOADU( &BLAKE2S_IV[0] );
-    row4 = _mm_xor_si128( LOADU( &BLAKE2S_IV[4] ), LOADU( &state.t[0] ) );
+    row4 = _mm_xor_si128( LOADU( &BLAKE2S_IV[4] ), LOADU( &state.tf[0] ) );

    BLAKE2S_ROUND( 0 );
    BLAKE2S_ROUND( 1 );
@ -653,7 +653,7 @@ void BLAKE2_Compress32_NEON(const byte* input, BLAKE2s_State& state)
    const uint32x4_t f0 = row1 = vld1q_u32(&state.h[0]);
    const uint32x4_t f1 = row2 = vld1q_u32(&state.h[4]);
    row3 = vld1q_u32(&BLAKE2S_IV[0]);
-    row4 = veorq_u32(vld1q_u32(&BLAKE2S_IV[4]), vld1q_u32(&state.t[0]));
+    row4 = veorq_u32(vld1q_u32(&BLAKE2S_IV[4]), vld1q_u32(&state.tf[0]));

    BLAKE2S_ROUND(0);
    BLAKE2S_ROUND(1);
@ -1000,7 +1000,7 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
    row1 = ff0 = VectorLoad32LE( &state.h[0] );
    row2 = ff1 = VectorLoad32LE( &state.h[4] );
    row3 = VectorLoad32( &BLAKE2S_IV[0] );
-    row4 = vec_xor( VectorLoad32( &BLAKE2S_IV[4] ), VectorLoad32( &state.t[0] ) );
+    row4 = vec_xor( VectorLoad32( &BLAKE2S_IV[4] ), VectorLoad32( &state.tf[0] ) );

    BLAKE2S_ROUND( 0 );
    BLAKE2S_ROUND( 1 );