Fix partial specializations for FixedSizeAllocatorWithCleanup (PR #710)

Commit afbd3e60f6 effectively treated a symptom and not the underlying problem. The problem was linkers on 32-bit systems ignore CRYPTOPP_ALIGN_DAT(16) passed down by the compiler and align to 8-bytes or less. We have to use Wei's original code in some places. It is not a bad thing, but the bit fiddling is something we would like to contain a little more by depending more on language or platform features.

This commit keeps the original changes which improve partial specializations; but fixes 32-bit linker behavior by effectively reverting afbd3e60f6 and e054d36dc8. We also add more comments so the next person has understands why things are done they way they are.
This commit is contained in:
Jeffrey Walton 2018-08-24 08:13:23 -04:00 committed by GitHub
parent 243673c32a
commit 1bbbfb6b75
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 110 additions and 97 deletions

View File

@ -346,13 +346,10 @@ NAMESPACE_END
// CRYPTOPP_ALIGN_DATA may not be reliable on AIX.
#ifndef CRYPTOPP_ALIGN_DATA
#if defined(_MSC_VER)
#define CRYPTOPP_ALIGN_ATTRIBUTE 1
#define CRYPTOPP_ALIGN_DATA(x) __declspec(align(x))
#elif defined(__GNUC__) || (__SUNPRO_CC >= 0x5100)
#define CRYPTOPP_ALIGN_ATTRIBUTE 1
#define CRYPTOPP_ALIGN_DATA(x) __attribute__((aligned(x)))
#elif defined(__xlc__) || defined(__xlC__)
#define CRYPTOPP_ALIGN_ATTRIBUTE 1
#define CRYPTOPP_ALIGN_DATA(x) __attribute__((aligned(x)))
#else
#define CRYPTOPP_ALIGN_DATA(x)

4
misc.h
View File

@ -2072,8 +2072,8 @@ void ByteReverse(T *out, const T *in, size_t byteCount)
{
// Alignment check due to Issues 690
CRYPTOPP_ASSERT(byteCount % sizeof(T) == 0);
//CRYPTOPP_ASSERT(IsAligned<T*>(in));
//CRYPTOPP_ASSERT(IsAligned<T*>(out));
CRYPTOPP_ASSERT(IsAligned<T>(in));
CRYPTOPP_ASSERT(IsAligned<T>(out));
size_t count = byteCount/sizeof(T);
for (size_t i=0; i<count; i++)

View File

@ -93,10 +93,10 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
AS_PUSH_IF86( cx)
#endif
AS2( movdqu xmm0, XMMWORD_PTR [AS_REG_2+0*16])
AS2( movdqu xmm1, XMMWORD_PTR [AS_REG_2+1*16])
AS2( movdqu xmm2, XMMWORD_PTR [AS_REG_2+2*16])
AS2( movdqu xmm3, XMMWORD_PTR [AS_REG_2+3*16])
AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_2+0*16])
AS2( movdqa xmm1, XMMWORD_PTR [AS_REG_2+1*16])
AS2( movdqa xmm2, XMMWORD_PTR [AS_REG_2+2*16])
AS2( movdqa xmm3, XMMWORD_PTR [AS_REG_2+3*16])
AS2( mov eax, dword ptr [AS_REG_2+4*16])
ASL(4)
@ -184,8 +184,8 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
ASL(1)
AS2( test AS_REG_3, 15)
ASJ( jnz, 3, f)
AS2( movdqu XMMWORD_PTR [AS_REG_3], xmm4)
AS2( movdqu XMMWORD_PTR [AS_REG_3+16], xmm6)
AS2( movdqa XMMWORD_PTR [AS_REG_3], xmm4)
AS2( movdqa XMMWORD_PTR [AS_REG_3+16], xmm6)
AS2( add AS_REG_3, 32)
ASJ( jmp, 0, f)
ASL(3)
@ -200,26 +200,24 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
AS2( lea AS_REG_7, [AS_REG_6 + (32-24)*32])
AS2( and AS_REG_7, 31*32)
AS2( movdqu xmm0, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8])
AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8])
AS2( pxor xmm3, xmm0)
ASS( pshufd xmm0, xmm0, 2, 3, 0, 1)
AS2( movdqu XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8], xmm3)
AS2( movdqu xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8])
AS2( pxor xmm0, xmm5)
AS2( movdqu XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8], xmm0)
AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8], xmm3)
AS2( pxor xmm0, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8])
AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8], xmm0)
AS2( movdqu xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8])
AS2( movdqa xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8])
AS2( pxor xmm1, xmm4)
AS2( movdqu XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8], xmm1)
AS2( movdqu xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8])
AS2( pxor xmm4, xmm5)
AS2( movdqu XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8], xmm4)
AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8], xmm1)
AS2( pxor xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8])
AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8], xmm4)
// theta
AS2( movdqu xmm3, XMMWORD_PTR [AS_REG_2+3*16])
AS2( movdqu xmm2, XMMWORD_PTR [AS_REG_2+2*16])
AS2( movdqu xmm1, XMMWORD_PTR [AS_REG_2+1*16])
AS2( movdqu xmm0, XMMWORD_PTR [AS_REG_2+0*16])
AS2( movdqa xmm3, XMMWORD_PTR [AS_REG_2+3*16])
AS2( movdqa xmm2, XMMWORD_PTR [AS_REG_2+2*16])
AS2( movdqa xmm1, XMMWORD_PTR [AS_REG_2+1*16])
AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_2+0*16])
#if CRYPTOPP_SSSE3_ASM_AVAILABLE
AS2( test AS_REG_6, 1)
@ -273,16 +271,16 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
AS2( lea AS_REG_7, [AS_REG_6 + 16*32])
AS2( and AS_REG_7, 31*32)
AS2( movdqu xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*16])
AS2( movdqu xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*16])
AS2( movdqa xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*16])
AS2( movdqa xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*16])
AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm5)
AS2( punpckhqdq xmm6, xmm5)
AS2( pxor xmm3, xmm4)
AS2( pxor xmm2, xmm6)
AS2( movdqu xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+1*16])
AS2( movdqu xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+1*16])
AS2( movdqa xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+1*16])
AS2( movdqa xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+1*16])
AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm5)
AS2( punpckhqdq xmm6, xmm5)
@ -296,10 +294,10 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
// save state
AS2( mov [AS_REG_2+4*16], eax)
AS2( movdqu XMMWORD_PTR [AS_REG_2+3*16], xmm3)
AS2( movdqu XMMWORD_PTR [AS_REG_2+2*16], xmm2)
AS2( movdqu XMMWORD_PTR [AS_REG_2+1*16], xmm1)
AS2( movdqu XMMWORD_PTR [AS_REG_2+0*16], xmm0)
AS2( movdqa XMMWORD_PTR [AS_REG_2+3*16], xmm3)
AS2( movdqa XMMWORD_PTR [AS_REG_2+2*16], xmm2)
AS2( movdqa XMMWORD_PTR [AS_REG_2+1*16], xmm1)
AS2( movdqa XMMWORD_PTR [AS_REG_2+0*16], xmm0)
#if CRYPTOPP_BOOL_X86
AS2( add esp, 4)

View File

@ -265,12 +265,12 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
#define SSE2_EXPAND_S(i, j) \
ASS( pshufd xmm4, xmm##i, j, j, j, j) \
AS2( movdqu [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
AS2( movdqa [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
AS2( movdqu xmm0, [REG_state + 0*16])
AS2( movdqu xmm1, [REG_state + 1*16])
AS2( movdqu xmm2, [REG_state + 2*16])
AS2( movdqu xmm3, [REG_state + 3*16])
AS2( movdqa xmm0, [REG_state + 0*16])
AS2( movdqa xmm1, [REG_state + 1*16])
AS2( movdqa xmm2, [REG_state + 2*16])
AS2( movdqa xmm3, [REG_state + 3*16])
SSE2_EXPAND_S(0, 0)
SSE2_EXPAND_S(0, 1)
SSE2_EXPAND_S(0, 2)
@ -311,15 +311,15 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
AS2( pxor xmm##b, xmm4) \
AS2( pxor xmm##b, xmm5)
#define L01(A,B,C,D,a,b,c,d,i) AS2( movdqu xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) /* y3 */
#define L02(A,B,C,D,a,b,c,d,i) AS2( movdqu xmm##C, [SSE2_WORKSPACE + a*16 + i*256]) /* y0 */
#define L01(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) /* y3 */
#define L02(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##C, [SSE2_WORKSPACE + a*16 + i*256]) /* y0 */
#define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* y0+y3 */
#define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
#define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7)
#define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7)
#define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256])
#define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z1 */
#define L09(A,B,C,D,a,b,c,d,i) AS2( movdqu [SSE2_WORKSPACE + b*16], xmm##A)
#define L09(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + b*16], xmm##A)
#define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
#define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* z1+y0 */
#define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
@ -327,7 +327,7 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
#define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9)
#define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256])
#define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z2 */
#define L17(A,B,C,D,a,b,c,d,i) AS2( movdqu [SSE2_WORKSPACE + c*16], xmm##A)
#define L17(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + c*16], xmm##A)
#define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
#define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B) /* z2+z1 */
#define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
@ -335,14 +335,14 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
#define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13)
#define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
#define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z3 */
#define L25(A,B,C,D,a,b,c,d,i) AS2( movdqu [SSE2_WORKSPACE + d*16], xmm##A)
#define L25(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + d*16], xmm##A)
#define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D) /* z3+z2 */
#define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
#define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18)
#define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18)
#define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C) /* xor y0 */
#define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z0 */
#define L32(A,B,C,D,a,b,c,d,i) AS2( movdqu [SSE2_WORKSPACE + a*16], xmm##A)
#define L32(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + a*16], xmm##A)
#define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \
L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \
@ -453,13 +453,13 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
ASJ( jnz, 6, b)
#define SSE2_OUTPUT_4(a, b, c, d) \
AS2( movdqu xmm4, [SSE2_WORKSPACE + a*16 + 256])\
AS2( movdqa xmm4, [SSE2_WORKSPACE + a*16 + 256])\
AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\
AS2( movdqu xmm5, [SSE2_WORKSPACE + b*16 + 256])\
AS2( movdqa xmm5, [SSE2_WORKSPACE + b*16 + 256])\
AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\
AS2( movdqu xmm6, [SSE2_WORKSPACE + c*16 + 256])\
AS2( movdqa xmm6, [SSE2_WORKSPACE + c*16 + 256])\
AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\
AS2( movdqu xmm7, [SSE2_WORKSPACE + d*16 + 256])\
AS2( movdqa xmm7, [SSE2_WORKSPACE + d*16 + 256])\
AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\
ASC( call, SSE2_Salsa_Output)
@ -480,10 +480,10 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
ASL(5)
AS2( sub REG_iterationCount, 1)
ASJ( jl, 4, f)
AS2( movdqu xmm0, [REG_state + 0*16])
AS2( movdqu xmm1, [REG_state + 1*16])
AS2( movdqu xmm2, [REG_state + 2*16])
AS2( movdqu xmm3, [REG_state + 3*16])
AS2( movdqa xmm0, [REG_state + 0*16])
AS2( movdqa xmm1, [REG_state + 1*16])
AS2( movdqa xmm2, [REG_state + 2*16])
AS2( movdqa xmm3, [REG_state + 3*16])
AS2( mov REG_roundsLeft, REG_rounds)
ASL(0)
@ -504,14 +504,10 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
AS2( sub REG_roundsLeft, 2)
ASJ( jnz, 0, b)
AS2( movdqu xmm6, [REG_state + 0*16])
AS2( paddd xmm0, xmm6)
AS2( movdqu xmm7, [REG_state + 1*16])
AS2( paddd xmm1, xmm7)
AS2( movdqu xmm6, [REG_state + 2*16])
AS2( paddd xmm2, xmm6)
AS2( movdqu xmm7, [REG_state + 3*16])
AS2( paddd xmm3, xmm7)
AS2( paddd xmm0, [REG_state + 0*16])
AS2( paddd xmm1, [REG_state + 1*16])
AS2( paddd xmm2, [REG_state + 2*16])
AS2( paddd xmm3, [REG_state + 3*16])
AS2( add dword ptr [REG_state + 8*4], 1)
AS2( adc dword ptr [REG_state + 5*4], 0)

View File

@ -468,24 +468,49 @@ public:
private:
#if defined(CRYPTOPP_BOOL_ALIGN16) && defined(CRYPTOPP_ALIGN_ATTRIBUTE)
#if defined(CRYPTOPP_BOOL_ALIGN16) && (defined(_M_X64) || defined(__x86_64__))
// Before we can add additional platforms we need to check the
// linker documentation for alignment behavior for stack variables.
// CRYPTOPP_ALIGN_DATA(16) is known OK on Linux, OS X, Solaris.
// Also see http://stackoverflow.com/a/1468656/608639.
T* GetAlignedArray() {
CRYPTOPP_ASSERT(IsAlignedOn(m_array, 16));
return m_array;
}
CRYPTOPP_ALIGN_DATA(16) T m_array[S];
#elif defined(CRYPTOPP_BOOL_ALIGN16)
// There be demons here... Some platforms and small datatypes can
// make things go sideways. We experienced it on AIX with XLC. If
// we see anymore problems we should probably avoid the stack and
// move to aligned heap allocations.
// There be demons here... We cannot use CRYPTOPP_ALIGN_DATA(16)
// because linkers on 32-bit machines (and some 64-bit machines)
// align the stack to 8-bytes or less by default, not 16-bytes as
// requested. Additionally, the AIX linker seems to use 4-bytes
// by default. However, all linkers tested appear to honor
// CRYPTOPP_ALIGN_DATA(8). Given we can achieve 8-byte array
// alignment, we needs to transform the address returned from
// GetAlignedArray() to a 16-byte alignment.
// Also see http://stackoverflow.com/a/1468656/608639.
//
// The 16-byte alignment is achieved by padding the requested
// size with extra elements so we have at least 8-bytes of slack
// to work with. Then the pointer is moved down to achieve a
// 16-byte alignment (stacks grow down).
//
// The additional 8-bytes introduces a small secondary issue.
// The secondary issue is, a large T results in 0 = 8/sizeof(T).
// The library is OK but users may hit it. So we need to guard
// for a large T, and that is what PAD achieves.
T* GetAlignedArray() {
T* p_array = (T*)(void*)(((byte*)m_array) + (0-(size_t)m_array)%16);
CRYPTOPP_ASSERT(IsAlignedOn(p_array, 16));
return p_array;
}
T m_array[S+8/sizeof(T)];
// PAD is elements, not bytes, and rounded up to ensure no overflow.
enum { Q = sizeof(T), PAD = (Q >= 8) ? 1 : (Q >= 4) ? 2 : (Q >= 2) ? 4 : 8 };
CRYPTOPP_ALIGN_DATA(8) T m_array[S+PAD];
#else
T* GetAlignedArray() {return m_array;}
T m_array[S];
#endif
@ -576,10 +601,8 @@ public:
{
if (ptr == GetAlignedArray())
{
// If the m_allocated assert fires then the bit twiddling for
// GetAlignedArray() is probably incorrect for the platform.
// Be sure to check CRYPTOPP_ALIGN_DATA(8). The platform may
// not have a way to declaritively align data to 8.
// If the m_allocated assert fires then
// something overwrote the flag.
CRYPTOPP_ASSERT(size <= S);
CRYPTOPP_ASSERT(m_allocated);
m_allocated = false;

View File

@ -1119,8 +1119,7 @@ CRYPTOPP_NAKED void CRYPTOPP_FASTCALL SHA512_HashBlock_SSE2(word64 *state, const
#define SSE2_CombineState(i) \
AS2( movdqu xmm0, [edi+i*16])\
AS2( movdqu xmm1, [ecx+i*16])\
AS2( paddq xmm0, xmm1)\
AS2( paddq xmm0, [ecx+i*16])\
AS2( movdqu [ecx+i*16], xmm0)
SSE2_CombineState(0)
@ -1148,7 +1147,7 @@ CRYPTOPP_NAKED void CRYPTOPP_FASTCALL SHA512_HashBlock_SSE2(word64 *state, const
// ANONYMOUS_NAMESPACE_END
#endif // CRYPTOPP_SSE2_ASM_AVAILABLE
#endif // CRYPTOPP_SSE2_ASM_AVAILABLE
ANONYMOUS_NAMESPACE_BEGIN

View File

@ -412,10 +412,10 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
AS2( lea WORD_REG(cx), [4*WORD_REG(cx)+WORD_REG(cx)])
AS2( lea WORD_REG(si), [4*WORD_REG(cx)])
AS2( mov SSE2_wordsLeft, WORD_REG(si))
AS2( movdqu xmm0, [WORD_REG(ax)+0*16]) // copy state to stack to save a register
AS2( movdqu [SSE2_stateCopy+0*16], xmm0)
AS2( movdqu xmm0, [WORD_REG(ax)+1*16])
AS2( movdqu [SSE2_stateCopy+1*16], xmm0)
AS2( movdqa xmm0, [WORD_REG(ax)+0*16]) // copy state to stack to save a register
AS2( movdqa [SSE2_stateCopy+0*16], xmm0)
AS2( movdqa xmm0, [WORD_REG(ax)+1*16])
AS2( movdqa [SSE2_stateCopy+1*16], xmm0)
AS2( movq xmm0, QWORD PTR [WORD_REG(ax)+2*16])
AS2( movq QWORD PTR [SSE2_stateCopy+2*16], xmm0)
AS2( psrlq xmm0, 32)
@ -507,10 +507,10 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
AS2( mov WORD_REG(si), SSE2_wordsLeft2)
ASL(1) // second inner loop, 16 words each, 5 iterations
AS2( movdqu xmm0, [WORD_REG(di)+0*20*4])
AS2( movdqu xmm2, [WORD_REG(di)+2*20*4])
AS2( movdqu xmm3, [WORD_REG(di)+3*20*4])
AS2( movdqu xmm1, [WORD_REG(di)+1*20*4])
AS2( movdqa xmm0, [WORD_REG(di)+0*20*4])
AS2( movdqa xmm2, [WORD_REG(di)+2*20*4])
AS2( movdqa xmm3, [WORD_REG(di)+3*20*4])
AS2( movdqa xmm1, [WORD_REG(di)+1*20*4])
// S2
AS2( movdqa xmm4, xmm0)
AS2( pand xmm0, xmm2)
@ -596,10 +596,10 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
ASL(6) // save state
AS2( mov AS_REG_6, SSE2_state)
AS2( movdqu xmm0, [SSE2_stateCopy+0*16])
AS2( movdqu [AS_REG_6+0*16], xmm0)
AS2( movdqu xmm0, [SSE2_stateCopy+1*16])
AS2( movdqu [AS_REG_6+1*16], xmm0)
AS2( movdqa xmm0, [SSE2_stateCopy+0*16])
AS2( movdqa [AS_REG_6+0*16], xmm0)
AS2( movdqa xmm0, [SSE2_stateCopy+1*16])
AS2( movdqa [AS_REG_6+1*16], xmm0)
AS2( movq xmm0, QWORD PTR [SSE2_stateCopy+2*16])
AS2( movq QWORD PTR [AS_REG_6+2*16], xmm0)
AS2( mov [AS_REG_6+10*4], ecx)

View File

@ -51,7 +51,7 @@ void Tiger::TruncatedFinal(byte *hash, size_t size)
Restart(); // reinit for next use
}
void Tiger::Transform (word64 *digest, const word64 *X)
void Tiger::Transform (word64 *state, const word64 *data)
{
#if CRYPTOPP_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
if (HasSSE2())
@ -63,8 +63,8 @@ void Tiger::Transform (word64 *digest, const word64 *X)
AS_PUSH_IF86(bx)
#else
AS2( lea edx, [table])
AS2( mov eax, digest)
AS2( mov esi, X)
AS2( mov eax, state)
AS2( mov esi, data)
#endif
AS2( movq mm0, [eax])
AS2( movq mm1, [eax+1*8])
@ -213,7 +213,7 @@ void Tiger::Transform (word64 *digest, const word64 *X)
AS_POP_IF86(bx)
ATT_PREFIX
:
: "a" (digest), "S" (X), "d" (table)
: "a" (state), "S" (data), "d" (table)
: "%ecx", "%edi", "memory", "cc"
);
#endif
@ -221,9 +221,9 @@ void Tiger::Transform (word64 *digest, const word64 *X)
else
#endif
{
word64 a = digest[0];
word64 b = digest[1];
word64 c = digest[2];
word64 a = state[0];
word64 b = state[1];
word64 c = state[2];
word64 Y[8];
#define t1 (table)
@ -267,15 +267,15 @@ void Tiger::Transform (word64 *digest, const word64 *X)
Y[6] += Y[5]; \
Y[7] -= Y[6] ^ W64LIT(0x0123456789ABCDEF)
pass(a,b,c,5,X);
key_schedule(Y,X);
pass(a,b,c,5,data);
key_schedule(Y,data);
pass(c,a,b,7,Y);
key_schedule(Y,Y);
pass(b,c,a,9,Y);
digest[0] = a ^ digest[0];
digest[1] = b - digest[1];
digest[2] = c + digest[2];
state[0] = a ^ state[0];
state[1] = b - state[1];
state[2] = c + state[2];
}
}