mirror of
https://github.com/shadps4-emu/ext-cryptopp.git
synced 2024-11-23 09:59:42 +00:00
Fix partial specializations for FixedSizeAllocatorWithCleanup (PR #710)
Commitafbd3e60f6
effectively treated a symptom and not the underlying problem. The problem was linkers on 32-bit systems ignore CRYPTOPP_ALIGN_DAT(16) passed down by the compiler and align to 8-bytes or less. We have to use Wei's original code in some places. It is not a bad thing, but the bit fiddling is something we would like to contain a little more by depending more on language or platform features. This commit keeps the original changes which improve partial specializations; but fixes 32-bit linker behavior by effectively revertingafbd3e60f6
ande054d36dc8
. We also add more comments so the next person has understands why things are done they way they are.
This commit is contained in:
parent
243673c32a
commit
1bbbfb6b75
3
config.h
3
config.h
@ -346,13 +346,10 @@ NAMESPACE_END
|
||||
// CRYPTOPP_ALIGN_DATA may not be reliable on AIX.
|
||||
#ifndef CRYPTOPP_ALIGN_DATA
|
||||
#if defined(_MSC_VER)
|
||||
#define CRYPTOPP_ALIGN_ATTRIBUTE 1
|
||||
#define CRYPTOPP_ALIGN_DATA(x) __declspec(align(x))
|
||||
#elif defined(__GNUC__) || (__SUNPRO_CC >= 0x5100)
|
||||
#define CRYPTOPP_ALIGN_ATTRIBUTE 1
|
||||
#define CRYPTOPP_ALIGN_DATA(x) __attribute__((aligned(x)))
|
||||
#elif defined(__xlc__) || defined(__xlC__)
|
||||
#define CRYPTOPP_ALIGN_ATTRIBUTE 1
|
||||
#define CRYPTOPP_ALIGN_DATA(x) __attribute__((aligned(x)))
|
||||
#else
|
||||
#define CRYPTOPP_ALIGN_DATA(x)
|
||||
|
4
misc.h
4
misc.h
@ -2072,8 +2072,8 @@ void ByteReverse(T *out, const T *in, size_t byteCount)
|
||||
{
|
||||
// Alignment check due to Issues 690
|
||||
CRYPTOPP_ASSERT(byteCount % sizeof(T) == 0);
|
||||
//CRYPTOPP_ASSERT(IsAligned<T*>(in));
|
||||
//CRYPTOPP_ASSERT(IsAligned<T*>(out));
|
||||
CRYPTOPP_ASSERT(IsAligned<T>(in));
|
||||
CRYPTOPP_ASSERT(IsAligned<T>(out));
|
||||
|
||||
size_t count = byteCount/sizeof(T);
|
||||
for (size_t i=0; i<count; i++)
|
||||
|
54
panama.cpp
54
panama.cpp
@ -93,10 +93,10 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
|
||||
AS_PUSH_IF86( cx)
|
||||
#endif
|
||||
|
||||
AS2( movdqu xmm0, XMMWORD_PTR [AS_REG_2+0*16])
|
||||
AS2( movdqu xmm1, XMMWORD_PTR [AS_REG_2+1*16])
|
||||
AS2( movdqu xmm2, XMMWORD_PTR [AS_REG_2+2*16])
|
||||
AS2( movdqu xmm3, XMMWORD_PTR [AS_REG_2+3*16])
|
||||
AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_2+0*16])
|
||||
AS2( movdqa xmm1, XMMWORD_PTR [AS_REG_2+1*16])
|
||||
AS2( movdqa xmm2, XMMWORD_PTR [AS_REG_2+2*16])
|
||||
AS2( movdqa xmm3, XMMWORD_PTR [AS_REG_2+3*16])
|
||||
AS2( mov eax, dword ptr [AS_REG_2+4*16])
|
||||
|
||||
ASL(4)
|
||||
@ -184,8 +184,8 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
|
||||
ASL(1)
|
||||
AS2( test AS_REG_3, 15)
|
||||
ASJ( jnz, 3, f)
|
||||
AS2( movdqu XMMWORD_PTR [AS_REG_3], xmm4)
|
||||
AS2( movdqu XMMWORD_PTR [AS_REG_3+16], xmm6)
|
||||
AS2( movdqa XMMWORD_PTR [AS_REG_3], xmm4)
|
||||
AS2( movdqa XMMWORD_PTR [AS_REG_3+16], xmm6)
|
||||
AS2( add AS_REG_3, 32)
|
||||
ASJ( jmp, 0, f)
|
||||
ASL(3)
|
||||
@ -200,26 +200,24 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
|
||||
AS2( lea AS_REG_7, [AS_REG_6 + (32-24)*32])
|
||||
AS2( and AS_REG_7, 31*32)
|
||||
|
||||
AS2( movdqu xmm0, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8])
|
||||
AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8])
|
||||
AS2( pxor xmm3, xmm0)
|
||||
ASS( pshufd xmm0, xmm0, 2, 3, 0, 1)
|
||||
AS2( movdqu XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8], xmm3)
|
||||
AS2( movdqu xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8])
|
||||
AS2( pxor xmm0, xmm5)
|
||||
AS2( movdqu XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8], xmm0)
|
||||
AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8], xmm3)
|
||||
AS2( pxor xmm0, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8])
|
||||
AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8], xmm0)
|
||||
|
||||
AS2( movdqu xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8])
|
||||
AS2( movdqa xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8])
|
||||
AS2( pxor xmm1, xmm4)
|
||||
AS2( movdqu XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8], xmm1)
|
||||
AS2( movdqu xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8])
|
||||
AS2( pxor xmm4, xmm5)
|
||||
AS2( movdqu XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8], xmm4)
|
||||
AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8], xmm1)
|
||||
AS2( pxor xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8])
|
||||
AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8], xmm4)
|
||||
|
||||
// theta
|
||||
AS2( movdqu xmm3, XMMWORD_PTR [AS_REG_2+3*16])
|
||||
AS2( movdqu xmm2, XMMWORD_PTR [AS_REG_2+2*16])
|
||||
AS2( movdqu xmm1, XMMWORD_PTR [AS_REG_2+1*16])
|
||||
AS2( movdqu xmm0, XMMWORD_PTR [AS_REG_2+0*16])
|
||||
AS2( movdqa xmm3, XMMWORD_PTR [AS_REG_2+3*16])
|
||||
AS2( movdqa xmm2, XMMWORD_PTR [AS_REG_2+2*16])
|
||||
AS2( movdqa xmm1, XMMWORD_PTR [AS_REG_2+1*16])
|
||||
AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_2+0*16])
|
||||
|
||||
#if CRYPTOPP_SSSE3_ASM_AVAILABLE
|
||||
AS2( test AS_REG_6, 1)
|
||||
@ -273,16 +271,16 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
|
||||
AS2( lea AS_REG_7, [AS_REG_6 + 16*32])
|
||||
AS2( and AS_REG_7, 31*32)
|
||||
|
||||
AS2( movdqu xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*16])
|
||||
AS2( movdqu xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*16])
|
||||
AS2( movdqa xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*16])
|
||||
AS2( movdqa xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*16])
|
||||
AS2( movdqa xmm6, xmm4)
|
||||
AS2( punpcklqdq xmm4, xmm5)
|
||||
AS2( punpckhqdq xmm6, xmm5)
|
||||
AS2( pxor xmm3, xmm4)
|
||||
AS2( pxor xmm2, xmm6)
|
||||
|
||||
AS2( movdqu xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+1*16])
|
||||
AS2( movdqu xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+1*16])
|
||||
AS2( movdqa xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+1*16])
|
||||
AS2( movdqa xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+1*16])
|
||||
AS2( movdqa xmm6, xmm4)
|
||||
AS2( punpcklqdq xmm4, xmm5)
|
||||
AS2( punpckhqdq xmm6, xmm5)
|
||||
@ -296,10 +294,10 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
|
||||
|
||||
// save state
|
||||
AS2( mov [AS_REG_2+4*16], eax)
|
||||
AS2( movdqu XMMWORD_PTR [AS_REG_2+3*16], xmm3)
|
||||
AS2( movdqu XMMWORD_PTR [AS_REG_2+2*16], xmm2)
|
||||
AS2( movdqu XMMWORD_PTR [AS_REG_2+1*16], xmm1)
|
||||
AS2( movdqu XMMWORD_PTR [AS_REG_2+0*16], xmm0)
|
||||
AS2( movdqa XMMWORD_PTR [AS_REG_2+3*16], xmm3)
|
||||
AS2( movdqa XMMWORD_PTR [AS_REG_2+2*16], xmm2)
|
||||
AS2( movdqa XMMWORD_PTR [AS_REG_2+1*16], xmm1)
|
||||
AS2( movdqa XMMWORD_PTR [AS_REG_2+0*16], xmm0)
|
||||
|
||||
#if CRYPTOPP_BOOL_X86
|
||||
AS2( add esp, 4)
|
||||
|
50
salsa.cpp
50
salsa.cpp
@ -265,12 +265,12 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
|
||||
|
||||
#define SSE2_EXPAND_S(i, j) \
|
||||
ASS( pshufd xmm4, xmm##i, j, j, j, j) \
|
||||
AS2( movdqu [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
|
||||
AS2( movdqa [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
|
||||
|
||||
AS2( movdqu xmm0, [REG_state + 0*16])
|
||||
AS2( movdqu xmm1, [REG_state + 1*16])
|
||||
AS2( movdqu xmm2, [REG_state + 2*16])
|
||||
AS2( movdqu xmm3, [REG_state + 3*16])
|
||||
AS2( movdqa xmm0, [REG_state + 0*16])
|
||||
AS2( movdqa xmm1, [REG_state + 1*16])
|
||||
AS2( movdqa xmm2, [REG_state + 2*16])
|
||||
AS2( movdqa xmm3, [REG_state + 3*16])
|
||||
SSE2_EXPAND_S(0, 0)
|
||||
SSE2_EXPAND_S(0, 1)
|
||||
SSE2_EXPAND_S(0, 2)
|
||||
@ -311,15 +311,15 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
|
||||
AS2( pxor xmm##b, xmm4) \
|
||||
AS2( pxor xmm##b, xmm5)
|
||||
|
||||
#define L01(A,B,C,D,a,b,c,d,i) AS2( movdqu xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) /* y3 */
|
||||
#define L02(A,B,C,D,a,b,c,d,i) AS2( movdqu xmm##C, [SSE2_WORKSPACE + a*16 + i*256]) /* y0 */
|
||||
#define L01(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) /* y3 */
|
||||
#define L02(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##C, [SSE2_WORKSPACE + a*16 + i*256]) /* y0 */
|
||||
#define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* y0+y3 */
|
||||
#define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
|
||||
#define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7)
|
||||
#define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7)
|
||||
#define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256])
|
||||
#define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z1 */
|
||||
#define L09(A,B,C,D,a,b,c,d,i) AS2( movdqu [SSE2_WORKSPACE + b*16], xmm##A)
|
||||
#define L09(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + b*16], xmm##A)
|
||||
#define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
|
||||
#define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* z1+y0 */
|
||||
#define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
|
||||
@ -327,7 +327,7 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
|
||||
#define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9)
|
||||
#define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256])
|
||||
#define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z2 */
|
||||
#define L17(A,B,C,D,a,b,c,d,i) AS2( movdqu [SSE2_WORKSPACE + c*16], xmm##A)
|
||||
#define L17(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + c*16], xmm##A)
|
||||
#define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
|
||||
#define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B) /* z2+z1 */
|
||||
#define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
|
||||
@ -335,14 +335,14 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
|
||||
#define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13)
|
||||
#define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
|
||||
#define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z3 */
|
||||
#define L25(A,B,C,D,a,b,c,d,i) AS2( movdqu [SSE2_WORKSPACE + d*16], xmm##A)
|
||||
#define L25(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + d*16], xmm##A)
|
||||
#define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D) /* z3+z2 */
|
||||
#define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
|
||||
#define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18)
|
||||
#define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18)
|
||||
#define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C) /* xor y0 */
|
||||
#define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z0 */
|
||||
#define L32(A,B,C,D,a,b,c,d,i) AS2( movdqu [SSE2_WORKSPACE + a*16], xmm##A)
|
||||
#define L32(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + a*16], xmm##A)
|
||||
|
||||
#define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \
|
||||
L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \
|
||||
@ -453,13 +453,13 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
|
||||
ASJ( jnz, 6, b)
|
||||
|
||||
#define SSE2_OUTPUT_4(a, b, c, d) \
|
||||
AS2( movdqu xmm4, [SSE2_WORKSPACE + a*16 + 256])\
|
||||
AS2( movdqa xmm4, [SSE2_WORKSPACE + a*16 + 256])\
|
||||
AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\
|
||||
AS2( movdqu xmm5, [SSE2_WORKSPACE + b*16 + 256])\
|
||||
AS2( movdqa xmm5, [SSE2_WORKSPACE + b*16 + 256])\
|
||||
AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\
|
||||
AS2( movdqu xmm6, [SSE2_WORKSPACE + c*16 + 256])\
|
||||
AS2( movdqa xmm6, [SSE2_WORKSPACE + c*16 + 256])\
|
||||
AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\
|
||||
AS2( movdqu xmm7, [SSE2_WORKSPACE + d*16 + 256])\
|
||||
AS2( movdqa xmm7, [SSE2_WORKSPACE + d*16 + 256])\
|
||||
AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\
|
||||
ASC( call, SSE2_Salsa_Output)
|
||||
|
||||
@ -480,10 +480,10 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
|
||||
ASL(5)
|
||||
AS2( sub REG_iterationCount, 1)
|
||||
ASJ( jl, 4, f)
|
||||
AS2( movdqu xmm0, [REG_state + 0*16])
|
||||
AS2( movdqu xmm1, [REG_state + 1*16])
|
||||
AS2( movdqu xmm2, [REG_state + 2*16])
|
||||
AS2( movdqu xmm3, [REG_state + 3*16])
|
||||
AS2( movdqa xmm0, [REG_state + 0*16])
|
||||
AS2( movdqa xmm1, [REG_state + 1*16])
|
||||
AS2( movdqa xmm2, [REG_state + 2*16])
|
||||
AS2( movdqa xmm3, [REG_state + 3*16])
|
||||
AS2( mov REG_roundsLeft, REG_rounds)
|
||||
|
||||
ASL(0)
|
||||
@ -504,14 +504,10 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
|
||||
AS2( sub REG_roundsLeft, 2)
|
||||
ASJ( jnz, 0, b)
|
||||
|
||||
AS2( movdqu xmm6, [REG_state + 0*16])
|
||||
AS2( paddd xmm0, xmm6)
|
||||
AS2( movdqu xmm7, [REG_state + 1*16])
|
||||
AS2( paddd xmm1, xmm7)
|
||||
AS2( movdqu xmm6, [REG_state + 2*16])
|
||||
AS2( paddd xmm2, xmm6)
|
||||
AS2( movdqu xmm7, [REG_state + 3*16])
|
||||
AS2( paddd xmm3, xmm7)
|
||||
AS2( paddd xmm0, [REG_state + 0*16])
|
||||
AS2( paddd xmm1, [REG_state + 1*16])
|
||||
AS2( paddd xmm2, [REG_state + 2*16])
|
||||
AS2( paddd xmm3, [REG_state + 3*16])
|
||||
|
||||
AS2( add dword ptr [REG_state + 8*4], 1)
|
||||
AS2( adc dword ptr [REG_state + 5*4], 0)
|
||||
|
43
secblock.h
43
secblock.h
@ -468,24 +468,49 @@ public:
|
||||
|
||||
private:
|
||||
|
||||
#if defined(CRYPTOPP_BOOL_ALIGN16) && defined(CRYPTOPP_ALIGN_ATTRIBUTE)
|
||||
#if defined(CRYPTOPP_BOOL_ALIGN16) && (defined(_M_X64) || defined(__x86_64__))
|
||||
// Before we can add additional platforms we need to check the
|
||||
// linker documentation for alignment behavior for stack variables.
|
||||
// CRYPTOPP_ALIGN_DATA(16) is known OK on Linux, OS X, Solaris.
|
||||
// Also see http://stackoverflow.com/a/1468656/608639.
|
||||
T* GetAlignedArray() {
|
||||
CRYPTOPP_ASSERT(IsAlignedOn(m_array, 16));
|
||||
return m_array;
|
||||
}
|
||||
CRYPTOPP_ALIGN_DATA(16) T m_array[S];
|
||||
|
||||
#elif defined(CRYPTOPP_BOOL_ALIGN16)
|
||||
// There be demons here... Some platforms and small datatypes can
|
||||
// make things go sideways. We experienced it on AIX with XLC. If
|
||||
// we see anymore problems we should probably avoid the stack and
|
||||
// move to aligned heap allocations.
|
||||
|
||||
// There be demons here... We cannot use CRYPTOPP_ALIGN_DATA(16)
|
||||
// because linkers on 32-bit machines (and some 64-bit machines)
|
||||
// align the stack to 8-bytes or less by default, not 16-bytes as
|
||||
// requested. Additionally, the AIX linker seems to use 4-bytes
|
||||
// by default. However, all linkers tested appear to honor
|
||||
// CRYPTOPP_ALIGN_DATA(8). Given we can achieve 8-byte array
|
||||
// alignment, we needs to transform the address returned from
|
||||
// GetAlignedArray() to a 16-byte alignment.
|
||||
// Also see http://stackoverflow.com/a/1468656/608639.
|
||||
//
|
||||
// The 16-byte alignment is achieved by padding the requested
|
||||
// size with extra elements so we have at least 8-bytes of slack
|
||||
// to work with. Then the pointer is moved down to achieve a
|
||||
// 16-byte alignment (stacks grow down).
|
||||
//
|
||||
// The additional 8-bytes introduces a small secondary issue.
|
||||
// The secondary issue is, a large T results in 0 = 8/sizeof(T).
|
||||
// The library is OK but users may hit it. So we need to guard
|
||||
// for a large T, and that is what PAD achieves.
|
||||
T* GetAlignedArray() {
|
||||
T* p_array = (T*)(void*)(((byte*)m_array) + (0-(size_t)m_array)%16);
|
||||
CRYPTOPP_ASSERT(IsAlignedOn(p_array, 16));
|
||||
return p_array;
|
||||
}
|
||||
T m_array[S+8/sizeof(T)];
|
||||
// PAD is elements, not bytes, and rounded up to ensure no overflow.
|
||||
enum { Q = sizeof(T), PAD = (Q >= 8) ? 1 : (Q >= 4) ? 2 : (Q >= 2) ? 4 : 8 };
|
||||
CRYPTOPP_ALIGN_DATA(8) T m_array[S+PAD];
|
||||
|
||||
#else
|
||||
|
||||
T* GetAlignedArray() {return m_array;}
|
||||
T m_array[S];
|
||||
#endif
|
||||
@ -576,10 +601,8 @@ public:
|
||||
{
|
||||
if (ptr == GetAlignedArray())
|
||||
{
|
||||
// If the m_allocated assert fires then the bit twiddling for
|
||||
// GetAlignedArray() is probably incorrect for the platform.
|
||||
// Be sure to check CRYPTOPP_ALIGN_DATA(8). The platform may
|
||||
// not have a way to declaritively align data to 8.
|
||||
// If the m_allocated assert fires then
|
||||
// something overwrote the flag.
|
||||
CRYPTOPP_ASSERT(size <= S);
|
||||
CRYPTOPP_ASSERT(m_allocated);
|
||||
m_allocated = false;
|
||||
|
5
sha.cpp
5
sha.cpp
@ -1119,8 +1119,7 @@ CRYPTOPP_NAKED void CRYPTOPP_FASTCALL SHA512_HashBlock_SSE2(word64 *state, const
|
||||
|
||||
#define SSE2_CombineState(i) \
|
||||
AS2( movdqu xmm0, [edi+i*16])\
|
||||
AS2( movdqu xmm1, [ecx+i*16])\
|
||||
AS2( paddq xmm0, xmm1)\
|
||||
AS2( paddq xmm0, [ecx+i*16])\
|
||||
AS2( movdqu [ecx+i*16], xmm0)
|
||||
|
||||
SSE2_CombineState(0)
|
||||
@ -1148,7 +1147,7 @@ CRYPTOPP_NAKED void CRYPTOPP_FASTCALL SHA512_HashBlock_SSE2(word64 *state, const
|
||||
|
||||
// ANONYMOUS_NAMESPACE_END
|
||||
|
||||
#endif // CRYPTOPP_SSE2_ASM_AVAILABLE
|
||||
#endif // CRYPTOPP_SSE2_ASM_AVAILABLE
|
||||
|
||||
ANONYMOUS_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -412,10 +412,10 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
|
||||
AS2( lea WORD_REG(cx), [4*WORD_REG(cx)+WORD_REG(cx)])
|
||||
AS2( lea WORD_REG(si), [4*WORD_REG(cx)])
|
||||
AS2( mov SSE2_wordsLeft, WORD_REG(si))
|
||||
AS2( movdqu xmm0, [WORD_REG(ax)+0*16]) // copy state to stack to save a register
|
||||
AS2( movdqu [SSE2_stateCopy+0*16], xmm0)
|
||||
AS2( movdqu xmm0, [WORD_REG(ax)+1*16])
|
||||
AS2( movdqu [SSE2_stateCopy+1*16], xmm0)
|
||||
AS2( movdqa xmm0, [WORD_REG(ax)+0*16]) // copy state to stack to save a register
|
||||
AS2( movdqa [SSE2_stateCopy+0*16], xmm0)
|
||||
AS2( movdqa xmm0, [WORD_REG(ax)+1*16])
|
||||
AS2( movdqa [SSE2_stateCopy+1*16], xmm0)
|
||||
AS2( movq xmm0, QWORD PTR [WORD_REG(ax)+2*16])
|
||||
AS2( movq QWORD PTR [SSE2_stateCopy+2*16], xmm0)
|
||||
AS2( psrlq xmm0, 32)
|
||||
@ -507,10 +507,10 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
|
||||
AS2( mov WORD_REG(si), SSE2_wordsLeft2)
|
||||
|
||||
ASL(1) // second inner loop, 16 words each, 5 iterations
|
||||
AS2( movdqu xmm0, [WORD_REG(di)+0*20*4])
|
||||
AS2( movdqu xmm2, [WORD_REG(di)+2*20*4])
|
||||
AS2( movdqu xmm3, [WORD_REG(di)+3*20*4])
|
||||
AS2( movdqu xmm1, [WORD_REG(di)+1*20*4])
|
||||
AS2( movdqa xmm0, [WORD_REG(di)+0*20*4])
|
||||
AS2( movdqa xmm2, [WORD_REG(di)+2*20*4])
|
||||
AS2( movdqa xmm3, [WORD_REG(di)+3*20*4])
|
||||
AS2( movdqa xmm1, [WORD_REG(di)+1*20*4])
|
||||
// S2
|
||||
AS2( movdqa xmm4, xmm0)
|
||||
AS2( pand xmm0, xmm2)
|
||||
@ -596,10 +596,10 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
|
||||
|
||||
ASL(6) // save state
|
||||
AS2( mov AS_REG_6, SSE2_state)
|
||||
AS2( movdqu xmm0, [SSE2_stateCopy+0*16])
|
||||
AS2( movdqu [AS_REG_6+0*16], xmm0)
|
||||
AS2( movdqu xmm0, [SSE2_stateCopy+1*16])
|
||||
AS2( movdqu [AS_REG_6+1*16], xmm0)
|
||||
AS2( movdqa xmm0, [SSE2_stateCopy+0*16])
|
||||
AS2( movdqa [AS_REG_6+0*16], xmm0)
|
||||
AS2( movdqa xmm0, [SSE2_stateCopy+1*16])
|
||||
AS2( movdqa [AS_REG_6+1*16], xmm0)
|
||||
AS2( movq xmm0, QWORD PTR [SSE2_stateCopy+2*16])
|
||||
AS2( movq QWORD PTR [AS_REG_6+2*16], xmm0)
|
||||
AS2( mov [AS_REG_6+10*4], ecx)
|
||||
|
24
tiger.cpp
24
tiger.cpp
@ -51,7 +51,7 @@ void Tiger::TruncatedFinal(byte *hash, size_t size)
|
||||
Restart(); // reinit for next use
|
||||
}
|
||||
|
||||
void Tiger::Transform (word64 *digest, const word64 *X)
|
||||
void Tiger::Transform (word64 *state, const word64 *data)
|
||||
{
|
||||
#if CRYPTOPP_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
|
||||
if (HasSSE2())
|
||||
@ -63,8 +63,8 @@ void Tiger::Transform (word64 *digest, const word64 *X)
|
||||
AS_PUSH_IF86(bx)
|
||||
#else
|
||||
AS2( lea edx, [table])
|
||||
AS2( mov eax, digest)
|
||||
AS2( mov esi, X)
|
||||
AS2( mov eax, state)
|
||||
AS2( mov esi, data)
|
||||
#endif
|
||||
AS2( movq mm0, [eax])
|
||||
AS2( movq mm1, [eax+1*8])
|
||||
@ -213,7 +213,7 @@ void Tiger::Transform (word64 *digest, const word64 *X)
|
||||
AS_POP_IF86(bx)
|
||||
ATT_PREFIX
|
||||
:
|
||||
: "a" (digest), "S" (X), "d" (table)
|
||||
: "a" (state), "S" (data), "d" (table)
|
||||
: "%ecx", "%edi", "memory", "cc"
|
||||
);
|
||||
#endif
|
||||
@ -221,9 +221,9 @@ void Tiger::Transform (word64 *digest, const word64 *X)
|
||||
else
|
||||
#endif
|
||||
{
|
||||
word64 a = digest[0];
|
||||
word64 b = digest[1];
|
||||
word64 c = digest[2];
|
||||
word64 a = state[0];
|
||||
word64 b = state[1];
|
||||
word64 c = state[2];
|
||||
word64 Y[8];
|
||||
|
||||
#define t1 (table)
|
||||
@ -267,15 +267,15 @@ void Tiger::Transform (word64 *digest, const word64 *X)
|
||||
Y[6] += Y[5]; \
|
||||
Y[7] -= Y[6] ^ W64LIT(0x0123456789ABCDEF)
|
||||
|
||||
pass(a,b,c,5,X);
|
||||
key_schedule(Y,X);
|
||||
pass(a,b,c,5,data);
|
||||
key_schedule(Y,data);
|
||||
pass(c,a,b,7,Y);
|
||||
key_schedule(Y,Y);
|
||||
pass(b,c,a,9,Y);
|
||||
|
||||
digest[0] = a ^ digest[0];
|
||||
digest[1] = b - digest[1];
|
||||
digest[2] = c + digest[2];
|
||||
state[0] = a ^ state[0];
|
||||
state[1] = b - state[1];
|
||||
state[2] = c + state[2];
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user