tweaks/fixes for 5.6

2024-11-26 19:30:21 +00:00 · 2009-03-03 03:28:39 +00:00 · 2009-03-03 03:28:39 +00:00 · 8565900724
commit 8565900724
parent 7d88bbd9ed
11 changed files with 301 additions and 298 deletions
--- a/Readme.txt
+++ b/Readme.txt
@ -414,6 +414,7 @@ the mailing list.

 5.6 - added AuthenticatedSymmetricCipher interface class and Filter wrappers
    - added CCM, GCM (with SSE2 assembly), CMAC, and SEED
+    - added support for variable length IVs
    - improved AES speed on x86 and x64
    - fixed run-time validation error on x86-64 with GCC 4.3.2 -O2
    - fixed HashFilter bug when putMessage=true
--- a/authenc.cpp
+++ b/authenc.cpp
@ -104,7 +104,7 @@ void AuthenticatedSymmetricCipherBase::Update(const byte *input, size_t length)
 void AuthenticatedSymmetricCipherBase::ProcessData(byte *outString, const byte *inString, size_t length)
 {
 	m_totalMessageLength += length;
-	if (m_totalMessageLength > MaxMessageLength())
+	if (m_state >= State_IVSet && m_totalMessageLength > MaxMessageLength())
 		throw InvalidArgument(AlgorithmName() + ": message length exceeds maximum");

 reswitch:
--- a/ccm.h
+++ b/ccm.h
@ -73,7 +73,7 @@ protected:
 };

 //! .
-template <class T_BlockCipher, int T_DefaultDigestBitSize, bool T_IsEncryption>
+template <class T_BlockCipher, int T_DefaultDigestSize, bool T_IsEncryption>
 class CCM_Final : public CCM_Base
 {
 public:
@ -84,16 +84,16 @@ public:

 private:
 	BlockCipher & AccessBlockCipher() {return m_cipher;}
-	int DefaultDigestSize() const {return T_DefaultDigestBitSize/8;}
+	int DefaultDigestSize() const {return T_DefaultDigestSize;}
 	typename T_BlockCipher::Encryption m_cipher;
 };

 /// <a href="http://www.cryptolounge.org/wiki/CCM">CCM</a>
-template <class T_BlockCipher, int T_DefaultDigestBitSize = 128>
+template <class T_BlockCipher, int T_DefaultDigestSize = 16>
 struct CCM : public AuthenticatedSymmetricCipherDocumentation
 {
-	typedef CCM_Final<T_BlockCipher, T_DefaultDigestBitSize, true> Encryption;
-	typedef CCM_Final<T_BlockCipher, T_DefaultDigestBitSize, false> Decryption;
+	typedef CCM_Final<T_BlockCipher, T_DefaultDigestSize, true> Encryption;
+	typedef CCM_Final<T_BlockCipher, T_DefaultDigestSize, false> Decryption;
 };

 NAMESPACE_END
--- a/cryptlib.cpp
+++ b/cryptlib.cpp
@ -205,6 +205,24 @@ void AuthenticatedSymmetricCipher::SpecifyDataLengths(lword headerLength, lword
 	UncheckedSpecifyDataLengths(headerLength, messageLength, footerLength);
 }

+void AuthenticatedSymmetricCipher::EncryptAndAuthenticate(byte *ciphertext, byte *mac, size_t macSize, const byte *iv, int ivLength, const byte *header, size_t headerLength, const byte *message, size_t messageLength)
+{
+	Resynchronize(iv, ivLength);
+	SpecifyDataLengths(headerLength, messageLength);
+	Update(header, headerLength);
+	ProcessString(ciphertext, message, messageLength);
+	TruncatedFinal(mac, macSize);
+}
+
+bool AuthenticatedSymmetricCipher::DecryptAndVerify(byte *message, const byte *mac, size_t macLength, const byte *iv, int ivLength, const byte *header, size_t headerLength, const byte *ciphertext, size_t ciphertextLength)
+{
+	Resynchronize(iv, ivLength);
+	SpecifyDataLengths(headerLength, ciphertextLength);
+	Update(header, headerLength);
+	ProcessString(message, ciphertext, ciphertextLength);
+	return TruncatedVerify(mac, macLength);
+}
+
 unsigned int RandomNumberGenerator::GenerateBit()
 {
 	return GenerateByte() & 1;
--- a/cryptlib.h
+++ b/cryptlib.h
@ -377,10 +377,10 @@ public:
 	//! calls SetKey() with an NameValuePairs object that just specifies "Rounds"
 	void SetKeyWithRounds(const byte *key, size_t length, int rounds);

-	//! calls SetKey() with an NameValuePairs object that just specifies "IVWithLength"
+	//! calls SetKey() with an NameValuePairs object that just specifies "IV"
 	void SetKeyWithIV(const byte *key, size_t length, const byte *iv, size_t ivLength);

-	//! calls SetKey() with an NameValuePairs object that just specifies "IVWithLength"
+	//! calls SetKey() with an NameValuePairs object that just specifies "IV"
 	void SetKeyWithIV(const byte *key, size_t length, const byte *iv)
 		{SetKeyWithIV(key, length, iv, IVSize());}

@ -560,9 +560,13 @@ public:
 	virtual void Restart()
 		{TruncatedFinal(NULL, 0);}

-	//! size of the hash returned by Final()
+	//! size of the hash/digest/MAC returned by Final()
 	virtual unsigned int DigestSize() const =0;

+	//! same as DigestSize()
+	unsigned int TagSize() const {return DigestSize();}
+
+
 	//! block size of underlying compression function, or 0 if not block based
 	virtual unsigned int BlockSize() const {return 0;}

@ -641,9 +645,6 @@ public:
 		explicit BadState(const std::string &name, const char *function, const char *state) : Exception(OTHER_ERROR, name + ": " + function + " was called before " + state) {}
 	};

-	// redeclare this to avoid compiler ambiguity errors
-	virtual std::string AlgorithmName() const =0;
-
 	//! the maximum length of AAD that can be input before the encrypted data
 	virtual lword MaxHeaderLength() const =0;
 	//! the maximum length of encrypted data
@ -655,6 +656,13 @@ public:
 	virtual bool NeedsPrespecifiedDataLengths() const {return false;}
 	//! this function only needs to be called if NeedsPrespecifiedDataLengths() returns true
 	void SpecifyDataLengths(lword headerLength, lword messageLength, lword footerLength=0);
+	//! encrypt and generate MAC in one call. will truncate MAC if macSize < TagSize()
+	virtual void EncryptAndAuthenticate(byte *ciphertext, byte *mac, size_t macSize, const byte *iv, int ivLength, const byte *header, size_t headerLength, const byte *message, size_t messageLength);
+	//! decrypt and verify MAC in one call, returning true iff MAC is valid. will assume MAC is truncated if macLength < TagSize()
+	virtual bool DecryptAndVerify(byte *message, const byte *mac, size_t macLength, const byte *iv, int ivLength, const byte *header, size_t headerLength, const byte *ciphertext, size_t ciphertextLength);
+
+	// redeclare this to avoid compiler ambiguity errors
+	virtual std::string AlgorithmName() const =0;

 protected:
 	const Algorithm & GetAlgorithm() const {return *static_cast<const MessageAuthenticationCode *>(this);}
--- a/datatest.cpp
+++ b/datatest.cpp
@ -397,7 +397,7 @@ void TestAuthenticatedSymmetricCipher(TestData &v, const NameValuePairs &overrid

 		std::string encrypted, decrypted;
 		AuthenticatedEncryptionFilter ef(*asc1, new StringSink(encrypted));
-		AuthenticatedDecryptionFilter df(*asc2, new StringSink(decrypted), AuthenticatedDecryptionFilter::DEFAULT_PADDING, AuthenticatedDecryptionFilter::MAC_AT_BEGIN);
+		AuthenticatedDecryptionFilter df(*asc2, new StringSink(decrypted), AuthenticatedDecryptionFilter::MAC_AT_BEGIN);

 		if (asc1->NeedsPrespecifiedDataLengths())
 		{
--- a/filters.cpp
+++ b/filters.cpp
@ -540,6 +540,18 @@ size_t ArrayXorSink::Put2(const byte *begin, size_t length, int messageEnd, bool

 // *************************************************************

+StreamTransformationFilter::StreamTransformationFilter(StreamTransformation &c, BufferedTransformation *attachment, BlockPaddingScheme padding, bool allowAuthenticatedSymmetricCipher)
+   : FilterWithBufferedInput(attachment)
+	, m_cipher(c)
+{
+	assert(c.MinLastBlockSize() == 0 || c.MinLastBlockSize() > c.MandatoryBlockSize());
+
+	if (!allowAuthenticatedSymmetricCipher && dynamic_cast<AuthenticatedSymmetricCipher *>(&c) != 0)
+		throw InvalidArgument("StreamTransformationFilter: please use AuthenticatedEncryptionFilter and AuthenticatedDecryptionFilter for AuthenticatedSymmetricCipher");
+
+	IsolatedInitialize(MakeParameters(Name::BlockPaddingScheme(), padding));
+}
+
 size_t StreamTransformationFilter::LastBlockSize(StreamTransformation &c, BlockPaddingScheme padding)
 {
 	if (c.MinLastBlockSize() > 0)
@ -550,15 +562,6 @@ size_t StreamTransformationFilter::LastBlockSize(StreamTransformation &c, BlockP
 		return 0;
 }

-StreamTransformationFilter::StreamTransformationFilter(StreamTransformation &c, BufferedTransformation *attachment, BlockPaddingScheme padding)
-   : FilterWithBufferedInput(attachment)
-	, m_cipher(c)
-{
-	assert(c.MinLastBlockSize() == 0 || c.MinLastBlockSize() > c.MandatoryBlockSize());
-
-	IsolatedInitialize(MakeParameters(Name::BlockPaddingScheme(), padding));
-}
-
 void StreamTransformationFilter::InitializeDerivedAndReturnNewSizes(const NameValuePairs &parameters, size_t &firstSize, size_t &blockSize, size_t &lastSize)
 {
 	BlockPaddingScheme padding = parameters.GetValueWithDefault(Name::BlockPaddingScheme(), DEFAULT_PADDING);
@ -804,8 +807,8 @@ void HashVerificationFilter::LastPut(const byte *inString, size_t length)
 // *************************************************************

 AuthenticatedEncryptionFilter::AuthenticatedEncryptionFilter(AuthenticatedSymmetricCipher &c, BufferedTransformation *attachment, 
-								BlockPaddingScheme padding, bool putMessage, int truncatedDigestSize, const std::string &macChannel)
-	: StreamTransformationFilter(c, attachment, padding)
+								bool putMessage, int truncatedDigestSize, const std::string &macChannel, BlockPaddingScheme padding)
+	: StreamTransformationFilter(c, attachment, padding, true)
 	, m_hf(c, new OutputProxy(*this, false), putMessage, truncatedDigestSize, "AAD", macChannel)
 {
 	assert(c.IsForwardTransformation());
@ -847,10 +850,10 @@ void AuthenticatedEncryptionFilter::LastPut(const byte *inString, size_t length)

 // *************************************************************

-AuthenticatedDecryptionFilter::AuthenticatedDecryptionFilter(AuthenticatedSymmetricCipher &c, BufferedTransformation *attachment, BlockPaddingScheme padding, word32 flags, int truncatedDigestSize)
+AuthenticatedDecryptionFilter::AuthenticatedDecryptionFilter(AuthenticatedSymmetricCipher &c, BufferedTransformation *attachment, word32 flags, int truncatedDigestSize, BlockPaddingScheme padding)
 	: FilterWithBufferedInput(attachment)
 	, m_hashVerifier(c, new OutputProxy(*this, false))
-	, m_streamFilter(c, new OutputProxy(*this, false))
+	, m_streamFilter(c, new OutputProxy(*this, false), padding, true)
 {
 	assert(!c.IsForwardTransformation() || c.IsSelfInverting());
 	IsolatedInitialize(MakeParameters(Name::BlockPaddingScheme(), padding)(Name::AuthenticatedDecryptionFilterFlags(), flags)(Name::TruncatedDigestSize(), truncatedDigestSize));
--- a/filters.h
+++ b/filters.h
@ -263,7 +263,7 @@ public:
 	/*! DEFAULT_PADDING means PKCS_PADDING if c.MandatoryBlockSize() > 1 && c.MinLastBlockSize() == 0 (e.g. ECB or CBC mode),
 		otherwise NO_PADDING (OFB, CFB, CTR, CBC-CTS modes).
 		See http://www.weidai.com/scan-mirror/csp.html for details of the padding schemes. */
-	StreamTransformationFilter(StreamTransformation &c, BufferedTransformation *attachment = NULL, BlockPaddingScheme padding = DEFAULT_PADDING);
+	StreamTransformationFilter(StreamTransformation &c, BufferedTransformation *attachment = NULL, BlockPaddingScheme padding = DEFAULT_PADDING, bool allowAuthenticatedSymmetricCipher = false);

 	std::string AlgorithmName() const {return m_cipher.AlgorithmName();}

@ -345,7 +345,7 @@ class CRYPTOPP_DLL AuthenticatedEncryptionFilter : public StreamTransformationFi
 {
 public:
 	/*! See StreamTransformationFilter for documentation on BlockPaddingScheme  */
-	AuthenticatedEncryptionFilter(AuthenticatedSymmetricCipher &c, BufferedTransformation *attachment = NULL, BlockPaddingScheme padding = DEFAULT_PADDING, bool putMessage=false, int truncatedDigestSize=-1, const std::string &macChannel=NULL_CHANNEL);
+	AuthenticatedEncryptionFilter(AuthenticatedSymmetricCipher &c, BufferedTransformation *attachment = NULL, bool putMessage=false, int truncatedDigestSize=-1, const std::string &macChannel=NULL_CHANNEL, BlockPaddingScheme padding = DEFAULT_PADDING);

 	void IsolatedInitialize(const NameValuePairs &parameters);
 	byte * ChannelCreatePutSpace(const std::string &channel, size_t &size);
@ -364,7 +364,7 @@ public:
 	enum Flags {MAC_AT_BEGIN=1, THROW_EXCEPTION=16, DEFAULT_FLAGS = THROW_EXCEPTION};

 	/*! See StreamTransformationFilter for documentation on BlockPaddingScheme  */
-	AuthenticatedDecryptionFilter(AuthenticatedSymmetricCipher &c, BufferedTransformation *attachment = NULL, BlockPaddingScheme padding = DEFAULT_PADDING, word32 flags = DEFAULT_FLAGS, int truncatedDigestSize=-1);
+	AuthenticatedDecryptionFilter(AuthenticatedSymmetricCipher &c, BufferedTransformation *attachment = NULL, word32 flags = DEFAULT_FLAGS, int truncatedDigestSize=-1, BlockPaddingScheme padding = DEFAULT_PADDING);

 	std::string AlgorithmName() const {return m_hashVerifier.AlgorithmName();}
 	byte * ChannelCreatePutSpace(const std::string &channel, size_t &size);
--- a/gcm.cpp
+++ b/gcm.cpp
@ -97,11 +97,11 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
 		{
 			s_reductionTable[0] = 0;
 			word16 x = 0x01c2;
-			s_reductionTable[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, x);
+			s_reductionTable[1] = ByteReverse(x);
 			for (int i=2; i<=0x80; i*=2)
 			{
 				x <<= 1;
-				s_reductionTable[i] = ConditionalByteReverse(BIG_ENDIAN_ORDER, x);
+				s_reductionTable[i] = ByteReverse(x);
 				for (int j=1; j<i; j++)
 					s_reductionTable[i+j] = s_reductionTable[i] ^ s_reductionTable[j];
 			}
@ -198,7 +198,7 @@ void GCM_AuthenticateBlocks_64K(const byte *data, size_t blocks, word64 *hashBuf

 size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
 {
-	typedef BlockGetAndPut<word64, NativeByteOrder, false, true> Block;
+	typedef BlockGetAndPut<word64, NativeByteOrder> Block;
 	word64 *hashBuffer = (word64 *)HashBuffer();

 	switch (2*(m_buffer.size()>=64*1024)
--- a/rijndael.cpp
+++ b/rijndael.cpp
@ -5,17 +5,20 @@
 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code

 /*
-The assembly code was rewritten in Feb 2009 by Wei Dai to do counter mode 
+Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode 
 caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein 
 and Peter Schwabe in their paper "New AES software speed records". The round 
 function was also modified to include a trick similar to one in Brian Gladman's 
 x86 assembly code, doing an 8-bit register move to minimize the number of 
 register spills. Also switched to compressed tables and copying round keys to 
 the stack.
+
+The C++ implementation now uses compressed tables if 
+CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined.
 */

 /*
-Defense against timing attacks was added in July 2006 by Wei Dai.
+July 2006: Defense against timing attacks was added in by Wei Dai.

 The code now uses smaller tables in the first and last rounds,
 and preloads them into L1 cache before usage (by loading at least 
@ -75,12 +78,66 @@ using namespace rdtable;
 #else
 static word64 Te[256];
 #endif
-static word32 Td[256*4];
+static word64 Td[256];
 #else
 static word32 Te[256*4], Td[256*4];
 #endif
 static bool s_TeFilled = false, s_TdFilled = false;

+// ************************* Portable Code ************************************
+
+#define QUARTER_ROUND(L, T, t, a, b, c, d)	\
+	a ^= L(T, 3, byte(t)); t >>= 8;\
+	b ^= L(T, 2, byte(t)); t >>= 8;\
+	c ^= L(T, 1, byte(t)); t >>= 8;\
+	d ^= L(T, 0, t);
+
+#define QUARTER_ROUND_LE(t, a, b, c, d)	\
+	tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
+	tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
+	tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
+	tempBlock[d] = ((byte *)(Te+t))[1];
+
+#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
+	#define QUARTER_ROUND_LD(t, a, b, c, d)	\
+		tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
+		tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
+		tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
+		tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
+#else
+	#define QUARTER_ROUND_LD(t, a, b, c, d)	\
+		tempBlock[a] = Sd[byte(t)]; t >>= 8;\
+		tempBlock[b] = Sd[byte(t)]; t >>= 8;\
+		tempBlock[c] = Sd[byte(t)]; t >>= 8;\
+		tempBlock[d] = Sd[t];
+#endif
+
+#define QUARTER_ROUND_E(t, a, b, c, d)		QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
+#define QUARTER_ROUND_D(t, a, b, c, d)		QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
+
+#ifdef IS_LITTLE_ENDIAN
+	#define QUARTER_ROUND_FE(t, a, b, c, d)		QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
+	#define QUARTER_ROUND_FD(t, a, b, c, d)		QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
+	#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
+		#define TL_F(T, i, x)	(*(word32 *)((byte *)T + x*8 + (6-i)%4+1))
+		#define TL_M(T, i, x)	(*(word32 *)((byte *)T + x*8 + (i+3)%4+1))
+	#else
+		#define TL_F(T, i, x)	rotrFixed(T[x], (3-i)*8)
+		#define TL_M(T, i, x)	T[i*256 + x]
+	#endif
+#else
+	#define QUARTER_ROUND_FE(t, a, b, c, d)		QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
+	#define QUARTER_ROUND_FD(t, a, b, c, d)		QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
+	#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
+		#define TL_F(T, i, x)	(*(word32 *)((byte *)T + x*8 + (4-i)%4))
+		#define TL_M			TL_F
+	#else
+		#define TL_F(T, i, x)	rotrFixed(T[x], i*8)
+		#define TL_M(T, i, x)	T[i*256 + x]
+	#endif
+#endif
+
+
 #define f2(x)   ((x<<1)^(((x>>7)&1)*0x11b))
 #define f4(x)   ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
 #define f8(x)   ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
@ -108,7 +165,7 @@ void Rijndael::Base::FillEncTable()
 		}
 #endif
 	}
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
 	Te[256] = Te[257] = 0;
 #endif
 	s_TeFilled = true;
@ -119,7 +176,7 @@ void Rijndael::Base::FillDecTable()
 	for (int i=0; i<256; i++)
 	{
 		byte x = Sd[i];
-#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS_
+#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
 		word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
 		Td[i] = word64(y | fb(x))<<32 | y | x;
 #else
@ -202,29 +259,16 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
 			temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
 			temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
 		}
+
+#define InverseMixColumn(x)		x = TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
+
 		/* apply the inverse MixColumn transform to all round keys but the first and the last: */
 		for (i = 1; i < m_rounds; i++) {
 			rk += 4;
-			rk[0] =
-				Td[0*256+Se[GETBYTE(rk[0], 3)]] ^
-				Td[1*256+Se[GETBYTE(rk[0], 2)]] ^
-				Td[2*256+Se[GETBYTE(rk[0], 1)]] ^
-				Td[3*256+Se[GETBYTE(rk[0], 0)]];
-			rk[1] =
-				Td[0*256+Se[GETBYTE(rk[1], 3)]] ^
-				Td[1*256+Se[GETBYTE(rk[1], 2)]] ^
-				Td[2*256+Se[GETBYTE(rk[1], 1)]] ^
-				Td[3*256+Se[GETBYTE(rk[1], 0)]];
-			rk[2] =
-				Td[0*256+Se[GETBYTE(rk[2], 3)]] ^
-				Td[1*256+Se[GETBYTE(rk[2], 2)]] ^
-				Td[2*256+Se[GETBYTE(rk[2], 1)]] ^
-				Td[3*256+Se[GETBYTE(rk[2], 0)]];
-			rk[3] =
-				Td[0*256+Se[GETBYTE(rk[3], 3)]] ^
-				Td[1*256+Se[GETBYTE(rk[3], 2)]] ^
-				Td[2*256+Se[GETBYTE(rk[3], 1)]] ^
-				Td[3*256+Se[GETBYTE(rk[3], 0)]];
+			InverseMixColumn(rk[0]);
+			InverseMixColumn(rk[1]);
+			InverseMixColumn(rk[2]);
+			InverseMixColumn(rk[3]);
 		}
 	}

@ -232,6 +276,163 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
 	ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16);
 }

+void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
+{
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
+	if (HasSSE2())
+	{
+		Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
+		return;
+	}
+#endif
+
+	typedef BlockGetAndPut<word32, NativeByteOrder> Block;
+
+	word32 s0, s1, s2, s3, t0, t1, t2, t3;
+	Block::Get(inBlock)(s0)(s1)(s2)(s3);
+
+	const word32 *rk = m_key;
+	s0 ^= rk[0];
+	s1 ^= rk[1];
+	s2 ^= rk[2];
+	s3 ^= rk[3];
+	t0 = rk[4];
+	t1 = rk[5];
+	t2 = rk[6];
+	t3 = rk[7];
+	rk += 8;
+
+	// timing attack countermeasure. see comments at top for more details
+	const int cacheLineSize = GetCacheLineSize();
+	unsigned int i;
+	word32 u = 0;
+#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
+	for (i=0; i<2048; i+=cacheLineSize)
+#else
+	for (i=0; i<1024; i+=cacheLineSize)
+#endif
+		u &= *(const word32 *)(((const byte *)Te)+i);
+	u &= Te[255];
+	s0 |= u; s1 |= u; s2 |= u; s3 |= u;
+
+	QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
+	QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
+	QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
+	QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
+
+	// Nr - 2 full rounds:
+    unsigned int r = m_rounds/2 - 1;
+    do
+	{
+		s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
+
+		QUARTER_ROUND_E(t3, s0, s1, s2, s3)
+		QUARTER_ROUND_E(t2, s3, s0, s1, s2)
+		QUARTER_ROUND_E(t1, s2, s3, s0, s1)
+		QUARTER_ROUND_E(t0, s1, s2, s3, s0)
+
+		t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
+
+		QUARTER_ROUND_E(s3, t0, t1, t2, t3)
+		QUARTER_ROUND_E(s2, t3, t0, t1, t2)
+		QUARTER_ROUND_E(s1, t2, t3, t0, t1)
+		QUARTER_ROUND_E(s0, t1, t2, t3, t0)
+
+        rk += 8;
+    } while (--r);
+
+	word32 tbw[4];
+	byte *const tempBlock = (byte *)tbw;
+
+	QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
+	QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
+	QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
+	QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
+
+	Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
+}
+
+void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
+{
+	typedef BlockGetAndPut<word32, NativeByteOrder> Block;
+
+	word32 s0, s1, s2, s3, t0, t1, t2, t3;
+	Block::Get(inBlock)(s0)(s1)(s2)(s3);
+
+	const word32 *rk = m_key;
+	s0 ^= rk[0];
+	s1 ^= rk[1];
+	s2 ^= rk[2];
+	s3 ^= rk[3];
+	t0 = rk[4];
+	t1 = rk[5];
+	t2 = rk[6];
+	t3 = rk[7];
+	rk += 8;
+
+	// timing attack countermeasure. see comments at top for more details
+	const int cacheLineSize = GetCacheLineSize();
+	unsigned int i;
+	word32 u = 0;
+#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
+	for (i=0; i<2048; i+=cacheLineSize)
+#else
+	for (i=0; i<1024; i+=cacheLineSize)
+#endif
+		u &= *(const word32 *)(((const byte *)Td)+i);
+	u &= Td[255];
+	s0 |= u; s1 |= u; s2 |= u; s3 |= u;
+
+	QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
+	QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
+	QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
+	QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
+
+	// Nr - 2 full rounds:
+    unsigned int r = m_rounds/2 - 1;
+    do
+	{
+		s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
+
+		QUARTER_ROUND_D(t3, s2, s1, s0, s3)
+		QUARTER_ROUND_D(t2, s1, s0, s3, s2)
+		QUARTER_ROUND_D(t1, s0, s3, s2, s1)
+		QUARTER_ROUND_D(t0, s3, s2, s1, s0)
+
+		t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
+
+		QUARTER_ROUND_D(s3, t2, t1, t0, t3)
+		QUARTER_ROUND_D(s2, t1, t0, t3, t2)
+		QUARTER_ROUND_D(s1, t0, t3, t2, t1)
+		QUARTER_ROUND_D(s0, t3, t2, t1, t0)
+
+        rk += 8;
+    } while (--r);
+
+#ifndef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
+	// timing attack countermeasure. see comments at top for more details
+	// If CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined, 
+	// QUARTER_ROUND_LD will use Td, which is already preloaded.
+	u = 0;
+	for (i=0; i<256; i+=cacheLineSize)
+		u &= *(const word32 *)(Sd+i);
+	u &= *(const word32 *)(Sd+252);
+	t0 |= u; t1 |= u; t2 |= u; t3 |= u;
+#endif
+
+	word32 tbw[4];
+	byte *const tempBlock = (byte *)tbw;
+
+	QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
+	QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
+	QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
+	QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
+
+	Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
+}
+
+// ************************* Assembly Code ************************************
+
 #pragma warning(disable: 4731)	// frame pointer register 'ebp' modified by inline assembly code

 #endif	// #ifndef CRYPTOPP_GENERATE_X64_MASM
@ -750,247 +951,6 @@ size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo

 #endif

-void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
-{
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
-	if (HasSSE2())
-	{
-		Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
-		return;
-	}
-#endif
-
-	word32 s0, s1, s2, s3, t0, t1, t2, t3;
-	const word32 *rk = m_key;
-
-	s0 = ((const word32 *)inBlock)[0] ^ rk[0];
-	s1 = ((const word32 *)inBlock)[1] ^ rk[1];
-	s2 = ((const word32 *)inBlock)[2] ^ rk[2];
-	s3 = ((const word32 *)inBlock)[3] ^ rk[3];
-	t0 = rk[4];
-	t1 = rk[5];
-	t2 = rk[6];
-	t3 = rk[7];
-	rk += 8;
-
-	// timing attack countermeasure. see comments at top for more details
-	const int cacheLineSize = GetCacheLineSize();
-	unsigned int i;
-	word32 u = 0;
-#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
-	for (i=0; i<2048; i+=cacheLineSize)
-#else
-	for (i=0; i<1024; i+=cacheLineSize)
-#endif
-		u &= *(const word32 *)(((const byte *)Te)+i);
-	u &= Te[255];
-	s0 |= u; s1 |= u; s2 |= u; s3 |= u;
-
-#define QUARTER_ROUND(t, a, b, c, d)	\
-	a ^= TL(3, byte(t)); t >>= 8;\
-	b ^= TL(2, byte(t)); t >>= 8;\
-	c ^= TL(1, byte(t)); t >>= 8;\
-	d ^= TL(0, t);
-
-#ifdef IS_LITTLE_ENDIAN
-	#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
-		#define TL(i, x)	(*(word32 *)((byte *)Te + x*8 + (6-i)%4+1))
-	#else
-		#define TL(i, x)	rotrFixed(Te[x], (3-i)*8)
-	#endif
-	#define QUARTER_ROUND1(t, a, b, c, d)	QUARTER_ROUND(t, d, c, b, a)
-#else
-	#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
-		#define TL(i, x)	(*(word32 *)((byte *)Te + x*8 + (4-i)%4))
-	#else
-		#define TL(i, x)	rotrFixed(Te[x], i*8)
-	#endif
-	#define QUARTER_ROUND1		QUARTER_ROUND
-#endif
-
-	QUARTER_ROUND1(s3, t0, t1, t2, t3)
-	QUARTER_ROUND1(s2, t3, t0, t1, t2)
-	QUARTER_ROUND1(s1, t2, t3, t0, t1)
-	QUARTER_ROUND1(s0, t1, t2, t3, t0)
-
-#if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) && defined(IS_LITTLE_ENDIAN)
-	#undef TL
-	#define TL(i, x)	(*(word32 *)((byte *)Te + x*8 + (i+3)%4+1))
-#endif
-
-#ifndef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
-	#undef TL
-	#define TL(i, x)	Te[i*256 + x]
-#endif
-
-	// Nr - 2 full rounds:
-    unsigned int r = m_rounds/2 - 1;
-    do
-	{
-		s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
-
-		QUARTER_ROUND(t3, s0, s1, s2, s3)
-		QUARTER_ROUND(t2, s3, s0, s1, s2)
-		QUARTER_ROUND(t1, s2, s3, s0, s1)
-		QUARTER_ROUND(t0, s1, s2, s3, s0)
-
-		t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
-
-		QUARTER_ROUND(s3, t0, t1, t2, t3)
-		QUARTER_ROUND(s2, t3, t0, t1, t2)
-		QUARTER_ROUND(s1, t2, t3, t0, t1)
-		QUARTER_ROUND(s0, t1, t2, t3, t0)
-#undef QUARTER_ROUND
-
-        rk += 8;
-    } while (--r);
-
-	word32 tbw[4];
-	byte *const tempBlock = (byte *)tbw;
-	word32 *const obw = (word32 *)outBlock;
-	const word32 *const xbw = (const word32 *)xorBlock;
-
-#define QUARTER_ROUND(t, a, b, c, d)	\
-	tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
-	tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
-	tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
-	tempBlock[d] = ((byte *)(Te+t))[1];
-
-	QUARTER_ROUND(t2, 15, 2, 5, 8)
-	QUARTER_ROUND(t1, 11, 14, 1, 4)
-	QUARTER_ROUND(t0, 7, 10, 13, 0)
-	QUARTER_ROUND(t3, 3, 6, 9, 12)
-#undef QUARTER_ROUND
-
-	if (xbw)
-	{
-		obw[0] = tbw[0] ^ xbw[0] ^ rk[0];
-		obw[1] = tbw[1] ^ xbw[1] ^ rk[1];
-		obw[2] = tbw[2] ^ xbw[2] ^ rk[2];
-		obw[3] = tbw[3] ^ xbw[3] ^ rk[3];
-	}
-	else
-	{
-		obw[0] = tbw[0] ^ rk[0];
-		obw[1] = tbw[1] ^ rk[1];
-		obw[2] = tbw[2] ^ rk[2];
-		obw[3] = tbw[3] ^ rk[3];
-	}
-}
-
-void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
-{
-	word32 s0, s1, s2, s3, t0, t1, t2, t3;
-	const word32 *rk = m_key;
-
-	s0 = ((const word32 *)inBlock)[0] ^ rk[0];
-	s1 = ((const word32 *)inBlock)[1] ^ rk[1];
-	s2 = ((const word32 *)inBlock)[2] ^ rk[2];
-	s3 = ((const word32 *)inBlock)[3] ^ rk[3];
-	t0 = rk[4];
-	t1 = rk[5];
-	t2 = rk[6];
-	t3 = rk[7];
-	rk += 8;
-
-	// timing attack countermeasure. see comments at top for more details
-	const int cacheLineSize = GetCacheLineSize();
-	unsigned int i;
-	word32 u = 0;
-	for (i=0; i<1024; i+=cacheLineSize)
-		u &= *(const word32 *)(((const byte *)Td)+i);
-	u &= Td[255];
-	s0 |= u; s1 |= u; s2 |= u; s3 |= u;
-
-	// first round
-#ifdef IS_BIG_ENDIAN
-#define QUARTER_ROUND(t, a, b, c, d)	\
-		a ^= rotrFixed(Td[byte(t)], 24);	t >>= 8;\
-		b ^= rotrFixed(Td[byte(t)], 16);	t >>= 8;\
-		c ^= rotrFixed(Td[byte(t)], 8);		t >>= 8;\
-		d ^= Td[t];
-#else
-#define QUARTER_ROUND(t, a, b, c, d)	\
-		d ^= Td[byte(t)];					t >>= 8;\
-		c ^= rotrFixed(Td[byte(t)], 8);		t >>= 8;\
-		b ^= rotrFixed(Td[byte(t)], 16);	t >>= 8;\
-		a ^= rotrFixed(Td[t], 24);
-#endif
-
-	QUARTER_ROUND(s3, t2, t1, t0, t3)
-	QUARTER_ROUND(s2, t1, t0, t3, t2)
-	QUARTER_ROUND(s1, t0, t3, t2, t1)
-	QUARTER_ROUND(s0, t3, t2, t1, t0)
-#undef QUARTER_ROUND
-
-	// Nr - 2 full rounds:
-    unsigned int r = m_rounds/2 - 1;
-    do
-	{
-#define QUARTER_ROUND(t, a, b, c, d)	\
-		a ^= Td[3*256+byte(t)]; t >>= 8;\
-		b ^= Td[2*256+byte(t)]; t >>= 8;\
-		c ^= Td[1*256+byte(t)]; t >>= 8;\
-		d ^= Td[t];
-
-		s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
-
-		QUARTER_ROUND(t3, s2, s1, s0, s3)
-		QUARTER_ROUND(t2, s1, s0, s3, s2)
-		QUARTER_ROUND(t1, s0, s3, s2, s1)
-		QUARTER_ROUND(t0, s3, s2, s1, s0)
-
-		t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
-
-		QUARTER_ROUND(s3, t2, t1, t0, t3)
-		QUARTER_ROUND(s2, t1, t0, t3, t2)
-		QUARTER_ROUND(s1, t0, t3, t2, t1)
-		QUARTER_ROUND(s0, t3, t2, t1, t0)
-#undef QUARTER_ROUND
-
-        rk += 8;
-    } while (--r);
-
-	// timing attack countermeasure. see comments at top for more details
-	u = 0;
-	for (i=0; i<256; i+=cacheLineSize)
-		u &= *(const word32 *)(Sd+i);
-	u &= *(const word32 *)(Sd+252);
-	t0 |= u; t1 |= u; t2 |= u; t3 |= u;
-
-	word32 tbw[4];
-	byte *const tempBlock = (byte *)tbw;
-	word32 *const obw = (word32 *)outBlock;
-	const word32 *const xbw = (const word32 *)xorBlock;
-
-#define QUARTER_ROUND(t, a, b, c, d)	\
-	tempBlock[a] = Sd[byte(t)]; t >>= 8;\
-	tempBlock[b] = Sd[byte(t)]; t >>= 8;\
-	tempBlock[c] = Sd[byte(t)]; t >>= 8;\
-	tempBlock[d] = Sd[t];
-
-	QUARTER_ROUND(t2, 7, 2, 13, 8)
-	QUARTER_ROUND(t1, 3, 14, 9, 4)
-	QUARTER_ROUND(t0, 15, 10, 5, 0)
-	QUARTER_ROUND(t3, 11, 6, 1, 12)
-#undef QUARTER_ROUND
-
-	if (xbw)
-	{
-		obw[0] = tbw[0] ^ xbw[0] ^ rk[0];
-		obw[1] = tbw[1] ^ xbw[1] ^ rk[1];
-		obw[2] = tbw[2] ^ xbw[2] ^ rk[2];
-		obw[3] = tbw[3] ^ xbw[3] ^ rk[3];
-	}
-	else
-	{
-		obw[0] = tbw[0] ^ rk[0];
-		obw[1] = tbw[1] ^ rk[1];
-		obw[2] = tbw[2] ^ rk[2];
-		obw[3] = tbw[3] ^ rk[3];
-	}
-}
-
 NAMESPACE_END

 #endif
--- a/validat1.cpp
+++ b/validat1.cpp
@ -158,6 +158,19 @@ bool TestSettings()
 		pass = false;
 	}

+#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
+	byte testvals[10] = {1,2,2,3,3,3,3,2,2,1};
+	if (*(word32 *)(testvals+3) == 0x03030303 && *(word64 *)(testvals+1) == W64LIT(0x0202030303030202))
+		cout << "passed:  Your machine allows unaligned data access.\n";
+	else
+	{
+		cout << "FAILED:  Unaligned data access gave incorrect results.\n";
+		pass = false;
+	}
+#else
+	cout << "passed:  CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is not defined. Will restrict to aligned data access.\n";
+#endif
+
 	if (sizeof(byte) == 1)
 		cout << "passed:  ";
 	else