Avoid increment during stores of 6x blocks

This provides another 0.1 cpb with GCC
This commit is contained in:
Jeffrey Walton 2017-09-14 21:06:44 -04:00
parent 25efb7a140
commit adea69ab68
No known key found for this signature in database
GPG Key ID: B36AB348921B1838

View File

@ -367,76 +367,60 @@ size_t Rijndael_AdvancedProcessBlocks_ARMV8(F1 func1, F6 func6, const word32 *su
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
{ {
uint32x4_t be = vld1q_u32(s_one); uint32x4_t be = vld1q_u32(s_one);
block1 = vaddq_u8(block0, vreinterpretq_u8_u32(be)); block1 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block0), be);
block2 = vaddq_u8(block1, vreinterpretq_u8_u32(be)); block2 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block1), be);
block3 = vaddq_u8(block2, vreinterpretq_u8_u32(be)); block3 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block2), be);
block4 = vaddq_u8(block3, vreinterpretq_u8_u32(be)); block4 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block3), be);
block5 = vaddq_u8(block4, vreinterpretq_u8_u32(be)); block5 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block4), be);
temp = vaddq_u8(block5, vreinterpretq_u8_u32(be)); temp = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block5), be);
vst1q_u8(const_cast<byte*>(inBlocks), temp); vst1q_u8(const_cast<byte*>(inBlocks), temp);
} }
else else
{ {
inBlocks += inIncrement; const int inc = static_cast<int>(inIncrement);
block1 = vld1q_u8(inBlocks); block1 = vld1q_u8(inBlocks+1*inc);
inBlocks += inIncrement; block2 = vld1q_u8(inBlocks+2*inc);
block2 = vld1q_u8(inBlocks); block3 = vld1q_u8(inBlocks+3*inc);
inBlocks += inIncrement; block4 = vld1q_u8(inBlocks+4*inc);
block3 = vld1q_u8(inBlocks); block5 = vld1q_u8(inBlocks+5*inc);
inBlocks += inIncrement; inBlocks += 6*inc;
block4 = vld1q_u8(inBlocks);
inBlocks += inIncrement;
block5 = vld1q_u8(inBlocks);
inBlocks += inIncrement;
} }
if (flags & BlockTransformation::BT_XorInput) if (flags & BlockTransformation::BT_XorInput)
{ {
block0 = veorq_u8(block0, vld1q_u8(xorBlocks)); const int inc = static_cast<int>(xorIncrement);
xorBlocks += xorIncrement; block0 = veorq_u8(block0, vld1q_u8(xorBlocks+0*inc));
block1 = veorq_u8(block1, vld1q_u8(xorBlocks)); block1 = veorq_u8(block1, vld1q_u8(xorBlocks+1*inc));
xorBlocks += xorIncrement; block2 = veorq_u8(block2, vld1q_u8(xorBlocks+2*inc));
block2 = veorq_u8(block2, vld1q_u8(xorBlocks)); block3 = veorq_u8(block3, vld1q_u8(xorBlocks+3*inc));
xorBlocks += xorIncrement; block4 = veorq_u8(block4, vld1q_u8(xorBlocks+4*inc));
block3 = veorq_u8(block3, vld1q_u8(xorBlocks)); block5 = veorq_u8(block5, vld1q_u8(xorBlocks+5*inc));
xorBlocks += xorIncrement; xorBlocks += 6*inc;
block4 = veorq_u8(block4, vld1q_u8(xorBlocks));
xorBlocks += xorIncrement;
block5 = veorq_u8(block5, vld1q_u8(xorBlocks));
xorBlocks += xorIncrement;
} }
func6(block0, block1, block2, block3, block4, block5, subKeys, rounds); func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{ {
block0 = veorq_u8(block0, vld1q_u8(xorBlocks)); const int inc = static_cast<int>(xorIncrement);
xorBlocks += xorIncrement; block0 = veorq_u8(block0, vld1q_u8(xorBlocks+0*inc));
block1 = veorq_u8(block1, vld1q_u8(xorBlocks)); block1 = veorq_u8(block1, vld1q_u8(xorBlocks+1*inc));
xorBlocks += xorIncrement; block2 = veorq_u8(block2, vld1q_u8(xorBlocks+2*inc));
block2 = veorq_u8(block2, vld1q_u8(xorBlocks)); block3 = veorq_u8(block3, vld1q_u8(xorBlocks+3*inc));
xorBlocks += xorIncrement; block4 = veorq_u8(block4, vld1q_u8(xorBlocks+4*inc));
block3 = veorq_u8(block3, vld1q_u8(xorBlocks)); block5 = veorq_u8(block5, vld1q_u8(xorBlocks+5*inc));
xorBlocks += xorIncrement; xorBlocks += 6*inc;
block4 = veorq_u8(block4, vld1q_u8(xorBlocks));
xorBlocks += xorIncrement;
block5 = veorq_u8(block5, vld1q_u8(xorBlocks));
xorBlocks += xorIncrement;
} }
vst1q_u8(outBlocks, block0); const int inc = static_cast<int>(outIncrement);
outBlocks += outIncrement; vst1q_u8(outBlocks+0*inc, block0);
vst1q_u8(outBlocks, block1); vst1q_u8(outBlocks+1*inc, block1);
outBlocks += outIncrement; vst1q_u8(outBlocks+2*inc, block2);
vst1q_u8(outBlocks, block2); vst1q_u8(outBlocks+3*inc, block3);
outBlocks += outIncrement; vst1q_u8(outBlocks+4*inc, block4);
vst1q_u8(outBlocks, block3); vst1q_u8(outBlocks+5*inc, block5);
outBlocks += outIncrement;
vst1q_u8(outBlocks, block4);
outBlocks += outIncrement;
vst1q_u8(outBlocks, block5);
outBlocks += outIncrement;
outBlocks += 6*inc;
length -= 6*blockSize; length -= 6*blockSize;
} }
} }
@ -1171,7 +1155,7 @@ size_t Rijndael_AdvancedProcessBlocks_POWER8(F1 func1, F6 func6, const word32 *s
block3 = VectorLoad(3*inc, inBlocks); block3 = VectorLoad(3*inc, inBlocks);
block4 = VectorLoad(4*inc, inBlocks); block4 = VectorLoad(4*inc, inBlocks);
block5 = VectorLoad(5*inc, inBlocks); block5 = VectorLoad(5*inc, inBlocks);
inBlocks += 6*inIncrement; inBlocks += 6*inc;
} }
if (flags & BlockTransformation::BT_XorInput) if (flags & BlockTransformation::BT_XorInput)