ext-cryptopp/chacha_avx.cpp

// chacha_avx.cpp - written and placed in the public domain by
//                  Jack Lloyd and Jeffrey Walton
//
//    This source file uses intrinsics and built-ins to gain access to
//    AVX2 instructions. A separate source file is needed because
//    additional CXXFLAGS are required to enable the appropriate
//    instructions sets in some build configurations.
//
//    AVX2 implementation based on Botan's chacha_avx.cpp. Many thanks
//    to Jack Lloyd and the Botan team for allowing us to use it.
//
//    Here are some relative numbers for ChaCha8:
//    * Intel Skylake,   3.0 GHz: AVX2 at 4411 MB/s; 0.57 cpb.
//    * Intel Broadwell, 2.3 GHz: AVX2 at 3828 MB/s; 0.58 cpb.
//    * AMD Bulldozer,   3.3 GHz: AVX2 at 1680 MB/s; 1.47 cpb.

#include "pch.h"
#include "config.h"

#include "chacha.h"
#include "misc.h"

#if defined(CRYPTOPP_AVX2_AVAILABLE)
# include <xmmintrin.h>
# include <emmintrin.h>
# include <immintrin.h>
#endif

// Squash MS LNK4221 and libtool warnings
extern const char CHACHA_AVX_FNAME[] = __FILE__;

// Sun Studio 12.4 OK, 12.5 and 12.6 compile error.
#if (__SUNPRO_CC >= 0x5140) && (__SUNPRO_CC <= 0x5150)
# define MAYBE_CONST
#else
# define MAYBE_CONST const
#endif

// VS2017 and global optimization bug. TODO, figure out when
// we can re-enable full optimizations for VS2017. Also see
// https://github.com/weidai11/cryptopp/issues/649 and
// https://github.com/weidai11/cryptopp/issues/735. The
// 649 issue affects AES but it is the same here. The 735
// issue is ChaCha AVX2 cut-in where it surfaced again.
#if (_MSC_VER >= 1910)
# ifndef CRYPTOPP_DEBUG
#  pragma optimize("", off)
#  pragma optimize("ts", on)
# endif
#endif

// The data is aligned, but Clang issues warning based on type
// and not the actual alignment of the variable and data.
#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
# pragma GCC diagnostic ignored "-Wcast-align"
#endif

ANONYMOUS_NAMESPACE_BEGIN

#if (CRYPTOPP_AVX2_AVAILABLE)

template <unsigned int R>
inline __m256i RotateLeft(const __m256i val)
{
    return _mm256_or_si256(_mm256_slli_epi32(val, R), _mm256_srli_epi32(val, 32-R));
}

template <>
inline __m256i RotateLeft<8>(const __m256i val)
{
    const __m256i mask = _mm256_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3,
                                         14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
    return _mm256_shuffle_epi8(val, mask);
}

template <>
inline __m256i RotateLeft<16>(const __m256i val)
{
    const __m256i mask = _mm256_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2,
                                         13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
    return _mm256_shuffle_epi8(val, mask);
}

#endif  // CRYPTOPP_AVX2_AVAILABLE

ANONYMOUS_NAMESPACE_END

NAMESPACE_BEGIN(CryptoPP)

#if (CRYPTOPP_AVX2_AVAILABLE)

void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds)
{
    const __m256i state0 = _mm256_broadcastsi128_si256(
        _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+0*4)));
    const __m256i state1 = _mm256_broadcastsi128_si256(
        _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+1*4)));
    const __m256i state2 = _mm256_broadcastsi128_si256(
        _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+2*4)));
    const __m256i state3 = _mm256_broadcastsi128_si256(
        _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+3*4)));

    const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 4);
    const __m256i CTR1 = _mm256_set_epi32(0, 0, 0, 1, 0, 0, 0, 5);
    const __m256i CTR2 = _mm256_set_epi32(0, 0, 0, 2, 0, 0, 0, 6);
    const __m256i CTR3 = _mm256_set_epi32(0, 0, 0, 3, 0, 0, 0, 7);

    __m256i X0_0 = state0;
    __m256i X0_1 = state1;
    __m256i X0_2 = state2;
    __m256i X0_3 = _mm256_add_epi64(state3, CTR0);

    __m256i X1_0 = state0;
    __m256i X1_1 = state1;
    __m256i X1_2 = state2;
    __m256i X1_3 = _mm256_add_epi64(state3, CTR1);

    __m256i X2_0 = state0;
    __m256i X2_1 = state1;
    __m256i X2_2 = state2;
    __m256i X2_3 = _mm256_add_epi64(state3, CTR2);

    __m256i X3_0 = state0;
    __m256i X3_1 = state1;
    __m256i X3_2 = state2;
    __m256i X3_3 = _mm256_add_epi64(state3, CTR3);

    for (int i = static_cast<int>(rounds); i > 0; i -= 2)
    {
        X0_0 = _mm256_add_epi32(X0_0, X0_1);
        X1_0 = _mm256_add_epi32(X1_0, X1_1);
        X2_0 = _mm256_add_epi32(X2_0, X2_1);
        X3_0 = _mm256_add_epi32(X3_0, X3_1);

        X0_3 = _mm256_xor_si256(X0_3, X0_0);
        X1_3 = _mm256_xor_si256(X1_3, X1_0);
        X2_3 = _mm256_xor_si256(X2_3, X2_0);
        X3_3 = _mm256_xor_si256(X3_3, X3_0);

        X0_3 = RotateLeft<16>(X0_3);
        X1_3 = RotateLeft<16>(X1_3);
        X2_3 = RotateLeft<16>(X2_3);
        X3_3 = RotateLeft<16>(X3_3);

        X0_2 = _mm256_add_epi32(X0_2, X0_3);
        X1_2 = _mm256_add_epi32(X1_2, X1_3);
        X2_2 = _mm256_add_epi32(X2_2, X2_3);
        X3_2 = _mm256_add_epi32(X3_2, X3_3);

        X0_1 = _mm256_xor_si256(X0_1, X0_2);
        X1_1 = _mm256_xor_si256(X1_1, X1_2);
        X2_1 = _mm256_xor_si256(X2_1, X2_2);
        X3_1 = _mm256_xor_si256(X3_1, X3_2);

        X0_1 = RotateLeft<12>(X0_1);
        X1_1 = RotateLeft<12>(X1_1);
        X2_1 = RotateLeft<12>(X2_1);
        X3_1 = RotateLeft<12>(X3_1);

        X0_0 = _mm256_add_epi32(X0_0, X0_1);
        X1_0 = _mm256_add_epi32(X1_0, X1_1);
        X2_0 = _mm256_add_epi32(X2_0, X2_1);
        X3_0 = _mm256_add_epi32(X3_0, X3_1);

        X0_3 = _mm256_xor_si256(X0_3, X0_0);
        X1_3 = _mm256_xor_si256(X1_3, X1_0);
        X2_3 = _mm256_xor_si256(X2_3, X2_0);
        X3_3 = _mm256_xor_si256(X3_3, X3_0);

        X0_3 = RotateLeft<8>(X0_3);
        X1_3 = RotateLeft<8>(X1_3);
        X2_3 = RotateLeft<8>(X2_3);
        X3_3 = RotateLeft<8>(X3_3);

        X0_2 = _mm256_add_epi32(X0_2, X0_3);
        X1_2 = _mm256_add_epi32(X1_2, X1_3);
        X2_2 = _mm256_add_epi32(X2_2, X2_3);
        X3_2 = _mm256_add_epi32(X3_2, X3_3);

        X0_1 = _mm256_xor_si256(X0_1, X0_2);
        X1_1 = _mm256_xor_si256(X1_1, X1_2);
        X2_1 = _mm256_xor_si256(X2_1, X2_2);
        X3_1 = _mm256_xor_si256(X3_1, X3_2);

        X0_1 = RotateLeft<7>(X0_1);
        X1_1 = RotateLeft<7>(X1_1);
        X2_1 = RotateLeft<7>(X2_1);
        X3_1 = RotateLeft<7>(X3_1);

        X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1));
        X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
        X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3));

        X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1));
        X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
        X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3));

        X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1));
        X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
        X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3));

        X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1));
        X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
        X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3));

        X0_0 = _mm256_add_epi32(X0_0, X0_1);
        X1_0 = _mm256_add_epi32(X1_0, X1_1);
        X2_0 = _mm256_add_epi32(X2_0, X2_1);
        X3_0 = _mm256_add_epi32(X3_0, X3_1);

        X0_3 = _mm256_xor_si256(X0_3, X0_0);
        X1_3 = _mm256_xor_si256(X1_3, X1_0);
        X2_3 = _mm256_xor_si256(X2_3, X2_0);
        X3_3 = _mm256_xor_si256(X3_3, X3_0);

        X0_3 = RotateLeft<16>(X0_3);
        X1_3 = RotateLeft<16>(X1_3);
        X2_3 = RotateLeft<16>(X2_3);
        X3_3 = RotateLeft<16>(X3_3);

        X0_2 = _mm256_add_epi32(X0_2, X0_3);
        X1_2 = _mm256_add_epi32(X1_2, X1_3);
        X2_2 = _mm256_add_epi32(X2_2, X2_3);
        X3_2 = _mm256_add_epi32(X3_2, X3_3);

        X0_1 = _mm256_xor_si256(X0_1, X0_2);
        X1_1 = _mm256_xor_si256(X1_1, X1_2);
        X2_1 = _mm256_xor_si256(X2_1, X2_2);
        X3_1 = _mm256_xor_si256(X3_1, X3_2);

        X0_1 = RotateLeft<12>(X0_1);
        X1_1 = RotateLeft<12>(X1_1);
        X2_1 = RotateLeft<12>(X2_1);
        X3_1 = RotateLeft<12>(X3_1);

        X0_0 = _mm256_add_epi32(X0_0, X0_1);
        X1_0 = _mm256_add_epi32(X1_0, X1_1);
        X2_0 = _mm256_add_epi32(X2_0, X2_1);
        X3_0 = _mm256_add_epi32(X3_0, X3_1);

        X0_3 = _mm256_xor_si256(X0_3, X0_0);
        X1_3 = _mm256_xor_si256(X1_3, X1_0);
        X2_3 = _mm256_xor_si256(X2_3, X2_0);
        X3_3 = _mm256_xor_si256(X3_3, X3_0);

        X0_3 = RotateLeft<8>(X0_3);
        X1_3 = RotateLeft<8>(X1_3);
        X2_3 = RotateLeft<8>(X2_3);
        X3_3 = RotateLeft<8>(X3_3);

        X0_2 = _mm256_add_epi32(X0_2, X0_3);
        X1_2 = _mm256_add_epi32(X1_2, X1_3);
        X2_2 = _mm256_add_epi32(X2_2, X2_3);
        X3_2 = _mm256_add_epi32(X3_2, X3_3);

        X0_1 = _mm256_xor_si256(X0_1, X0_2);
        X1_1 = _mm256_xor_si256(X1_1, X1_2);
        X2_1 = _mm256_xor_si256(X2_1, X2_2);
        X3_1 = _mm256_xor_si256(X3_1, X3_2);

        X0_1 = RotateLeft<7>(X0_1);
        X1_1 = RotateLeft<7>(X1_1);
        X2_1 = RotateLeft<7>(X2_1);
        X3_1 = RotateLeft<7>(X3_1);

        X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3));
        X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
        X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1));

        X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3));
        X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
        X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1));

        X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3));
        X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
        X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1));

        X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3));
        X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
        X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1));
    }

    X0_0 = _mm256_add_epi32(X0_0, state0);
    X0_1 = _mm256_add_epi32(X0_1, state1);
    X0_2 = _mm256_add_epi32(X0_2, state2);
    X0_3 = _mm256_add_epi32(X0_3, state3);
    X0_3 = _mm256_add_epi64(X0_3, CTR0);

    X1_0 = _mm256_add_epi32(X1_0, state0);
    X1_1 = _mm256_add_epi32(X1_1, state1);
    X1_2 = _mm256_add_epi32(X1_2, state2);
    X1_3 = _mm256_add_epi32(X1_3, state3);
    X1_3 = _mm256_add_epi64(X1_3, CTR1);

    X2_0 = _mm256_add_epi32(X2_0, state0);
    X2_1 = _mm256_add_epi32(X2_1, state1);
    X2_2 = _mm256_add_epi32(X2_2, state2);
    X2_3 = _mm256_add_epi32(X2_3, state3);
    X2_3 = _mm256_add_epi64(X2_3, CTR2);

    X3_0 = _mm256_add_epi32(X3_0, state0);
    X3_1 = _mm256_add_epi32(X3_1, state1);
    X3_2 = _mm256_add_epi32(X3_2, state2);
    X3_3 = _mm256_add_epi32(X3_3, state3);
    X3_3 = _mm256_add_epi64(X3_3, CTR3);

    if (input)
    {
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+0*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)),
            _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input+0*32))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+1*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)),
            _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input+1*32))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+2*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)),
            _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input+2*32))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+3*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)),
            _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input+3*32))));
    }
    else
    {
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+0*32),
            _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+1*32),
            _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+2*32),
            _mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+3*32),
            _mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)));
    }

    if (input)
    {
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+4*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)),
            _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input+4*32))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+5*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)),
            _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input+5*32))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+6*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)),
            _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input+6*32))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+7*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)),
            _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input+7*32))));
    }
    else
    {
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+4*32),
            _mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+5*32),
            _mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+6*32),
            _mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+7*32),
            _mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)));
    }

    if (input)
    {
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 8*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)),
            _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input+8*32))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 9*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)),
            _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input+9*32))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+10*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)),
            _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input+10*32))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+11*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)),
            _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input+11*32))));
    }
    else
    {
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 8*32),
            _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 9*32),
            _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+10*32),
            _mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+11*32),
            _mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)));
    }

    if (input)
    {
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+12*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)),
            _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input+12*32))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+13*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)),
            _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input+13*32))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+14*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)),
            _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input+14*32))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+15*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)),
            _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input+15*32))));
    }
    else
    {
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+12*32),
            _mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+13*32),
            _mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+14*32),
            _mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+15*32),
            _mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)));
    }

    // https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties
    _mm256_zeroupper();
}

#endif  // CRYPTOPP_AVX2_AVAILABLE

NAMESPACE_END
Rename files with dashes to underscores (GH #736) Also see https://groups.google.com/forum/#!topic/cryptopp-users/HBz-6gZZFOA on the mailing list 2018-11-10 13:00:14 +00:00			`// chacha_avx.cpp - written and placed in the public domain by`
Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00			`// Jack Lloyd and Jeffrey Walton`
			`//`
			`// This source file uses intrinsics and built-ins to gain access to`
Whitespace check-in 2018-11-09 01:34:22 +00:00			`// AVX2 instructions. A separate source file is needed because`
			`// additional CXXFLAGS are required to enable the appropriate`
			`// instructions sets in some build configurations.`
Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00			`//`
Whitespace check-in 2018-11-09 01:34:22 +00:00			`// AVX2 implementation based on Botan's chacha_avx.cpp. Many thanks`
Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00			`// to Jack Lloyd and the Botan team for allowing us to use it.`
			`//`
			`// Here are some relative numbers for ChaCha8:`
Whitespace check-in 2018-11-09 01:34:22 +00:00			`// * Intel Skylake, 3.0 GHz: AVX2 at 4411 MB/s; 0.57 cpb.`
			`// * Intel Broadwell, 2.3 GHz: AVX2 at 3828 MB/s; 0.58 cpb.`
			`// * AMD Bulldozer, 3.3 GHz: AVX2 at 1680 MB/s; 1.47 cpb.`
Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00
			`#include "pch.h"`
			`#include "config.h"`

			`#include "chacha.h"`
			`#include "misc.h"`

			`#if defined(CRYPTOPP_AVX2_AVAILABLE)`
			`# include <xmmintrin.h>`
			`# include <emmintrin.h>`
			`# include <immintrin.h>`
			`#endif`

			`// Squash MS LNK4221 and libtool warnings`
			`extern const char CHACHA_AVX_FNAME[] = __FILE__;`

Revert changes for lgtm findings This broke SunCC to the point of no repair. SunCC is using AVX2 instructions for C++ and SSE2. Man this compiler sucks... 2019-06-09 05:49:44 +00:00			`// Sun Studio 12.4 OK, 12.5 and 12.6 compile error.`
			`#if (__SUNPRO_CC >= 0x5140) && (__SUNPRO_CC <= 0x5150)`
			`# define MAYBE_CONST`
			`#else`
			`# define MAYBE_CONST const`
			`#endif`

Fix global optimization bug for ChaCha AVX2 under VS2017 (GH #735) Also see https://github.com/weidai11/cryptopp/issues/649. The 649 issue is the one affecting AES. It appears to be the same problem. 2018-11-09 13:00:53 +00:00			`// VS2017 and global optimization bug. TODO, figure out when`
			`// we can re-enable full optimizations for VS2017. Also see`
			`// https://github.com/weidai11/cryptopp/issues/649 and`
			`// https://github.com/weidai11/cryptopp/issues/735. The`
			`// 649 issue affects AES but it is the same here. The 735`
			`// issue is ChaCha AVX2 cut-in where it surfaced again.`
Avoid use of NDEBUG in source files Posix NDEBUG causes our test script to fail the Posix Assert test 2018-12-03 09:18:41 +00:00			`#if (_MSC_VER >= 1910)`
			`# ifndef CRYPTOPP_DEBUG`
			`# pragma optimize("", off)`
			`# pragma optimize("ts", on)`
			`# endif`
Fix global optimization bug for ChaCha AVX2 under VS2017 (GH #735) Also see https://github.com/weidai11/cryptopp/issues/649. The 649 issue is the one affecting AES. It appears to be the same problem. 2018-11-09 13:00:53 +00:00			`#endif`
Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00
Clear alginment warning with Clang The data is aligned, but Clang issues warning based on type and not the actual alignment of the variable and data. 2018-12-14 09:14:44 +00:00			`// The data is aligned, but Clang issues warning based on type`
			`// and not the actual alignment of the variable and data.`
			`#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE`
			`# pragma GCC diagnostic ignored "-Wcast-align"`
			`#endif`

Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00			`ANONYMOUS_NAMESPACE_BEGIN`

Fix global optimization bug for ChaCha AVX2 under VS2017 (GH #735) Also see https://github.com/weidai11/cryptopp/issues/649. The 649 issue is the one affecting AES. It appears to be the same problem. 2018-11-09 13:00:53 +00:00			`#if (CRYPTOPP_AVX2_AVAILABLE)`

Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00			`template <unsigned int R>`
			`inline __m256i RotateLeft(const __m256i val)`
			`{`
			`return _mm256_or_si256(_mm256_slli_epi32(val, R), _mm256_srli_epi32(val, 32-R));`
			`}`

			`template <>`
			`inline __m256i RotateLeft<8>(const __m256i val)`
			`{`
			`const __m256i mask = _mm256_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3,`
			`14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);`
			`return _mm256_shuffle_epi8(val, mask);`
			`}`

			`template <>`
			`inline __m256i RotateLeft<16>(const __m256i val)`
			`{`
			`const __m256i mask = _mm256_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2,`
			`13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);`
			`return _mm256_shuffle_epi8(val, mask);`
			`}`

Fix missing comment characters 2018-11-09 13:38:29 +00:00			`#endif // CRYPTOPP_AVX2_AVAILABLE`
Fix global optimization bug for ChaCha AVX2 under VS2017 (GH #735) Also see https://github.com/weidai11/cryptopp/issues/649. The 649 issue is the one affecting AES. It appears to be the same problem. 2018-11-09 13:00:53 +00:00
Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00			`ANONYMOUS_NAMESPACE_END`

			`NAMESPACE_BEGIN(CryptoPP)`

Fix global optimization bug for ChaCha AVX2 under VS2017 (GH #735) Also see https://github.com/weidai11/cryptopp/issues/649. The 649 issue is the one affecting AES. It appears to be the same problem. 2018-11-09 13:00:53 +00:00			`#if (CRYPTOPP_AVX2_AVAILABLE)`

Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00			`void ChaCha_OperateKeystream_AVX2(const word32 state, const byte input, byte *output, unsigned int rounds)`
			`{`
Rework SSE2 and AVX2 loads and stores 2019-06-09 08:29:40 +00:00			`const __m256i state0 = _mm256_broadcastsi128_si256(`
			`_mm_loadu_si128(reinterpret_cast<const __m128i>(state+04)));`
			`const __m256i state1 = _mm256_broadcastsi128_si256(`
			`_mm_loadu_si128(reinterpret_cast<const __m128i>(state+14)));`
			`const __m256i state2 = _mm256_broadcastsi128_si256(`
			`_mm_loadu_si128(reinterpret_cast<const __m128i>(state+24)));`
			`const __m256i state3 = _mm256_broadcastsi128_si256(`
			`_mm_loadu_si128(reinterpret_cast<const __m128i>(state+34)));`
Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00
			`const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 4);`
			`const __m256i CTR1 = _mm256_set_epi32(0, 0, 0, 1, 0, 0, 0, 5);`
			`const __m256i CTR2 = _mm256_set_epi32(0, 0, 0, 2, 0, 0, 0, 6);`
			`const __m256i CTR3 = _mm256_set_epi32(0, 0, 0, 3, 0, 0, 0, 7);`

			`__m256i X0_0 = state0;`
			`__m256i X0_1 = state1;`
			`__m256i X0_2 = state2;`
			`__m256i X0_3 = _mm256_add_epi64(state3, CTR0);`

			`__m256i X1_0 = state0;`
			`__m256i X1_1 = state1;`
			`__m256i X1_2 = state2;`
			`__m256i X1_3 = _mm256_add_epi64(state3, CTR1);`

			`__m256i X2_0 = state0;`
			`__m256i X2_1 = state1;`
			`__m256i X2_2 = state2;`
			`__m256i X2_3 = _mm256_add_epi64(state3, CTR2);`

			`__m256i X3_0 = state0;`
			`__m256i X3_1 = state1;`
			`__m256i X3_2 = state2;`
			`__m256i X3_3 = _mm256_add_epi64(state3, CTR3);`

			`for (int i = static_cast<int>(rounds); i > 0; i -= 2)`
			`{`
			`X0_0 = _mm256_add_epi32(X0_0, X0_1);`
			`X1_0 = _mm256_add_epi32(X1_0, X1_1);`
			`X2_0 = _mm256_add_epi32(X2_0, X2_1);`
			`X3_0 = _mm256_add_epi32(X3_0, X3_1);`

			`X0_3 = _mm256_xor_si256(X0_3, X0_0);`
			`X1_3 = _mm256_xor_si256(X1_3, X1_0);`
			`X2_3 = _mm256_xor_si256(X2_3, X2_0);`
			`X3_3 = _mm256_xor_si256(X3_3, X3_0);`

			`X0_3 = RotateLeft<16>(X0_3);`
			`X1_3 = RotateLeft<16>(X1_3);`
			`X2_3 = RotateLeft<16>(X2_3);`
			`X3_3 = RotateLeft<16>(X3_3);`

			`X0_2 = _mm256_add_epi32(X0_2, X0_3);`
			`X1_2 = _mm256_add_epi32(X1_2, X1_3);`
			`X2_2 = _mm256_add_epi32(X2_2, X2_3);`
			`X3_2 = _mm256_add_epi32(X3_2, X3_3);`

			`X0_1 = _mm256_xor_si256(X0_1, X0_2);`
			`X1_1 = _mm256_xor_si256(X1_1, X1_2);`
			`X2_1 = _mm256_xor_si256(X2_1, X2_2);`
			`X3_1 = _mm256_xor_si256(X3_1, X3_2);`

			`X0_1 = RotateLeft<12>(X0_1);`
			`X1_1 = RotateLeft<12>(X1_1);`
			`X2_1 = RotateLeft<12>(X2_1);`
			`X3_1 = RotateLeft<12>(X3_1);`

			`X0_0 = _mm256_add_epi32(X0_0, X0_1);`
			`X1_0 = _mm256_add_epi32(X1_0, X1_1);`
			`X2_0 = _mm256_add_epi32(X2_0, X2_1);`
			`X3_0 = _mm256_add_epi32(X3_0, X3_1);`

			`X0_3 = _mm256_xor_si256(X0_3, X0_0);`
			`X1_3 = _mm256_xor_si256(X1_3, X1_0);`
			`X2_3 = _mm256_xor_si256(X2_3, X2_0);`
			`X3_3 = _mm256_xor_si256(X3_3, X3_0);`

			`X0_3 = RotateLeft<8>(X0_3);`
			`X1_3 = RotateLeft<8>(X1_3);`
			`X2_3 = RotateLeft<8>(X2_3);`
			`X3_3 = RotateLeft<8>(X3_3);`

			`X0_2 = _mm256_add_epi32(X0_2, X0_3);`
			`X1_2 = _mm256_add_epi32(X1_2, X1_3);`
			`X2_2 = _mm256_add_epi32(X2_2, X2_3);`
			`X3_2 = _mm256_add_epi32(X3_2, X3_3);`

			`X0_1 = _mm256_xor_si256(X0_1, X0_2);`
			`X1_1 = _mm256_xor_si256(X1_1, X1_2);`
			`X2_1 = _mm256_xor_si256(X2_1, X2_2);`
			`X3_1 = _mm256_xor_si256(X3_1, X3_2);`

			`X0_1 = RotateLeft<7>(X0_1);`
			`X1_1 = RotateLeft<7>(X1_1);`
			`X2_1 = RotateLeft<7>(X2_1);`
			`X3_1 = RotateLeft<7>(X3_1);`

			`X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1));`
			`X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));`
			`X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3));`

			`X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1));`
			`X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));`
			`X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3));`

			`X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1));`
			`X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));`
			`X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3));`

			`X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1));`
			`X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));`
			`X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3));`

			`X0_0 = _mm256_add_epi32(X0_0, X0_1);`
			`X1_0 = _mm256_add_epi32(X1_0, X1_1);`
			`X2_0 = _mm256_add_epi32(X2_0, X2_1);`
			`X3_0 = _mm256_add_epi32(X3_0, X3_1);`

			`X0_3 = _mm256_xor_si256(X0_3, X0_0);`
			`X1_3 = _mm256_xor_si256(X1_3, X1_0);`
			`X2_3 = _mm256_xor_si256(X2_3, X2_0);`
			`X3_3 = _mm256_xor_si256(X3_3, X3_0);`

			`X0_3 = RotateLeft<16>(X0_3);`
			`X1_3 = RotateLeft<16>(X1_3);`
			`X2_3 = RotateLeft<16>(X2_3);`
			`X3_3 = RotateLeft<16>(X3_3);`

			`X0_2 = _mm256_add_epi32(X0_2, X0_3);`
			`X1_2 = _mm256_add_epi32(X1_2, X1_3);`
			`X2_2 = _mm256_add_epi32(X2_2, X2_3);`
			`X3_2 = _mm256_add_epi32(X3_2, X3_3);`

			`X0_1 = _mm256_xor_si256(X0_1, X0_2);`
			`X1_1 = _mm256_xor_si256(X1_1, X1_2);`
			`X2_1 = _mm256_xor_si256(X2_1, X2_2);`
			`X3_1 = _mm256_xor_si256(X3_1, X3_2);`

			`X0_1 = RotateLeft<12>(X0_1);`
			`X1_1 = RotateLeft<12>(X1_1);`
			`X2_1 = RotateLeft<12>(X2_1);`
			`X3_1 = RotateLeft<12>(X3_1);`

			`X0_0 = _mm256_add_epi32(X0_0, X0_1);`
			`X1_0 = _mm256_add_epi32(X1_0, X1_1);`
			`X2_0 = _mm256_add_epi32(X2_0, X2_1);`
			`X3_0 = _mm256_add_epi32(X3_0, X3_1);`

			`X0_3 = _mm256_xor_si256(X0_3, X0_0);`
			`X1_3 = _mm256_xor_si256(X1_3, X1_0);`
			`X2_3 = _mm256_xor_si256(X2_3, X2_0);`
			`X3_3 = _mm256_xor_si256(X3_3, X3_0);`

			`X0_3 = RotateLeft<8>(X0_3);`
			`X1_3 = RotateLeft<8>(X1_3);`
			`X2_3 = RotateLeft<8>(X2_3);`
			`X3_3 = RotateLeft<8>(X3_3);`

			`X0_2 = _mm256_add_epi32(X0_2, X0_3);`
			`X1_2 = _mm256_add_epi32(X1_2, X1_3);`
			`X2_2 = _mm256_add_epi32(X2_2, X2_3);`
			`X3_2 = _mm256_add_epi32(X3_2, X3_3);`

			`X0_1 = _mm256_xor_si256(X0_1, X0_2);`
			`X1_1 = _mm256_xor_si256(X1_1, X1_2);`
			`X2_1 = _mm256_xor_si256(X2_1, X2_2);`
			`X3_1 = _mm256_xor_si256(X3_1, X3_2);`

			`X0_1 = RotateLeft<7>(X0_1);`
			`X1_1 = RotateLeft<7>(X1_1);`
			`X2_1 = RotateLeft<7>(X2_1);`
			`X3_1 = RotateLeft<7>(X3_1);`

			`X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3));`
			`X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));`
			`X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1));`

			`X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3));`
			`X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));`
			`X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1));`

			`X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3));`
			`X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));`
			`X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1));`

			`X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3));`
			`X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));`
			`X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1));`
			`}`

			`X0_0 = _mm256_add_epi32(X0_0, state0);`
			`X0_1 = _mm256_add_epi32(X0_1, state1);`
			`X0_2 = _mm256_add_epi32(X0_2, state2);`
			`X0_3 = _mm256_add_epi32(X0_3, state3);`
			`X0_3 = _mm256_add_epi64(X0_3, CTR0);`

			`X1_0 = _mm256_add_epi32(X1_0, state0);`
			`X1_1 = _mm256_add_epi32(X1_1, state1);`
			`X1_2 = _mm256_add_epi32(X1_2, state2);`
			`X1_3 = _mm256_add_epi32(X1_3, state3);`
			`X1_3 = _mm256_add_epi64(X1_3, CTR1);`

			`X2_0 = _mm256_add_epi32(X2_0, state0);`
			`X2_1 = _mm256_add_epi32(X2_1, state1);`
			`X2_2 = _mm256_add_epi32(X2_2, state2);`
			`X2_3 = _mm256_add_epi32(X2_3, state3);`
			`X2_3 = _mm256_add_epi64(X2_3, CTR2);`

			`X3_0 = _mm256_add_epi32(X3_0, state0);`
			`X3_1 = _mm256_add_epi32(X3_1, state1);`
			`X3_2 = _mm256_add_epi32(X3_2, state2);`
			`X3_3 = _mm256_add_epi32(X3_3, state3);`
			`X3_3 = _mm256_add_epi64(X3_3, CTR3);`

Rework SSE2 and AVX2 loads and stores 2019-06-09 08:29:40 +00:00			`if (input)`
Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00			`{`
Whitespace check-in 2019-06-09 16:12:46 +00:00			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+032),`
			`_mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)),`
			`_mm256_loadu_si256(reinterpret_cast<const __m256i>(input+032))));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+132),`
			`_mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)),`
			`_mm256_loadu_si256(reinterpret_cast<const __m256i>(input+132))));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+232),`
			`_mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)),`
			`_mm256_loadu_si256(reinterpret_cast<const __m256i>(input+232))));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+332),`
			`_mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)),`
			`_mm256_loadu_si256(reinterpret_cast<const __m256i>(input+332))));`
Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00			`}`
			`else`
			`{`
Rework SSE2 and AVX2 loads and stores 2019-06-09 08:29:40 +00:00			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+032),`
			`_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+132),`
			`_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+232),`
			`_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+332),`
			`_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)));`
Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00			`}`

Rework SSE2 and AVX2 loads and stores 2019-06-09 08:29:40 +00:00			`if (input)`
Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00			`{`
Whitespace check-in 2019-06-09 16:12:46 +00:00			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+432),`
			`_mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)),`
			`_mm256_loadu_si256(reinterpret_cast<const __m256i>(input+432))));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+532),`
			`_mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)),`
			`_mm256_loadu_si256(reinterpret_cast<const __m256i>(input+532))));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+632),`
			`_mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)),`
			`_mm256_loadu_si256(reinterpret_cast<const __m256i>(input+632))));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+732),`
			`_mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)),`
			`_mm256_loadu_si256(reinterpret_cast<const __m256i>(input+732))));`
Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00			`}`
			`else`
			`{`
Rework SSE2 and AVX2 loads and stores 2019-06-09 08:29:40 +00:00			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+432),`
			`_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+532),`
			`_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+632),`
			`_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+732),`
			`_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)));`
Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00			`}`

Rework SSE2 and AVX2 loads and stores 2019-06-09 08:29:40 +00:00			`if (input)`
Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00			`{`
Whitespace check-in 2019-06-09 16:12:46 +00:00			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+ 832),`
			`_mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)),`
			`_mm256_loadu_si256(reinterpret_cast<const __m256i>(input+832))));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+ 932),`
			`_mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)),`
			`_mm256_loadu_si256(reinterpret_cast<const __m256i>(input+932))));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1032),`
			`_mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)),`
			`_mm256_loadu_si256(reinterpret_cast<const __m256i>(input+1032))));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1132),`
			`_mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)),`
			`_mm256_loadu_si256(reinterpret_cast<const __m256i>(input+1132))));`
Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00			`}`
			`else`
			`{`
Rework SSE2 and AVX2 loads and stores 2019-06-09 08:29:40 +00:00			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+ 832),`
			`_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+ 932),`
			`_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1032),`
			`_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1132),`
			`_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)));`
Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00			`}`

Rework SSE2 and AVX2 loads and stores 2019-06-09 08:29:40 +00:00			`if (input)`
Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00			`{`
Whitespace check-in 2019-06-09 16:12:46 +00:00			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1232),`
			`_mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)),`
			`_mm256_loadu_si256(reinterpret_cast<const __m256i>(input+1232))));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1332),`
			`_mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)),`
			`_mm256_loadu_si256(reinterpret_cast<const __m256i>(input+1332))));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1432),`
			`_mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)),`
			`_mm256_loadu_si256(reinterpret_cast<const __m256i>(input+1432))));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1532),`
			`_mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)),`
			`_mm256_loadu_si256(reinterpret_cast<const __m256i>(input+1532))));`
Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00			`}`
			`else`
			`{`
Rework SSE2 and AVX2 loads and stores 2019-06-09 08:29:40 +00:00			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1232),`
			`_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1332),`
			`_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1432),`
			`_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)));`
			`_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1532),`
			`_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)));`
Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00			`}`
Whitespace check-in 2018-11-10 19:10:31 +00:00
Update comments 2018-12-06 00:53:14 +00:00			`// https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties`
Add call to _mm256_zeroupper to avoid state penalties Also see https://stackoverflow.com/a/7841251/608639 2018-11-10 16:39:30 +00:00			`_mm256_zeroupper();`
Add ChaCha AVX2 implementation (GH #735) 2018-11-08 21:20:31 +00:00			`}`

			`#endif // CRYPTOPP_AVX2_AVAILABLE`
Fix global optimization bug for ChaCha AVX2 under VS2017 (GH #735) Also see https://github.com/weidai11/cryptopp/issues/649. The 649 issue is the one affecting AES. It appears to be the same problem. 2018-11-09 13:00:53 +00:00
			`NAMESPACE_END`