From e613876e9d1a8548ae4405d79e8ab35903621581 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Thu, 25 Jul 2024 20:19:12 -0700 Subject: [PATCH] AVX128: Optimize all cases of vpermq Started by cherry-picking some cases from the variants that appeared when running Steam, games, AV1 convolve tests, openssl, ffmpeg, libjpeg-turbo, openh264, libvpx, gemmlowp, libyuv, and dav1d. Then turned it around and optimized them all since all variants end up needing to be split in to two halves, that effectively means we need to have 16 implementations, plus a couple of special cases for duplicated results. Fixes #3795 --- .../Core/OpcodeDispatcher/AVX_128.cpp | 59 +++++++++---------- 1 file changed, 28 insertions(+), 31 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp index 8eb423e9d..6551c5b5a 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp @@ -1851,38 +1851,35 @@ void OpDispatchBuilder::AVX128_VPERMQ(OpcodeArgs) { RefPair Result {}; - if (Selector == 0x00 || Selector == 0x55) { - // If we're just broadcasting one element in particular across the vector - // then this can be done fairly simply without any individual inserts. - // Low 128-bit version. - const auto Index = Selector & 0b11; - Result.Low = _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.Low, Index); - Result.High = Result.Low; - } else if (Selector == 0xAA || Selector == 0xFF) { - // High 128-bit version. - const auto Index = (Selector & 0b11) - 2; - Result.Low = _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.High, Index); - Result.High = Result.Low; - } else { - ///< TODO: This can be more optimized. - auto ZeroRegister = LoadZeroVector(OpSize::i128Bit); - Ref Selections[4] = { - Src.Low, - Src.Low, - Src.High, - Src.High, - }; - uint8_t SrcIndex {}; - SrcIndex = (Selector >> (0 * 2)) & 0b11; - Result.Low = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 0, SrcIndex & 1, ZeroRegister, Selections[SrcIndex]); - SrcIndex = (Selector >> (1 * 2)) & 0b11; - Result.Low = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 1, SrcIndex & 1, Result.Low, Selections[SrcIndex]); + // Crack the operation in to two halves and implement per half + uint8_t SelectorLow = Selector & 0b1111; + uint8_t SelectorHigh = (Selector >> 4) & 0b1111; + auto SelectLane = [this](uint8_t Selector, RefPair Src) -> Ref { + LOGMAN_THROW_AA_FMT(Selector < 16, "Selector too large!"); - SrcIndex = (Selector >> (2 * 2)) & 0b11; - Result.High = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 0, SrcIndex & 1, ZeroRegister, Selections[SrcIndex]); - SrcIndex = (Selector >> (3 * 2)) & 0b11; - Result.High = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 1, SrcIndex & 1, Result.High, Selections[SrcIndex]); - } + switch (Selector) { + case 0b00'00: return _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.Low, 0); + case 0b00'01: return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src.Low, Src.Low, 8); + case 0b00'10: return _VZip(OpSize::i128Bit, OpSize::i64Bit, Src.High, Src.Low); + case 0b00'11: return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src.Low, Src.High, 8); + case 0b01'00: return Src.Low; + case 0b01'01: return _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.Low, 1); + case 0b01'10: return _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 1, 1, Src.High, Src.Low); + case 0b01'11: return _VTrn2(OpSize::i128Bit, OpSize::i64Bit, Src.High, Src.Low); + case 0b10'00: return _VZip(OpSize::i128Bit, OpSize::i64Bit, Src.Low, Src.High); + case 0b10'01: return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src.High, Src.Low, 8); + case 0b10'10: return _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.High, 0); + case 0b10'11: return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src.High, Src.High, 8); + case 0b11'00: return _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 1, 1, Src.Low, Src.High); + case 0b11'01: return _VTrn2(OpSize::i128Bit, OpSize::i64Bit, Src.Low, Src.High); + case 0b11'10: return Src.High; + case 0b11'11: return _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.High, 1); + default: FEX_UNREACHABLE; + } + }; + + Result.Low = SelectLane(SelectorLow, Src); + Result.High = SelectorLow == SelectorHigh ? Result.Low : SelectLane(SelectorHigh, Src); AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); }