mirror of
https://github.com/FEX-Emu/FEX.git
synced 2024-12-15 01:49:00 +00:00
AVX128: Optimize all cases of vpermq
Started by cherry-picking some cases from the variants that appeared when running Steam, games, AV1 convolve tests, openssl, ffmpeg, libjpeg-turbo, openh264, libvpx, gemmlowp, libyuv, and dav1d. Then turned it around and optimized them all since all variants end up needing to be split in to two halves, that effectively means we need to have 16 implementations, plus a couple of special cases for duplicated results. Fixes #3795
This commit is contained in:
parent
f75bd2f09b
commit
e613876e9d
@ -1851,38 +1851,35 @@ void OpDispatchBuilder::AVX128_VPERMQ(OpcodeArgs) {
|
||||
|
||||
RefPair Result {};
|
||||
|
||||
if (Selector == 0x00 || Selector == 0x55) {
|
||||
// If we're just broadcasting one element in particular across the vector
|
||||
// then this can be done fairly simply without any individual inserts.
|
||||
// Low 128-bit version.
|
||||
const auto Index = Selector & 0b11;
|
||||
Result.Low = _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.Low, Index);
|
||||
Result.High = Result.Low;
|
||||
} else if (Selector == 0xAA || Selector == 0xFF) {
|
||||
// High 128-bit version.
|
||||
const auto Index = (Selector & 0b11) - 2;
|
||||
Result.Low = _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.High, Index);
|
||||
Result.High = Result.Low;
|
||||
} else {
|
||||
///< TODO: This can be more optimized.
|
||||
auto ZeroRegister = LoadZeroVector(OpSize::i128Bit);
|
||||
Ref Selections[4] = {
|
||||
Src.Low,
|
||||
Src.Low,
|
||||
Src.High,
|
||||
Src.High,
|
||||
};
|
||||
uint8_t SrcIndex {};
|
||||
SrcIndex = (Selector >> (0 * 2)) & 0b11;
|
||||
Result.Low = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 0, SrcIndex & 1, ZeroRegister, Selections[SrcIndex]);
|
||||
SrcIndex = (Selector >> (1 * 2)) & 0b11;
|
||||
Result.Low = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 1, SrcIndex & 1, Result.Low, Selections[SrcIndex]);
|
||||
// Crack the operation in to two halves and implement per half
|
||||
uint8_t SelectorLow = Selector & 0b1111;
|
||||
uint8_t SelectorHigh = (Selector >> 4) & 0b1111;
|
||||
auto SelectLane = [this](uint8_t Selector, RefPair Src) -> Ref {
|
||||
LOGMAN_THROW_AA_FMT(Selector < 16, "Selector too large!");
|
||||
|
||||
SrcIndex = (Selector >> (2 * 2)) & 0b11;
|
||||
Result.High = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 0, SrcIndex & 1, ZeroRegister, Selections[SrcIndex]);
|
||||
SrcIndex = (Selector >> (3 * 2)) & 0b11;
|
||||
Result.High = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 1, SrcIndex & 1, Result.High, Selections[SrcIndex]);
|
||||
}
|
||||
switch (Selector) {
|
||||
case 0b00'00: return _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.Low, 0);
|
||||
case 0b00'01: return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src.Low, Src.Low, 8);
|
||||
case 0b00'10: return _VZip(OpSize::i128Bit, OpSize::i64Bit, Src.High, Src.Low);
|
||||
case 0b00'11: return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src.Low, Src.High, 8);
|
||||
case 0b01'00: return Src.Low;
|
||||
case 0b01'01: return _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.Low, 1);
|
||||
case 0b01'10: return _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 1, 1, Src.High, Src.Low);
|
||||
case 0b01'11: return _VTrn2(OpSize::i128Bit, OpSize::i64Bit, Src.High, Src.Low);
|
||||
case 0b10'00: return _VZip(OpSize::i128Bit, OpSize::i64Bit, Src.Low, Src.High);
|
||||
case 0b10'01: return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src.High, Src.Low, 8);
|
||||
case 0b10'10: return _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.High, 0);
|
||||
case 0b10'11: return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src.High, Src.High, 8);
|
||||
case 0b11'00: return _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 1, 1, Src.Low, Src.High);
|
||||
case 0b11'01: return _VTrn2(OpSize::i128Bit, OpSize::i64Bit, Src.Low, Src.High);
|
||||
case 0b11'10: return Src.High;
|
||||
case 0b11'11: return _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.High, 1);
|
||||
default: FEX_UNREACHABLE;
|
||||
}
|
||||
};
|
||||
|
||||
Result.Low = SelectLane(SelectorLow, Src);
|
||||
Result.High = SelectorLow == SelectorHigh ? Result.Low : SelectLane(SelectorHigh, Src);
|
||||
|
||||
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user