Merge pull request #3770 from alyssarosenzweig/opt/vzeroall

Tiny opt for vzeroall
This commit is contained in:
Ryan Houdek 2024-06-27 10:25:35 -07:00 committed by GitHub
commit ad4d4c9e67
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 20 additions and 21 deletions

View File

@ -780,15 +780,15 @@ void OpDispatchBuilder::AVX128_VZERO(OpcodeArgs) {
if (IsVZEROALL) {
// NOTE: Despite the name being VZEROALL, this will still only ever
// zero out up to the first 16 registers (even on AVX-512, where we have 32 registers)
Ref ZeroVector;
for (uint32_t i = 0; i < NumRegs; i++) {
// Explicitly not caching named vector zero. This ensures that every register gets movi #0.0 directly.
Ref ZeroVector = LoadUncachedZeroVector(OpSize::i128Bit);
ZeroVector = LoadUncachedZeroVector(OpSize::i128Bit);
AVX128_StoreXMMRegister(i, ZeroVector, false);
}
// More efficient for non-SRA upper-halves to cache the constant and store directly.
const auto ZeroVector = LoadZeroVector(OpSize::i128Bit);
// More efficient for non-SRA upper-halves to use a cached constant and store directly.
for (uint32_t i = 0; i < NumRegs; i++) {
AVX128_StoreXMMRegister(i, ZeroVector, true);
}

View File

@ -1655,7 +1655,7 @@
]
},
"vzeroall": {
"ExpectedInstructionCount": 33,
"ExpectedInstructionCount": 32,
"Comment": [
"Might be able to use DZ ZVA",
"Map 1 0b01 0x77 L=1"
@ -1677,23 +1677,22 @@
"movi v29.2d, #0x0",
"movi v30.2d, #0x0",
"movi v31.2d, #0x0",
"movi v2.2d, #0x0",
"str q2, [x28, #256]",
"str q2, [x28, #240]",
"str q2, [x28, #224]",
"str q2, [x28, #208]",
"str q2, [x28, #192]",
"str q2, [x28, #176]",
"str q2, [x28, #160]",
"str q2, [x28, #144]",
"str q2, [x28, #128]",
"str q2, [x28, #112]",
"str q2, [x28, #96]",
"str q2, [x28, #80]",
"str q2, [x28, #64]",
"str q2, [x28, #48]",
"str q2, [x28, #32]",
"str q2, [x28, #16]"
"str q31, [x28, #256]",
"str q31, [x28, #240]",
"str q31, [x28, #224]",
"str q31, [x28, #208]",
"str q31, [x28, #192]",
"str q31, [x28, #176]",
"str q31, [x28, #160]",
"str q31, [x28, #144]",
"str q31, [x28, #128]",
"str q31, [x28, #112]",
"str q31, [x28, #96]",
"str q31, [x28, #80]",
"str q31, [x28, #64]",
"str q31, [x28, #48]",
"str q31, [x28, #32]",
"str q31, [x28, #16]"
]
},
"vcmpps xmm0, xmm1, xmm2, 0x00": {