mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2025-01-05 23:52:45 +00:00
X86DAGToDAGISel::matchBitExtract() with truncation (PR36419)
Summary:
Previously in D54095 i have added support for extraction of `lshr` from `X` if we are to produce `BEXTR`.
That was good, but the fix was partial, there was still [[ https://bugs.llvm.org/show_bug.cgi?id=36419 | PR36419 ]].
That pattern can also appear, roughly, when you have a large (64-bit) storage, and the consume bits from it.
It will not be unexpected if you will be doing further computations in 32-bit width.
And then the current code breaks, as the tests show.
The basic idea/pattern here is following:
1. We have `i64` input
2. We perform `i64` right-shift on it.
3. We `trunc`ate that shifted value
4. We do all further work (masking) in `i32`
Since we see `trunc`ation and not `lshr`, we give up, and stop trying to extract that right-shift.
BUT. The mask is `i32`, therefore we can extend both of the operands of the masking (`and`) to `i64`
and truncate the result after masking: https://rise4fun.com/Alive/K4B
```
Name: @bextr64_32_b1 -> @bextr64_32_b0
%shiftedval = lshr i64 %val, %numskipbits
%truncshiftedval = trunc i64 %shiftedval to i32
%widenumlowbits1 = zext i8 %numlowbits to i32
%notmask1 = shl nsw i32 -1, %widenumlowbits1
%mask1 = xor i32 %notmask1, -1
%res = and i32 %truncshiftedval, %mask1
=>
%shiftedval = lshr i64 %val, %numskipbits
%widenumlowbits = zext i8 %numlowbits to i64
%notmask = shl nsw i64 -1, %widenumlowbits
%mask = xor i64 %notmask, -1
%wideres = and i64 %shiftedval, %mask
%res = trunc i64 %wideres to i32
```
Thus, we are again able to extract that `lshr` into `BEXTR`'s control.
Now, the perf (via `llvm-exegesis`) of the snippet suggests that it is not a good idea:
```
$ cat /tmp/old.s
# bextr64_32_b1
# LLVM-EXEGESIS-LIVEIN RSI
# LLVM-EXEGESIS-LIVEIN EDX
# LLVM-EXEGESIS-LIVEIN RDI
movq %rsi, %rcx
shrq %cl, %rdi
shll $8, %edx
bextrl %edx, %edi, %eax
$ cat /tmp/old.s | ./bin/llvm-exegesis -mode=latency -snippets-file=-
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-1e0082.o
---
mode: latency
key:
instructions:
- 'MOV64rr RCX RSI'
- 'SHR64rCL RDI RDI'
- 'SHL32ri EDX EDX i_0x8'
- 'BEXTR32rr EAX EDI EDX'
config: ''
register_initial_values: []
cpu_name: bdver2
llvm_triple: x86_64-unknown-linux-gnu
num_repetitions: 10000
measurements:
- { key: latency, value: 0.6638, per_snippet_value: 2.6552 }
error: ''
info: ''
assembled_snippet: 4889F148D3EFC1E208C4E268F7C74889F148D3EFC1E208C4E268F7C74889F148D3EFC1E208C4E268F7C74889F148D3EFC1E208C4E268F7C7C3
...
$ cat /tmp/old.s | ./bin/llvm-exegesis -mode=uops -snippets-file=-
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-43e346.o
---
mode: uops
key:
instructions:
- 'MOV64rr RCX RSI'
- 'SHR64rCL RDI RDI'
- 'SHL32ri EDX EDX i_0x8'
- 'BEXTR32rr EAX EDI EDX'
config: ''
register_initial_values: []
cpu_name: bdver2
llvm_triple: x86_64-unknown-linux-gnu
num_repetitions: 10000
measurements:
- { key: PdFPU0, value: 0, per_snippet_value: 0 }
- { key: PdFPU1, value: 0, per_snippet_value: 0 }
- { key: PdFPU2, value: 0, per_snippet_value: 0 }
- { key: PdFPU3, value: 0, per_snippet_value: 0 }
- { key: NumMicroOps, value: 1.2571, per_snippet_value: 5.0284 }
error: ''
info: ''
assembled_snippet: 4889F148D3EFC1E208C4E268F7C74889F148D3EFC1E208C4E268F7C74889F148D3EFC1E208C4E268F7C74889F148D3EFC1E208C4E268F7C7C3
...
```
vs
```
$ cat /tmp/new.s
# bextr64_32_b1
# LLVM-EXEGESIS-LIVEIN RDX
# LLVM-EXEGESIS-LIVEIN SIL
# LLVM-EXEGESIS-LIVEIN RDI
shlq $8, %rdx
movzbl %sil, %eax
orq %rdx, %rax
bextrq %rax, %rdi, %rax
$ cat /tmp/new.s | ./bin/llvm-exegesis -mode=latency -snippets-file=-
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-8944f1.o
---
mode: latency
key:
instructions:
- 'SHL64ri RDX RDX i_0x8'
- 'MOVZX32rr8 EAX SIL'
- 'OR64rr RAX RAX RDX'
- 'BEXTR64rr RAX RDI RAX'
config: ''
register_initial_values: []
cpu_name: bdver2
llvm_triple: x86_64-unknown-linux-gnu
num_repetitions: 10000
measurements:
- { key: latency, value: 0.7454, per_snippet_value: 2.9816 }
error: ''
info: ''
assembled_snippet: 48C1E208400FB6C64809D0C4E2F8F7C748C1E208400FB6C64809D0C4E2F8F7C748C1E208400FB6C64809D0C4E2F8F7C748C1E208400FB6C64809D0C4E2F8F7C7C3
...
$ cat /tmp/new.s | ./bin/llvm-exegesis -mode=uops -snippets-file=-
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-da403c.o
---
mode: uops
key:
instructions:
- 'SHL64ri RDX RDX i_0x8'
- 'MOVZX32rr8 EAX SIL'
- 'OR64rr RAX RAX RDX'
- 'BEXTR64rr RAX RDI RAX'
config: ''
register_initial_values: []
cpu_name: bdver2
llvm_triple: x86_64-unknown-linux-gnu
num_repetitions: 10000
measurements:
- { key: PdFPU0, value: 0, per_snippet_value: 0 }
- { key: PdFPU1, value: 0, per_snippet_value: 0 }
- { key: PdFPU2, value: 0, per_snippet_value: 0 }
- { key: PdFPU3, value: 0, per_snippet_value: 0 }
- { key: NumMicroOps, value: 1.2571, per_snippet_value: 5.0284 }
error: ''
info: ''
assembled_snippet: 48C1E208400FB6C64809D0C4E2F8F7C748C1E208400FB6C64809D0C4E2F8F7C748C1E208400FB6C64809D0C4E2F8F7C748C1E208400FB6C64809D0C4E2F8F7C7C3
...
```
^ latency increased (worse).
Except //maybe// not really.
Like with all synthetic benchmarks, they //may// be misleading.
Let's take a look on some actual real-world hotpath.
In this case it's 'my' [[ https://github.com/darktable-org/rawspeed | RawSpeed ]]'s `BitStream<>::peekBitsNoFill()`, in [[ e3316dc851/src/librawspeed/decompressors/VC5Decompressor.cpp (L814)
| GoPro VC5 decompressor ]]:
```
raw.pixls.us-unique/GoPro/HERO6 Black$ /usr/src/googlebenchmark/tools/compare.py -a benchmarks ~/rawspeed/build-clangs1-{old,new}/src/utilities/rsbench/rsbench --benchmark_counters_tabular=true --benchmark_min_time=0.00000001 --benchmark_repetitions=128 GOPR9172.GPR
RUNNING: /home/lebedevri/rawspeed/build-clangs1-old/src/utilities/rsbench/rsbench --benchmark_counters_tabular=true --benchmark_min_time=0.00000001 --benchmark_repetitions=128 GOPR9172.GPR --benchmark_display_aggregates_only=true --benchmark_out=/tmp/tmplwbKEM
2018-12-22 21:23:03
Running /home/lebedevri/rawspeed/build-clangs1-old/src/utilities/rsbench/rsbench
Run on (8 X 4012.81 MHz CPU s)
CPU Caches:
L1 Data 16K (x8)
L1 Instruction 64K (x4)
L2 Unified 2048K (x4)
L3 Unified 8192K (x1)
Load Average: 3.41, 2.41, 2.03
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations CPUTime,s CPUTime/WallTime Pixels Pixels/CPUTime Pixels/WallTime Raws/CPUTime Raws/WallTime WallTime,s
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
GOPR9172.GPR/threads:8/real_time_mean 40 ms 40 ms 128 0.322244 7.96974 12M 37.4457M 298.534M 3.12047 24.8778 0.040465
GOPR9172.GPR/threads:8/real_time_median 39 ms 39 ms 128 0.312606 7.99155 12M 38.387M 306.788M 3.19891 25.5656 0.039115
GOPR9172.GPR/threads:8/real_time_stddev 4 ms 3 ms 128 0.0271557 0.130575 0 2.4941M 21.3909M 0.207842 1.78257 3.81081m
RUNNING: /home/lebedevri/rawspeed/build-clangs1-new/src/utilities/rsbench/rsbench --benchmark_counters_tabular=true --benchmark_min_time=0.00000001 --benchmark_repetitions=128 GOPR9172.GPR --benchmark_display_aggregates_only=true --benchmark_out=/tmp/tmpWAkan9
2018-12-22 21:23:08
Running /home/lebedevri/rawspeed/build-clangs1-new/src/utilities/rsbench/rsbench
Run on (8 X 4013.1 MHz CPU s)
CPU Caches:
L1 Data 16K (x8)
L1 Instruction 64K (x4)
L2 Unified 2048K (x4)
L3 Unified 8192K (x1)
Load Average: 3.78, 2.50, 2.06
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations CPUTime,s CPUTime/WallTime Pixels Pixels/CPUTime Pixels/WallTime Raws/CPUTime Raws/WallTime WallTime,s
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
GOPR9172.GPR/threads:8/real_time_mean 39 ms 39 ms 128 0.311533 7.97323 12M 38.6828M 308.471M 3.22356 25.706 0.0390928
GOPR9172.GPR/threads:8/real_time_median 38 ms 38 ms 128 0.304231 7.99005 12M 39.4437M 315.527M 3.28698 26.294 0.0380316
GOPR9172.GPR/threads:8/real_time_stddev 3 ms 3 ms 128 0.0229149 0.133814 0 2.26225M 19.1421M 0.188521 1.59517 3.13671m
Comparing /home/lebedevri/rawspeed/build-clangs1-old/src/utilities/rsbench/rsbench to /home/lebedevri/rawspeed/build-clangs1-new/src/utilities/rsbench/rsbench
Benchmark Time CPU Time Old Time New CPU Old CPU New
--------------------------------------------------------------------------------------------------------------------------------------
GOPR9172.GPR/threads:8/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 128 vs 128
GOPR9172.GPR/threads:8/real_time_mean -0.0339 -0.0316 40 39 40 39
GOPR9172.GPR/threads:8/real_time_median -0.0277 -0.0274 39 38 39 38
GOPR9172.GPR/threads:8/real_time_stddev -0.1769 -0.1267 4 3 3 3
```
I.e. this results in //roughly// -3% improvements in perf.
While this will help [[ https://bugs.llvm.org/show_bug.cgi?id=36419 | PR36419 ]], it won't address it fully.
Reviewers: RKSimon, craig.topper, andreadb, spatel
Reviewed By: craig.topper
Subscribers: courbet, llvm-commits
Differential Revision: https://reviews.llvm.org/D56052
llvm-svn: 351253
This commit is contained in:
parent
f6627ce834
commit
fb4eed381d
@ -2946,25 +2946,37 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
|
||||
|
||||
SDLoc DL(Node);
|
||||
|
||||
// If we do *NOT* have BMI2, let's find out if the if the 'X' is *logically*
|
||||
// shifted (potentially with one-use trunc inbetween),
|
||||
// and if so look past one-use truncation.
|
||||
MVT XVT = NVT;
|
||||
if (!Subtarget->hasBMI2() && X.getOpcode() == ISD::TRUNCATE &&
|
||||
X.hasOneUse() && X.getOperand(0).getOpcode() == ISD::SRL) {
|
||||
assert(NVT == MVT::i32 && "Expected target valuetype to be i32");
|
||||
X = X.getOperand(0);
|
||||
XVT = X.getSimpleValueType();
|
||||
assert(XVT == MVT::i64 && "Expected truncation from i64");
|
||||
}
|
||||
|
||||
SDValue OrigNBits = NBits;
|
||||
if (NBits.getValueType() != NVT) {
|
||||
if (NBits.getValueType() != XVT) {
|
||||
// Truncate the shift amount.
|
||||
NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
|
||||
insertDAGNode(*CurDAG, OrigNBits, NBits);
|
||||
|
||||
// Insert 8-bit NBits into lowest 8 bits of NVT-sized (32 or 64-bit)
|
||||
// Insert 8-bit NBits into lowest 8 bits of XVT-sized (32 or 64-bit)
|
||||
// register. All the other bits are undefined, we do not care about them.
|
||||
SDValue ImplDef =
|
||||
SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, NVT), 0);
|
||||
SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, XVT), 0);
|
||||
insertDAGNode(*CurDAG, OrigNBits, ImplDef);
|
||||
NBits =
|
||||
CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, NVT, ImplDef, NBits);
|
||||
CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, XVT, ImplDef, NBits);
|
||||
insertDAGNode(*CurDAG, OrigNBits, NBits);
|
||||
}
|
||||
|
||||
if (Subtarget->hasBMI2()) {
|
||||
// Great, just emit the the BZHI..
|
||||
SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
|
||||
SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, XVT, X, NBits);
|
||||
ReplaceNode(Node, Extract.getNode());
|
||||
SelectCode(Extract.getNode());
|
||||
return true;
|
||||
@ -2979,7 +2991,7 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
|
||||
// Shift NBits left by 8 bits, thus producing 'control'.
|
||||
// This makes the low 8 bits to be zero.
|
||||
SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
|
||||
SDValue Control = CurDAG->getNode(ISD::SHL, DL, NVT, NBits, C8);
|
||||
SDValue Control = CurDAG->getNode(ISD::SHL, DL, XVT, NBits, C8);
|
||||
insertDAGNode(*CurDAG, OrigNBits, Control);
|
||||
|
||||
// If the 'X' is *logically* shifted, we can fold that shift into 'control'.
|
||||
@ -2992,16 +3004,23 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
|
||||
|
||||
// Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
|
||||
SDValue OrigShiftAmt = ShiftAmt;
|
||||
ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, NVT, ShiftAmt);
|
||||
ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, XVT, ShiftAmt);
|
||||
insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
|
||||
|
||||
// And now 'or' these low 8 bits of shift amount into the 'control'.
|
||||
Control = CurDAG->getNode(ISD::OR, DL, NVT, Control, ShiftAmt);
|
||||
Control = CurDAG->getNode(ISD::OR, DL, XVT, Control, ShiftAmt);
|
||||
insertDAGNode(*CurDAG, OrigNBits, Control);
|
||||
}
|
||||
|
||||
// And finally, form the BEXTR itself.
|
||||
SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, NVT, X, Control);
|
||||
SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
|
||||
|
||||
// The 'X' was originally truncated. Do that now.
|
||||
if (XVT != NVT) {
|
||||
insertDAGNode(*CurDAG, OrigNBits, Extract);
|
||||
Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
|
||||
}
|
||||
|
||||
ReplaceNode(Node, Extract.getNode());
|
||||
SelectCode(Extract.getNode());
|
||||
|
||||
|
@ -1828,11 +1828,12 @@ define i32 @bextr64_32_a1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind
|
||||
;
|
||||
; X64-BMI1NOTBM-LABEL: bextr64_32_a1:
|
||||
; X64-BMI1NOTBM: # %bb.0:
|
||||
; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx
|
||||
; X64-BMI1NOTBM-NEXT: shrq %cl, %rdi
|
||||
; X64-BMI1NOTBM-NEXT: shll $8, %edx
|
||||
; X64-BMI1NOTBM-NEXT: bextrl %edx, %edi, %eax
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $edx killed $edx def $rdx
|
||||
; X64-BMI1NOTBM-NEXT: shlq $8, %rdx
|
||||
; X64-BMI1NOTBM-NEXT: movzbl %sil, %eax
|
||||
; X64-BMI1NOTBM-NEXT: orq %rdx, %rax
|
||||
; X64-BMI1NOTBM-NEXT: bextrq %rax, %rdi, %rax
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $eax killed $eax killed $rax
|
||||
; X64-BMI1NOTBM-NEXT: retq
|
||||
;
|
||||
; X64-BMI1BMI2-LABEL: bextr64_32_a1:
|
||||
@ -2074,11 +2075,12 @@ define i32 @bextr64_32_a2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind
|
||||
;
|
||||
; X64-BMI1NOTBM-LABEL: bextr64_32_a2:
|
||||
; X64-BMI1NOTBM: # %bb.0:
|
||||
; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx
|
||||
; X64-BMI1NOTBM-NEXT: shrq %cl, %rdi
|
||||
; X64-BMI1NOTBM-NEXT: shll $8, %edx
|
||||
; X64-BMI1NOTBM-NEXT: bextrl %edx, %edi, %eax
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $edx killed $edx def $rdx
|
||||
; X64-BMI1NOTBM-NEXT: shlq $8, %rdx
|
||||
; X64-BMI1NOTBM-NEXT: movzbl %sil, %eax
|
||||
; X64-BMI1NOTBM-NEXT: orq %rdx, %rax
|
||||
; X64-BMI1NOTBM-NEXT: bextrq %rax, %rdi, %rax
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $eax killed $eax killed $rax
|
||||
; X64-BMI1NOTBM-NEXT: retq
|
||||
;
|
||||
; X64-BMI1BMI2-LABEL: bextr64_32_a2:
|
||||
@ -3660,11 +3662,12 @@ define i32 @bextr64_32_b1(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
|
||||
;
|
||||
; X64-BMI1NOTBM-LABEL: bextr64_32_b1:
|
||||
; X64-BMI1NOTBM: # %bb.0:
|
||||
; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx
|
||||
; X64-BMI1NOTBM-NEXT: shrq %cl, %rdi
|
||||
; X64-BMI1NOTBM-NEXT: shll $8, %edx
|
||||
; X64-BMI1NOTBM-NEXT: bextrl %edx, %edi, %eax
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $edx killed $edx def $rdx
|
||||
; X64-BMI1NOTBM-NEXT: shlq $8, %rdx
|
||||
; X64-BMI1NOTBM-NEXT: movzbl %sil, %eax
|
||||
; X64-BMI1NOTBM-NEXT: orq %rdx, %rax
|
||||
; X64-BMI1NOTBM-NEXT: bextrq %rax, %rdi, %rax
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $eax killed $eax killed $rax
|
||||
; X64-BMI1NOTBM-NEXT: retq
|
||||
;
|
||||
; X64-BMI1BMI2-LABEL: bextr64_32_b1:
|
||||
@ -3762,11 +3765,12 @@ define i32 @bextr64_32_b2(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
|
||||
;
|
||||
; X64-BMI1NOTBM-LABEL: bextr64_32_b2:
|
||||
; X64-BMI1NOTBM: # %bb.0:
|
||||
; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx
|
||||
; X64-BMI1NOTBM-NEXT: shrq %cl, %rdi
|
||||
; X64-BMI1NOTBM-NEXT: shll $8, %edx
|
||||
; X64-BMI1NOTBM-NEXT: bextrl %edx, %edi, %eax
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $edx killed $edx def $rdx
|
||||
; X64-BMI1NOTBM-NEXT: shlq $8, %rdx
|
||||
; X64-BMI1NOTBM-NEXT: movzbl %sil, %eax
|
||||
; X64-BMI1NOTBM-NEXT: orq %rdx, %rax
|
||||
; X64-BMI1NOTBM-NEXT: bextrq %rax, %rdi, %rax
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $eax killed $eax killed $rax
|
||||
; X64-BMI1NOTBM-NEXT: retq
|
||||
;
|
||||
; X64-BMI1BMI2-LABEL: bextr64_32_b2:
|
||||
@ -6131,11 +6135,12 @@ define i32 @bextr64_32_c1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind
|
||||
;
|
||||
; X64-BMI1NOTBM-LABEL: bextr64_32_c1:
|
||||
; X64-BMI1NOTBM: # %bb.0:
|
||||
; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx
|
||||
; X64-BMI1NOTBM-NEXT: shrq %cl, %rdi
|
||||
; X64-BMI1NOTBM-NEXT: shll $8, %edx
|
||||
; X64-BMI1NOTBM-NEXT: bextrl %edx, %edi, %eax
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $edx killed $edx def $rdx
|
||||
; X64-BMI1NOTBM-NEXT: shlq $8, %rdx
|
||||
; X64-BMI1NOTBM-NEXT: movzbl %sil, %eax
|
||||
; X64-BMI1NOTBM-NEXT: orq %rdx, %rax
|
||||
; X64-BMI1NOTBM-NEXT: bextrq %rax, %rdi, %rax
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $eax killed $eax killed $rax
|
||||
; X64-BMI1NOTBM-NEXT: retq
|
||||
;
|
||||
; X64-BMI1BMI2-LABEL: bextr64_32_c1:
|
||||
@ -6230,11 +6235,12 @@ define i32 @bextr64_32_c2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind
|
||||
;
|
||||
; X64-BMI1NOTBM-LABEL: bextr64_32_c2:
|
||||
; X64-BMI1NOTBM: # %bb.0:
|
||||
; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx
|
||||
; X64-BMI1NOTBM-NEXT: shrq %cl, %rdi
|
||||
; X64-BMI1NOTBM-NEXT: shll $8, %edx
|
||||
; X64-BMI1NOTBM-NEXT: bextrl %edx, %edi, %eax
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $edx killed $edx def $rdx
|
||||
; X64-BMI1NOTBM-NEXT: shlq $8, %rdx
|
||||
; X64-BMI1NOTBM-NEXT: movzbl %sil, %eax
|
||||
; X64-BMI1NOTBM-NEXT: orq %rdx, %rax
|
||||
; X64-BMI1NOTBM-NEXT: bextrq %rax, %rdi, %rax
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $eax killed $eax killed $rax
|
||||
; X64-BMI1NOTBM-NEXT: retq
|
||||
;
|
||||
; X64-BMI1BMI2-LABEL: bextr64_32_c2:
|
||||
@ -7756,11 +7762,12 @@ define i32 @bextr64_32_d1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind
|
||||
;
|
||||
; X64-BMI1NOTBM-LABEL: bextr64_32_d1:
|
||||
; X64-BMI1NOTBM: # %bb.0:
|
||||
; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx
|
||||
; X64-BMI1NOTBM-NEXT: shrq %cl, %rdi
|
||||
; X64-BMI1NOTBM-NEXT: shll $8, %edx
|
||||
; X64-BMI1NOTBM-NEXT: bextrl %edx, %edi, %eax
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $edx killed $edx def $rdx
|
||||
; X64-BMI1NOTBM-NEXT: shlq $8, %rdx
|
||||
; X64-BMI1NOTBM-NEXT: movzbl %sil, %eax
|
||||
; X64-BMI1NOTBM-NEXT: orq %rdx, %rax
|
||||
; X64-BMI1NOTBM-NEXT: bextrq %rax, %rdi, %rax
|
||||
; X64-BMI1NOTBM-NEXT: # kill: def $eax killed $eax killed $rax
|
||||
; X64-BMI1NOTBM-NEXT: retq
|
||||
;
|
||||
; X64-BMI1BMI2-LABEL: bextr64_32_d1:
|
||||
|
Loading…
Reference in New Issue
Block a user