mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2025-02-03 16:03:21 +00:00
Add 256-bit load/store recognition and matching in several places.
llvm-svn: 135171
This commit is contained in:
parent
2dc4b55bd8
commit
6778597deb
@ -301,12 +301,17 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
|
||||
{ X86::MOVAPDrr, X86::MOVAPDmr, 0, 16 },
|
||||
{ X86::MOVAPSrr, X86::MOVAPSmr, 0, 16 },
|
||||
{ X86::MOVDQArr, X86::MOVDQAmr, 0, 16 },
|
||||
{ X86::VMOVAPDYrr, X86::VMOVAPDYmr, 0, 32 },
|
||||
{ X86::VMOVAPSYrr, X86::VMOVAPSYmr, 0, 32 },
|
||||
{ X86::VMOVDQAYrr, X86::VMOVDQAYmr, 0, 32 },
|
||||
{ X86::MOVPDI2DIrr, X86::MOVPDI2DImr, 0, 0 },
|
||||
{ X86::MOVPQIto64rr,X86::MOVPQI2QImr, 0, 0 },
|
||||
{ X86::MOVSDto64rr, X86::MOVSDto64mr, 0, 0 },
|
||||
{ X86::MOVSS2DIrr, X86::MOVSS2DImr, 0, 0 },
|
||||
{ X86::MOVUPDrr, X86::MOVUPDmr, 0, 0 },
|
||||
{ X86::MOVUPSrr, X86::MOVUPSmr, 0, 0 },
|
||||
{ X86::VMOVUPDYrr, X86::VMOVUPDYmr, 0, 0 },
|
||||
{ X86::VMOVUPSYrr, X86::VMOVUPSYmr, 0, 0 },
|
||||
{ X86::MUL16r, X86::MUL16m, 1, 0 },
|
||||
{ X86::MUL32r, X86::MUL32m, 1, 0 },
|
||||
{ X86::MUL64r, X86::MUL64m, 1, 0 },
|
||||
@ -411,10 +416,13 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
|
||||
{ X86::MOV8rr, X86::MOV8rm, 0 },
|
||||
{ X86::MOVAPDrr, X86::MOVAPDrm, 16 },
|
||||
{ X86::MOVAPSrr, X86::MOVAPSrm, 16 },
|
||||
{ X86::VMOVAPDYrr, X86::VMOVAPDYrm, 32 },
|
||||
{ X86::VMOVAPSYrr, X86::VMOVAPSYrm, 32 },
|
||||
{ X86::MOVDDUPrr, X86::MOVDDUPrm, 0 },
|
||||
{ X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
|
||||
{ X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 },
|
||||
{ X86::MOVDQArr, X86::MOVDQArm, 16 },
|
||||
{ X86::VMOVDQAYrr, X86::VMOVDQAYrm, 16 },
|
||||
{ X86::MOVSHDUPrr, X86::MOVSHDUPrm, 16 },
|
||||
{ X86::MOVSLDUPrr, X86::MOVSLDUPrm, 16 },
|
||||
{ X86::MOVSX16rr8, X86::MOVSX16rm8, 0 },
|
||||
@ -425,6 +433,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
|
||||
{ X86::MOVSX64rr8, X86::MOVSX64rm8, 0 },
|
||||
{ X86::MOVUPDrr, X86::MOVUPDrm, 16 },
|
||||
{ X86::MOVUPSrr, X86::MOVUPSrm, 0 },
|
||||
{ X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 },
|
||||
{ X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 },
|
||||
{ X86::MOVZDI2PDIrr, X86::MOVZDI2PDIrm, 0 },
|
||||
{ X86::MOVZQI2PQIrr, X86::MOVZQI2PQIrm, 0 },
|
||||
{ X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, 16 },
|
||||
@ -787,6 +797,9 @@ static bool isFrameLoadOpcode(int Opcode) {
|
||||
case X86::MOVAPSrm:
|
||||
case X86::MOVAPDrm:
|
||||
case X86::MOVDQArm:
|
||||
case X86::VMOVAPSYrm:
|
||||
case X86::VMOVAPDYrm:
|
||||
case X86::VMOVDQAYrm:
|
||||
case X86::MMX_MOVD64rm:
|
||||
case X86::MMX_MOVQ64rm:
|
||||
return true;
|
||||
@ -808,6 +821,9 @@ static bool isFrameStoreOpcode(int Opcode) {
|
||||
case X86::MOVAPSmr:
|
||||
case X86::MOVAPDmr:
|
||||
case X86::MOVDQAmr:
|
||||
case X86::VMOVAPSYmr:
|
||||
case X86::VMOVAPDYmr:
|
||||
case X86::VMOVDQAYmr:
|
||||
case X86::MMX_MOVD64mr:
|
||||
case X86::MMX_MOVQ64mr:
|
||||
case X86::MMX_MOVNTQmr:
|
||||
@ -926,6 +942,10 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
|
||||
case X86::MOVUPSrm:
|
||||
case X86::MOVAPDrm:
|
||||
case X86::MOVDQArm:
|
||||
case X86::VMOVAPSYrm:
|
||||
case X86::VMOVUPSYrm:
|
||||
case X86::VMOVAPDYrm:
|
||||
case X86::VMOVDQAYrm:
|
||||
case X86::MMX_MOVD64rm:
|
||||
case X86::MMX_MOVQ64rm:
|
||||
case X86::FsMOVAPSrm:
|
||||
@ -1975,6 +1995,8 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
|
||||
Opc = X86::MOV8rr;
|
||||
} else if (X86::VR128RegClass.contains(DestReg, SrcReg))
|
||||
Opc = X86::MOVAPSrr;
|
||||
else if (X86::VR256RegClass.contains(DestReg, SrcReg))
|
||||
Opc = X86::VMOVAPSYrr;
|
||||
else if (X86::VR64RegClass.contains(DestReg, SrcReg))
|
||||
Opc = X86::MMX_MOVQ64rr;
|
||||
else
|
||||
@ -2064,6 +2086,13 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
|
||||
return load ? X86::MOVAPSrm : X86::MOVAPSmr;
|
||||
else
|
||||
return load ? X86::MOVUPSrm : X86::MOVUPSmr;
|
||||
case 32:
|
||||
assert(X86::VR256RegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
|
||||
// If stack is realigned we can use aligned stores.
|
||||
if (isStackAligned)
|
||||
return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr;
|
||||
else
|
||||
return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr;
|
||||
}
|
||||
}
|
||||
|
||||
@ -2853,6 +2882,11 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
|
||||
case X86::MOVAPDrm:
|
||||
case X86::MOVDQArm:
|
||||
case X86::MOVDQUrm:
|
||||
case X86::VMOVAPSYrm:
|
||||
case X86::VMOVUPSYrm:
|
||||
case X86::VMOVAPDYrm:
|
||||
case X86::VMOVDQAYrm:
|
||||
case X86::VMOVDQUYrm:
|
||||
break;
|
||||
}
|
||||
switch (Opc2) {
|
||||
@ -2875,6 +2909,11 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
|
||||
case X86::MOVAPDrm:
|
||||
case X86::MOVDQArm:
|
||||
case X86::MOVDQUrm:
|
||||
case X86::VMOVAPSYrm:
|
||||
case X86::VMOVUPSYrm:
|
||||
case X86::VMOVAPDYrm:
|
||||
case X86::VMOVDQAYrm:
|
||||
case X86::VMOVDQUYrm:
|
||||
break;
|
||||
}
|
||||
|
||||
@ -3053,6 +3092,13 @@ static const unsigned ReplaceableInstrs[][3] = {
|
||||
{ X86::AVX_SET0PS, X86::AVX_SET0PD, X86::AVX_SET0PI },
|
||||
{ X86::VXORPSrm, X86::VXORPDrm, X86::VPXORrm },
|
||||
{ X86::VXORPSrr, X86::VXORPDrr, X86::VPXORrr },
|
||||
// AVX 256-bit support
|
||||
{ X86::VMOVAPSYmr, X86::VMOVAPDYmr, X86::VMOVDQAYmr },
|
||||
{ X86::VMOVAPSYrm, X86::VMOVAPDYrm, X86::VMOVDQAYrm },
|
||||
{ X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr },
|
||||
{ X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr },
|
||||
{ X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm },
|
||||
{ X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr },
|
||||
};
|
||||
|
||||
// FIXME: Some shuffle and unpack instructions have equivalents in different
|
||||
|
@ -2022,7 +2022,10 @@ def V_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "",
|
||||
}
|
||||
|
||||
// The same as done above but for AVX. The 128-bit versions are the
|
||||
// same, but re-encoded. The 256-bit does not support PI version.
|
||||
// same, but re-encoded. The 256-bit does not support PI version, and
|
||||
// doesn't need it because on sandy bridge the register is set to zero
|
||||
// at the rename stage without using any execution unit, so SET0PSY
|
||||
// and SET0PDY can be used for vector int instructions without penalty
|
||||
// FIXME: Change encoding to pseudo! This is blocked right now by the x86
|
||||
// JIT implementatioan, it does not expand the instructions below like
|
||||
// X86MCInstLower does.
|
||||
@ -2037,8 +2040,8 @@ def AVX_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
|
||||
def AVX_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
|
||||
[(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V;
|
||||
let ExeDomain = SSEPackedInt in
|
||||
def AVX_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "",
|
||||
[(set VR128:$dst, (v4i32 immAllZerosV))]>;
|
||||
def AVX_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "",
|
||||
[(set VR128:$dst, (v4i32 immAllZerosV))]>;
|
||||
}
|
||||
|
||||
def : Pat<(v2i64 immAllZerosV), (V_SET0PI)>;
|
||||
@ -3831,6 +3834,8 @@ def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
|
||||
(CVTTPS2DQrr VR128:$src)>, Requires<[HasSSE2]>;
|
||||
|
||||
// Use movaps / movups for SSE integer load / store (one byte shorter).
|
||||
// The instructions selected below are then converted to MOVDQA/MOVDQU
|
||||
// during the SSE domain pass.
|
||||
let Predicates = [HasSSE1] in {
|
||||
def : Pat<(alignedloadv4i32 addr:$src),
|
||||
(MOVAPSrm addr:$src)>;
|
||||
@ -3859,8 +3864,9 @@ let Predicates = [HasSSE1] in {
|
||||
(MOVUPSmr addr:$dst, VR128:$src)>;
|
||||
}
|
||||
|
||||
// Use vmovaps/vmovups for AVX 128-bit integer load/store (one byte shorter).
|
||||
// Use vmovaps/vmovups for AVX integer load/store.
|
||||
let Predicates = [HasAVX] in {
|
||||
// 128-bit load/store
|
||||
def : Pat<(alignedloadv4i32 addr:$src),
|
||||
(VMOVAPSrm addr:$src)>;
|
||||
def : Pat<(loadv4i32 addr:$src),
|
||||
@ -3886,6 +3892,24 @@ let Predicates = [HasAVX] in {
|
||||
(VMOVUPSmr addr:$dst, VR128:$src)>;
|
||||
def : Pat<(store (v16i8 VR128:$src), addr:$dst),
|
||||
(VMOVUPSmr addr:$dst, VR128:$src)>;
|
||||
|
||||
// 256-bit load/store
|
||||
def : Pat<(alignedloadv4i64 addr:$src),
|
||||
(VMOVAPSYrm addr:$src)>;
|
||||
def : Pat<(loadv4i64 addr:$src),
|
||||
(VMOVUPSYrm addr:$src)>;
|
||||
def : Pat<(alignedloadv8i32 addr:$src),
|
||||
(VMOVAPSYrm addr:$src)>;
|
||||
def : Pat<(loadv8i32 addr:$src),
|
||||
(VMOVUPSYrm addr:$src)>;
|
||||
def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
|
||||
(VMOVAPSYmr addr:$dst, VR256:$src)>;
|
||||
def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
|
||||
(VMOVAPSYmr addr:$dst, VR256:$src)>;
|
||||
def : Pat<(store (v4i64 VR256:$src), addr:$dst),
|
||||
(VMOVUPSYmr addr:$dst, VR256:$src)>;
|
||||
def : Pat<(store (v8i32 VR256:$src), addr:$dst),
|
||||
(VMOVUPSYmr addr:$dst, VR256:$src)>;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
24
llvm/test/CodeGen/X86/avx-load-store.ll
Normal file
24
llvm/test/CodeGen/X86/avx-load-store.ll
Normal file
@ -0,0 +1,24 @@
|
||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
|
||||
|
||||
; CHECK: vmovaps
|
||||
; CHECK: vmovaps
|
||||
; CHECK: vmovapd
|
||||
; CHECK: vmovapd
|
||||
; CHECK: vmovaps
|
||||
; CHECK: vmovaps
|
||||
define void @test_256_load(double* nocapture %d, float* nocapture %f, <4 x i64>* nocapture %i) nounwind uwtable ssp {
|
||||
entry:
|
||||
%0 = bitcast double* %d to <4 x double>*
|
||||
%tmp1.i = load <4 x double>* %0, align 32
|
||||
%1 = bitcast float* %f to <8 x float>*
|
||||
%tmp1.i17 = load <8 x float>* %1, align 32
|
||||
%tmp1.i16 = load <4 x i64>* %i, align 32
|
||||
tail call void @dummy(<4 x double> %tmp1.i, <8 x float> %tmp1.i17, <4 x i64> %tmp1.i16) nounwind
|
||||
store <4 x double> %tmp1.i, <4 x double>* %0, align 32
|
||||
store <8 x float> %tmp1.i17, <8 x float>* %1, align 32
|
||||
store <4 x i64> %tmp1.i16, <4 x i64>* %i, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @dummy(<4 x double>, <8 x float>, <4 x i64>)
|
||||
|
Loading…
x
Reference in New Issue
Block a user