mirror of
https://github.com/RPCS3/llvm.git
synced 2024-12-27 06:35:30 +00:00
R600: Fix inconsistency in rsq instructions.
R600 was using a clamped version of rsq, but SI was not. Add a new rsq_clamped intrinsic and use them consistently. It's unclear to me from the documentation what behavior the R600 instructions have, so I assume they have the legacy behavior described by the SI documents. For R600, use RECIPSQRT_IEEE for both llvm.AMDGPU.rsq.legacy and llvm.AMDGPU.rsq. R600 also has RECIPSQRT_FF, which I'm not sure how it fits in here. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211637 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
0029534141
commit
95eb45c5d9
@ -66,4 +66,7 @@ def int_AMDGPU_rcp : GCCBuiltin<"__builtin_amdgpu_rcp">,
|
||||
def int_AMDGPU_rsq : GCCBuiltin<"__builtin_amdgpu_rsq">,
|
||||
Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
|
||||
|
||||
def int_AMDGPU_rsq_clamped : GCCBuiltin<"__builtin_amdgpu_rsq_clamped">,
|
||||
Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
|
||||
|
||||
} // End TargetPrefix = "AMDGPU"
|
||||
|
@ -805,6 +805,12 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
||||
case Intrinsic::AMDGPU_rsq:
|
||||
return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
|
||||
|
||||
case AMDGPUIntrinsic::AMDGPU_legacy_rsq:
|
||||
return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
|
||||
|
||||
case Intrinsic::AMDGPU_rsq_clamped:
|
||||
return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
|
||||
|
||||
case AMDGPUIntrinsic::AMDGPU_imax:
|
||||
return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
|
||||
Op.getOperand(2));
|
||||
@ -2052,6 +2058,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
NODE_NAME_CASE(TRIG_PREOP)
|
||||
NODE_NAME_CASE(RCP)
|
||||
NODE_NAME_CASE(RSQ)
|
||||
NODE_NAME_CASE(RSQ_LEGACY)
|
||||
NODE_NAME_CASE(RSQ_CLAMPED)
|
||||
NODE_NAME_CASE(DOT4)
|
||||
NODE_NAME_CASE(BFE_U32)
|
||||
NODE_NAME_CASE(BFE_I32)
|
||||
|
@ -192,6 +192,8 @@ enum {
|
||||
// For f64, max error 2^29 ULP, handles denormals.
|
||||
RCP,
|
||||
RSQ,
|
||||
RSQ_LEGACY,
|
||||
RSQ_CLAMPED,
|
||||
DOT4,
|
||||
BFE_U32, // Extract range of bits with zero extension to 32-bits.
|
||||
BFE_I32, // Extract range of bits with sign extension to 32-bits.
|
||||
|
@ -43,6 +43,12 @@ def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
|
||||
// out = 1.0 / sqrt(a)
|
||||
def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
|
||||
|
||||
// out = 1.0 / sqrt(a)
|
||||
def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
|
||||
|
||||
// out = 1.0 / sqrt(a) result clamped to +/- max_float.
|
||||
def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>;
|
||||
|
||||
// out = max(a, b) a and b are floats
|
||||
def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp,
|
||||
[SDNPCommutative, SDNPAssociative]
|
||||
|
@ -524,10 +524,17 @@ class RcpPat<Instruction RcpInst, ValueType vt> : Pat <
|
||||
(RcpInst $src)
|
||||
>;
|
||||
|
||||
class RsqPat<Instruction RsqInst, ValueType vt> : Pat <
|
||||
(AMDGPUrcp (fsqrt vt:$src)),
|
||||
(RsqInst $src)
|
||||
>;
|
||||
multiclass RsqPat<Instruction RsqInst, ValueType vt> {
|
||||
def : Pat <
|
||||
(fdiv FP_ONE, (fsqrt vt:$src)),
|
||||
(RsqInst $src)
|
||||
>;
|
||||
|
||||
def : Pat <
|
||||
(AMDGPUrcp (fsqrt vt:$src)),
|
||||
(RsqInst $src)
|
||||
>;
|
||||
}
|
||||
|
||||
include "R600Instructions.td"
|
||||
include "R700Instructions.td"
|
||||
|
@ -24,6 +24,14 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
|
||||
def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
|
||||
def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
|
||||
def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
|
||||
|
||||
// This is named backwards (instead of rsq_legacy) so we don't have
|
||||
// to define it with the public builtins intrinsics. This is a
|
||||
// workaround for how intrinsic names are parsed. If the name is
|
||||
// llvm.AMDGPU.rsq.legacy, the parser assumes that you meant
|
||||
// llvm.AMDGPU.rsq.{f32 | f64} and incorrectly mangled the name.
|
||||
def int_AMDGPU_legacy_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
|
||||
|
||||
def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
|
||||
def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
|
||||
def int_AMDGPU_kilp : Intrinsic<[], [], []>;
|
||||
|
@ -814,6 +814,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
|
||||
case Intrinsic::r600_read_tidig_z:
|
||||
return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
|
||||
AMDGPU::T0_Z, VT);
|
||||
case Intrinsic::AMDGPU_rsq:
|
||||
// XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
|
||||
return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
|
||||
}
|
||||
// break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
|
||||
break;
|
||||
|
@ -1079,18 +1079,21 @@ class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
|
||||
let Itinerary = TransALU;
|
||||
}
|
||||
|
||||
// Clamped to maximum.
|
||||
class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
|
||||
inst, "RECIPSQRT_CLAMPED", AMDGPUrsq
|
||||
inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamped
|
||||
> {
|
||||
let Itinerary = TransALU;
|
||||
}
|
||||
|
||||
class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP <
|
||||
inst, "RECIPSQRT_IEEE", []
|
||||
class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
|
||||
inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy
|
||||
> {
|
||||
let Itinerary = TransALU;
|
||||
}
|
||||
|
||||
// TODO: There is also RECIPSQRT_FF which clamps to zero.
|
||||
|
||||
class SIN_Common <bits<11> inst> : R600_1OP <
|
||||
inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{
|
||||
let Trig = 1;
|
||||
|
@ -1123,22 +1123,26 @@ defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32",
|
||||
[(set f32:$dst, (AMDGPUrcp f32:$src0))]
|
||||
>;
|
||||
defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>;
|
||||
defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>;
|
||||
defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32",
|
||||
[(set f32:$dst, (AMDGPUrsq_clamped f32:$src0))]
|
||||
>;
|
||||
defm V_RSQ_LEGACY_F32 : VOP1_32 <
|
||||
0x0000002d, "V_RSQ_LEGACY_F32",
|
||||
[(set f32:$dst, (AMDGPUrsq f32:$src0))]
|
||||
[(set f32:$dst, (AMDGPUrsq_legacy f32:$src0))]
|
||||
>;
|
||||
defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32",
|
||||
[(set f32:$dst, (fdiv FP_ONE, (fsqrt f32:$src0)))]
|
||||
[(set f32:$dst, (AMDGPUrsq f32:$src0))]
|
||||
>;
|
||||
defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64",
|
||||
[(set f64:$dst, (AMDGPUrcp f64:$src0))]
|
||||
>;
|
||||
defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>;
|
||||
defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64",
|
||||
[(set f64:$dst, (fdiv FP_ONE, (fsqrt f64:$src0)))]
|
||||
[(set f64:$dst, (AMDGPUrsq f64:$src0))]
|
||||
>;
|
||||
defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64",
|
||||
[(set f64:$dst, (AMDGPUrsq_clamped f64:$src0))]
|
||||
>;
|
||||
defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", []>;
|
||||
defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32",
|
||||
[(set f32:$dst, (fsqrt f32:$src0))]
|
||||
>;
|
||||
@ -1781,8 +1785,8 @@ def : Pat <
|
||||
|
||||
def : RcpPat<V_RCP_F32_e32, f32>;
|
||||
def : RcpPat<V_RCP_F64_e32, f64>;
|
||||
def : RsqPat<V_RSQ_F32_e32, f32>;
|
||||
def : RsqPat<V_RSQ_F64_e32, f64>;
|
||||
defm : RsqPat<V_RSQ_F32_e32, f32>;
|
||||
defm : RsqPat<V_RSQ_F64_e32, f64>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// VOP2 Patterns
|
||||
|
13
test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll
Normal file
13
test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll
Normal file
@ -0,0 +1,13 @@
|
||||
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||
|
||||
declare float @llvm.AMDGPU.legacy.rsq(float) nounwind readnone
|
||||
|
||||
; FUNC-LABEL: @rsq_legacy_f32
|
||||
; SI: V_RSQ_LEGACY_F32_e32
|
||||
; EG: RECIPSQRT_IEEE
|
||||
define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) nounwind {
|
||||
%rsq = call float @llvm.AMDGPU.legacy.rsq(float %src) nounwind readnone
|
||||
store float %rsq, float addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
11
test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll
Normal file
11
test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll
Normal file
@ -0,0 +1,11 @@
|
||||
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
|
||||
declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone
|
||||
|
||||
; FUNC-LABEL: @rsq_clamped_f64
|
||||
; SI: V_RSQ_CLAMP_F64_e32
|
||||
define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind {
|
||||
%rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone
|
||||
store double %rsq_clamped, double addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
14
test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll
Normal file
14
test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll
Normal file
@ -0,0 +1,14 @@
|
||||
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||
|
||||
|
||||
declare float @llvm.AMDGPU.rsq.clamped.f32(float) nounwind readnone
|
||||
|
||||
; FUNC-LABEL: @rsq_clamped_f32
|
||||
; SI: V_RSQ_CLAMP_F32_e32
|
||||
; EG: RECIPSQRT_CLAMPED
|
||||
define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
|
||||
%rsq_clamped = call float @llvm.AMDGPU.rsq.clamped.f32(float %src) nounwind readnone
|
||||
store float %rsq_clamped, float addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
13
test/CodeGen/R600/llvm.AMDGPU.rsq.ll
Normal file
13
test/CodeGen/R600/llvm.AMDGPU.rsq.ll
Normal file
@ -0,0 +1,13 @@
|
||||
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||
|
||||
declare float @llvm.AMDGPU.rsq.f32(float) nounwind readnone
|
||||
|
||||
; FUNC-LABEL: @rsq_f32
|
||||
; SI: V_RSQ_F32_e32
|
||||
; EG: RECIPSQRT_IEEE
|
||||
define void @rsq_f32(float addrspace(1)* %out, float %src) nounwind {
|
||||
%rsq = call float @llvm.AMDGPU.rsq.f32(float %src) nounwind readnone
|
||||
store float %rsq, float addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue
Block a user