mirror of
https://github.com/RPCSX/llvm.git
synced 2024-11-24 12:19:53 +00:00
[X86] 8bit divrem: Improve codegen for AH register extraction.
For 8-bit divrems where the remainder is used, we used to generate: divb %sil shrw $8, %ax movzbl %al, %eax That was to avoid an H-reg access, which is problematic mainly because it isn't possible in REX-prefixed instructions. This patch optimizes that to: divb %sil movzbl %ah, %eax To do that, we explicitly extend AH, and extract the L-subreg in the resulting register. The extension is done using the NOREX variants of MOVZX. To support signed operations, MOVSX_NOREX is also added. Further, this introduces a new SDNode type, [us]divrem_ext_hreg, which is then lowered to a sequence containing a single zext (rather than 2). Differential Revision: http://reviews.llvm.org/D6064 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@221176 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
f16d6b1ff1
commit
40453da779
@ -2412,11 +2412,14 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
|
||||
}
|
||||
|
||||
case ISD::SDIVREM:
|
||||
case ISD::UDIVREM: {
|
||||
case ISD::UDIVREM:
|
||||
case X86ISD::SDIVREM8_SEXT_HREG:
|
||||
case X86ISD::UDIVREM8_ZEXT_HREG: {
|
||||
SDValue N0 = Node->getOperand(0);
|
||||
SDValue N1 = Node->getOperand(1);
|
||||
|
||||
bool isSigned = Opcode == ISD::SDIVREM;
|
||||
bool isSigned = (Opcode == ISD::SDIVREM ||
|
||||
Opcode == X86ISD::SDIVREM8_SEXT_HREG);
|
||||
if (!isSigned) {
|
||||
switch (NVT.SimpleTy) {
|
||||
default: llvm_unreachable("Unsupported VT!");
|
||||
@ -2532,33 +2535,43 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
|
||||
SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
|
||||
}
|
||||
|
||||
// Prevent use of AH in a REX instruction by referencing AX instead.
|
||||
// Shift it down 8 bits.
|
||||
// Prevent use of AH in a REX instruction by explicitly copying it to
|
||||
// an ABCD_L register.
|
||||
//
|
||||
// The current assumption of the register allocator is that isel
|
||||
// won't generate explicit references to the GPR8_NOREX registers. If
|
||||
// won't generate explicit references to the GR8_ABCD_H registers. If
|
||||
// the allocator and/or the backend get enhanced to be more robust in
|
||||
// that regard, this can be, and should be, removed.
|
||||
if (HiReg == X86::AH && Subtarget->is64Bit() &&
|
||||
!SDValue(Node, 1).use_empty()) {
|
||||
SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
|
||||
X86::AX, MVT::i16, InFlag);
|
||||
InFlag = Result.getValue(2);
|
||||
if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
|
||||
SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
|
||||
unsigned AHExtOpcode =
|
||||
isSigned ? X86::MOVSX32_NOREXrr8 : X86::MOVZX32_NOREXrr8;
|
||||
|
||||
// If we also need AL (the quotient), get it by extracting a subreg from
|
||||
// Result. The fast register allocator does not like multiple CopyFromReg
|
||||
// nodes using aliasing registers.
|
||||
if (!SDValue(Node, 0).use_empty())
|
||||
ReplaceUses(SDValue(Node, 0),
|
||||
CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
|
||||
SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
|
||||
MVT::Glue, AHCopy, InFlag);
|
||||
SDValue Result(RNode, 0);
|
||||
InFlag = SDValue(RNode, 1);
|
||||
|
||||
// Shift AX right by 8 bits instead of using AH.
|
||||
Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
|
||||
Result,
|
||||
CurDAG->getTargetConstant(8, MVT::i8)),
|
||||
0);
|
||||
ReplaceUses(SDValue(Node, 1),
|
||||
CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
|
||||
if (Opcode == X86ISD::UDIVREM8_ZEXT_HREG ||
|
||||
Opcode == X86ISD::SDIVREM8_SEXT_HREG) {
|
||||
if (Node->getValueType(1) == MVT::i64) {
|
||||
// It's not possible to directly movsx AH to a 64bit register, because
|
||||
// the latter needs the REX prefix, but the former can't have it.
|
||||
assert(Opcode != X86ISD::SDIVREM8_SEXT_HREG &&
|
||||
"Unexpected i64 sext of h-register");
|
||||
Result =
|
||||
SDValue(CurDAG->getMachineNode(
|
||||
TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
|
||||
CurDAG->getTargetConstant(0, MVT::i64), Result,
|
||||
CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)),
|
||||
0);
|
||||
}
|
||||
} else {
|
||||
Result =
|
||||
CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
|
||||
}
|
||||
ReplaceUses(SDValue(Node, 1), Result);
|
||||
DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
|
||||
}
|
||||
// Copy the division (low) result, if it is needed.
|
||||
if (!SDValue(Node, 0).use_empty()) {
|
||||
|
@ -19080,6 +19080,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
case X86ISD::SBB: return "X86ISD::SBB";
|
||||
case X86ISD::SMUL: return "X86ISD::SMUL";
|
||||
case X86ISD::UMUL: return "X86ISD::UMUL";
|
||||
case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
|
||||
case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
|
||||
case X86ISD::INC: return "X86ISD::INC";
|
||||
case X86ISD::DEC: return "X86ISD::DEC";
|
||||
case X86ISD::OR: return "X86ISD::OR";
|
||||
@ -24278,13 +24280,29 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
|
||||
static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
|
||||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
const X86Subtarget *Subtarget) {
|
||||
SDValue N0 = N->getOperand(0);
|
||||
EVT VT = N->getValueType(0);
|
||||
|
||||
// (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
|
||||
// (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
|
||||
// This exposes the sext to the sdivrem lowering, so that it directly extends
|
||||
// from AH (which we otherwise need to do contortions to access).
|
||||
if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
|
||||
N0.getValueType() == MVT::i8 && VT == MVT::i32) {
|
||||
SDLoc dl(N);
|
||||
SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
|
||||
SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
|
||||
N0.getOperand(0), N0.getOperand(1));
|
||||
DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
|
||||
return R.getValue(1);
|
||||
}
|
||||
|
||||
if (!DCI.isBeforeLegalizeOps())
|
||||
return SDValue();
|
||||
|
||||
if (!Subtarget->hasFp256())
|
||||
return SDValue();
|
||||
|
||||
EVT VT = N->getValueType(0);
|
||||
if (VT.isVector() && VT.getSizeInBits() == 256) {
|
||||
SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
|
||||
if (R.getNode())
|
||||
@ -24377,6 +24395,20 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
|
||||
return R;
|
||||
}
|
||||
|
||||
// (i8,i32 zext (udivrem (i8 x, i8 y)) ->
|
||||
// (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
|
||||
// This exposes the zext to the udivrem lowering, so that it directly extends
|
||||
// from AH (which we otherwise need to do contortions to access).
|
||||
if (N0.getOpcode() == ISD::UDIVREM &&
|
||||
N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
|
||||
(VT == MVT::i32 || VT == MVT::i64)) {
|
||||
SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
|
||||
SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
|
||||
N0.getOperand(0), N0.getOperand(1));
|
||||
DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
|
||||
return R.getValue(1);
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
|
@ -304,6 +304,10 @@ namespace llvm {
|
||||
// 8-bit SMUL/UMUL - AX, FLAGS = smul8/umul8 AL, RHS
|
||||
SMUL8, UMUL8,
|
||||
|
||||
// 8-bit divrem that zero-extend the high result (AH).
|
||||
UDIVREM8_ZEXT_HREG,
|
||||
SDIVREM8_SEXT_HREG,
|
||||
|
||||
// MUL_IMM - X86 specific multiply by immediate.
|
||||
MUL_IMM,
|
||||
|
||||
|
@ -97,13 +97,23 @@ def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
|
||||
let neverHasSideEffects = 1, isCodeGenOnly = 1 in {
|
||||
def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
|
||||
(outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
|
||||
"movz{bl|x}\t{$src, $dst|$dst, $src}",
|
||||
"movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX",
|
||||
[], IIC_MOVZX>, TB, Sched<[WriteALU]>;
|
||||
let mayLoad = 1 in
|
||||
def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
|
||||
(outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
|
||||
"movz{bl|x}\t{$src, $dst|$dst, $src}",
|
||||
"movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX",
|
||||
[], IIC_MOVZX>, TB, Sched<[WriteALULd]>;
|
||||
|
||||
def MOVSX32_NOREXrr8 : I<0xBE, MRMSrcReg,
|
||||
(outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
|
||||
"movs{bl|x}\t{$src, $dst|$dst, $src} # NOREX",
|
||||
[], IIC_MOVSX>, TB, Sched<[WriteALU]>;
|
||||
let mayLoad = 1 in
|
||||
def MOVSX32_NOREXrm8 : I<0xBE, MRMSrcMem,
|
||||
(outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
|
||||
"movs{bl|x}\t{$src, $dst|$dst, $src} # NOREX",
|
||||
[], IIC_MOVSX>, TB, Sched<[WriteALULd]>;
|
||||
}
|
||||
|
||||
// MOVSX64rr8 always has a REX prefix and it has an 8-bit register
|
||||
|
100
test/CodeGen/X86/divrem8_ext.ll
Normal file
100
test/CodeGen/X86/divrem8_ext.ll
Normal file
@ -0,0 +1,100 @@
|
||||
; RUN: llc -march=x86-64 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-64
|
||||
; RUN: llc -march=x86 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-32
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-apple-macosx10.10.0"
|
||||
|
||||
define zeroext i8 @test_udivrem_zext_ah(i8 %x, i8 %y) {
|
||||
; CHECK-LABEL: test_udivrem_zext_ah
|
||||
; CHECK: divb
|
||||
; CHECK: movzbl %ah, [[REG_REM:%[a-z0-9]+]]
|
||||
; CHECK: movb %al, ([[REG_ZPTR:%[a-z0-9]+]])
|
||||
; CHECK: movl [[REG_REM]], %eax
|
||||
; CHECK: ret
|
||||
%div = udiv i8 %x, %y
|
||||
store i8 %div, i8* @z
|
||||
%1 = urem i8 %x, %y
|
||||
ret i8 %1
|
||||
}
|
||||
|
||||
define zeroext i8 @test_urem_zext_ah(i8 %x, i8 %y) {
|
||||
; CHECK-LABEL: test_urem_zext_ah
|
||||
; CHECK: divb
|
||||
; CHECK: movzbl %ah, %eax
|
||||
; CHECK: ret
|
||||
%1 = urem i8 %x, %y
|
||||
ret i8 %1
|
||||
}
|
||||
|
||||
define i8 @test_urem_noext_ah(i8 %x, i8 %y) {
|
||||
; CHECK-LABEL: test_urem_noext_ah
|
||||
; CHECK: divb [[REG_X:%[a-z0-9]+]]
|
||||
; CHECK: movzbl %ah, %eax
|
||||
; CHECK: addb [[REG_X]], %al
|
||||
; CHECK: ret
|
||||
%1 = urem i8 %x, %y
|
||||
%2 = add i8 %1, %y
|
||||
ret i8 %2
|
||||
}
|
||||
|
||||
define i64 @test_urem_zext64_ah(i8 %x, i8 %y) {
|
||||
; CHECK-LABEL: test_urem_zext64_ah
|
||||
; CHECK: divb
|
||||
; CHECK: movzbl %ah, %eax
|
||||
; CHECK-32: xorl %edx, %edx
|
||||
; CHECK: ret
|
||||
%1 = urem i8 %x, %y
|
||||
%2 = zext i8 %1 to i64
|
||||
ret i64 %2
|
||||
}
|
||||
|
||||
define signext i8 @test_sdivrem_sext_ah(i8 %x, i8 %y) {
|
||||
; CHECK-LABEL: test_sdivrem_sext_ah
|
||||
; CHECK: cbtw
|
||||
; CHECK: idivb
|
||||
; CHECK: movsbl %ah, [[REG_REM:%[a-z0-9]+]]
|
||||
; CHECK: movb %al, ([[REG_ZPTR]])
|
||||
; CHECK: movl [[REG_REM]], %eax
|
||||
; CHECK: ret
|
||||
%div = sdiv i8 %x, %y
|
||||
store i8 %div, i8* @z
|
||||
%1 = srem i8 %x, %y
|
||||
ret i8 %1
|
||||
}
|
||||
|
||||
define signext i8 @test_srem_sext_ah(i8 %x, i8 %y) {
|
||||
; CHECK-LABEL: test_srem_sext_ah
|
||||
; CHECK: cbtw
|
||||
; CHECK: idivb
|
||||
; CHECK: movsbl %ah, %eax
|
||||
; CHECK: ret
|
||||
%1 = srem i8 %x, %y
|
||||
ret i8 %1
|
||||
}
|
||||
|
||||
define i8 @test_srem_noext_ah(i8 %x, i8 %y) {
|
||||
; CHECK-LABEL: test_srem_noext_ah
|
||||
; CHECK: cbtw
|
||||
; CHECK: idivb [[REG_X:%[a-z0-9]+]]
|
||||
; CHECK: movsbl %ah, %eax
|
||||
; CHECK: addb [[REG_X]], %al
|
||||
; CHECK: ret
|
||||
%1 = srem i8 %x, %y
|
||||
%2 = add i8 %1, %y
|
||||
ret i8 %2
|
||||
}
|
||||
|
||||
define i64 @test_srem_sext64_ah(i8 %x, i8 %y) {
|
||||
; CHECK-LABEL: test_srem_sext64_ah
|
||||
; CHECK: cbtw
|
||||
; CHECK: idivb
|
||||
; CHECK: movsbl %ah, %eax
|
||||
; CHECK-32: movl %eax, %edx
|
||||
; CHECK-32: sarl $31, %edx
|
||||
; CHECK-64: movsbq %al, %rax
|
||||
; CHECK: ret
|
||||
%1 = srem i8 %x, %y
|
||||
%2 = sext i8 %1 to i64
|
||||
ret i64 %2
|
||||
}
|
||||
|
||||
@z = external global i8
|
Loading…
Reference in New Issue
Block a user