Optimized instruction sequence for sitofp operation on X86-32

Optimized sitofp i64 %x to double. The current sequence movl %ecx, 8(%esp) movl %edx, 12(%esp) fildll 8(%esp) is replaced with: movd %ecx, %xmm0 movd %edx, %xmm1 punpckldq %xmm1, %xmm0 movq %xmm0, 8(%esp) Differential Revision: http://reviews.llvm.org/D15946 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@257285 91177308-0d34-0410-b5e6-96231b3b80d8
2025-02-03 11:08:32 +00:00 · 2016-01-10 09:41:22 +00:00 · 2016-01-10 09:41:22 +00:00 · d6de44078b
commit d6de44078b
parent ca4af1ae34
3 changed files with 86 additions and 15 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -265,7 +265,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      // Without SSE, i64->f64 goes through memory.
      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
    }
-  }
+  } else if (!Subtarget->is64Bit())
+    setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);

  // Scalar integer divide and remainder are lowered to use operations that
  // produce two results, to match the available instructions. This exposes
@ -12672,13 +12673,21 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
    return Op;
  }

+  SDValue ValueToStore = Op.getOperand(0);
+  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
+      !Subtarget->is64Bit())
+    // Bitcasting to f64 here allows us to do a single 64-bit store from
+    // an SSE register, avoiding the store forwarding penalty that would come
+    // with two 32-bit stores.
+    ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
+
  unsigned Size = SrcVT.getSizeInBits()/8;
  MachineFunction &MF = DAG.getMachineFunction();
  auto PtrVT = getPointerTy(MF.getDataLayout());
  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
  SDValue Chain = DAG.getStore(
-      DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot,
+      DAG.getEntryNode(), dl, ValueToStore, StackSlot,
      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false,
      false, 0);
  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
@ -13051,7 +13060,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
  }

  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
-  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+  SDValue ValueToStore = Op.getOperand(0);
+  if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget->is64Bit())
+    // Bitcasting to f64 here allows us to do a single 64-bit store from
+    // an SSE register, avoiding the store forwarding penalty that would come
+    // with two 32-bit stores.
+    ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore,
                               StackSlot, MachinePointerInfo(),
                               false, false, 0);
  // For i64 source, we need to add the appropriate power of 2 if the input
@ -19536,24 +19551,37 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
  MVT SrcVT = Op.getOperand(0).getSimpleValueType();
  MVT DstVT = Op.getSimpleValueType();

-  if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
+  if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
+      SrcVT == MVT::i64) {
    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
    if (DstVT != MVT::f64)
      // This conversion needs to be expanded.
      return SDValue();

-    SDValue InVec = Op->getOperand(0);
-    SDLoc dl(Op);
-    unsigned NumElts = SrcVT.getVectorNumElements();
-    MVT SVT = SrcVT.getVectorElementType();
-
-    // Widen the vector in input in the case of MVT::v2i32.
-    // Example: from MVT::v2i32 to MVT::v4i32.
+    SDValue Op0 = Op->getOperand(0);
    SmallVector<SDValue, 16> Elts;
-    for (unsigned i = 0, e = NumElts; i != e; ++i)
-      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
-                                 DAG.getIntPtrConstant(i, dl)));
+    SDLoc dl(Op);
+    unsigned NumElts;
+    MVT SVT;
+    if (SrcVT.isVector()) {
+      NumElts = SrcVT.getVectorNumElements();
+      SVT = SrcVT.getVectorElementType();

+      // Widen the vector in input in the case of MVT::v2i32.
+      // Example: from MVT::v2i32 to MVT::v4i32.
+      for (unsigned i = 0, e = NumElts; i != e; ++i)
+        Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
+                                   DAG.getIntPtrConstant(i, dl)));
+    } else {
+      assert(SrcVT == MVT::i64 && !Subtarget->is64Bit() &&
+             "Unexpected source type in LowerBITCAST");
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
+                                 DAG.getIntPtrConstant(0, dl)));
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
+                                 DAG.getIntPtrConstant(1, dl)));
+      NumElts = 2;
+      SVT = MVT::i32;
+    }
    // Explicitly mark the extra elements as Undef.
    Elts.append(NumElts, DAG.getUNDEF(SVT));

--- a/test/CodeGen/X86/dagcombine-cse.ll
+++ b/test/CodeGen/X86/dagcombine-cse.ll
@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin -stats 2>&1 | grep asm-printer | grep 14
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin -stats 2>&1 | grep asm-printer | grep 13

 define i32 @t(i8* %ref_frame_ptr, i32 %ref_frame_stride, i32 %idxX, i32 %idxY) nounwind  {
 entry:
--- a/test/CodeGen/X86/scalar-int-to-fp.ll
+++ b/test/CodeGen/X86/scalar-int-to-fp.ll
@ -74,9 +74,16 @@ define x86_fp80 @s32_to_x(i32 %a) nounwind {
 }

 ; CHECK-LABEL: u64_to_f
+; AVX512_32: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512_32: vmovlpd %xmm0, {{[0-9]+}}(%esp)
 ; AVX512_32: fildll
+
 ; AVX512_64: vcvtusi2ssq
+
+; SSE2_32: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2_32: movq %xmm0, {{[0-9]+}}(%esp)
 ; SSE2_32: fildll
+
 ; SSE2_64: cvtsi2ssq
 ; X87: fildll
 define float @u64_to_f(i64 %a) nounwind {
@ -95,6 +102,24 @@ define float @s64_to_f(i64 %a) nounwind {
  ret float %r
 }

+; CHECK-LABEL: s64_to_f_2
+; SSE2_32:    movd %ecx, %xmm0
+; SSE2_32:    movd %eax, %xmm1
+; SSE2_32:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2_32:    movq %xmm1, {{[0-9]+}}(%esp)
+; SSE2_32:    fildll {{[0-9]+}}(%esp)
+
+; AVX512_32:    vmovd %eax, %xmm0
+; AVX512_32:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX512_32:    vmovlpd %xmm0, {{[0-9]+}}(%esp)
+; AVX512_32:    fildll {{[0-9]+}}(%esp)
+
+define float @s64_to_f_2(i64 %a) nounwind {
+  %a1 = add i64 %a, 5
+  %r = sitofp i64 %a1 to float
+  ret float %r
+}
+
 ; CHECK-LABEL: u64_to_d
 ; AVX512_32: vpunpckldq
 ; AVX512_64: vcvtusi2sdq
@ -117,6 +142,24 @@ define double @s64_to_d(i64 %a) nounwind {
  ret double %r
 }

+; CHECK-LABEL: s64_to_d_2
+; SSE2_32: movd %ecx, %xmm0
+; SSE2_32: movd %eax, %xmm1
+; SSE2_32: punpckldq %xmm0, %xmm1
+; SSE2_32: movq %xmm1, {{[0-9]+}}(%esp)
+; SSE2_32: fildll
+
+; AVX512_32:    vmovd %eax, %xmm0
+; AVX512_32:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX512_32:    vmovlpd %xmm0, {{[0-9]+}}(%esp)
+; AVX512_32: fildll
+
+define double @s64_to_d_2(i64 %a) nounwind {
+  %b = add i64 %a, 5
+  %f = sitofp i64 %b to double
+  ret double %f
+}
+
 ; CHECK-LABEL: u64_to_x
 ; CHECK: fildll
 define x86_fp80 @u64_to_x(i64 %a) nounwind {