From 338268c67fbb7252702bf400495771068750466b Mon Sep 17 00:00:00 2001
From: David Goodwin <david_goodwin@apple.com>
Date: Mon, 10 Aug 2009 22:17:39 +0000
Subject: [PATCH] Use NEON for single-precision int<->FP conversions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78604 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMInstrFormats.td | 12 +++++--
 lib/Target/ARM/ARMInstrNEON.td    | 60 ++++++++++++++++++++++++-------
 lib/Target/ARM/ARMInstrVFP.td     | 12 +++----
 test/CodeGen/ARM/fsitos.ll        | 12 +++++++
 test/CodeGen/ARM/ftosizs.ll       | 12 +++++++
 test/CodeGen/ARM/ftouizs.ll       | 12 +++++++
 test/CodeGen/ARM/fuitos.ll        | 12 +++++++
 7 files changed, 111 insertions(+), 21 deletions(-)
 create mode 100644 test/CodeGen/ARM/fsitos.ll
 create mode 100644 test/CodeGen/ARM/ftosizs.ll
 create mode 100644 test/CodeGen/ARM/ftouizs.ll
 create mode 100644 test/CodeGen/ARM/fuitos.ll
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 5bc7212881c..deff83b1c99 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -1118,7 +1118,7 @@ class ASuI<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, dag oops, dag iops,
   let Inst{7-4}   = opcod3;
 }
 
-// Single precision, unary if no NEON
+// Single precision unary, if no NEON
 // Same as ASuI except not available if NEON is enabled
 class ASuIn<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, dag oops, dag iops,
             InstrItinClass itin,    string opc, string asm, list<dag> pattern>
@@ -1135,7 +1135,7 @@ class ASbI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
   let Inst{11-8}  = 0b1010;
 }
 
-// Single precision, binary if no NEON
+// Single precision binary, if no NEON
 // Same as ASbI except not available if NEON is enabled
 class ASbIn<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
@@ -1154,6 +1154,14 @@ class AVConv1I<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3,
   let Inst{6}     = 1;
 }
 
+// VFP conversion instructions, if no NEON
+class AVConv1In<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string asm, list<dag> pattern>
+  : AVConv1I<opcod1, opcod2, opcod3, oops, iops, itin, opc, asm, pattern> {
+  list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+}
+
 class AVConvXI<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, Format f,
                InstrItinClass itin,
                string opc, string asm, list<dag> pattern>
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 0d48aa65075..77bea683520 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -324,6 +324,20 @@ class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
         (ins QPR:$src), NoItinerary, !strconcat(OpcodeStr, "\t$dst, $src"), "",
         [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src))))]>;
 
+// Basic 2-register operations, scalar single-precision.
+class N2VDs<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+            bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+            ValueType ResTy, ValueType OpTy, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
+        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src),
+        NoItinerary, !strconcat(OpcodeStr, "\t$dst, $src"), "", []>;
+
+class N2VDsPat<SDNode OpNode, ValueType ResTy, ValueType OpTy, NeonI Inst>
+  : NEONFPPat<(ResTy (OpNode SPR:$a)),
+       (EXTRACT_SUBREG
+           (Inst (INSERT_SUBREG (OpTy (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0)),
+        arm_ssubreg_0)>;
+
 // Basic 2-register intrinsics, both double- and quad-register.
 class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
               bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
@@ -338,7 +352,7 @@ class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
         (ins QPR:$src), NoItinerary, !strconcat(OpcodeStr, "\t$dst, $src"), "",
         [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>;
 
-// Basic 2-register operations, scalar single-precision
+// Basic 2-register intrinsics, scalar single-precision
 class N2VDInts<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
               bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
               ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
@@ -1981,6 +1995,11 @@ let neverHasSideEffects = 1 in
 def VADDfd_sfp : N3VDs<0, 0, 0b00, 0b1101, 0, "vadd.f32", v2f32, v2f32, fadd,1>;
 def : N3VDsPat<fadd, VADDfd_sfp>;
 
+// Vector Sub Operations used for single-precision FP
+let neverHasSideEffects = 1 in
+def VSUBfd_sfp : N3VDs<0, 0, 0b10, 0b1101, 0, "vsub.f32", v2f32, v2f32, fsub,0>;
+def : N3VDsPat<fsub, VSUBfd_sfp>;
+
 // Vector Multiply Operations used for single-precision FP
 let neverHasSideEffects = 1 in
 def VMULfd_sfp : N3VDs<1, 0, 0b00, 0b1101, 1, "vmul.f32", v2f32, v2f32, fmul,1>;
@@ -1989,31 +2008,46 @@ def : N3VDsPat<fmul, VMULfd_sfp>;
 // Vector Multiply-Accumulate/Subtract used for single-precision FP
 let neverHasSideEffects = 1 in
 def VMLAfd_sfp : N3VDMulOps<0, 0, 0b00, 0b1101, 1, "vmla.f32", v2f32,fmul,fadd>;
-def : N3VDMulOpsPat<fmul, fadd, VMLAfd>;
+def : N3VDMulOpsPat<fmul, fadd, VMLAfd_sfp>;
 
 let neverHasSideEffects = 1 in
 def VMLSfd_sfp : N3VDMulOps<0, 0, 0b10, 0b1101, 1, "vmls.f32", v2f32,fmul,fsub>;
-def : N3VDMulOpsPat<fmul, fsub, VMLSfd>;
+def : N3VDMulOpsPat<fmul, fsub, VMLSfd_sfp>;
 
-// Vector Sub Operations used for single-precision FP
-let neverHasSideEffects = 1 in
-def VSUBfd_sfp : N3VDs<0, 0, 0b10, 0b1101, 0, "vsub.f32", v2f32, v2f32, fsub,0>;
-def : N3VDsPat<fsub, VSUBfd_sfp>;
-
-// Vector Absolute for single-precision FP
+// Vector Absolute used for single-precision FP
 let neverHasSideEffects = 1 in
 def  VABSfd_sfp : N2VDInts<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32",
                            v2f32, v2f32, int_arm_neon_vabsf>;
 def : N2VDIntsPat<fabs, VABSfd_sfp>;
 
-// Vector Negate for single-precision FP
-
+// Vector Negate used for single-precision FP
 let neverHasSideEffects = 1 in
 def  VNEGf32d_sfp : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0,
-                    (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), NoItinerary,
-                    "vneg.f32\t$dst, $src", "", []>;
+                        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), NoItinerary,
+                        "vneg.f32\t$dst, $src", "", []>;
 def : N2VDIntsPat<fneg, VNEGf32d_sfp>;
 
+// Vector Convert between single-precision FP and integer
+let neverHasSideEffects = 1 in
+def  VCVTf2sd_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt.s32.f32",
+                          v2i32, v2f32, fp_to_sint>;
+def : N2VDsPat<arm_ftosi, f32, v2f32, VCVTf2sd_sfp>;
+
+let neverHasSideEffects = 1 in
+def  VCVTf2ud_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt.u32.f32",
+                          v2i32, v2f32, fp_to_uint>;
+def : N2VDsPat<arm_ftoui, f32, v2f32, VCVTf2ud_sfp>;
+
+let neverHasSideEffects = 1 in
+def  VCVTs2fd_sfp : N2VD<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt.f32.s32",
+                         v2f32, v2i32, sint_to_fp>;
+def : N2VDsPat<arm_sitof, f32, v2i32, VCVTs2fd_sfp>;
+
+let neverHasSideEffects = 1 in
+def  VCVTu2fd_sfp : N2VD<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt.f32.u32",
+                         v2f32, v2i32, uint_to_fp>;
+def : N2VDsPat<arm_uitof, f32, v2i32, VCVTu2fd_sfp>;
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 9eb11475830..f7c16bb9c22 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -263,7 +263,7 @@ def FSITOD : AVConv1I<0b11101011, 0b1000, 0b1011, (outs DPR:$dst), (ins SPR:$a),
   let Inst{7} = 1;
 }
 
-def FSITOS : AVConv1I<0b11101011, 0b1000, 0b1010, (outs SPR:$dst), (ins SPR:$a),
+def FSITOS : AVConv1In<0b11101011, 0b1000, 0b1010, (outs SPR:$dst),(ins SPR:$a),
                  IIC_fpALU, "fsitos", " $dst, $a",
                  [(set SPR:$dst, (arm_sitof SPR:$a))]> {
   let Inst{7} = 1;
@@ -273,7 +273,7 @@ def FUITOD : AVConv1I<0b11101011, 0b1000, 0b1011, (outs DPR:$dst), (ins SPR:$a),
                  IIC_fpALU, "fuitod", " $dst, $a",
                  [(set DPR:$dst, (arm_uitof SPR:$a))]>;
 
-def FUITOS : AVConv1I<0b11101011, 0b1000, 0b1010, (outs SPR:$dst), (ins SPR:$a),
+def FUITOS : AVConv1In<0b11101011, 0b1000, 0b1010, (outs SPR:$dst),(ins SPR:$a),
                  IIC_fpALU, "fuitos", " $dst, $a",
                  [(set SPR:$dst, (arm_uitof SPR:$a))]>;
 
@@ -287,8 +287,8 @@ def FTOSIZD : AVConv1I<0b11101011, 0b1101, 0b1011,
   let Inst{7} = 1; // Z bit
 }
 
-def FTOSIZS : AVConv1I<0b11101011, 0b1101, 0b1010,
-                       (outs SPR:$dst), (ins SPR:$a),
+def FTOSIZS : AVConv1In<0b11101011, 0b1101, 0b1010,
+                        (outs SPR:$dst), (ins SPR:$a),
                  IIC_fpALU, "ftosizs", " $dst, $a",
                  [(set SPR:$dst, (arm_ftosi SPR:$a))]> {
   let Inst{7} = 1; // Z bit
@@ -301,8 +301,8 @@ def FTOUIZD : AVConv1I<0b11101011, 0b1100, 0b1011,
   let Inst{7} = 1; // Z bit
 }
 
-def FTOUIZS : AVConv1I<0b11101011, 0b1100, 0b1010,
-                       (outs SPR:$dst), (ins SPR:$a),
+def FTOUIZS : AVConv1In<0b11101011, 0b1100, 0b1010,
+                        (outs SPR:$dst), (ins SPR:$a),
                  IIC_fpALU, "ftouizs", " $dst, $a",
                  [(set SPR:$dst, (arm_ftoui SPR:$a))]> {
   let Inst{7} = 1; // Z bit
diff --git a/test/CodeGen/ARM/fsitos.ll b/test/CodeGen/ARM/fsitos.ll
new file mode 100644
index 00000000000..7420504df6f
--- /dev/null
+++ b/test/CodeGen/ARM/fsitos.ll
@@ -0,0 +1,12 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2 | grep -E {fsitos\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon,+neonfp | grep -E {vcvt.f32.s32\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon,-neonfp | grep -E {fsitos\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llvm-as < %s | llc -march=arm -mcpu=cortex-a8 | grep -E {vcvt.f32.s32\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1
+; RUN: llvm-as < %s | llc -march=arm -mcpu=cortex-a9 | grep -E {fsitos\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+
+define float @test(i32 %a, i32 %b) {
+entry:
+        %0 = add i32 %a, %b
+        %1 = sitofp i32 %0 to float
+	ret float %1
+}
diff --git a/test/CodeGen/ARM/ftosizs.ll b/test/CodeGen/ARM/ftosizs.ll
new file mode 100644
index 00000000000..5ab77909d58
--- /dev/null
+++ b/test/CodeGen/ARM/ftosizs.ll
@@ -0,0 +1,12 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2 | grep -E {ftosizs\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon,+neonfp | grep -E {vcvt.s32.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon,-neonfp | grep -E {ftosizs\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llvm-as < %s | llc -march=arm -mcpu=cortex-a8 | grep -E {vcvt.s32.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1
+; RUN: llvm-as < %s | llc -march=arm -mcpu=cortex-a9 | grep -E {ftosizs\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+
+define i32 @test(float %a, float %b) {
+entry:
+        %0 = fadd float %a, %b
+        %1 = fptosi float %0 to i32
+	ret i32 %1
+}
diff --git a/test/CodeGen/ARM/ftouizs.ll b/test/CodeGen/ARM/ftouizs.ll
new file mode 100644
index 00000000000..1cc5115352e
--- /dev/null
+++ b/test/CodeGen/ARM/ftouizs.ll
@@ -0,0 +1,12 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2 | grep -E {ftouizs\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon,+neonfp | grep -E {vcvt.u32.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon,-neonfp | grep -E {ftouizs\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llvm-as < %s | llc -march=arm -mcpu=cortex-a8 | grep -E {vcvt.u32.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1
+; RUN: llvm-as < %s | llc -march=arm -mcpu=cortex-a9 | grep -E {ftouizs\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+
+define i32 @test(float %a, float %b) {
+entry:
+        %0 = fadd float %a, %b
+        %1 = fptoui float %0 to i32
+	ret i32 %1
+}
diff --git a/test/CodeGen/ARM/fuitos.ll b/test/CodeGen/ARM/fuitos.ll
new file mode 100644
index 00000000000..0b70dc73641
--- /dev/null
+++ b/test/CodeGen/ARM/fuitos.ll
@@ -0,0 +1,12 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2 | grep -E {fuitos\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon,+neonfp | grep -E {vcvt.f32.u32\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon,-neonfp | grep -E {fuitos\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llvm-as < %s | llc -march=arm -mcpu=cortex-a8 | grep -E {vcvt.f32.u32\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1
+; RUN: llvm-as < %s | llc -march=arm -mcpu=cortex-a9 | grep -E {fuitos\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+
+define float @test(i32 %a, i32 %b) {
+entry:
+        %0 = add i32 %a, %b
+        %1 = uitofp i32 %0 to float
+	ret float %1
+}