[ARM,MVE] Fix vreinterpretq in big-endian mode.

Summary: In big-endian MVE, the simple vector load/store instructions (i.e. both contiguous and non-widening) don't all store the bytes of a register to memory in the same order: it matters whether you did a VSTRB.8, VSTRH.16 or VSTRW.32. Put another way, the in-register formats of different vector types relate to each other in a different way from the in-memory formats. So, if you want to 'bitcast' or 'reinterpret' one vector type as another, you have to carefully specify which you mean: did you want to reinterpret the //register// format of one type as that of the other, or the //memory// format? The ACLE `vreinterpretq` intrinsics are specified to reinterpret the register format. But I had implemented them as LLVM IR bitcast, which is specified for all types as a reinterpretation of the memory format. So a `vreinterpretq` intrinsic, applied to values already in registers, would code-generate incorrectly if compiled big-endian: instead of emitting no code, it would emit a `vrev`. To fix this, I've introduced a new IR intrinsic to perform a register-format reinterpretation: `@llvm.arm.mve.vreinterpretq`. It's implemented by a trivial isel pattern that expects the input in an MQPR register, and just returns it unchanged. In the clang codegen, I only emit this new intrinsic where it's actually needed: I prefer a bitcast wherever it will have the right effect, because LLVM understands bitcasts better. So we still generate bitcasts in little-endian mode, and even in big-endian when you're casting between two vector types with the same lane size. For testing, I've moved all the codegen tests of vreinterpretq out into their own file, so that they can have a different set of RUN lines to check both big- and little-endian. Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D73786
2024-11-26 21:00:29 +00:00 · 2020-02-03 09:35:36 +00:00 · 2020-02-03 09:35:36 +00:00 · 6709634733
commit 6709634733
parent e3f9be3c6f
5 changed files with 87 additions and 1 deletions
--- a/include/llvm/IR/IntrinsicsARM.td
+++ b/include/llvm/IR/IntrinsicsARM.td
@ -795,6 +795,8 @@ def int_arm_mve_pred_i2v : Intrinsic<
  [llvm_anyvector_ty], [llvm_i32_ty], [IntrNoMem]>;
 def int_arm_mve_pred_v2i : Intrinsic<
  [llvm_i32_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+def int_arm_mve_vreinterpretq : Intrinsic<
+  [llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;

 multiclass IntrinsicSignSuffix<list<LLVMType> rets, list<LLVMType> params = [],
                                    list<IntrinsicProperty> props = [],
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@ -1605,6 +1605,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
  case ARMISD::WIN__DBZCHK:   return "ARMISD::WIN__DBZCHK";

  case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST";
+  case ARMISD::VECTOR_REG_CAST: return "ARMISD::VECTOR_REG_CAST";
  case ARMISD::VCMP:          return "ARMISD::VCMP";
  case ARMISD::VCMPZ:         return "ARMISD::VCMPZ";
  case ARMISD::VTST:          return "ARMISD::VTST";
@ -3777,6 +3778,9 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
  case Intrinsic::arm_mve_pred_v2i:
    return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
                       Op.getOperand(1));
+  case Intrinsic::arm_mve_vreinterpretq:
+    return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1));
  }
 }

--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@ -131,6 +131,7 @@ class VectorType;
      LE,           // Low-overhead loops, Loop End

      PREDICATE_CAST, // Predicate cast for MVE i1 types
+      VECTOR_REG_CAST, // Reinterpret the current contents of a vector register

      VCMP,         // Vector compare.
      VCMPZ,        // Vector compare to zero.
--- a/lib/Target/ARM/ARMInstrMVE.td
+++ b/lib/Target/ARM/ARMInstrMVE.td
@ -3959,9 +3959,23 @@ let Predicates = [HasMVEInt] in {
 // example when moving between rGPR and VPR.P0 as part of predicate vector
 // shuffles. We also sometimes need to cast between different predicate
 // vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles.
-
 def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>;

+// 'vector_reg_cast' is an operation that reinterprets the contents of an MVE
+// vector register as a different vector type, without changing the contents of
+// the register. It differs from 'bitconvert' in that bitconvert reinterprets
+// the _memory_ storage format of the vector, whereas vector_reg_cast
+// reinterprets the _register_ format - and in big-endian, the memory and
+// register formats are different, so they are different operations.
+//
+// For example, 'vector_reg_cast' between v8i16 and v16i8 will map the LSB of
+// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness,
+// whereas 'bitconvert' will map it to the high byte in big-endian mode,
+// because that's what VSTRH.16 followed by VLDRB.8 would do. So the bitconvert
+// would have to emit a VREV16.8 instruction, whereas the vector_reg_cast emits
+// no code at all if the vector is already in a register.
+def vector_reg_cast : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>;
+
 let Predicates = [HasMVEInt] in {
  foreach VT = [ v4i1, v8i1, v16i1 ] in {
    def : Pat<(i32 (predicate_cast (VT VCCR:$src))),
@ -3973,6 +3987,10 @@ let Predicates = [HasMVEInt] in {
      def : Pat<(VT  (predicate_cast (VT2 VCCR:$src))),
                (VT  (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>;
  }
+
+  foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
+    foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
+      def : Pat<(VT (vector_reg_cast (VT2 MQPR:$src))), (VT MQPR:$src)>;
 }

 // end of MVE compares
--- a/test/CodeGen/Thumb2/mve-be.ll
+++ b/test/CodeGen/Thumb2/mve-be.ll
@ -295,3 +295,64 @@ entry:
  %3 = tail call <4 x i32> asm sideeffect "  VMULLB.s32 $0, $1, $1", "=&w,w"(<4 x i32> %2) #2
  ret <4 x i32> %3
 }
+
+; Test case demonstrating that 'bitcast' reinterprets the memory format of a
+; vector, as if stored and then loaded. So if it has to go between two
+; operations treating a register as having different lane sizes, then in
+; big-endian mode, it has to emit a vrev32.16, which is equivalent to the
+; effect that vstrw.32 followed by vldrh.16 would have.
+define arm_aapcs_vfpcc void @test_bitcast(<4 x i32>* readonly %in, <8 x i16>* %out) {
+; CHECK-LE-LABEL: test_bitcast:
+; CHECK-LE:       @ %bb.0: @ %entry
+; CHECK-LE-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-LE-NEXT:    vmul.i32 q0, q0, q0
+; CHECK-LE-NEXT:    vmul.i16 q0, q0, q0
+; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: test_bitcast:
+; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-BE-NEXT:    vmul.i32 q0, q0, q0
+; CHECK-BE-NEXT:    vrev32.16 q0, q0
+; CHECK-BE-NEXT:    vmul.i16 q0, q0, q0
+; CHECK-BE-NEXT:    vstrh.16 q0, [r1]
+; CHECK-BE-NEXT:    bx lr
+entry:
+  %vin = load <4 x i32>, <4 x i32>* %in, align 8
+  %vdbl = mul <4 x i32> %vin, %vin
+  %cast = bitcast <4 x i32> %vdbl to <8 x i16>
+  %cdbl = mul <8 x i16> %cast, %cast
+  store <8 x i16> %cdbl, <8 x i16>* %out, align 8
+  ret void
+}
+
+; Similar test case but using the arm.mve.vreinterpretq intrinsic instead,
+; which is defined to reinterpret the in-register format, so it generates no
+; instruction in either endianness.
+define arm_aapcs_vfpcc void @test_vreinterpretq(<4 x i32>* readonly %in, <8 x i16>* %out) {
+; CHECK-LE-LABEL: test_vreinterpretq:
+; CHECK-LE:       @ %bb.0: @ %entry
+; CHECK-LE-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-LE-NEXT:    vmul.i32 q0, q0, q0
+; CHECK-LE-NEXT:    vmul.i16 q0, q0, q0
+; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: test_vreinterpretq:
+; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-BE-NEXT:    vmul.i32 q0, q0, q0
+; CHECK-BE-NEXT:    vmul.i16 q0, q0, q0
+; CHECK-BE-NEXT:    vstrh.16 q0, [r1]
+; CHECK-BE-NEXT:    bx lr
+entry:
+  %vin = load <4 x i32>, <4 x i32>* %in, align 8
+  %vdbl = mul <4 x i32> %vin, %vin
+  %cast = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v4i32(<4 x i32> %vdbl)
+  %cdbl = mul <8 x i16> %cast, %cast
+  store <8 x i16> %cdbl, <8 x i16>* %out, align 8
+  ret void
+}
+
+declare <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v4i32(<4 x i32>)