From 6709634733848c42dea9f0648a982199e513cdd4 Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Mon, 3 Feb 2020 09:35:36 +0000 Subject: [PATCH] [ARM,MVE] Fix vreinterpretq in big-endian mode. Summary: In big-endian MVE, the simple vector load/store instructions (i.e. both contiguous and non-widening) don't all store the bytes of a register to memory in the same order: it matters whether you did a VSTRB.8, VSTRH.16 or VSTRW.32. Put another way, the in-register formats of different vector types relate to each other in a different way from the in-memory formats. So, if you want to 'bitcast' or 'reinterpret' one vector type as another, you have to carefully specify which you mean: did you want to reinterpret the //register// format of one type as that of the other, or the //memory// format? The ACLE `vreinterpretq` intrinsics are specified to reinterpret the register format. But I had implemented them as LLVM IR bitcast, which is specified for all types as a reinterpretation of the memory format. So a `vreinterpretq` intrinsic, applied to values already in registers, would code-generate incorrectly if compiled big-endian: instead of emitting no code, it would emit a `vrev`. To fix this, I've introduced a new IR intrinsic to perform a register-format reinterpretation: `@llvm.arm.mve.vreinterpretq`. It's implemented by a trivial isel pattern that expects the input in an MQPR register, and just returns it unchanged. In the clang codegen, I only emit this new intrinsic where it's actually needed: I prefer a bitcast wherever it will have the right effect, because LLVM understands bitcasts better. So we still generate bitcasts in little-endian mode, and even in big-endian when you're casting between two vector types with the same lane size. For testing, I've moved all the codegen tests of vreinterpretq out into their own file, so that they can have a different set of RUN lines to check both big- and little-endian. Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D73786 --- include/llvm/IR/IntrinsicsARM.td | 2 + lib/Target/ARM/ARMISelLowering.cpp | 4 ++ lib/Target/ARM/ARMISelLowering.h | 1 + lib/Target/ARM/ARMInstrMVE.td | 20 +++++++++- test/CodeGen/Thumb2/mve-be.ll | 61 ++++++++++++++++++++++++++++++ 5 files changed, 87 insertions(+), 1 deletion(-) diff --git a/include/llvm/IR/IntrinsicsARM.td b/include/llvm/IR/IntrinsicsARM.td index 353e0207e29..02a14939d9b 100644 --- a/include/llvm/IR/IntrinsicsARM.td +++ b/include/llvm/IR/IntrinsicsARM.td @@ -795,6 +795,8 @@ def int_arm_mve_pred_i2v : Intrinsic< [llvm_anyvector_ty], [llvm_i32_ty], [IntrNoMem]>; def int_arm_mve_pred_v2i : Intrinsic< [llvm_i32_ty], [llvm_anyvector_ty], [IntrNoMem]>; +def int_arm_mve_vreinterpretq : Intrinsic< + [llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; multiclass IntrinsicSignSuffix rets, list params = [], list props = [], diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index c6abeb017ad..d5cc1e759d8 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -1605,6 +1605,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST"; + case ARMISD::VECTOR_REG_CAST: return "ARMISD::VECTOR_REG_CAST"; case ARMISD::VCMP: return "ARMISD::VCMP"; case ARMISD::VCMPZ: return "ARMISD::VCMPZ"; case ARMISD::VTST: return "ARMISD::VTST"; @@ -3777,6 +3778,9 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, case Intrinsic::arm_mve_pred_v2i: return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(), Op.getOperand(1)); + case Intrinsic::arm_mve_vreinterpretq: + return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(), + Op.getOperand(1)); } } diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 78fde2f77ec..57817549d74 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -131,6 +131,7 @@ class VectorType; LE, // Low-overhead loops, Loop End PREDICATE_CAST, // Predicate cast for MVE i1 types + VECTOR_REG_CAST, // Reinterpret the current contents of a vector register VCMP, // Vector compare. VCMPZ, // Vector compare to zero. diff --git a/lib/Target/ARM/ARMInstrMVE.td b/lib/Target/ARM/ARMInstrMVE.td index 69f496a7f79..2b86c9091fe 100644 --- a/lib/Target/ARM/ARMInstrMVE.td +++ b/lib/Target/ARM/ARMInstrMVE.td @@ -3959,9 +3959,23 @@ let Predicates = [HasMVEInt] in { // example when moving between rGPR and VPR.P0 as part of predicate vector // shuffles. We also sometimes need to cast between different predicate // vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles. - def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>; +// 'vector_reg_cast' is an operation that reinterprets the contents of an MVE +// vector register as a different vector type, without changing the contents of +// the register. It differs from 'bitconvert' in that bitconvert reinterprets +// the _memory_ storage format of the vector, whereas vector_reg_cast +// reinterprets the _register_ format - and in big-endian, the memory and +// register formats are different, so they are different operations. +// +// For example, 'vector_reg_cast' between v8i16 and v16i8 will map the LSB of +// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness, +// whereas 'bitconvert' will map it to the high byte in big-endian mode, +// because that's what VSTRH.16 followed by VLDRB.8 would do. So the bitconvert +// would have to emit a VREV16.8 instruction, whereas the vector_reg_cast emits +// no code at all if the vector is already in a register. +def vector_reg_cast : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>; + let Predicates = [HasMVEInt] in { foreach VT = [ v4i1, v8i1, v16i1 ] in { def : Pat<(i32 (predicate_cast (VT VCCR:$src))), @@ -3973,6 +3987,10 @@ let Predicates = [HasMVEInt] in { def : Pat<(VT (predicate_cast (VT2 VCCR:$src))), (VT (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>; } + + foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in + foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in + def : Pat<(VT (vector_reg_cast (VT2 MQPR:$src))), (VT MQPR:$src)>; } // end of MVE compares diff --git a/test/CodeGen/Thumb2/mve-be.ll b/test/CodeGen/Thumb2/mve-be.ll index 7f355396a4c..3db11f1e429 100644 --- a/test/CodeGen/Thumb2/mve-be.ll +++ b/test/CodeGen/Thumb2/mve-be.ll @@ -295,3 +295,64 @@ entry: %3 = tail call <4 x i32> asm sideeffect " VMULLB.s32 $0, $1, $1", "=&w,w"(<4 x i32> %2) #2 ret <4 x i32> %3 } + +; Test case demonstrating that 'bitcast' reinterprets the memory format of a +; vector, as if stored and then loaded. So if it has to go between two +; operations treating a register as having different lane sizes, then in +; big-endian mode, it has to emit a vrev32.16, which is equivalent to the +; effect that vstrw.32 followed by vldrh.16 would have. +define arm_aapcs_vfpcc void @test_bitcast(<4 x i32>* readonly %in, <8 x i16>* %out) { +; CHECK-LE-LABEL: test_bitcast: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrw.u32 q0, [r0] +; CHECK-LE-NEXT: vmul.i32 q0, q0, q0 +; CHECK-LE-NEXT: vmul.i16 q0, q0, q0 +; CHECK-LE-NEXT: vstrw.32 q0, [r1] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: test_bitcast: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrw.u32 q0, [r0] +; CHECK-BE-NEXT: vmul.i32 q0, q0, q0 +; CHECK-BE-NEXT: vrev32.16 q0, q0 +; CHECK-BE-NEXT: vmul.i16 q0, q0, q0 +; CHECK-BE-NEXT: vstrh.16 q0, [r1] +; CHECK-BE-NEXT: bx lr +entry: + %vin = load <4 x i32>, <4 x i32>* %in, align 8 + %vdbl = mul <4 x i32> %vin, %vin + %cast = bitcast <4 x i32> %vdbl to <8 x i16> + %cdbl = mul <8 x i16> %cast, %cast + store <8 x i16> %cdbl, <8 x i16>* %out, align 8 + ret void +} + +; Similar test case but using the arm.mve.vreinterpretq intrinsic instead, +; which is defined to reinterpret the in-register format, so it generates no +; instruction in either endianness. +define arm_aapcs_vfpcc void @test_vreinterpretq(<4 x i32>* readonly %in, <8 x i16>* %out) { +; CHECK-LE-LABEL: test_vreinterpretq: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrw.u32 q0, [r0] +; CHECK-LE-NEXT: vmul.i32 q0, q0, q0 +; CHECK-LE-NEXT: vmul.i16 q0, q0, q0 +; CHECK-LE-NEXT: vstrw.32 q0, [r1] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: test_vreinterpretq: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrw.u32 q0, [r0] +; CHECK-BE-NEXT: vmul.i32 q0, q0, q0 +; CHECK-BE-NEXT: vmul.i16 q0, q0, q0 +; CHECK-BE-NEXT: vstrh.16 q0, [r1] +; CHECK-BE-NEXT: bx lr +entry: + %vin = load <4 x i32>, <4 x i32>* %in, align 8 + %vdbl = mul <4 x i32> %vin, %vin + %cast = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v4i32(<4 x i32> %vdbl) + %cdbl = mul <8 x i16> %cast, %cast + store <8 x i16> %cdbl, <8 x i16>* %out, align 8 + ret void +} + +declare <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v4i32(<4 x i32>)