[ARM,MVE] Intrinsics for variable shift instructions.

This batch of intrinsics fills in all the shift instructions that take
a variable shift distance in a register, instead of an immediate. Some
of these instructions take a single shift distance in a scalar
register and apply it to all lanes; others take a vector of per-lane
distances.

These instructions are all basically one family, varying in whether
they saturate out-of-range values, and whether they round when bits
are shifted off the bottom. I've implemented them at the IR level by a
much smaller family of IR intrinsics, which take flag parameters to
indicate saturating and/or rounding (along with the usual one to
specify signed/unsigned integers).

An oddity is that all of them are //left// shift instructions – but if
you pass a negative shift count, they'll shift right. So the vector
shift distances are always vectors of //signed// integers, regardless
of whether you're considering the other input vector to be of signed
or unsigned. Also, even the simplest `vshlq` instruction in this
family (neither saturating nor rounding) has to be implemented as an
IR intrinsic, because the ordinary LLVM IR `shl` operation would
consider an out-of-range shift count to be undefined behavior.

Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard

Reviewed By: dmgreen

Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D72329
This commit is contained in:
Simon Tatham 2020-01-08 13:37:12 +00:00
parent 5371ba4ab1
commit 0f41bbd87f
3 changed files with 1394 additions and 12 deletions
include/llvm/IR
lib/Target/ARM
test/CodeGen/Thumb2/mve-intrinsics

@ -955,6 +955,13 @@ defm int_arm_mve_vshrn: MVEPredicated<
llvm_i32_ty /*unsigned-out*/, llvm_i32_ty /*unsigned-in*/,
llvm_i32_ty /*top-half*/]>;
defm int_arm_mve_vshl_scalar: MVEPredicated<
[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_i32_ty /*shiftcount*/,
llvm_i32_ty /*saturate*/, llvm_i32_ty /*round*/, llvm_i32_ty /*unsigned*/]>;
defm int_arm_mve_vshl_vector: MVEPredicatedM<
[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_anyvector_ty /*shiftcounts*/,
llvm_i32_ty /*saturate*/, llvm_i32_ty /*round*/, llvm_i32_ty /*unsigned*/]>;
// MVE scalar shifts.
class ARM_MVE_qrshift_single<list<LLVMType> value,
list<LLVMType> saturate = []> :

@ -2727,13 +2727,32 @@ class MVE_shift_by_vec<string iname, string suffix, bit U,
let validForTailPredication = 1;
}
multiclass MVE_shift_by_vec_p<string iname, MVEVectorVTInfo VTI, bit q, bit r> {
def "" : MVE_shift_by_vec<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, q, r>;
def : Pat<(VTI.Vec (int_arm_mve_vshl_vector
(VTI.Vec MQPR:$in), (VTI.Vec MQPR:$sh),
(i32 q), (i32 r), (i32 VTI.Unsigned))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$in), (VTI.Vec MQPR:$sh)))>;
def : Pat<(VTI.Vec (int_arm_mve_vshl_vector_predicated
(VTI.Vec MQPR:$in), (VTI.Vec MQPR:$sh),
(i32 q), (i32 r), (i32 VTI.Unsigned),
(VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$in), (VTI.Vec MQPR:$sh),
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
multiclass mve_shift_by_vec_multi<string iname, bit bit_4, bit bit_8> {
def s8 : MVE_shift_by_vec<iname, "s8", 0b0, 0b00, bit_4, bit_8>;
def s16 : MVE_shift_by_vec<iname, "s16", 0b0, 0b01, bit_4, bit_8>;
def s32 : MVE_shift_by_vec<iname, "s32", 0b0, 0b10, bit_4, bit_8>;
def u8 : MVE_shift_by_vec<iname, "u8", 0b1, 0b00, bit_4, bit_8>;
def u16 : MVE_shift_by_vec<iname, "u16", 0b1, 0b01, bit_4, bit_8>;
def u32 : MVE_shift_by_vec<iname, "u32", 0b1, 0b10, bit_4, bit_8>;
defm s8 : MVE_shift_by_vec_p<iname, MVE_v16s8, bit_4, bit_8>;
defm s16 : MVE_shift_by_vec_p<iname, MVE_v8s16, bit_4, bit_8>;
defm s32 : MVE_shift_by_vec_p<iname, MVE_v4s32, bit_4, bit_8>;
defm u8 : MVE_shift_by_vec_p<iname, MVE_v16u8, bit_4, bit_8>;
defm u16 : MVE_shift_by_vec_p<iname, MVE_v8u16, bit_4, bit_8>;
defm u32 : MVE_shift_by_vec_p<iname, MVE_v4u32, bit_4, bit_8>;
}
defm MVE_VSHL_by_vec : mve_shift_by_vec_multi<"vshl", 0b0, 0b0>;
@ -4542,13 +4561,31 @@ class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size,
let validForTailPredication = 1;
}
multiclass MVE_VxSHL_qr_p<string iname, MVEVectorVTInfo VTI, bit q, bit r> {
def "" : MVE_VxSHL_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, q, r>;
def : Pat<(VTI.Vec (int_arm_mve_vshl_scalar
(VTI.Vec MQPR:$in), (i32 rGPR:$sh),
(i32 q), (i32 r), (i32 VTI.Unsigned))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$in), (i32 rGPR:$sh)))>;
def : Pat<(VTI.Vec (int_arm_mve_vshl_scalar_predicated
(VTI.Vec MQPR:$in), (i32 rGPR:$sh),
(i32 q), (i32 r), (i32 VTI.Unsigned),
(VTI.Pred VCCR:$mask))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$in), (i32 rGPR:$sh),
ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
}
multiclass MVE_VxSHL_qr_types<string iname, bit bit_7, bit bit_17> {
def s8 : MVE_VxSHL_qr<iname, "s8", 0b0, 0b00, bit_7, bit_17>;
def s16 : MVE_VxSHL_qr<iname, "s16", 0b0, 0b01, bit_7, bit_17>;
def s32 : MVE_VxSHL_qr<iname, "s32", 0b0, 0b10, bit_7, bit_17>;
def u8 : MVE_VxSHL_qr<iname, "u8", 0b1, 0b00, bit_7, bit_17>;
def u16 : MVE_VxSHL_qr<iname, "u16", 0b1, 0b01, bit_7, bit_17>;
def u32 : MVE_VxSHL_qr<iname, "u32", 0b1, 0b10, bit_7, bit_17>;
defm s8 : MVE_VxSHL_qr_p<iname, MVE_v16s8, bit_7, bit_17>;
defm s16 : MVE_VxSHL_qr_p<iname, MVE_v8s16, bit_7, bit_17>;
defm s32 : MVE_VxSHL_qr_p<iname, MVE_v4s32, bit_7, bit_17>;
defm u8 : MVE_VxSHL_qr_p<iname, MVE_v16u8, bit_7, bit_17>;
defm u16 : MVE_VxSHL_qr_p<iname, MVE_v8u16, bit_7, bit_17>;
defm u32 : MVE_VxSHL_qr_p<iname, MVE_v4u32, bit_7, bit_17>;
}
defm MVE_VSHL_qr : MVE_VxSHL_qr_types<"vshl", 0b0, 0b0>;

File diff suppressed because it is too large Load Diff