llvm-mirror/test/Bitcode/neon-intrinsics.ll
Evan Cheng ed09135349 Add intrinsics @llvm.arm.neon.vmulls and @llvm.arm.neon.vmullu.* back. Frontends
was lowering them to sext / uxt + mul instructions. Unfortunately the
optimization passes may hoist the extensions out of the loop and separate them.
When that happens, the long multiplication instructions can be broken into
several scalar instructions, causing significant performance issue.

Note the vmla and vmls intrinsics are not added back. Frontend will codegen them
as intrinsics vmull* + add / sub. Also note the isel optimizations for catching
mul + sext / zext are not changed either.

First part of rdar://8832507, rdar://9203134

llvm-svn: 128502
2011-03-29 23:06:19 +00:00

207 lines
4.2 KiB
LLVM

; RUN: llvm-dis < %s.bc | FileCheck %s
; vmovls should be auto-upgraded to sext
; CHECK: vmovls8
; CHECK-NOT: arm.neon.vmovls.v8i16
; CHECK: sext <8 x i8>
; CHECK: vmovls16
; CHECK-NOT: arm.neon.vmovls.v4i32
; CHECK: sext <4 x i16>
; CHECK: vmovls32
; CHECK-NOT: arm.neon.vmovls.v2i64
; CHECK: sext <2 x i32>
; vmovlu should be auto-upgraded to zext
; CHECK: vmovlu8
; CHECK-NOT: arm.neon.vmovlu.v8i16
; CHECK: zext <8 x i8>
; CHECK: vmovlu16
; CHECK-NOT: arm.neon.vmovlu.v4i32
; CHECK: zext <4 x i16>
; CHECK: vmovlu32
; CHECK-NOT: arm.neon.vmovlu.v2i64
; CHECK: zext <2 x i32>
; vaddl/vaddw should be auto-upgraded to add with sext/zext
; CHECK: vaddls16
; CHECK-NOT: arm.neon.vaddls.v4i32
; CHECK: sext <4 x i16>
; CHECK-NEXT: sext <4 x i16>
; CHECK-NEXT: add <4 x i32>
; CHECK: vaddlu32
; CHECK-NOT: arm.neon.vaddlu.v2i64
; CHECK: zext <2 x i32>
; CHECK-NEXT: zext <2 x i32>
; CHECK-NEXT: add <2 x i64>
; CHECK: vaddws8
; CHECK-NOT: arm.neon.vaddws.v8i16
; CHECK: sext <8 x i8>
; CHECK-NEXT: add <8 x i16>
; CHECK: vaddwu16
; CHECK-NOT: arm.neon.vaddwu.v4i32
; CHECK: zext <4 x i16>
; CHECK-NEXT: add <4 x i32>
; vsubl/vsubw should be auto-upgraded to subtract with sext/zext
; CHECK: vsubls16
; CHECK-NOT: arm.neon.vsubls.v4i32
; CHECK: sext <4 x i16>
; CHECK-NEXT: sext <4 x i16>
; CHECK-NEXT: sub <4 x i32>
; CHECK: vsublu32
; CHECK-NOT: arm.neon.vsublu.v2i64
; CHECK: zext <2 x i32>
; CHECK-NEXT: zext <2 x i32>
; CHECK-NEXT: sub <2 x i64>
; CHECK: vsubws8
; CHECK-NOT: arm.neon.vsubws.v8i16
; CHECK: sext <8 x i8>
; CHECK-NEXT: sub <8 x i16>
; CHECK: vsubwu16
; CHECK-NOT: arm.neon.vsubwu.v4i32
; CHECK: zext <4 x i16>
; CHECK-NEXT: sub <4 x i32>
; vmull* intrinsics will remain intrinsics
; CHECK: vmulls8
; CHECK: arm.neon.vmulls.v8i16
; CHECK: vmullu16
; CHECK: arm.neon.vmullu.v4i32
; CHECK: vmullp8
; CHECK: arm.neon.vmullp.v8i16
; vmlal should be auto-upgraded to multiply/add with sext/zext
; CHECK: vmlals32
; CHECK-NOT: arm.neon.vmlals.v2i64
; CHECK: sext <2 x i32>
; CHECK-NEXT: sext <2 x i32>
; CHECK-NEXT: mul <2 x i64>
; CHECK-NEXT: add <2 x i64>
; CHECK: vmlalu8
; CHECK-NOT: arm.neon.vmlalu.v8i16
; CHECK: zext <8 x i8>
; CHECK-NEXT: zext <8 x i8>
; CHECK-NEXT: mul <8 x i16>
; CHECK-NEXT: add <8 x i16>
; vmlsl should be auto-upgraded to multiply/sub with sext/zext
; CHECK: vmlsls16
; CHECK-NOT: arm.neon.vmlsls.v4i32
; CHECK: sext <4 x i16>
; CHECK-NEXT: sext <4 x i16>
; CHECK-NEXT: mul <4 x i32>
; CHECK-NEXT: sub <4 x i32>
; CHECK: vmlslu32
; CHECK-NOT: arm.neon.vmlslu.v2i64
; CHECK: zext <2 x i32>
; CHECK-NEXT: zext <2 x i32>
; CHECK-NEXT: mul <2 x i64>
; CHECK-NEXT: sub <2 x i64>
; vaba should be auto-upgraded to vabd + add
; CHECK: vabas32
; CHECK-NOT: arm.neon.vabas.v2i32
; CHECK: arm.neon.vabds.v2i32
; CHECK-NEXT: add <2 x i32>
; CHECK: vabaQu8
; CHECK-NOT: arm.neon.vabau.v16i8
; CHECK: arm.neon.vabdu.v16i8
; CHECK-NEXT: add <16 x i8>
; vabal should be auto-upgraded to vabd with zext + add
; CHECK: vabals16
; CHECK-NOT: arm.neon.vabals.v4i32
; CHECK: arm.neon.vabds.v4i16
; CHECK-NEXT: zext <4 x i16>
; CHECK-NEXT: add <4 x i32>
; CHECK: vabalu32
; CHECK-NOT: arm.neon.vabalu.v2i64
; CHECK: arm.neon.vabdu.v2i32
; CHECK-NEXT: zext <2 x i32>
; CHECK-NEXT: add <2 x i64>
; vabdl should be auto-upgraded to vabd with zext
; CHECK: vabdls8
; CHECK-NOT: arm.neon.vabdls.v8i16
; CHECK: arm.neon.vabds.v8i8
; CHECK-NEXT: zext <8 x i8>
; CHECK: vabdlu16
; CHECK-NOT: arm.neon.vabdlu.v4i32
; CHECK: arm.neon.vabdu.v4i16
; CHECK-NEXT: zext <4 x i16>
; vmovn should be auto-upgraded to trunc
; CHECK: vmovni16
; CHECK-NOT: arm.neon.vmovn.v8i8
; CHECK: trunc <8 x i16>
; CHECK: vmovni32
; CHECK-NOT: arm.neon.vmovn.v4i16
; CHECK: trunc <4 x i32>
; CHECK: vmovni64
; CHECK-NOT: arm.neon.vmovn.v2i32
; CHECK: trunc <2 x i64>
; vld* and vst* intrinsic calls need an alignment argument (defaulted to 1)
; CHECK: vld1i8
; CHECK: i32 1
; CHECK: vld2Qi16
; CHECK: i32 1
; CHECK: vld3i32
; CHECK: i32 1
; CHECK: vld4Qf
; CHECK: i32 1
; CHECK: vst1i8
; CHECK: i32 1
; CHECK: vst2Qi16
; CHECK: i32 1
; CHECK: vst3i32
; CHECK: i32 1
; CHECK: vst4Qf
; CHECK: i32 1
; CHECK: vld2laneQi16
; CHECK: i32 1
; CHECK: vld3lanei32
; CHECK: i32 1
; CHECK: vld4laneQf
; CHECK: i32 1
; CHECK: vst2laneQi16
; CHECK: i32 1
; CHECK: vst3lanei32
; CHECK: i32 1
; CHECK: vst4laneQf
; CHECK: i32 1