llvm/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll
Evan Cheng 376642ed62 Some enhancements for memcpy / memset inline expansion.
1. Teach it to use overlapping unaligned load / store to copy / set the trailing
   bytes. e.g. On 86, use two pairs of movups / movaps for 17 - 31 byte copies.
2. Use f64 for memcpy / memset on targets where i64 is not legal but f64 is. e.g.
   x86 and ARM.
3. When memcpy from a constant string, do *not* replace the load with a constant
   if it's not possible to materialize an integer immediate with a single
   instruction (required a new target hook: TLI.isIntImmLegal()).
4. Use unaligned load / stores more aggressively if target hooks indicates they
   are "fast".
5. Update ARM target hooks to use unaligned load / stores. e.g. vld1.8 / vst1.8.
   Also increase the threshold to something reasonable (8 for memset, 4 pairs
   for memcpy).

This significantly improves Dhrystone, up to 50% on ARM iOS devices.

rdar://12760078


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169791 91177308-0d34-0410-b5e6-96231b3b80d8
2012-12-10 23:21:26 +00:00

30 lines
902 B
LLVM

; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
; rdar://7396984
@str = private constant [28 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxx\00", align 1
define void @t(i32 %count) ssp nounwind {
entry:
; CHECK: t:
; CHECK: movups L_str+12(%rip), %xmm0
; CHECK: movups L_str(%rip), %xmm1
%tmp0 = alloca [60 x i8], align 1
%tmp1 = getelementptr inbounds [60 x i8]* %tmp0, i64 0, i64 0
br label %bb1
bb1:
; CHECK: LBB0_1:
; CHECK: movups %xmm0, 12(%rsp)
; CHECK: movaps %xmm1, (%rsp)
%tmp2 = phi i32 [ %tmp3, %bb1 ], [ 0, %entry ]
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp1, i8* getelementptr inbounds ([28 x i8]* @str, i64 0, i64 0), i64 28, i32 1, i1 false)
%tmp3 = add i32 %tmp2, 1
%tmp4 = icmp eq i32 %tmp3, %count
br i1 %tmp4, label %bb2, label %bb1
bb2:
ret void
}
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind