[XCore] Target optimized library function __memcpy_4()

Summary: If the src, dst and size of a memcpy are known to be 4 byte aligned we can call __memcpy_4() instead of memcpy(). Reviewers: robertlytton Reviewed By: robertlytton CC: llvm-commits Differential Revision: http://llvm-reviews.chandlerc.com/D2871 llvm-svn: 202395
2024-12-11 13:37:07 +00:00 · 2014-02-27 13:39:07 +00:00 · 2014-02-27 13:39:07 +00:00 · 5ac74685fd
commit 5ac74685fd
parent 75c16f2bf4
4 changed files with 75 additions and 1 deletions
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@ -21,3 +21,36 @@ XCoreSelectionDAGInfo::XCoreSelectionDAGInfo(const XCoreTargetMachine &TM)
 XCoreSelectionDAGInfo::~XCoreSelectionDAGInfo() {
 }
 SDValue XCoreSelectionDAGInfo::
 EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
                        SDValue Dst, SDValue Src, SDValue Size, unsigned Align,
                        bool isVolatile, bool AlwaysInline,
                        MachinePointerInfo DstPtrInfo,
                        MachinePointerInfo SrcPtrInfo) const
 {
  unsigned SizeBitWidth = Size.getValueType().getSizeInBits();
  // Call __memcpy_4 if the src, dst and size are all 4 byte aligned.
  if (!AlwaysInline && (Align & 3) == 0 &&
      DAG.MaskedValueIsZero(Size, APInt(SizeBitWidth, 3))) {
    const TargetLowering &TLI = *DAG.getTarget().getTargetLowering();
    TargetLowering::ArgListTy Args;
    TargetLowering::ArgListEntry Entry;
    Entry.Ty = TLI.getDataLayout()->getIntPtrType(*DAG.getContext());
    Entry.Node = Dst; Args.push_back(Entry);
    Entry.Node = Src; Args.push_back(Entry);
    Entry.Node = Size; Args.push_back(Entry);
    TargetLowering::CallLoweringInfo
    CLI(Chain, Type::getVoidTy(*DAG.getContext()), false, false, false, false,
        0, TLI.getLibcallCallingConv(RTLIB::MEMCPY), /*isTailCall=*/false,
        /*doesNotRet=*/false, /*isReturnValueUsed=*/false,
        DAG.getExternalSymbol("__memcpy_4", TLI.getPointerTy()), Args, DAG, dl);
    std::pair<SDValue,SDValue> CallResult =
      TLI.LowerCallTo(CLI);
    return CallResult.second;
  }
  // Otherwise have the target-independent code call memcpy.
  return SDValue();
 }
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.h
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.h
@ -24,6 +24,15 @@ class XCoreSelectionDAGInfo : public TargetSelectionDAGInfo {
 public:
  explicit XCoreSelectionDAGInfo(const XCoreTargetMachine &TM);
  ~XCoreSelectionDAGInfo();
  virtual SDValue
  EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                          SDValue Chain,
                          SDValue Op1, SDValue Op2,
                          SDValue Op3, unsigned Align, bool isVolatile,
                          bool AlwaysInline,
                          MachinePointerInfo DstPtrInfo,
                          MachinePointerInfo SrcPtrInfo) const;
 };
 }
--- a/test/CodeGen/XCore/byVal.ll
+++ b/test/CodeGen/XCore/byVal.ll
@ -20,7 +20,7 @@ entry:
 ; CHECK: ldaw r5, sp[1]
 ; CHECK: ldc r2, 40
 ; CHECK: mov r0, r5
-; CHECK: bl memcpy
+; CHECK: bl __memcpy_4
 ; CHECK: mov r0, r5
 ; CHECK: bl f1
 ; CHECK: mov r0, r4
--- a/test/CodeGen/XCore/memcpy.ll
+++ b/test/CodeGen/XCore/memcpy.ll
@ -0,0 +1,32 @@
 ; RUN: llc < %s -march=xcore | FileCheck %s
 ; Optimize memcpy to __memcpy_4 if src, dst and size are all 4 byte aligned.
 define void @f1(i8* %dst, i8* %src, i32 %n) nounwind {
 ; CHECK-LABEL: f1:
 ; CHECK: bl __memcpy_4
 entry:
  %0 = shl i32 %n, 2
  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %0, i32 4, i1 false)
  ret void
 }
 ; Can't optimize - size is not a multiple of 4.
 define void @f2(i8* %dst, i8* %src, i32 %n) nounwind {
 ; CHECK-LABEL: f2:
 ; CHECK: bl memcpy
 entry:
  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %n, i32 4, i1 false)
  ret void
 }
 ; Can't optimize - alignment is not a multiple of 4.
 define void @f3(i8* %dst, i8* %src, i32 %n) nounwind {
 ; CHECK-LABEL: f3:
 ; CHECK: bl memcpy
 entry:
  %0 = shl i32 %n, 2
  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %0, i32 2, i1 false)
  ret void
 }
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind