[AVR] Expand large shifts early in IR

This patch makes sure shift instructions such as this one: %result = shl i32 %n, %amount are expanded just before the IR to SelectionDAG conversion to a loop so that calls to non-existing library functions such as __ashlsi3 are avoided. The generated code is currently pretty bad but there's a lot of room for improvement: the shift itself can be done in just four instructions. Differential Revision: https://reviews.llvm.org/D96677
2024-11-30 17:21:10 +00:00 · 2021-02-14 23:29:59 +01:00 · 2021-02-14 23:29:59 +01:00 · 6aa9e746eb
commit 6aa9e746eb
parent 431a941465
5 changed files with 250 additions and 0 deletions
--- a/llvm/lib/Target/AVR/AVR.h
+++ b/llvm/lib/Target/AVR/AVR.h
@ -22,6 +22,7 @@ namespace llvm {
 class AVRTargetMachine;
 class FunctionPass;

+Pass *createAVRShiftExpandPass();
 FunctionPass *createAVRISelDag(AVRTargetMachine &TM,
                               CodeGenOpt::Level OptLevel);
 FunctionPass *createAVRExpandPseudoPass();
@ -30,6 +31,7 @@ FunctionPass *createAVRRelaxMemPass();
 FunctionPass *createAVRDynAllocaSRPass();
 FunctionPass *createAVRBranchSelectionPass();

+void initializeAVRShiftExpandPass(PassRegistry &);
 void initializeAVRExpandPseudoPass(PassRegistry&);
 void initializeAVRRelaxMemPass(PassRegistry&);

--- a/llvm/lib/Target/AVR/AVRShiftExpand.cpp
+++ b/llvm/lib/Target/AVR/AVRShiftExpand.cpp
@ -0,0 +1,147 @@
+//===- AVRShift.cpp - Shift Expansion Pass --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Expand 32-bit shift instructions (shl, lshr, ashr) to inline loops, just
+/// like avr-gcc. This must be done in IR because otherwise the type legalizer
+/// will turn 32-bit shifts into (non-existing) library calls such as __ashlsi3.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVR.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+
+using namespace llvm;
+
+namespace {
+
+class AVRShiftExpand : public FunctionPass {
+public:
+  static char ID;
+
+  AVRShiftExpand() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override { return "AVR Shift Expansion"; }
+
+private:
+  void expand(BinaryOperator *BI);
+};
+
+} // end of anonymous namespace
+
+char AVRShiftExpand::ID = 0;
+
+INITIALIZE_PASS(AVRShiftExpand, "avr-shift-expand", "AVR Shift Expansion",
+                false, false)
+
+Pass *llvm::createAVRShiftExpandPass() { return new AVRShiftExpand(); }
+
+bool AVRShiftExpand::runOnFunction(Function &F) {
+  SmallVector<BinaryOperator *, 1> ShiftInsts;
+  auto &Ctx = F.getContext();
+  for (Instruction &I : instructions(F)) {
+    if (!I.isShift())
+      // Only expand shift instructions (shl, lshr, ashr).
+      continue;
+    if (I.getType() != Type::getInt32Ty(Ctx))
+      // Only expand plain i32 types.
+      continue;
+    if (isa<ConstantInt>(I.getOperand(1)))
+      // Only expand when the shift amount is not known.
+      // Known shift amounts are (currently) better expanded inline.
+      continue;
+    ShiftInsts.push_back(cast<BinaryOperator>(&I));
+  }
+
+  // The expanding itself needs to be done separately as expand() will remove
+  // these instructions. Removing instructions while iterating over a basic
+  // block is not a great idea.
+  for (auto *I : ShiftInsts) {
+    expand(I);
+  }
+
+  // Return whether this function expanded any shift instructions.
+  return ShiftInsts.size() > 0;
+}
+
+void AVRShiftExpand::expand(BinaryOperator *BI) {
+  auto &Ctx = BI->getContext();
+  IRBuilder<> Builder(BI);
+  Type *Int32Ty = Type::getInt32Ty(Ctx);
+  Type *Int8Ty = Type::getInt8Ty(Ctx);
+  Value *Int8Zero = ConstantInt::get(Int8Ty, 0);
+
+  // Split the current basic block at the point of the existing shift
+  // instruction and insert a new basic block for the loop.
+  BasicBlock *BB = BI->getParent();
+  Function *F = BB->getParent();
+  BasicBlock *EndBB = BB->splitBasicBlock(BI, "shift.done");
+  BasicBlock *LoopBB = BasicBlock::Create(Ctx, "shift.loop", F, EndBB);
+
+  // Truncate the shift amount to i8, which is trivially lowered to a single
+  // AVR register.
+  Builder.SetInsertPoint(&BB->back());
+  Value *ShiftAmount = Builder.CreateTrunc(BI->getOperand(1), Int8Ty);
+
+  // Replace the unconditional branch that splitBasicBlock created with a
+  // conditional branch.
+  Value *Cmp1 = Builder.CreateICmpEQ(ShiftAmount, Int8Zero);
+  Builder.CreateCondBr(Cmp1, EndBB, LoopBB);
+  BB->back().eraseFromParent();
+
+  // Create the loop body starting with PHI nodes.
+  Builder.SetInsertPoint(LoopBB);
+  PHINode *ShiftAmountPHI = Builder.CreatePHI(Int8Ty, 2);
+  ShiftAmountPHI->addIncoming(ShiftAmount, BB);
+  PHINode *ValuePHI = Builder.CreatePHI(Int32Ty, 2);
+  ValuePHI->addIncoming(BI->getOperand(0), BB);
+
+  // Subtract the shift amount by one, as we're shifting one this loop
+  // iteration.
+  Value *ShiftAmountSub =
+      Builder.CreateSub(ShiftAmountPHI, ConstantInt::get(Int8Ty, 1));
+  ShiftAmountPHI->addIncoming(ShiftAmountSub, LoopBB);
+
+  // Emit the actual shift instruction. The difference is that this shift
+  // instruction has a constant shift amount, which can be emitted inline
+  // without a library call.
+  Value *ValueShifted;
+  switch (BI->getOpcode()) {
+  case Instruction::Shl:
+    ValueShifted = Builder.CreateShl(ValuePHI, ConstantInt::get(Int32Ty, 1));
+    break;
+  case Instruction::LShr:
+    ValueShifted = Builder.CreateLShr(ValuePHI, ConstantInt::get(Int32Ty, 1));
+    break;
+  case Instruction::AShr:
+    ValueShifted = Builder.CreateAShr(ValuePHI, ConstantInt::get(Int32Ty, 1));
+    break;
+  default:
+    llvm_unreachable("asked to expand an instruction that is not a shift");
+  }
+  ValuePHI->addIncoming(ValueShifted, LoopBB);
+
+  // Branch to either the loop again (if there is more to shift) or to the
+  // basic block after the loop (if all bits are shifted).
+  Value *Cmp2 = Builder.CreateICmpEQ(ShiftAmountSub, Int8Zero);
+  Builder.CreateCondBr(Cmp2, EndBB, LoopBB);
+
+  // Collect the resulting value. This is necessary in the IR but won't produce
+  // any actual instructions.
+  Builder.SetInsertPoint(BI);
+  PHINode *Result = Builder.CreatePHI(Int32Ty, 2);
+  Result->addIncoming(BI->getOperand(0), BB);
+  Result->addIncoming(ValueShifted, LoopBB);
+
+  // Replace the original shift instruction.
+  BI->replaceAllUsesWith(Result);
+  BI->eraseFromParent();
+}
--- a/llvm/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@ -65,6 +65,7 @@ public:
    return getTM<AVRTargetMachine>();
  }

+  void addIRPasses() override;
  bool addInstSelector() override;
  void addPreSched2() override;
  void addPreEmitPass() override;
@ -76,6 +77,15 @@ TargetPassConfig *AVRTargetMachine::createPassConfig(PassManagerBase &PM) {
  return new AVRPassConfig(*this, PM);
 }

+void AVRPassConfig::addIRPasses() {
+  // Expand instructions like
+  //   %result = shl i32 %n, %amount
+  // to a loop so that library calls are avoided.
+  addPass(createAVRShiftExpandPass());
+
+  TargetPassConfig::addIRPasses();
+}
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTarget() {
  // Register the target.
  RegisterTargetMachine<AVRTargetMachine> X(getTheAVRTarget());
@ -83,6 +93,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTarget() {
  auto &PR = *PassRegistry::getPassRegistry();
  initializeAVRExpandPseudoPass(PR);
  initializeAVRRelaxMemPass(PR);
+  initializeAVRShiftExpandPass(PR);
 }

 const AVRSubtarget *AVRTargetMachine::getSubtargetImpl() const {
--- a/llvm/lib/Target/AVR/CMakeLists.txt
+++ b/llvm/lib/Target/AVR/CMakeLists.txt
@ -24,6 +24,7 @@ add_llvm_target(AVRCodeGen
  AVRMCInstLower.cpp
  AVRRelaxMemOperations.cpp
  AVRRegisterInfo.cpp
+  AVRShiftExpand.cpp
  AVRSubtarget.cpp
  AVRTargetMachine.cpp
  AVRTargetObjectFile.cpp
--- a/llvm/test/CodeGen/AVR/shift-expand.ll
+++ b/llvm/test/CodeGen/AVR/shift-expand.ll
@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -avr-shift-expand -S %s -o - | FileCheck %s
+
+; The avr-shift-expand pass expands large shifts with a non-constant shift
+; amount to a loop. These loops avoid generating a (non-existing) builtin such
+; as __ashlsi3.
+
+target datalayout = "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8"
+target triple = "avr"
+
+define i32 @shl(i32 %value, i32 %amount) addrspace(1) {
+; CHECK-LABEL: @shl(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[AMOUNT:%.*]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[SHIFT_DONE:%.*]], label [[SHIFT_LOOP:%.*]]
+; CHECK:       shift.loop:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i8 [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[SHIFT_LOOP]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[VALUE:%.*]], [[TMP0]] ], [ [[TMP6:%.*]], [[SHIFT_LOOP]] ]
+; CHECK-NEXT:    [[TMP5]] = sub i8 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP6]] = shl i32 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i8 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[SHIFT_DONE]], label [[SHIFT_LOOP]]
+; CHECK:       shift.done:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ [[VALUE]], [[TMP0]] ], [ [[TMP6]], [[SHIFT_LOOP]] ]
+; CHECK-NEXT:    ret i32 [[TMP8]]
+;
+  %result = shl i32 %value, %amount
+  ret i32 %result
+}
+
+define i32 @lshr(i32 %value, i32 %amount) addrspace(1) {
+; CHECK-LABEL: @lshr(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[AMOUNT:%.*]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[SHIFT_DONE:%.*]], label [[SHIFT_LOOP:%.*]]
+; CHECK:       shift.loop:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i8 [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[SHIFT_LOOP]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[VALUE:%.*]], [[TMP0]] ], [ [[TMP6:%.*]], [[SHIFT_LOOP]] ]
+; CHECK-NEXT:    [[TMP5]] = sub i8 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP6]] = lshr i32 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i8 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[SHIFT_DONE]], label [[SHIFT_LOOP]]
+; CHECK:       shift.done:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ [[VALUE]], [[TMP0]] ], [ [[TMP6]], [[SHIFT_LOOP]] ]
+; CHECK-NEXT:    ret i32 [[TMP8]]
+;
+  %result = lshr i32 %value, %amount
+  ret i32 %result
+}
+
+define i32 @ashr(i32 %0, i32 %1) addrspace(1) {
+; CHECK-LABEL: @ashr(
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP1:%.*]] to i8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i8 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[SHIFT_DONE:%.*]], label [[SHIFT_LOOP:%.*]]
+; CHECK:       shift.loop:
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i8 [ [[TMP3]], [[TMP2:%.*]] ], [ [[TMP7:%.*]], [[SHIFT_LOOP]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi i32 [ [[TMP0:%.*]], [[TMP2]] ], [ [[TMP8:%.*]], [[SHIFT_LOOP]] ]
+; CHECK-NEXT:    [[TMP7]] = sub i8 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP8]] = ashr i32 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i8 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[TMP9]], label [[SHIFT_DONE]], label [[SHIFT_LOOP]]
+; CHECK:       shift.done:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP0]], [[TMP2]] ], [ [[TMP8]], [[SHIFT_LOOP]] ]
+; CHECK-NEXT:    ret i32 [[TMP10]]
+;
+  %3 = ashr i32 %0, %1
+  ret i32 %3
+}
+
+; This function is not modified because it is not an i32.
+define i40 @shl40(i40 %value, i40 %amount) addrspace(1) {
+; CHECK-LABEL: @shl40(
+; CHECK-NEXT:    [[RESULT:%.*]] = shl i40 [[VALUE:%.*]], [[AMOUNT:%.*]]
+; CHECK-NEXT:    ret i40 [[RESULT]]
+;
+  %result = shl i40 %value, %amount
+  ret i40 %result
+}
+
+; This function isn't either, although perhaps it should.
+define i24 @shl24(i24 %value, i24 %amount) addrspace(1) {
+; CHECK-LABEL: @shl24(
+; CHECK-NEXT:    [[RESULT:%.*]] = shl i24 [[VALUE:%.*]], [[AMOUNT:%.*]]
+; CHECK-NEXT:    ret i24 [[RESULT]]
+;
+  %result = shl i24 %value, %amount
+  ret i24 %result
+}