Teach the instruction selector how to transform 'array' GEP computations into X86

scaled indexes. This allows us to compile GEP's like this: int* %test([10 x { int, { int } }]* %X, int %Idx) { %Idx = cast int %Idx to long %X = getelementptr [10 x { int, { int } }]* %X, long 0, long %Idx, ubyte 1, ubyte 0 ret int* %X } Into a single address computation: test: mov %EAX, DWORD PTR [%ESP + 4] mov %ECX, DWORD PTR [%ESP + 8] lea %EAX, DWORD PTR [%EAX + 8*%ECX + 4] ret Before it generated: test: mov %EAX, DWORD PTR [%ESP + 4] mov %ECX, DWORD PTR [%ESP + 8] shl %ECX, 3 add %EAX, %ECX lea %EAX, DWORD PTR [%EAX + 4] ret This is useful for things like int/float/double arrays, as the indexing can be folded into the loads&stores, reducing register pressure and decreasing the pressure on the decode unit. With these changes, I expect our performance on 256.bzip2 and gzip to improve a lot. On bzip2 for example, we go from this: 10665 asm-printer - Number of machine instrs printed 40 ra-local - Number of loads/stores folded into instructions 1708 ra-local - Number of loads added 1532 ra-local - Number of stores added 1354 twoaddressinstruction - Number of instructions added 1354 twoaddressinstruction - Number of two-address instructions 2794 x86-peephole - Number of peephole optimization performed to this: 9873 asm-printer - Number of machine instrs printed 41 ra-local - Number of loads/stores folded into instructions 1710 ra-local - Number of loads added 1521 ra-local - Number of stores added 789 twoaddressinstruction - Number of instructions added 789 twoaddressinstruction - Number of two-address instructions 2142 x86-peephole - Number of peephole optimization performed ... and these types of instructions are often in tight loops. Linear scan is also helped, but not as much. It goes from: 8787 asm-printer - Number of machine instrs printed 2389 liveintervals - Number of identity moves eliminated after coalescing 2288 liveintervals - Number of interval joins performed 3522 liveintervals - Number of intervals after coalescing 5810 liveintervals - Number of original intervals 700 spiller - Number of loads added 487 spiller - Number of stores added 303 spiller - Number of register spills 1354 twoaddressinstruction - Number of instructions added 1354 twoaddressinstruction - Number of two-address instructions 363 x86-peephole - Number of peephole optimization performed to: 7982 asm-printer - Number of machine instrs printed 1759 liveintervals - Number of identity moves eliminated after coalescing 1658 liveintervals - Number of interval joins performed 3282 liveintervals - Number of intervals after coalescing 4940 liveintervals - Number of original intervals 635 spiller - Number of loads added 452 spiller - Number of stores added 288 spiller - Number of register spills 789 twoaddressinstruction - Number of instructions added 789 twoaddressinstruction - Number of two-address instructions 258 x86-peephole - Number of peephole optimization performed Though I'm not complaining about the drop in the number of intervals. :) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@11820 91177308-0d34-0410-b5e6-96231b3b80d8
2025-01-16 23:19:37 +00:00 · 2004-02-25 07:00:55 +00:00 · 2004-02-25 07:00:55 +00:00 · 5f2c7b1975
commit 5f2c7b1975
parent b6bac51351
2 changed files with 46 additions and 48 deletions
--- a/lib/Target/X86/InstSelectSimple.cpp
+++ b/lib/Target/X86/InstSelectSimple.cpp
@ -2438,11 +2438,30 @@ void ISel::getGEPIndex(MachineBasicBlock *MBB, MachineBasicBlock::iterator IP,
      assert(idx->getType() == Type::LongTy && "Bad GEP array index!");

      // If idx is a constant, fold it into the offset.
+      unsigned TypeSize = TD.getTypeSize(SqTy->getElementType());
      if (ConstantSInt *CSI = dyn_cast<ConstantSInt>(idx)) {
-        Disp += TD.getTypeSize(SqTy->getElementType())*CSI->getValue();
+        Disp += TypeSize*CSI->getValue();
      } else {
-        // If we can't handle it, return.
-        return;
+        // If the index reg is already taken, we can't handle this index.
+        if (IndexReg) return;
+
+        // If this is a size that we can handle, then add the index as 
+        switch (TypeSize) {
+        case 1: case 2: case 4: case 8:
+          // These are all acceptable scales on X86.
+          Scale = TypeSize;
+          break;
+        default:
+          // Otherwise, we can't handle this scale
+          return;
+        }
+
+        if (CastInst *CI = dyn_cast<CastInst>(idx))
+          if (CI->getOperand(0)->getType() == Type::IntTy ||
+              CI->getOperand(0)->getType() == Type::UIntTy)
+            idx = CI->getOperand(0);
+
+        IndexReg = MBB ? getReg(idx, MBB, IP) : 1;
      }

      GEPOps.pop_back();        // Consume a GEP operand
@ -2456,7 +2475,7 @@ void ISel::getGEPIndex(MachineBasicBlock *MBB, MachineBasicBlock::iterator IP,
  // FIXME: When addressing modes are more powerful/correct, we could load
  // global addresses directly as 32-bit immediates.
  assert(BaseReg == 0);
-  BaseReg = MBB ? getReg(GEPOps[0], MBB, IP) : 0;
+  BaseReg = MBB ? getReg(GEPOps[0], MBB, IP) : 1;
  GEPOps.pop_back();        // Consume the last GEP operand
 }

@ -2538,26 +2557,6 @@ void ISel::emitGEPOperation(MachineBasicBlock *MBB,
      }
      break;                // we are now done

-    } else if (const StructType *StTy = dyn_cast<StructType>(GEPTypes.back())) {
-      // It's a struct access.  CUI is the index into the structure,
-      // which names the field. This index must have unsigned type.
-      const ConstantUInt *CUI = cast<ConstantUInt>(GEPOps.back());
-      GEPOps.pop_back();        // Consume a GEP operand
-      GEPTypes.pop_back();
-
-      // Use the TargetData structure to pick out what the layout of the
-      // structure is in memory.  Since the structure index must be constant, we
-      // can get its value and use it to find the right byte offset from the
-      // StructLayout class's list of structure member offsets.
-      unsigned idxValue = CUI->getValue();
-      unsigned FieldOff = TD.getStructLayout(StTy)->MemberOffsets[idxValue];
-      if (FieldOff) {
-        unsigned Reg = makeAnotherReg(Type::UIntTy);
-        // Emit an ADD to add FieldOff to the basePtr.
-        BMI(MBB, IP, X86::ADDri32, 2, TargetReg).addReg(Reg).addZImm(FieldOff);
-        --IP;            // Insert the next instruction before this one.
-        TargetReg = Reg; // Codegen the rest of the GEP into this
-      }
    } else {
      // It's an array or pointer access: [ArraySize x ElementType].
      const SequentialType *SqTy = cast<SequentialType>(GEPTypes.back());
--- a/lib/Target/X86/X86ISelSimple.cpp
+++ b/lib/Target/X86/X86ISelSimple.cpp
@ -2438,11 +2438,30 @@ void ISel::getGEPIndex(MachineBasicBlock *MBB, MachineBasicBlock::iterator IP,
      assert(idx->getType() == Type::LongTy && "Bad GEP array index!");

      // If idx is a constant, fold it into the offset.
+      unsigned TypeSize = TD.getTypeSize(SqTy->getElementType());
      if (ConstantSInt *CSI = dyn_cast<ConstantSInt>(idx)) {
-        Disp += TD.getTypeSize(SqTy->getElementType())*CSI->getValue();
+        Disp += TypeSize*CSI->getValue();
      } else {
-        // If we can't handle it, return.
-        return;
+        // If the index reg is already taken, we can't handle this index.
+        if (IndexReg) return;
+
+        // If this is a size that we can handle, then add the index as 
+        switch (TypeSize) {
+        case 1: case 2: case 4: case 8:
+          // These are all acceptable scales on X86.
+          Scale = TypeSize;
+          break;
+        default:
+          // Otherwise, we can't handle this scale
+          return;
+        }
+
+        if (CastInst *CI = dyn_cast<CastInst>(idx))
+          if (CI->getOperand(0)->getType() == Type::IntTy ||
+              CI->getOperand(0)->getType() == Type::UIntTy)
+            idx = CI->getOperand(0);
+
+        IndexReg = MBB ? getReg(idx, MBB, IP) : 1;
      }

      GEPOps.pop_back();        // Consume a GEP operand
@ -2456,7 +2475,7 @@ void ISel::getGEPIndex(MachineBasicBlock *MBB, MachineBasicBlock::iterator IP,
  // FIXME: When addressing modes are more powerful/correct, we could load
  // global addresses directly as 32-bit immediates.
  assert(BaseReg == 0);
-  BaseReg = MBB ? getReg(GEPOps[0], MBB, IP) : 0;
+  BaseReg = MBB ? getReg(GEPOps[0], MBB, IP) : 1;
  GEPOps.pop_back();        // Consume the last GEP operand
 }

@ -2538,26 +2557,6 @@ void ISel::emitGEPOperation(MachineBasicBlock *MBB,
      }
      break;                // we are now done

-    } else if (const StructType *StTy = dyn_cast<StructType>(GEPTypes.back())) {
-      // It's a struct access.  CUI is the index into the structure,
-      // which names the field. This index must have unsigned type.
-      const ConstantUInt *CUI = cast<ConstantUInt>(GEPOps.back());
-      GEPOps.pop_back();        // Consume a GEP operand
-      GEPTypes.pop_back();
-
-      // Use the TargetData structure to pick out what the layout of the
-      // structure is in memory.  Since the structure index must be constant, we
-      // can get its value and use it to find the right byte offset from the
-      // StructLayout class's list of structure member offsets.
-      unsigned idxValue = CUI->getValue();
-      unsigned FieldOff = TD.getStructLayout(StTy)->MemberOffsets[idxValue];
-      if (FieldOff) {
-        unsigned Reg = makeAnotherReg(Type::UIntTy);
-        // Emit an ADD to add FieldOff to the basePtr.
-        BMI(MBB, IP, X86::ADDri32, 2, TargetReg).addReg(Reg).addZImm(FieldOff);
-        --IP;            // Insert the next instruction before this one.
-        TargetReg = Reg; // Codegen the rest of the GEP into this
-      }
    } else {
      // It's an array or pointer access: [ArraySize x ElementType].
      const SequentialType *SqTy = cast<SequentialType>(GEPTypes.back());