[CodeGen] In narrowExtractedVectorLoad bail out for scalable vectors

In narrowExtractedVectorLoad there is an optimisation that tries to combine extract_subvector with a narrowing vector load. At the moment this produces warnings due to the incorrect calls to getVectorNumElements() for scalable vector types. I've got this working for scalable vectors too when the extract subvector index is a multiple of the minimum number of elements. I have added a new variant of the function: MachineFunction::getMachineMemOperand that copies an existing MachineMemOperand, but replaces the pointer info with a null version since we cannot currently represent scaled offsets. I've added a new test for this particular case in: CodeGen/AArch64/sve-extract-subvector.ll Differential Revision: https://reviews.llvm.org/D83950
2024-11-29 14:20:29 +00:00 · 2020-07-14 13:50:21 +01:00 · 2020-07-14 13:50:21 +01:00 · 8fe44dff5f
commit 8fe44dff5f
parent 850e6d4aa1
4 changed files with 47 additions and 17 deletions
--- a/include/llvm/CodeGen/MachineFunction.h
+++ b/include/llvm/CodeGen/MachineFunction.h
@ -815,6 +815,14 @@ public:
  MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO,
                                          int64_t Offset, uint64_t Size);

+  /// getMachineMemOperand - Allocate a new MachineMemOperand by copying
+  /// an existing one, replacing only the MachinePointerInfo and size.
+  /// MachineMemOperands are owned by the MachineFunction and need not be
+  /// explicitly deallocated.
+  MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO,
+                                          MachinePointerInfo &PtrInfo,
+                                          uint64_t Size);
+
  /// Allocate a new MachineMemOperand by copying an existing one,
  /// replacing only AliasAnalysis information. MachineMemOperands are owned
  /// by the MachineFunction and need not be explicitly deallocated.
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@ -474,6 +474,13 @@ MachineMemOperand *MachineFunction::getMachineMemOperand(
                        SSID, Ordering, FailureOrdering);
 }

+MachineMemOperand *MachineFunction::getMachineMemOperand(
+    const MachineMemOperand *MMO, MachinePointerInfo &PtrInfo, uint64_t Size) {
+  return new (Allocator) MachineMemOperand(
+      PtrInfo, MMO->getFlags(), Size, Alignment, AAMDNodes(), nullptr,
+      MMO->getSyncScopeID(), MMO->getOrdering(), MMO->getFailureOrdering());
+}
+
 MachineMemOperand *
 MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
                                      int64_t Offset, uint64_t Size) {
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -19338,19 +19338,15 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
    return SDValue();

  unsigned Index = ExtIdx->getZExtValue();
-  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumElts = VT.getVectorMinNumElements();

-  // If the index is a multiple of the extract element count, we can offset the
-  // address by the store size multiplied by the subvector index. Otherwise if
-  // the scalar type is byte sized, we can just use the index multiplied by
-  // the element size in bytes as the offset.
-  unsigned Offset;
-  if (Index % NumElts == 0)
-    Offset = (Index / NumElts) * VT.getStoreSize();
-  else if (VT.getScalarType().isByteSized())
-    Offset = Index * VT.getScalarType().getStoreSize();
-  else
-    return SDValue();
+  // The definition of EXTRACT_SUBVECTOR states that the index must be a
+  // multiple of the minimum number of elements in the result type.
+  assert(Index % NumElts == 0 && "The extract subvector index is not a "
+                                 "multiple of the result's element count");
+
+  // It's fine to use TypeSize here as we know the offset will not be negative.
+  TypeSize Offset = VT.getStoreSize() * (Index / NumElts);

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT))
@ -19359,14 +19355,21 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
  // The narrow load will be offset from the base address of the old load if
  // we are extracting from something besides index 0 (little-endian).
  SDLoc DL(Extract);
-  SDValue BaseAddr = Ld->getBasePtr();

  // TODO: Use "BaseIndexOffset" to make this more effective.
-  SDValue NewAddr =
-      DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
+  SDValue NewAddr = DAG.getMemBasePlusOffset(Ld->getBasePtr(), Offset, DL);
+
+  uint64_t StoreSize = MemoryLocation::getSizeOrUnknown(VT.getStoreSize());
  MachineFunction &MF = DAG.getMachineFunction();
-  MachineMemOperand *MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset,
-                                                   VT.getStoreSize());
+  MachineMemOperand *MMO;
+  if (Offset.isScalable()) {
+    MachinePointerInfo MPI =
+        MachinePointerInfo(Ld->getPointerInfo().getAddrSpace());
+    MMO = MF.getMachineMemOperand(Ld->getMemOperand(), MPI, StoreSize);
+  } else
+    MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset.getFixedSize(),
+                                  StoreSize);
+
  SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO);
  DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
  return NewLd;
--- a/test/CodeGen/AArch64/sve-extract-subvector.ll
+++ b/test/CodeGen/AArch64/sve-extract-subvector.ll
@ -64,7 +64,19 @@ define <vscale x 2 x float> @extract_hi_nxv2f32_nxv4f32(<vscale x 4 x float> %z0
  ret <vscale x 2 x float> %ext
 }

+define <vscale x 4 x float> @load_extract_nxv4f32_nxv8f32(<vscale x 8 x float>* %p) {
+; CHECK-LABEL: load_extract_nxv4f32_nxv8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+  %tmp1 = load <vscale x 8 x float>, <vscale x 8 x float>* %p, align 16
+  %tmp2 = call <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv8f32(<vscale x 8 x float> %tmp1, i32 1)
+  ret <vscale x 4 x float> %tmp2
+}
+
 declare <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv4i64(<vscale x 4 x i64>, i32)
 declare <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv32i8(<vscale x 32 x i8>, i32)
 declare <vscale x 2 x float> @llvm.aarch64.sve.tuple.get.nxv4f32(<vscale x 4 x float>, i32)
 declare <vscale x 4 x half> @llvm.aarch64.sve.tuple.get.nxv8f16(<vscale x 8 x half>, i32)
+declare <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv8f32(<vscale x 8 x float>, i32)