[IR] Enable load/store/alloca for arrays of scalable vectors.

Differential Revision: https://reviews.llvm.org/D158517
2025-02-14 23:10:54 +00:00 · 2023-08-11 14:39:17 +01:00 · 2023-08-11 14:39:17 +01:00 · c7d65e4466
commit c7d65e4466
parent 02d27eac0f
22 changed files with 324 additions and 56 deletions
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@ -742,16 +742,16 @@ an optional list of attached :ref:`metadata <metadata>`.
 Variables and aliases can have a
 :ref:`Thread Local Storage Model <tls_model>`.

-:ref:`Scalable vectors <t_vector>` cannot be global variables or members of
-arrays because their size is unknown at compile time. They are allowed in
-structs to facilitate intrinsics returning multiple values. Generally, structs
-containing scalable vectors are not considered "sized" and cannot be used in
-loads, stores, allocas, or GEPs. The only exception to this rule is for structs
-that contain scalable vectors of the same type (e.g. ``{<vscale x 2 x i32>,
-<vscale x 2 x i32>}`` contains the same type while ``{<vscale x 2 x i32>,
-<vscale x 2 x i64>}`` doesn't). These kinds of structs (we may call them
-homogeneous scalable vector structs) are considered sized and can be used in
-loads, stores, allocas, but not GEPs.
+Globals cannot be or contain :ref:`Scalable vectors <t_vector>` because their
+size is unknown at compile time. They are allowed in structs to facilitate
+intrinsics returning multiple values. Generally, structs containing scalable
+vectors are not considered "sized" and cannot be used in loads, stores, allocas,
+or GEPs. The only exception to this rule is for structs that contain scalable
+vectors of the same type (e.g. ``{<vscale x 2 x i32>, <vscale x 2 x i32>}``
+contains the same type while ``{<vscale x 2 x i32>, <vscale x 2 x i64>}``
+doesn't). These kinds of structs (we may call them homogeneous scalable vector
+structs) are considered sized and can be used in loads, stores, allocas, but
+not GEPs.

 Syntax::

--- a/llvm/include/llvm/IR/Type.h
+++ b/llvm/include/llvm/IR/Type.h
@ -209,8 +209,7 @@ public:
  /// Return true if this is a target extension type with a scalable layout.
  bool isScalableTargetExtTy() const;

-  /// Return true if this is a scalable vector type or a target extension type
-  /// with a scalable layout.
+  /// Return true if this is a type whose size is a known multiple of vscale.
  bool isScalableTy() const;

  /// Return true if this is a FP type or a vector of FP.
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@ -4934,7 +4934,7 @@ static Value *simplifyGEPInst(Type *SrcTy, Value *Ptr,
    return UndefValue::get(GEPTy);

  bool IsScalableVec =
-      isa<ScalableVectorType>(SrcTy) || any_of(Indices, [](const Value *V) {
+      SrcTy->isScalableTy() || any_of(Indices, [](const Value *V) {
        return isa<ScalableVectorType>(V->getType());
      });

--- a/llvm/lib/IR/Operator.cpp
+++ b/llvm/lib/IR/Operator.cpp
@ -127,9 +127,7 @@ bool GEPOperator::accumulateConstantOffset(
  auto end = generic_gep_type_iterator<decltype(Index.end())>::end(Index.end());
  for (auto GTI = begin, GTE = end; GTI != GTE; ++GTI) {
    // Scalable vectors are multiplied by a runtime constant.
-    bool ScalableType = false;
-    if (isa<ScalableVectorType>(GTI.getIndexedType()))
-      ScalableType = true;
+    bool ScalableType = GTI.getIndexedType()->isScalableTy();

    Value *V = GTI.getOperand();
    StructType *STy = GTI.getStructTypeOrNull();
@ -189,7 +187,7 @@ bool GEPOperator::collectOffset(
  for (gep_type_iterator GTI = gep_type_begin(this), GTE = gep_type_end(this);
       GTI != GTE; ++GTI) {
    // Scalable vectors are multiplied by a runtime constant.
-    bool ScalableType = isa<ScalableVectorType>(GTI.getIndexedType());
+    bool ScalableType = GTI.getIndexedType()->isScalableTy();

    Value *V = GTI.getOperand();
    StructType *STy = GTI.getStructTypeOrNull();
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@ -58,6 +58,8 @@ bool Type::isIntegerTy(unsigned Bitwidth) const {
 }

 bool Type::isScalableTy() const {
+  if (const auto *ATy = dyn_cast<ArrayType>(this))
+    return ATy->getElementType()->isScalableTy();
  if (const auto *STy = dyn_cast<StructType>(this)) {
    SmallPtrSet<Type *, 4> Visited;
    return STy->containsScalableVectorType(&Visited);
@ -658,8 +660,7 @@ ArrayType *ArrayType::get(Type *ElementType, uint64_t NumElements) {
 bool ArrayType::isValidElementType(Type *ElemTy) {
  return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
         !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy() &&
-         !ElemTy->isTokenTy() && !ElemTy->isX86_AMXTy() &&
-         !isa<ScalableVectorType>(ElemTy);
+         !ElemTy->isTokenTy() && !ElemTy->isX86_AMXTy();
 }

 //===----------------------------------------------------------------------===//
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@ -850,17 +850,9 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
  }

  // Scalable vectors cannot be global variables, since we don't know
-  // the runtime size. If the global is an array containing scalable vectors,
-  // that will be caught by the isValidElementType methods in StructType or
-  // ArrayType instead.
-  Check(!isa<ScalableVectorType>(GV.getValueType()),
-        "Globals cannot contain scalable vectors", &GV);
-
-  if (auto *STy = dyn_cast<StructType>(GV.getValueType())) {
-    SmallPtrSet<Type *, 4> Visited;
-    Check(!STy->containsScalableVectorType(&Visited),
-          "Globals cannot contain scalable vectors", &GV);
-  }
+  // the runtime size.
+  Check(!GV.getValueType()->isScalableTy(),
+        "Globals cannot contain scalable types", &GV);

  // Check if it's a target extension type that disallows being used as a
  // global.
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@ -390,7 +390,7 @@ static bool collectSRATypes(DenseMap<uint64_t, GlobalPart> &Parts,
      }

      // Scalable types not currently supported.
-      if (isa<ScalableVectorType>(Ty))
+      if (Ty->isScalableTy())
        return false;

      auto IsStored = [](Value *V, Constant *Initializer) {
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@ -804,7 +804,7 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
      return nullptr;

    const DataLayout &DL = IC.getDataLayout();
-    auto EltSize = DL.getTypeAllocSize(ET);
+    TypeSize EltSize = DL.getTypeAllocSize(ET);
    const auto Align = LI.getAlign();

    auto *Addr = LI.getPointerOperand();
@ -812,7 +812,7 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
    auto *Zero = ConstantInt::get(IdxType, 0);

    Value *V = PoisonValue::get(T);
-    uint64_t Offset = 0;
+    TypeSize Offset = TypeSize::get(0, ET->isScalableTy());
    for (uint64_t i = 0; i < NumElements; i++) {
      Value *Indices[2] = {
        Zero,
@ -820,9 +820,9 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
      };
      auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, ArrayRef(Indices),
                                               Name + ".elt");
+      auto EltAlign = commonAlignment(Align, Offset.getKnownMinValue());
      auto *L = IC.Builder.CreateAlignedLoad(AT->getElementType(), Ptr,
-                                             commonAlignment(Align, Offset),
-                                             Name + ".unpack");
+                                             EltAlign, Name + ".unpack");
      L->setAAMetadata(LI.getAAMetadata());
      V = IC.Builder.CreateInsertValue(V, L, i);
      Offset += EltSize;
@ -957,7 +957,7 @@ static bool canReplaceGEPIdxWithZero(InstCombinerImpl &IC,
  Type *SourceElementType = GEPI->getSourceElementType();
  // Size information about scalable vectors is not available, so we cannot
  // deduce whether indexing at n is undefined behaviour or not. Bail out.
-  if (isa<ScalableVectorType>(SourceElementType))
+  if (SourceElementType->isScalableTy())
    return false;

  Type *AllocTy = GetElementPtrInst::getIndexedType(SourceElementType, Ops);
@ -1323,7 +1323,7 @@ static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
      return false;

    const DataLayout &DL = IC.getDataLayout();
-    auto EltSize = DL.getTypeAllocSize(AT->getElementType());
+    TypeSize EltSize = DL.getTypeAllocSize(AT->getElementType());
    const auto Align = SI.getAlign();

    SmallString<16> EltName = V->getName();
@ -1335,7 +1335,7 @@ static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
    auto *IdxType = Type::getInt64Ty(T->getContext());
    auto *Zero = ConstantInt::get(IdxType, 0);

-    uint64_t Offset = 0;
+    TypeSize Offset = TypeSize::get(0, AT->getElementType()->isScalableTy());
    for (uint64_t i = 0; i < NumElements; i++) {
      Value *Indices[2] = {
        Zero,
@ -1344,7 +1344,7 @@ static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
      auto *Ptr =
          IC.Builder.CreateInBoundsGEP(AT, Addr, ArrayRef(Indices), AddrName);
      auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
-      auto EltAlign = commonAlignment(Align, Offset);
+      auto EltAlign = commonAlignment(Align, Offset.getKnownMinValue());
      Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
      NS->setAAMetadata(SI.getAAMetadata());
      Offset += EltSize;
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@ -2005,7 +2005,7 @@ Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP,
    APInt Offset(DL.getIndexTypeSizeInBits(PtrTy), 0);
    if (NumVarIndices != Src->getNumIndices()) {
      // FIXME: getIndexedOffsetInType() does not handled scalable vectors.
-      if (isa<ScalableVectorType>(BaseType))
+      if (BaseType->isScalableTy())
        return nullptr;

      SmallVector<Value *> ConstantIndices;
@ -2118,7 +2118,7 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
  SmallVector<Value *, 8> Indices(GEP.indices());
  Type *GEPType = GEP.getType();
  Type *GEPEltType = GEP.getSourceElementType();
-  bool IsGEPSrcEleScalable = isa<ScalableVectorType>(GEPEltType);
+  bool IsGEPSrcEleScalable = GEPEltType->isScalableTy();
  if (Value *V = simplifyGEPInst(GEPEltType, PtrOp, Indices, GEP.isInBounds(),
                                 SQ.getWithInstruction(&GEP)))
    return replaceInstUsesWith(GEP, V);
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@ -830,7 +830,7 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
    if (GTI.isSequential()) {
      // Constant offsets of scalable types are not really constant.
-      if (isa<ScalableVectorType>(GTI.getIndexedType()))
+      if (GTI.getIndexedType()->isScalableTy())
        continue;

      // Tries to extract a constant offset from this GEP index.
@ -1019,7 +1019,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
    if (GTI.isSequential()) {
      // Constant offsets of scalable types are not really constant.
-      if (isa<ScalableVectorType>(GTI.getIndexedType()))
+      if (GTI.getIndexedType()->isScalableTy())
        continue;

      // Splits this GEP index into a variadic part and a constant offset, and
--- a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
+++ b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+%my_subtype = type <vscale x 2 x double>
+%my_type = type [3 x %my_subtype]
+
+define void @array_1D(ptr %addr) #0 {
+; CHECK-LABEL: array_1D:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z2.d }, p0, [sp]
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %ret = alloca %my_type, align 8
+  %val = load %my_type, ptr %addr
+  store %my_type %val, ptr %ret, align 8
+  ret void
+}
+
+define %my_subtype @array_1D_extract(ptr %addr) #0 {
+; CHECK-LABEL: array_1D_extract:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %ret = alloca %my_type, align 8
+  %val = load %my_type, ptr %addr
+  %elt = extractvalue %my_type %val, 1
+  ret %my_subtype %elt
+}
+
+define void @array_1D_insert(ptr %addr, %my_subtype %elt) #0 {
+; CHECK-LABEL: array_1D_insert:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp]
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %ret = alloca %my_type, align 8
+  %val = load %my_type, ptr %addr
+  %ins = insertvalue %my_type %val, %my_subtype %elt, 1
+  store %my_type %ins, ptr %ret, align 8
+  ret void
+}
+
+define void @array_2D(ptr %addr) #0 {
+; CHECK-LABEL: array_2D:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-6
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 48 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT:    ld1d { z4.d }, p0/z, [x0, #5, mul vl]
+; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z5.d }, p0, [sp]
+; CHECK-NEXT:    st1d { z4.d }, p0, [sp, #5, mul vl]
+; CHECK-NEXT:    st1d { z3.d }, p0, [sp, #4, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [sp, #3, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #6
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %ret = alloca [2 x %my_type], align 8
+  %val = load [2 x %my_type], ptr %addr
+  store [2 x %my_type] %val, ptr %ret, align 8
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve" }
--- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs --riscv-no-aliases < %s | FileCheck %s
+
+target triple = "riscv64-unknown-unknown-elf"
+
+%my_type = type [3 x <vscale x 1 x double>]
+
+define void @test(ptr %addr) {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrrs a1, vlenb, zero
+; CHECK-NEXT:    slli a1, a1, 2
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; CHECK-NEXT:    csrrs a1, vlenb, zero
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    vl1re64.v v8, (a2)
+; CHECK-NEXT:    slli a2, a1, 1
+; CHECK-NEXT:    vl1re64.v v9, (a0)
+; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    vl1re64.v v10, (a0)
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs1r.v v9, (a0)
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    vs1r.v v10, (a2)
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vs1r.v v8, (a0)
+; CHECK-NEXT:    csrrs a0, vlenb, zero
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %ret = alloca %my_type, align 8
+  %val = load %my_type, ptr %addr
+  store %my_type %val, ptr %ret, align 8
+  ret void
+}
--- a/llvm/test/Other/scalable-vector-array.ll
+++ b/llvm/test/Other/scalable-vector-array.ll
@ -1,8 +0,0 @@
-; RUN: not opt -S -passes=verify < %s 2>&1 | FileCheck %s
-
-;; Arrays cannot contain scalable vectors; make sure we detect them even
-;; when nested inside other aggregates.
-
-%ty = type { i64, [4 x <vscale x 256 x i1>] }
-; CHECK: error: invalid array element type
-; CHECK: %ty = type { i64, [4 x <vscale x 256 x i1>] }
--- a/llvm/test/Transforms/GVN/opaque-ptr.ll
+++ b/llvm/test/Transforms/GVN/opaque-ptr.ll
@ -52,6 +52,12 @@ define void @gep_cse_offset_canonicalization(ptr %p, i64 %idx, i64 %idx2) {
 ; CHECK-NEXT:    call void @use(ptr [[GEP5]])
 ; CHECK-NEXT:    call void @use(ptr [[GEP5_SAME]])
 ; CHECK-NEXT:    call void @use(ptr [[GEP5_DIFFERENT]])
+; CHECK-NEXT:    [[GEP6:%.*]] = getelementptr [4 x <vscale x 4 x i32>], ptr [[P]], i64 [[IDX]], i64 1
+; CHECK-NEXT:    [[GEP6_SAME:%.*]] = getelementptr [4 x <vscale x 4 x float>], ptr [[P]], i64 [[IDX]], i64 1
+; CHECK-NEXT:    [[GEP6_DIFFERENT:%.*]] = getelementptr [4 x <vscale x 4 x float>], ptr [[P]], i64 [[IDX2]], i64 1
+; CHECK-NEXT:    call void @use(ptr [[GEP6]])
+; CHECK-NEXT:    call void @use(ptr [[GEP6_SAME]])
+; CHECK-NEXT:    call void @use(ptr [[GEP6_DIFFERENT]])
 ; CHECK-NEXT:    ret void
 ;
  %gep1 = getelementptr i64, ptr %p, i64 1
@ -89,6 +95,12 @@ define void @gep_cse_offset_canonicalization(ptr %p, i64 %idx, i64 %idx2) {
  call void @use(ptr %gep5)
  call void @use(ptr %gep5.same)
  call void @use(ptr %gep5.different)
+  %gep6 = getelementptr [4 x <vscale x 4 x i32>], ptr %p, i64 %idx, i64 1
+  %gep6.same = getelementptr [4 x <vscale x 4 x float>], ptr %p, i64 %idx, i64 1
+  %gep6.different = getelementptr [4 x <vscale x 4 x float>], ptr %p, i64 %idx2, i64 1
+  call void @use(ptr %gep6)
+  call void @use(ptr %gep6.same)
+  call void @use(ptr %gep6.different)
  ret void
 }

--- a/llvm/test/Transforms/GlobalOpt/2022-08-23-ScalableVectorArrayCrash.ll
+++ b/llvm/test/Transforms/GlobalOpt/2022-08-23-ScalableVectorArrayCrash.ll
@ -0,0 +1,15 @@
+; RUN: opt -passes=globalopt < %s
+
+; Ensure we don't ICE by trying to optimize a scalable vector load of a global
+; variable.
+
+%struct.xxx = type <{ [96 x i8] }>
+
+@.bss = internal unnamed_addr global %struct.xxx zeroinitializer, align 32
+
+define dso_local void @foo() local_unnamed_addr align 16 {
+L.entry:
+  store [4 x <vscale x 2 x double>] zeroinitializer, ptr @.bss, align 1
+  %0 = load [4 x <vscale x 2 x double>], ptr @.bss, align 8
+  unreachable
+}
--- a/llvm/test/Transforms/InstCombine/gep-can-replace-gep-idx-with-zero-typesize.ll
+++ b/llvm/test/Transforms/InstCombine/gep-can-replace-gep-idx-with-zero-typesize.ll
@ -18,3 +18,10 @@ define void @can_replace_gep_idx_with_zero_typesize(i64 %n, ptr %a, i64 %b) {
  call void @do_something(<vscale x 4 x i32> %tmp)
  ret void
 }
+
+define void @can_replace_gep_idx_with_zero_typesize_2(i64 %n, ptr %a, i64 %b) {
+  %idx = getelementptr [2 x <vscale x 4 x i32>], ptr %a, i64 %b, i64 0
+  %tmp = load <vscale x 4 x i32>, ptr %idx
+  call void @do_something(<vscale x 4 x i32> %tmp)
+  ret void
+}
--- a/llvm/test/Transforms/InstCombine/opaque-ptr.ll
+++ b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
@ -298,6 +298,17 @@ define ptr @geps_combinable_scalable(ptr %a, i64 %idx) {
  ret ptr %a3
 }

+define ptr @geps_combinable_scalable_vector_array(ptr %a, i64 %idx) {
+; CHECK-LABEL: @geps_combinable_scalable_vector_array(
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds [4 x <vscale x 2 x i32>], ptr [[A:%.*]], i64 1
+; CHECK-NEXT:    [[A3:%.*]] = getelementptr inbounds i8, ptr [[A2]], i64 4
+; CHECK-NEXT:    ret ptr [[A3]]
+;
+  %a2 = getelementptr inbounds [4 x <vscale x 2 x i32>], ptr %a, i64 1
+  %a3 = getelementptr inbounds i8, ptr %a2, i32 4
+  ret ptr %a3
+}
+
 define i1 @compare_geps_same_indices(ptr %a, ptr %b, i64 %idx) {
 ; CHECK-LABEL: @compare_geps_same_indices(
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq ptr [[A:%.*]], [[B:%.*]]
--- a/llvm/test/Transforms/InstCombine/scalable-vector-array.ll
+++ b/llvm/test/Transforms/InstCombine/scalable-vector-array.ll
@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+define <vscale x 4 x i32> @load(ptr %x) {
+; CHECK-LABEL: define <vscale x 4 x i32> @load
+; CHECK-SAME: (ptr [[X:%.*]]) {
+; CHECK-NEXT:    [[A_ELT1:%.*]] = getelementptr inbounds [2 x <vscale x 4 x i32>], ptr [[X]], i64 0, i64 1
+; CHECK-NEXT:    [[A_UNPACK2:%.*]] = load <vscale x 4 x i32>, ptr [[A_ELT1]], align 16
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A_UNPACK2]]
+;
+  %a = load [2 x <vscale x 4 x i32>], ptr %x
+  %b = extractvalue [2 x <vscale x 4 x i32>] %a, 1
+  ret <vscale x 4 x i32> %b
+}
+
+define void @store(ptr %x, <vscale x 4 x i32> %y, <vscale x 4 x i32> %z) {
+; CHECK-LABEL: define void @store
+; CHECK-SAME: (ptr [[X:%.*]], <vscale x 4 x i32> [[Y:%.*]], <vscale x 4 x i32> [[Z:%.*]]) {
+; CHECK-NEXT:    store <vscale x 4 x i32> [[Y]], ptr [[X]], align 16
+; CHECK-NEXT:    [[X_REPACK1:%.*]] = getelementptr inbounds [2 x <vscale x 4 x i32>], ptr [[X]], i64 0, i64 1
+; CHECK-NEXT:    store <vscale x 4 x i32> [[Z]], ptr [[X_REPACK1]], align 16
+; CHECK-NEXT:    ret void
+;
+  %a = insertvalue [2 x <vscale x 4 x i32>] poison, <vscale x 4 x i32> %y, 0
+  %b = insertvalue [2 x <vscale x 4 x i32>] %a, <vscale x 4 x i32> %z, 1
+  store [2 x <vscale x 4 x i32>] %b, ptr %x
+  ret void
+}
--- a/llvm/test/Transforms/InstSimplify/gep.ll
+++ b/llvm/test/Transforms/InstSimplify/gep.ll
@ -358,3 +358,12 @@ define <8 x ptr> @gep_vector_index_op3_poison_constant_index_afterwards(ptr %ptr
  %res = getelementptr inbounds %t.3, ptr %ptr, i64 0, i32 1, <8 x i64> poison, i32 1
  ret <8 x ptr> %res
 }
+
+define i64 @gep_array_of_scalable_vectors_ptrdiff(ptr %ptr) {
+  %c1 = getelementptr inbounds [8 x <vscale x 4 x i32>], ptr %ptr, i64 4
+  %c2 = getelementptr inbounds [8 x <vscale x 4 x i32>], ptr %ptr, i64 6
+  %c1.int = ptrtoint ptr %c1 to i64
+  %c2.int = ptrtoint ptr %c2 to i64
+  %diff = sub i64 %c2.int, %c1.int
+  ret i64 %diff
+}
--- a/llvm/test/Transforms/SROA/scalable-vector-array.ll
+++ b/llvm/test/Transforms/SROA/scalable-vector-array.ll
@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes='sroa<preserve-cfg>' -S | FileCheck %s
+; RUN: opt < %s -passes='sroa<modify-cfg>' -S | FileCheck %s
+
+; This test checks that SROA runs mem2reg on arrays of scalable vectors.
+
+define [ 2 x <vscale x 4 x i32> ] @alloca(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; CHECK-LABEL: define [2 x <vscale x 4 x i32>] @alloca
+; CHECK-SAME: (<vscale x 4 x i32> [[X:%.*]], <vscale x 4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:    [[AGG0:%.*]] = insertvalue [2 x <vscale x 4 x i32>] poison, <vscale x 4 x i32> [[X]], 0
+; CHECK-NEXT:    [[AGG1:%.*]] = insertvalue [2 x <vscale x 4 x i32>] [[AGG0]], <vscale x 4 x i32> [[Y]], 1
+; CHECK-NEXT:    ret [2 x <vscale x 4 x i32>] [[AGG1]]
+;
+  %addr = alloca [ 2 x <vscale x 4 x i32> ], align 4
+  %agg0 = insertvalue [ 2 x <vscale x 4 x i32> ] poison, <vscale x 4 x i32> %x, 0
+  %agg1 = insertvalue [ 2 x <vscale x 4 x i32> ] %agg0, <vscale x 4 x i32> %y, 1
+  store [ 2 x <vscale x 4 x i32> ] %agg1, ptr %addr, align 4
+  %val = load [ 2 x <vscale x 4 x i32> ], ptr %addr, align 4
+  ret [ 2 x <vscale x 4 x i32> ] %val
+}
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AArch64/scalable-vector-geps.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AArch64/scalable-vector-geps.ll
@ -28,4 +28,41 @@ define ptr @test2(ptr %base, i64 %idx) {
  ret ptr %gep
 }

+; Index is implicitly multiplied by vscale and so not really constant.
+define ptr @test3(ptr %base, i64 %idx) #0 {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[IDX_NEXT:%.*]] = add nuw nsw i64 [[IDX:%.*]], 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [8 x <vscale x 4 x float>], ptr [[BASE:%.*]], i64 [[IDX_NEXT]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %idx.next = add nuw nsw i64 %idx, 1
+  %gep = getelementptr [8 x <vscale x 4 x float>], ptr %base, i64 %idx.next
+  ret ptr %gep
+}
+
+; Indices are implicitly multiplied by vscale and so not really constant.
+define ptr @test4(ptr %base, i64 %idx) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[IDX_NEXT:%.*]] = add nuw nsw i64 [[IDX:%.*]], 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [8 x <vscale x 4 x float>], ptr [[BASE:%.*]], i64 3, i64 [[IDX_NEXT]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %idx.next = add nuw nsw i64 %idx, 1
+  %gep = getelementptr [8 x <vscale x 4 x float>], ptr %base, i64 3, i64 %idx.next
+  ret ptr %gep
+}
+
+; Whilst the first two indices are not constant, the calculation of the third
+; index does contain a constant that can be extracted.
+define ptr @test5(ptr %base, i64 %idx) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [8 x <vscale x 4 x float>], ptr [[BASE:%.*]], i64 1, i64 3, i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr float, ptr [[TMP1]], i64 1
+; CHECK-NEXT:    ret ptr [[GEP2]]
+;
+  %idx.next = add nuw nsw i64 %idx, 1
+  %gep = getelementptr [8 x <vscale x 4 x float>], ptr %base, i64 1, i64 3, i64 %idx.next
+  ret ptr %gep
+}
+
 attributes #0 = { "target-features"="+sve" }
--- a/llvm/test/Verifier/scalable-global-vars.ll
+++ b/llvm/test/Verifier/scalable-global-vars.ll
@ -3,14 +3,15 @@
 ;; Global variables cannot be scalable vectors, since we don't
 ;; know the size at compile time.

-; CHECK: Globals cannot contain scalable vectors
+; CHECK: Globals cannot contain scalable types
 ; CHECK-NEXT: ptr @ScalableVecGlobal
@ScalableVecGlobal = global <vscale x 4 x i32> zeroinitializer

-; CHECK-NEXT: Globals cannot contain scalable vectors
+; CHECK-NEXT: Globals cannot contain scalable types
+; CHECK-NEXT: ptr @ScalableVecArrayGlobal
+@ScalableVecArrayGlobal = global [ 8 x  <vscale x 4 x i32> ] zeroinitializer
+
+; CHECK-NEXT: Globals cannot contain scalable types
 ; CHECK-NEXT: ptr @ScalableVecStructGlobal
@ScalableVecStructGlobal = global { i32,  <vscale x 4 x i32> } zeroinitializer

-;; Global _pointers_ to scalable vectors are fine
-; CHECK-NOT: Globals cannot contain scalable vectors
-@ScalableVecPtr = global ptr zeroinitializer