[mlir][sparse] use loop emitter to generate loop in sparsification

Reviewed By: aartbik Differential Revision: https://reviews.llvm.org/D136185
2024-11-24 06:10:12 +00:00 · 2022-10-18 16:41:03 +00:00 · 2022-10-18 16:41:03 +00:00 · b0f8057e4c
commit b0f8057e4c
parent b279f9bba9
14 changed files with 945 additions and 837 deletions
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
@ -95,45 +95,49 @@ static Value genIndexAndValueForDense(OpBuilder &builder, Location loc,
 //===----------------------------------------------------------------------===//

 SparseTensorLoopEmitter::SparseTensorLoopEmitter(ValueRange tensors,
-                                                 bool isLastOutput)
-    : tensors(tensors.begin(), tensors.end()), dims(tensors.size()),
-      pidxs(tensors.size()), coord(tensors.size()), highs(tensors.size()),
-      sizes(tensors.size()), ptrBuffer(tensors.size()),
-      idxBuffer(tensors.size()), valBuffer(tensors.size()),
-      isLastOutput(isLastOutput), loopStack(), curLv(tensors.size(), 0) {
-  for (size_t i = 0, e = tensors.size(); i < e; i++) {
-    auto t = tensors[i];
-    auto rtp = t.getType().dyn_cast<RankedTensorType>();
-    if (!rtp) // a scalar (0-dimension tensors)
+                                                 bool hasOutput,
+                                                 bool isSparseOut)
+    : hasOutput(hasOutput), tensors(tensors.begin(), tensors.end()),
+      dimTypes(tensors.size()), pidxs(tensors.size()), coord(tensors.size()),
+      highs(tensors.size()), ptrBuffer(tensors.size()),
+      idxBuffer(tensors.size()), valBuffer(tensors.size()), loopStack() {
+  for (size_t tid = 0, e = tensors.size(); tid < e; tid++) {
+    auto t = tensors[tid];
+    // a scalar or 0-dimension tensors
+    if (isZeroRankedTensorOrScalar(t.getType()))
      continue;
-
+    auto rtp = t.getType().cast<RankedTensorType>();
    auto rank = static_cast<size_t>(rtp.getRank());
    auto enc = getSparseTensorEncoding(rtp);
-    if (enc)
+    // We always treat sparse output tensor as dense so that we always iterate
+    // it based on dim size.
+    if (enc && !(isOutputTensor(tid) && isSparseOut))
      for (auto dimTp : enc.getDimLevelType())
-        dims[i].push_back(dimTp);
+        dimTypes[tid].push_back(dimTp);
    else
-      dims[i].assign(rank, DimLevelType::Dense);
+      dimTypes[tid].assign(rank, DimLevelType::Dense);

    // Initialize using empty value.
-    pidxs[i].assign(rank, Value());
-    coord[i].assign(rank, Value());
-    highs[i].assign(rank, Value());
-    sizes[i].assign(rank, Value());
-    ptrBuffer[i].assign(rank, Value());
-    idxBuffer[i].assign(rank, Value());
+    pidxs[tid].assign(rank, Value());
+    coord[tid].assign(rank, Value());
+    highs[tid].assign(rank, Value());
+    ptrBuffer[tid].assign(rank, Value());
+    idxBuffer[tid].assign(rank, Value());
  }
 }

-void SparseTensorLoopEmitter::initializeLoopEmit(OpBuilder &builder,
-                                                 Location loc) {
+void SparseTensorLoopEmitter::initializeLoopEmit(
+    OpBuilder &builder, Location loc,
+    SparseTensorLoopEmitter::OutputUpdater updater) {
  // For every tensor, find lower and upper bound on dimensions, set the
  // same bounds on loop indices, and obtain dense or sparse buffer(s).
-  // TODO: Provides ability to generate loop on output buffer (with undef
-  // dim level in Merger in GenericOp Sparsification).
  for (size_t t = 0, e = tensors.size(); t < e; t++) {
    auto tensor = tensors[t];
-    auto rtp = tensor.getType().cast<RankedTensorType>();
+    auto rtp = tensor.getType().dyn_cast<RankedTensorType>();
+    if (!rtp)
+      // Skips only scalar, zero ranked tensor still need to be bufferized and
+      // (probably) filled with zeros by users.
+      continue;
    auto rank = rtp.getRank();
    auto shape = rtp.getShape();
    auto enc = getSparseTensorEncoding(rtp);
@ -141,10 +145,9 @@ void SparseTensorLoopEmitter::initializeLoopEmit(OpBuilder &builder,
    // Scan all dimensions of current tensor.
    for (int64_t d = 0; d < rank; d++) {
      // This should be called only once at beginning.
-      assert(!ptrBuffer[t][d] && !idxBuffer[t][d] && !sizes[t][d] &&
-             !highs[t][d]);
+      assert(!ptrBuffer[t][d] && !idxBuffer[t][d] && !highs[t][d]);
      // Handle sparse storage schemes.
-      if (isCompressedDLT(dims[t][d])) {
+      if (isCompressedDLT(dimTypes[t][d])) {
        auto ptrTp =
            MemRefType::get(dynShape, getPointerOverheadType(builder, enc));
        auto indTp =
@ -153,7 +156,7 @@ void SparseTensorLoopEmitter::initializeLoopEmit(OpBuilder &builder,
        // Generate sparse primitives to obtains pointer and indices.
        ptrBuffer[t][d] = builder.create<ToPointersOp>(loc, ptrTp, tensor, dim);
        idxBuffer[t][d] = builder.create<ToIndicesOp>(loc, indTp, tensor, dim);
-      } else if (isSingletonDLT(dims[t][d])) {
+      } else if (isSingletonDLT(dimTypes[t][d])) {
        // Singleton dimension, fetch indices.
        auto indTp =
            MemRefType::get(dynShape, getIndexOverheadType(builder, enc));
@ -161,105 +164,225 @@ void SparseTensorLoopEmitter::initializeLoopEmit(OpBuilder &builder,
        idxBuffer[t][d] = builder.create<ToIndicesOp>(loc, indTp, tensor, dim);
      } else {
        // Dense dimension, nothing to fetch.
-        assert(isDenseDLT(dims[t][d]));
+        assert(isDenseDLT(dimTypes[t][d]));
      }

      // Find upper bound in current dimension.
      unsigned p = toOrigDim(enc, d);
      Value up = mlir::linalg::createOrFoldDimOp(builder, loc, tensor, p);
-      sizes[t][d] = highs[t][d] = up;
+      highs[t][d] = up;
    }
-    // Perform the required bufferization. Dense inputs materialize
-    // from the input tensors. Dense outputs need special handling.
-    // Sparse inputs use sparse primitives to obtain the values.
-    Type elementType = rtp.getElementType();

+    // Perform the required bufferization. Dense inputs materialize
+    // from the input tensors. Sparse inputs use sparse primitives to obtain the
+    // values.
+    // Delegates extra output initialization to clients.
+    bool isOutput = isOutputTensor(t);
+    Type elementType = rtp.getElementType();
    if (!enc) {
      // Non-annotated dense tensors.
      auto denseTp = MemRefType::get(shape, elementType);
-      if (isLastOutput && t == tensors.size() - 1)
-        llvm_unreachable("TODO: not yet handled");
-      else
-        valBuffer[t] =
-            builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
+      Value denseVal =
+          builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
+      // Dense outputs need special handling.
+      if (isOutput && updater)
+        denseVal = updater(builder, loc, denseVal, tensor);
+
+      valBuffer[t] = denseVal;
    } else {
      // Annotated sparse tensors.
+      // We also need the value buffer for annotated all dense `sparse` tensor.
      auto dynShape = {ShapedType::kDynamicSize};
      auto sparseTp = MemRefType::get(dynShape, elementType);
      valBuffer[t] = builder.create<ToValuesOp>(loc, sparseTp, tensor);
    }
-    // Prepare to enter the first dim for all (input) tensors
-    prepareLoopOverTensorAtDim(builder, loc, t, 0);
+    // NOTE: we can also prepares for 0 dim here in advance, this will hosit
+    // some loop preparation from tensor iteration, but will also (undesirably)
+    // hosit the code ouside if conditions.
  }
 }

+void SparseTensorLoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc,
+                                              ArrayRef<size_t> tids,
+                                              ArrayRef<size_t> dims) {
+  // Universal Index start from 0
+  assert(loopSeqStack.size() == loopStack.size());
+  // Universal index starts from 0
+  loopSeqStack.emplace_back(constantIndex(builder, loc, 0));
+  // Prepares for all the tensors used in the current loop sequence.
+  for (auto [tid, dim] : llvm::zip(tids, dims))
+    prepareLoopOverTensorAtDim(builder, loc, tid, dim);
+}
+
 Operation *SparseTensorLoopEmitter::enterLoopOverTensorAtDim(
    OpBuilder &builder, Location loc, size_t tid, size_t dim,
-    ArrayRef<Value> reduc) {
-  assert(dims[tid].size() > dim);
+    MutableArrayRef<Value> reduc, bool isParallel, ArrayRef<size_t> extraTids,
+    ArrayRef<size_t> extraDims) {
+  assert(dimTypes[tid].size() > dim);
  // We can not re-enter the same level.
  assert(!coord[tid][dim]);
+
  Value step = constantIndex(builder, loc, 1);
-  auto dimType = dims[tid][dim];
-  bool isSparse = isCompressedDLT(dimType) || isSingletonDLT(dimType);
+  auto dimType = dimTypes[tid][dim];
+  bool isSparseInput = isCompressedDLT(dimType) || isSingletonDLT(dimType);
  assert(isDenseDLT(dimType) || isCompressedDLT(dimType) ||
         isSingletonDLT(dimType));

-  Value lo = isSparse ? pidxs[tid][dim] : constantIndex(builder, loc, 0);
+  Value lo = isSparseInput ? pidxs[tid][dim]      // current offset
+                           : loopSeqStack.back(); // univeral tid
  Value hi = highs[tid][dim];

-  // TODO: support reduction.
-  if (!reduc.empty())
-    llvm_unreachable("TODO: not implemented yet");
-
  scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
  builder.setInsertionPointToStart(forOp.getBody());
  Value iv = forOp.getInductionVar();
-  Operation *loop = forOp;
-
  assert(iv);
-  if (isSparse) {
+  if (isSparseInput) {
    pidxs[tid][dim] = iv;
    // Generating a load on the indices array yields the coordinate.
    Value ptr = idxBuffer[tid][dim];
-    // TODO: generates load for vector value.
    coord[tid][dim] = genIndexLoad(builder, loc, ptr, iv);
  } else {
    // Dense tensor, the coordinates is the inducation variable.
    coord[tid][dim] = iv;
    // generate pidx for dense dim (pidx = i * sz + j)
-    // TODO: handle vector loop.
-    Value p = dim == 0 ? constantIndex(builder, loc, 0) : pidxs[tid][dim - 1];
-    Value mul = builder.create<arith::MulIOp>(loc, sizes[tid][dim], p);
-    Value add = builder.create<arith::AddIOp>(loc, mul, iv);
-    pidxs[tid][dim] = add;
+    auto enc = getSparseTensorEncoding(tensors[tid].getType());
+    if (enc)
+      pidxs[tid][dim] = genAddress(builder, loc, tid, dim, iv);
  }

-  // Prepares for next dim if this is not currently the innermost dimension.
-  if (dim != dims[tid].size() - 1)
-    prepareLoopOverTensorAtDim(builder, loc, tid, dim + 1);
+  // NOTE: we can also prepares for next dim here in advance
+  // Push the loop into stack
+  loopStack.emplace_back(ArrayRef<size_t>(tid), ArrayRef<size_t>(dim), forOp,
+                         coord[tid][dim]);
+  // Emit extra locals.
+  emitExtraLocalsForTensorsAtDenseDims(builder, loc, extraTids, extraDims);

-  loopStack.push_back(LoopLevelInfo({tid}, {dim}, coord[tid][dim]));
-  return loop;
+  // In-place update on the reduction variable vector.
+  assert(forOp.getNumRegionIterArgs() == reduc.size());
+  for (int i = 0, e = reduc.size(); i < e; i++)
+    reduc[i] = forOp.getRegionIterArg(i);
+  return forOp;
 }

-void SparseTensorLoopEmitter::enterCoiterationOverTensorsAtDims(
-    OpBuilder &builder, Location loc, ArrayRef<size_t> ts,
-    ArrayRef<size_t> ds) {
-  llvm_unreachable("TODO: unimplemented");
+Operation *SparseTensorLoopEmitter::enterCoIterationOverTensorsAtDims(
+    OpBuilder &builder, Location loc, ArrayRef<size_t> tids,
+    ArrayRef<size_t> dims, bool needsUniv, MutableArrayRef<Value> reduc,
+    ArrayRef<size_t> extraTids, ArrayRef<size_t> extraDims) {
+  assert(tids.size() == dims.size());
+  SmallVector<Type, 4> types;
+  SmallVector<Value, 4> operands;
+  // Construct the while-loop with a parameter for each index.
+  Type indexType = builder.getIndexType();
+  for (auto [tid, dim] : llvm::zip(tids, dims)) {
+    if (isCompressedDLT(dimTypes[tid][dim]) ||
+        isSingletonDLT(dimTypes[tid][dim])) {
+      assert(pidxs[tid][dim]);
+      types.push_back(indexType);
+      operands.push_back(pidxs[tid][dim]);
+    }
+  }
+  // The position where user-supplied reduction variable starts.
+  for (Value rec : reduc) {
+    types.push_back(rec.getType());
+    operands.push_back(rec);
+  }
+  if (needsUniv) {
+    types.push_back(indexType);
+    // Update universal index.
+    operands.push_back(loopSeqStack.back());
+  }
+  assert(types.size() == operands.size());
+  scf::WhileOp whileOp = builder.create<scf::WhileOp>(loc, types, operands);
+
+  SmallVector<Location> locs(types.size(), loc);
+  Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs);
+  Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs);
+
+  // Build the "before" region, which effectively consists
+  // of a conjunction of "i < upper" tests on all induction.
+  builder.setInsertionPointToStart(&whileOp.getBefore().front());
+  Value cond;
+  unsigned o = 0;
+  for (auto [tid, dim] : llvm::zip(tids, dims)) {
+    if (isCompressedDLT(dimTypes[tid][dim]) ||
+        isSingletonDLT(dimTypes[tid][dim])) {
+      Value op1 = before->getArgument(o);
+      Value op2 = highs[tid][dim];
+      Value opc = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult,
+                                                op1, op2);
+      cond = cond ? builder.create<arith::AndIOp>(loc, cond, opc) : opc;
+      // Update
+      pidxs[tid][dim] = after->getArgument(o++);
+    }
+  }
+  builder.create<scf::ConditionOp>(loc, cond, before->getArguments());
+
+  // Generates while body.
+  builder.setInsertionPointToStart(&whileOp.getAfter().front());
+  Value min;
+  for (auto [tid, dim] : llvm::zip(tids, dims)) {
+    // Prepares for next level.
+    if (isCompressedDLT(dimTypes[tid][dim]) ||
+        isSingletonDLT(dimTypes[tid][dim])) {
+      Value ptr = idxBuffer[tid][dim];
+      Value s = pidxs[tid][dim];
+      Value load = genIndexLoad(builder, loc, ptr, s);
+      coord[tid][dim] = load;
+      if (!needsUniv) {
+        if (min) {
+          Value cmp = builder.create<arith::CmpIOp>(
+              loc, arith::CmpIPredicate::ult, load, min);
+          min = builder.create<arith::SelectOp>(loc, cmp, load, min);
+        } else {
+          min = load;
+        }
+      }
+    }
+  }
+
+  if (needsUniv) {
+    assert(!min);
+    // Otherwise, universal index is the minimal pidx.
+    min = after->getArguments().back();
+  }
+
+  for (auto [tid, dim] : llvm::zip(tids, dims)) {
+    // All dense dim (as well as sparse output tensor) shared the same pidx in
+    // the while loop.
+    if (isDenseDLT(dimTypes[tid][dim])) {
+      pidxs[tid][dim] = min;
+      // generate pidx for dense dim (pidx = i * sz + j)
+      auto enc = getSparseTensorEncoding(tensors[tid].getType());
+      if (enc)
+        pidxs[tid][dim] = genAddress(builder, loc, tid, dim, min);
+    }
+    // NOTE: we can also prepares for next dim here in advance
+  }
+  // Sets up the loop stack.
+  loopStack.emplace_back(tids, dims, whileOp, min);
+  assert(loopStack.size() == loopSeqStack.size());
+
+  // Emits extra locals
+  emitExtraLocalsForTensorsAtDenseDims(builder, loc, extraTids, extraDims);
+
+  // Updates reduction variables
+  assert(after->getNumArguments() == o + reduc.size() + (needsUniv ? 1 : 0));
+  // In-place update on reduction variable.
+  for (unsigned i = 0, e = reduc.size(); i < e; i++)
+    reduc[i] = after->getArgument(o + i);
+
+  return whileOp;
 }

-bool SparseTensorLoopEmitter::prepareLoopOverTensorAtDim(OpBuilder &builder,
+void SparseTensorLoopEmitter::prepareLoopOverTensorAtDim(OpBuilder &builder,
                                                         Location loc,
                                                         size_t tid,
                                                         size_t dim) {
-  // TODO: generate loop iteration on output tensor based on the shape
-  // instead of pointer/indices arrays.
-  assert(dims[tid].size() > dim);
-  auto dimType = dims[tid][dim];
+  assert(dimTypes[tid].size() > dim);
+  auto dimType = dimTypes[tid][dim];

  if (isDenseDLT(dimType))
-    return false;
+    return;

  // Either the first dimension, or the previous dimension has been set.
  assert(dim == 0 || pidxs[tid][dim - 1]);
@ -269,11 +392,11 @@ bool SparseTensorLoopEmitter::prepareLoopOverTensorAtDim(OpBuilder &builder,
    Value ptr = ptrBuffer[tid][dim];

    Value pLo = dim == 0 ? c0 : pidxs[tid][dim - 1];
-    Value pHi = builder.create<arith::AddIOp>(loc, pLo, c1);
-
    pidxs[tid][dim] = genIndexLoad(builder, loc, ptr, pLo);
+
+    Value pHi = builder.create<arith::AddIOp>(loc, pLo, c1);
    highs[tid][dim] = genIndexLoad(builder, loc, ptr, pHi);
-    return true;
+    return;
  }
  if (isSingletonDLT(dimType)) {
    Value pLo = dim == 0 ? c0 : pidxs[tid][dim - 1];
@ -281,34 +404,138 @@ bool SparseTensorLoopEmitter::prepareLoopOverTensorAtDim(OpBuilder &builder,

    pidxs[tid][dim] = pLo;
    highs[tid][dim] = pHi;
-    return true;
+    return;
  }

  llvm_unreachable("Unrecognizable dimesion type!");
 }

-Value SparseTensorLoopEmitter::emitExtraLocalsForTensorsAtDims(
-    OpBuilder &builder, Location loc, size_t tid, size_t dim) {
-  llvm_unreachable("TODO: not implemented yet");
+void SparseTensorLoopEmitter::emitExtraLocalsForTensorsAtDenseDims(
+    OpBuilder &builder, Location loc, ArrayRef<size_t> tids,
+    ArrayRef<size_t> dims) {
+  // Initialize dense positions. Note that we generate dense indices of the
+  // output tensor unconditionally, since they may not appear in the lattice,
+  // but may be needed for linearized codegen.
+  for (auto [tid, dim] : llvm::zip(tids, dims)) {
+    assert(isDenseDLT(dimTypes[tid][dim]));
+    auto enc = getSparseTensorEncoding(tensors[tid].getType());
+    if (enc) {
+      bool validPidx = dim == 0 || pidxs[tid][dim - 1];
+      if (!validPidx) {
+        // We might not find the pidx for the sparse output tensor as it is
+        // unconditionally required by the sparsification.
+        assert(isOutputTensor(tid));
+        continue;
+      }
+      pidxs[tid][dim] = genAddress(builder, loc, tid, dim, loopStack.back().iv);
+      // NOTE: we can also prepares for next dim here in advance
+    }
+  }
 }

-void SparseTensorLoopEmitter::exitCurrentLoop() {
+SmallVector<Value, 2>
+SparseTensorLoopEmitter::exitForLoop(OpBuilder &builder, Location loc,
+                                     ArrayRef<Value> reduc) {
+  LoopLevelInfo &loopInfo = loopStack.back();
+  auto &dims = loopStack.back().dims;
+  auto &tids = loopStack.back().tids;
+  auto forOp = llvm::cast<scf::ForOp>(loopInfo.loop);
+  if (!reduc.empty()) {
+    assert(reduc.size() == forOp.getNumResults());
+    builder.setInsertionPointToEnd(forOp.getBody());
+    builder.create<scf::YieldOp>(loc, reduc);
+  }
+
+  // Finished iterating a tensor, clean up
+  // We only do the clean up on for loop as while loops do not necessarily
+  // finish the iteration on a sparse tensor
+  for (auto [tid, dim] : llvm::zip(tids, dims)) {
+    // Reset to null.
+    coord[tid][dim] = Value();
+    pidxs[tid][dim] = Value();
+    // Dense dimension, high is fixed.
+    if (!isDenseDLT(dimTypes[tid][dim]))
+      highs[tid][dim] = Value();
+  }
+  // exit the loop
+  builder.setInsertionPointAfter(forOp);
+  return forOp.getResults();
+}
+
+SmallVector<Value, 2>
+SparseTensorLoopEmitter::exitCoiterationLoop(OpBuilder &builder, Location loc,
+                                             ArrayRef<Value> reduc) {
+  auto whileOp = llvm::cast<scf::WhileOp>(loopStack.back().loop);
+  auto &dims = loopStack.back().dims;
+  auto &tids = loopStack.back().tids;
+  Value iv = loopStack.back().iv;
+  // Generation while loop induction at the end.
+  builder.setInsertionPointToEnd(&whileOp.getAfter().front());
+  // Finalize the induction. Note that the induction could be performed
+  // in the individual if-branches to avoid re-evaluating the conditions.
+  // However, that would result in a rather elaborate forest of yield
+  // instructions during code generation. Moreover, performing the induction
+  // after the if-statements more closely resembles code generated by TACO.
+  unsigned o = 0;
+  SmallVector<Value, 4> operands;
+  Value one = constantIndex(builder, loc, 1);
+  for (auto [tid, dim] : llvm::zip(tids, dims)) {
+    if (isCompressedDLT(dimTypes[tid][dim]) ||
+        isSingletonDLT(dimTypes[tid][dim])) {
+      Value op1 = coord[tid][dim];
+      Value op3 = pidxs[tid][dim];
+      Value cmp =
+          builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, op1, iv);
+      Value add = builder.create<arith::AddIOp>(loc, op3, one);
+      operands.push_back(builder.create<arith::SelectOp>(loc, cmp, add, op3));
+      // Following loops continue iteration from the break point of the
+      // current while loop.
+      pidxs[tid][dim] = whileOp->getResult(o++);
+      // The coordinates are invalid now.
+      coord[tid][dim] = nullptr;
+      // highs remains unchanged.
+    }
+  }
+
+  // Reduction value from users.
+  SmallVector<Value, 2> ret;
+  for (auto red : reduc) {
+    operands.push_back(red);
+    ret.push_back(whileOp->getResult(o++));
+  }
+
+  // An (optional) universal index.
+  if (operands.size() < whileOp.getNumResults()) {
+    assert(operands.size() + 1 == whileOp.getNumResults());
+    // The last one is the universial index.
+    operands.push_back(builder.create<arith::AddIOp>(loc, iv, one));
+    // update the loop starting point of current loop sequence
+    loopSeqStack.back() = whileOp->getResult(o++);
+  }
+
+  assert(o == operands.size());
+  builder.create<scf::YieldOp>(loc, operands);
+  builder.setInsertionPointAfter(whileOp);
+  return ret;
+}
+
+SmallVector<Value, 2>
+SparseTensorLoopEmitter::exitCurrentLoop(OpBuilder &builder, Location loc,
+                                         ArrayRef<Value> reduc) {
  // Clean up the values, it would help use to discover potential bug at a
  // earlier stage (instead of silently using a wrong value).
  LoopLevelInfo &loopInfo = loopStack.back();
-  assert(loopInfo.tensors.size() == loopInfo.dims.size());
-  for (auto info : llvm::zip(loopInfo.tensors, loopInfo.dims)) {
-    auto tid = std::get<0>(info);
-    auto dim = std::get<1>(info);
-    assert(pidxs[tid][dim] && coord[tid][dim] && highs[tid][dim]);
-    // Reset to null.
-    pidxs[tid][dim] = Value();
-    coord[tid][dim] = Value();
-    if (!isDenseDLT(dims[tid][dim]))
-      // Dense dimension, high is fixed.
-      highs[tid][dim] = Value();
+  assert(loopInfo.tids.size() == loopInfo.dims.size());
+  SmallVector<Value, 2> red;
+  if (llvm::isa<scf::WhileOp>(loopInfo.loop)) {
+    red = exitCoiterationLoop(builder, loc, reduc);
+  } else {
+    red = exitForLoop(builder, loc, reduc);
  }
+
+  assert(loopStack.size() == loopSeqStack.size());
  loopStack.pop_back();
+  return red;
 }

 //===----------------------------------------------------------------------===//
@ -500,9 +727,10 @@ void mlir::sparse_tensor::genReshapeDstShape(
    auto srcDim = srcShape[i];
    // Iterate through dimensions expanded from the i-th dimension.
    for (unsigned j = start; j < start + map.size(); j++) {
-      // There can be only one dynamic sized dimension among dimensions expanded
-      // from the i-th dimension in srcShape. For example, if srcDim = 8, then
-      // the expanded shape could be <2x?x2>, but not <2x?x?>.
+      // There can be only one dynamic sized dimension among dimensions
+      // expanded from the i-th dimension in srcShape.
+      // For example, if srcDim = 8, then the expanded shape could be <2x?x2>,
+      // but not <2x?x?>.
      if (staticDstShape[j] == ShapedType::kDynamicSize) {
        // The expanded dimension has dynamic size. We compute the dimension
        // by dividing srcDim by the product of the static dimensions.
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
@ -34,127 +34,6 @@ namespace sparse_tensor {
 /// `createFuncCall()`, and `replaceOpWithFuncCall()`.
 enum class EmitCInterface : bool { Off = false, On = true };

-//===----------------------------------------------------------------------===//
-// SparseTensorLoopEmiter class, manages sparse tensors and helps to generate
-// loop structure to (co-iterate) sparse tensors.
-//
-// An example usage:
-// To generate following loops over T1<?x?> and T2<?x?>
-//
-// for i in T1[0] {
-//   for j : T2[0] {
-//     for k : T1[1] {}
-//     for k : T2[1] {}
-//   }
-// }
-//
-// One can use
-//
-// SparseTensorLoopEmiter loopEmiter({T1, T1});
-// loopEmiter.initializeLoopEmit();
-// loopEmiter.enterLoopOverTensorAtDim(T1, 0);
-// loopEmiter.enterLoopOverTensorAtDim(T2, 0);
-// loopEmiter.enterLoopOverTensorAtDim(T1, 1);
-// loopEmiter.exitCurrentLoop();
-// loopEmiter.enterLoopOverTensorAtDim(T2, 1);
-// for 0 -> 3:
-//    loopEmiter.exitCurrentLoop();
-//===----------------------------------------------------------------------===//
-
-// TODO: Sparsification should also rely on this class to generate loops.
-class SparseTensorLoopEmitter {
-public:
-  /// Constructor: take an array of tensors inputs, on which the generated loops
-  /// will iterate on. The index of the tensor in the array is also the
-  /// tensor id (tid) used in related functions.
-  explicit SparseTensorLoopEmitter(ValueRange tensors,
-                                   bool isLastOutput = false);
-
-  ///
-  /// Core functions.
-  ///
-
-  /// Starts a loop emitting session:
-  /// 1. Generates all the buffers needed to iterate tensors.
-  /// 2. Generates the lo/hi bounds to iterate tensors[0].
-  void initializeLoopEmit(OpBuilder &builder, Location loc);
-
-  // TODO: Gets rid of `dim` in the argument list? Track the dimension we
-  // are currently at internally. Then it would be enterNextDimForTensor.
-
-  /// Emits loop over tensor[dim], it assumes that loops between
-  /// tensor[0...dim - 1] have already been generated.
-  /// It also prepares to enter tensor[dim + 1].
-  Operation *enterLoopOverTensorAtDim(OpBuilder &builder, Location loc,
-                                      size_t tid, size_t dim,
-                                      ArrayRef<Value> reduc = {});
-
-  /// Emits a coiteration loop over a set of tensors.
-  // TODO: not yet implemented
-  void enterCoiterationOverTensorsAtDims(OpBuilder &builder, Location loc,
-                                         ArrayRef<size_t> ts,
-                                         ArrayRef<size_t> ds);
-
-  /// Emits extra locals, since the locals might not be in simplified lattices
-  /// point used to generate the loops, but are still required to generates
-  /// expressions.
-  Value emitExtraLocalsForTensorsAtDims(OpBuilder &builder, Location loc,
-                                        size_t tid, size_t dim);
-
-  void exitCurrentLoop();
-
-  /// Return the array of coordinate for all the loop generated till now.
-  void getCoordinateArray(SmallVectorImpl<Value> &coords) {
-    for (auto &l : loopStack)
-      coords.push_back(l.idx);
-  }
-
-  ///
-  /// Getters.
-  ///
-
-  Value getTensorValueBuffer(size_t tid) { return valBuffer[tid]; }
-  Value getLastLevelTensorPointerIndex(size_t tid) {
-    return pidxs[tid].back();
-  };
-
-private:
-  struct LoopLevelInfo {
-    LoopLevelInfo(ArrayRef<size_t> ts, ArrayRef<size_t> ds, Value idx)
-        : tensors(ts), dims(ds), idx(idx) {}
-    llvm::SmallVector<size_t, 4> tensors;
-    llvm::SmallVector<size_t, 4> dims;
-    Value idx;
-  };
-
-  /// Return false if tid[dim] is a dense dimension that does not need to be
-  /// prepared (to be used by sparsification for needUniv).
-  bool prepareLoopOverTensorAtDim(OpBuilder &builder, Location loc, size_t tid,
-                                  size_t dim);
-
-  /// Input (TODO: and output) tensors.
-  std::vector<Value> tensors;
-  /// The dim type array for each tensor.
-  std::vector<std::vector<DimLevelType>> dims;
-  /// Sparse iteration information (by tensor and dim). These arrays
-  /// are updated to remain current within the current loop.
-  std::vector<std::vector<Value>> pidxs;
-  std::vector<std::vector<Value>> coord;
-  std::vector<std::vector<Value>> highs;
-  /// Universal dense indices and upper bounds (by index). The sizes array is
-  /// set once with the inferred dimension sizes.
-  std::vector<std::vector<Value>> sizes;
-  std::vector<std::vector<Value>> ptrBuffer; // to_pointers
-  std::vector<std::vector<Value>> idxBuffer; // to_indices
-  std::vector<Value> valBuffer;              // to_value
-
-  bool isLastOutput; // Is the last tensor output tensor
-  std::vector<LoopLevelInfo> loopStack;
-  // TODO: not yet used, it should track the current level for each tensor
-  // to help eliminate `dim` paramters from above APIs.
-  std::vector<size_t> curLv;
-};
-
 //===----------------------------------------------------------------------===//
 // ExecutionEngine/SparseTensorUtils helper functions.
 //===----------------------------------------------------------------------===//
@ -400,6 +279,214 @@ inline Value constantDimLevelTypeEncoding(OpBuilder &builder, Location loc,
  return constantI8(builder, loc, static_cast<uint8_t>(dlt));
 }

+inline bool isZeroRankedTensorOrScalar(Type type) {
+  auto rtp = type.dyn_cast<RankedTensorType>();
+  return !rtp || rtp.getRank() == 0;
+}
+
+//===----------------------------------------------------------------------===//
+// SparseTensorLoopEmiter class, manages sparse tensors and helps to generate
+// loop structure to (co)-iterate sparse tensors.
+//
+// An example usage:
+// To generate the following loops over T1<?x?> and T2<?x?>
+//
+// for i in TENSOR_1_0 {
+//   for j : TENSOR_2_0 {
+//     for k : TENSOR_1_1 {}
+//     for k : TENSOR_2_1 {}
+//   }
+// }
+//
+// One can use
+//
+// SparseTensorLoopEmiter loopEmiter({T1, T1});
+// loopEmiter.initializeLoopEmit();
+// loopEmiter.enterLoopOverTensorAtDim(T1, 0);
+// loopEmiter.enterLoopOverTensorAtDim(T2, 0);
+// loopEmiter.enterLoopOverTensorAtDim(T1, 1);
+// loopEmiter.exitCurrentLoop();
+// loopEmiter.enterLoopOverTensorAtDim(T2, 1);
+// loopEmiter.exitCurrentLoop(); // exit k
+// loopEmiter.exitCurrentLoop(); // exit j
+// loopEmiter.exitCurrentLoop(); // exit i
+//===----------------------------------------------------------------------===//
+
+// TODO: Sparsification should also rely on this class to generate loops.
+class SparseTensorLoopEmitter {
+public:
+  /// Optional callback function to setup dense output tensors when
+  /// initializing the loop emitter (e.g., to fill a dense output with zeros).
+  using OutputUpdater = function_ref<Value(OpBuilder &builder, Location loc,
+                                           Value memref, Value tensor)>;
+
+  /// Constructor: take an array of tensors inputs, on which the generated loops
+  /// will iterate on. The index of the tensor in the array is also the
+  /// tensor id (tid) used in related functions.
+  /// If isSparseOut is set, loop emitter assume that the sparse output tensor
+  /// is empty, and will always generate loops on it based on the dim sizes.
+  explicit SparseTensorLoopEmitter(ValueRange tensors, bool hasOutput = false,
+                                   bool isSparseOut = false);
+
+  /// Starts a loop emitting session by generating all the buffers needed to
+  /// iterate tensors.
+  void initializeLoopEmit(OpBuilder &builder, Location loc,
+                          OutputUpdater updater = nullptr);
+
+  /// Enters a new loop sequence, the loops within the same sequence starts from
+  /// the break points of previous loop instead of starting over from 0.
+  /// e.g.,
+  /// {
+  ///   // loop sequence start.
+  ///   p0 = while(xxx)
+  ///     ...
+  ///     break p0
+  ///
+  ///   // Starts loop from p0
+  ///   for (i = p0; i < end; i++)
+  ///     ...
+  ///   // loop sequence end.
+  /// }
+  void enterNewLoopSeq(OpBuilder &builder, Location loc, ArrayRef<size_t> tids,
+                       ArrayRef<size_t> dims);
+
+  // exit the current loop sequence, this will reset universal index to 0.
+  void exitCurrentLoopSeq() {
+    assert(loopSeqStack.size() == loopStack.size() + 1);
+    loopSeqStack.pop_back();
+  }
+
+  // TODO: Gets rid of `dim` in the argument list? Track the dimension we
+  // are currently at internally. Then it would be enterNextDimForTensor.
+  // Still need a way to specify the dim for non annoated dense tensor though,
+  // as it can be accessed out of order.
+  /// Emits loop over tensor_tid_dim, it assumes that loops between
+  /// tensor_tid_[0, dim - 1] have already been generated.
+  /// The function will also perform in-place update on the `reduc` vector to
+  /// return the reduction variable used inside the generated loop.
+  Operation *enterLoopOverTensorAtDim(OpBuilder &builder, Location loc,
+                                      size_t tid, size_t dim,
+                                      MutableArrayRef<Value> reduc = {},
+                                      bool isParallel = false,
+                                      ArrayRef<size_t> extraTids = {},
+                                      ArrayRef<size_t> extraDims = {});
+
+  /// Emits a co-iteration loop over a set of tensors.
+  Operation *enterCoIterationOverTensorsAtDims(
+      OpBuilder &builder, Location loc, ArrayRef<size_t> tids,
+      ArrayRef<size_t> dims, bool needsUniv, MutableArrayRef<Value> reduc = {},
+      ArrayRef<size_t> extraTids = {}, ArrayRef<size_t> extraDims = {});
+
+  SmallVector<Value, 2> exitCurrentLoop(OpBuilder &builder, Location loc,
+                                        ArrayRef<Value> reduc = {});
+
+  /// Returns the array of coordinate for all the loop generated till now.
+  void getCoordinateArray(SmallVectorImpl<Value> &coords) const {
+    for (auto &l : loopStack)
+      coords.push_back(l.iv);
+  }
+
+  /// Gets loop induction variable at the given level.
+  Value getLoopIV(size_t level) const {
+    if (level < loopStack.size())
+      return loopStack[level].iv;
+    return nullptr;
+  }
+
+  ///
+  /// Getters.
+  ///
+  const std::vector<std::vector<Value>> &getPidxs() const { return pidxs; };
+  const std::vector<std::vector<Value>> &getCoord() const { return coord; };
+  const std::vector<std::vector<Value>> &getHighs() const { return highs; };
+  const std::vector<std::vector<Value>> &getPtrBuffer() const {
+    return ptrBuffer;
+  };
+  const std::vector<std::vector<Value>> &getIdxBuffer() const {
+    return idxBuffer;
+  };
+  const std::vector<Value> &getValBuffer() const { return valBuffer; };
+
+private:
+  struct LoopLevelInfo {
+    LoopLevelInfo(ArrayRef<size_t> tids, ArrayRef<size_t> dims, Operation *loop,
+                  Value iv)
+        : tids(tids), dims(dims), loop(loop), iv(iv) {}
+    // TODO: maybe use a vector<pair> for tid and dim?
+    // The set of tensors that the loop is operating on
+    const llvm::SmallVector<size_t, 4> tids;
+    // The corresponding dims for the tensors
+    const llvm::SmallVector<size_t, 4> dims;
+    const Operation *loop; // the loop operation
+    const Value iv;        // the induction variable for the loop
+  };
+
+  /// Linearizes address for dense dimension (i.e., p = (i * d0) + j).
+  Value genAddress(OpBuilder &builder, Location loc, size_t tid, size_t dim,
+                   Value iv) {
+    Value p = dim == 0 ? constantIndex(builder, loc, 0) : pidxs[tid][dim - 1];
+    Value mul = builder.create<arith::MulIOp>(loc, highs[tid][dim], p);
+    Value add = builder.create<arith::AddIOp>(loc, mul, iv);
+    return add;
+  }
+
+  bool isOutputTensor(size_t tid) {
+    return hasOutput && tid == tensors.size() - 1;
+  }
+
+  /// Setups [lo, hi] for iterating tensor[dim], it assumes that tensor[0
+  /// ...dims-1] has already been setup.
+  void prepareLoopOverTensorAtDim(OpBuilder &builder, Location loc, size_t tid,
+                                  size_t dim);
+
+  /// Emits extra locals, since the locals might not be in simplified lattices
+  /// point used to generate the loops, but are still required to generates
+  /// expressions.
+  void emitExtraLocalsForTensorsAtDenseDims(OpBuilder &builder, Location loc,
+                                            ArrayRef<size_t> tids,
+                                            ArrayRef<size_t> dims);
+
+  /// Exits a for loop, returns the reduction results, e.g.,
+  /// %ret = for () {
+  ///   ...
+  ///   yield %val
+  /// }
+  /// Return %ret to user, while %val is provided by users (`reduc`)
+  SmallVector<Value, 2> exitForLoop(OpBuilder &builder, Location loc,
+                                    ArrayRef<Value> reduc);
+
+  /// Exits a while loop, returns the reduction results.
+  SmallVector<Value, 2> exitCoiterationLoop(OpBuilder &builder, Location loc,
+                                            ArrayRef<Value> reduc);
+
+  // Whether the loop emitter needs to treat the last tensor as the output
+  // tensor.
+  bool hasOutput;
+  /// Input and (optional) output tensors.
+  std::vector<Value> tensors;
+  /// The dim type array for each tensor.
+  std::vector<std::vector<DimLevelType>> dimTypes;
+  /// Sparse iteration information (by tensor and dim). These arrays
+  /// are updated to remain current within the current loop.
+  std::vector<std::vector<Value>> pidxs;
+  std::vector<std::vector<Value>> coord;
+  std::vector<std::vector<Value>> highs;
+  std::vector<std::vector<Value>> ptrBuffer; // to_pointers
+  std::vector<std::vector<Value>> idxBuffer; // to_indices
+  std::vector<Value> valBuffer;              // to_value
+
+  // Loop Stack, stores the information of all the nested loops that are alive.
+  std::vector<LoopLevelInfo> loopStack;
+
+  // Loop Sequence Stack, stores the unversial index for the current loop
+  // sequence.
+  std::vector<Value> loopSeqStack;
+
+  // TODO: not yet used, it should track the current level for each tensor
+  // to help eliminate `dim` paramters from above APIs.
+  // std::vector<size_t> curLv;
+};
+
 } // namespace sparse_tensor
 } // namespace mlir

--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@ -477,29 +477,35 @@ public:
    // 1. Generates loop for the sparse input.
    SparseTensorLoopEmitter loopEmitter(ValueRange{input});
    loopEmitter.initializeLoopEmit(rewriter, loc);
-    for (int64_t i = 0; i < rank; i++)
+    for (int64_t i = 0; i < rank; i++) {
+      // TODO: provide utility function for loop sequences that only contains
+      // one for loop?
+      loopEmitter.enterNewLoopSeq(rewriter, loc, 0, static_cast<size_t>(i));
      loopEmitter.enterLoopOverTensorAtDim(rewriter, loc, 0, i);
+    }

    SmallVector<Value, 4> coords;
    coords.reserve(rank);
    loopEmitter.getCoordinateArray(coords);

-    Value vals = loopEmitter.getTensorValueBuffer(0);
-    Value pidx = loopEmitter.getLastLevelTensorPointerIndex(0);
+    Value vals = loopEmitter.getValBuffer()[0];
+    Value pidx = loopEmitter.getPidxs()[0].back();
    // Loads the value from sparse tensor using pointer index;
    // loads the value from dense tensor using coordinate array.
    Value val = enc ? rewriter.create<memref::LoadOp>(loc, vals, pidx)
                    : rewriter.create<memref::LoadOp>(loc, vals, coords);

-    for (int64_t i = 0; i < rank; i++)
-      loopEmitter.exitCurrentLoop();
-
    // 2. Inline the block in the foreach operator.
    Block::iterator inlinePos = rewriter.getInsertionPoint();
    Block *srcBlock = op.getBody();
    // Remove sparse_tensor.yield.
    rewriter.eraseOp(srcBlock->getTerminator());

+    for (int64_t i = 0; i < rank; i++) {
+      loopEmitter.exitCurrentLoop(rewriter, loc);
+      loopEmitter.exitCurrentLoopSeq();
+    }
+
    SmallVector<Value, 4> args;
    // Remap coordinates.
    for (int64_t i = 0; i < rank; i++) {
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
--- a/mlir/test/Dialect/SparseTensor/sorted_coo.mlir
+++ b/mlir/test/Dialect/SparseTensor/sorted_coo.mlir
@ -74,10 +74,10 @@ func.func @sparse_scale(%argx: tensor<?x?xf32, #SortedCOO>) -> tensor<?x?xf32, #
 // CHECK-DAG:     %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref<?xindex>
 // CHECK-DAG:     %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref<?xindex>
 // CHECK-DAG:     %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref<?xf64>
-// CHECK:         %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
-// CHECK:         %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
-// CHECK:         %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:         %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:     %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
+// CHECK-DAG:     %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
+// CHECK-DAG:     %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:     %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:         scf.for %[[VAL_13:.*]] = %[[VAL_11]] to %[[VAL_12]] step %[[VAL_4]] {
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_13]]] : memref<?xindex>
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_14]]] : memref<32xf64>
@ -120,12 +120,12 @@ func.func @matvec(%arga: tensor<32x64xf64, #SortedCOO>,
 // CHECK-DAG:     %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref<?xindex>
 // CHECK-DAG:     %[[VAL_12:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 1 : index} : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref<?xindex>
 // CHECK-DAG:     %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref<?xf64>
-// CHECK:         %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x64xf64>
-// CHECK:         linalg.fill ins(%[[VAL_3]] : f64) outs(%[[VAL_14]] : memref<32x64xf64>)
-// CHECK:         %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:         %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK:         %[[VAL_17:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:         %[[VAL_18:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:     %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x64xf64>
+// CHECK-DAG:     linalg.fill ins(%[[VAL_3]] : f64) outs(%[[VAL_14]] : memref<32x64xf64>)
+// CHECK-DAG:     %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:     %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:     %[[VAL_17:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:     %[[VAL_18:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:         %[[VAL_19:.*]]:2 = scf.while (%[[VAL_20:.*]] = %[[VAL_15]], %[[VAL_21:.*]] = %[[VAL_17]]) : (index, index) -> (index, index) {
 // CHECK:           %[[VAL_22:.*]] = arith.cmpi ult, %[[VAL_20]], %[[VAL_16]] : index
 // CHECK:           %[[VAL_23:.*]] = arith.cmpi ult, %[[VAL_21]], %[[VAL_18]] : index
--- a/mlir/test/Dialect/SparseTensor/sparse_1d.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_1d.mlir
@ -113,10 +113,10 @@ func.func @mul_d(%arga: tensor<32xf32, #DV>, %argb: f32, %argx: tensor<32xf32>)
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_11]] : memref<32xf32>)
-// CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_11]] : memref<32xf32>)
 // CHECK:           %[[VAL_14:.*]]:2 = scf.while (%[[VAL_15:.*]] = %[[VAL_12]], %[[VAL_16:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) {
 // CHECK:             %[[VAL_17:.*]] = arith.cmpi ult, %[[VAL_15]], %[[VAL_13]] : index
 // CHECK:             scf.condition(%[[VAL_17]]) %[[VAL_15]], %[[VAL_16]] : index, index
@ -166,9 +166,9 @@ func.func @add_s(%arga: tensor<32xf32, #SV>, %argb: f32, %argx: tensor<32xf32>)
 // CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_8]] : memref<32xf32>)
-// CHECK:           %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_8]] : memref<32xf32>)
 // CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_3]] {
 // CHECK:             %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_11]]] : memref<?xindex>
 // CHECK:             %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_11]]] : memref<?xf32>
@ -206,9 +206,9 @@ func.func @repeated_add_s(%arga: tensor<32xf32, #SV>, %argx: tensor<32xf32>) ->
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_9]] : memref<32xf32>)
-// CHECK:           %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_9]] : memref<32xf32>)
+// CHECK-DAG:       %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_12:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_4]] {
 // CHECK:             %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>
 // CHECK:             %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_12]]] : memref<?xf32>
@ -314,9 +314,9 @@ func.func @mul_dd(%arga: tensor<32xf32, #DV>, %argb: tensor<32xf32>, %argx: tens
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
-// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_14:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:           %[[VAL_15:.*]]:2 = scf.while (%[[VAL_16:.*]] = %[[VAL_13]], %[[VAL_17:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) {
 // CHECK:             %[[VAL_18:.*]] = arith.cmpi ult, %[[VAL_16]], %[[VAL_14]] : index
 // CHECK:             scf.condition(%[[VAL_18]]) %[[VAL_16]], %[[VAL_17]] : index, index
@ -371,9 +371,9 @@ func.func @add_ds(%arga: tensor<32xf32>, %argb: tensor<32xf32, #SV>, %argx: tens
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32xf32>)
-// CHECK:           %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32xf32>)
+// CHECK-DAG:       %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_13:.*]] = %[[VAL_11]] to %[[VAL_12]] step %[[VAL_4]] {
 // CHECK:             %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_13]]] : memref<?xindex>
 // CHECK:             %[[VAL_15:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_14]]] : memref<32xf32>
@ -408,9 +408,9 @@ func.func @mul_ds(%arga: tensor<32xf32>, %argb: tensor<32xf32, #SV>, %argx: tens
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
 // CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
-// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:           %[[VAL_15:.*]]:2 = scf.while (%[[VAL_16:.*]] = %[[VAL_13]], %[[VAL_17:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) {
 // CHECK:             %[[VAL_18:.*]] = arith.cmpi ult, %[[VAL_16]], %[[VAL_14]] : index
 // CHECK:             scf.condition(%[[VAL_18]]) %[[VAL_16]], %[[VAL_17]] : index, index
@ -465,9 +465,9 @@ func.func @add_sd(%arga: tensor<32xf32, #SV>, %argb: tensor<32xf32>, %argx: tens
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
 // CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32xf32>)
-// CHECK:           %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32xf32>)
+// CHECK-DAG:       %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_13:.*]] = %[[VAL_11]] to %[[VAL_12]] step %[[VAL_4]] {
 // CHECK:             %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_13]]] : memref<?xindex>
 // CHECK:             %[[VAL_15:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_13]]] : memref<?xf32>
@ -502,11 +502,11 @@ func.func @mul_sd(%arga: tensor<32xf32, #SV>, %argb: tensor<32xf32>, %argx: tens
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
-// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_17:.*]]:2 = scf.while (%[[VAL_18:.*]] = %[[VAL_13]], %[[VAL_19:.*]] = %[[VAL_15]]) : (index, index) -> (index, index) {
 // CHECK:             %[[VAL_20:.*]] = arith.cmpi ult, %[[VAL_18]], %[[VAL_14]] : index
 // CHECK:             %[[VAL_21:.*]] = arith.cmpi ult, %[[VAL_19]], %[[VAL_16]] : index
@ -585,11 +585,11 @@ func.func @add_ss(%arga: tensor<32xf32, #SV>, %argb: tensor<32xf32, #SV>, %argx:
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
-// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_17:.*]]:2 = scf.while (%[[VAL_18:.*]] = %[[VAL_13]], %[[VAL_19:.*]] = %[[VAL_15]]) : (index, index) -> (index, index) {
 // CHECK:             %[[VAL_20:.*]] = arith.cmpi ult, %[[VAL_18]], %[[VAL_14]] : index
 // CHECK:             %[[VAL_21:.*]] = arith.cmpi ult, %[[VAL_19]], %[[VAL_16]] : index
@ -647,11 +647,11 @@ func.func @mul_ss(%arga: tensor<32xf32, #SV>, %argb: tensor<32xf32, #SV>, %argx:
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<16xf32>)
-// CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<16xf32>)
+// CHECK-DAG:       %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           %[[VAL_18:.*]]:2 = scf.while (%[[VAL_19:.*]] = %[[VAL_14]], %[[VAL_20:.*]] = %[[VAL_16]]) : (index, index) -> (index, index) {
 // CHECK:             %[[VAL_21:.*]] = arith.cmpi ult, %[[VAL_19]], %[[VAL_15]] : index
 // CHECK:             %[[VAL_22:.*]] = arith.cmpi ult, %[[VAL_20]], %[[VAL_17]] : index
@ -740,11 +740,11 @@ func.func @two_way_inv(%arga: tensor<16xf32, #SV>, %argb: tensor<16xf32, #SV>, %
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]]
-// CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<16xf32>)
-// CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<16xf32>)
+// CHECK-DAG:       %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           %[[VAL_18:.*]]:2 = scf.while (%[[VAL_19:.*]] = %[[VAL_14]], %[[VAL_20:.*]] = %[[VAL_16]]) : (index, index) -> (index, index) {
 // CHECK:             %[[VAL_21:.*]] = arith.cmpi ult, %[[VAL_19]], %[[VAL_15]] : index
 // CHECK:             %[[VAL_22:.*]] = arith.cmpi ult, %[[VAL_20]], %[[VAL_17]] : index
@ -881,11 +881,11 @@ func.func @sum_reduction(%arga: tensor<?xf32, #SV>, %argx: tensor<f32>) -> tenso
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<f32>
-// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_11]][] : memref<f32>
-// CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:           %[[VAL_13:.*]] = memref.load %[[VAL_11]][] : memref<f32>
+// CHECK-DAG:           %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:           %[[VAL_15:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:           %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:           %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_18:.*]]:3 = scf.while (%[[VAL_19:.*]] = %[[VAL_14]], %[[VAL_20:.*]] = %[[VAL_16]], %[[VAL_21:.*]] = %[[VAL_13]]) : (index, index, f32) -> (index, index, f32) {
 // CHECK:             %[[VAL_22:.*]] = arith.cmpi ult, %[[VAL_19]], %[[VAL_15]] : index
 // CHECK:             %[[VAL_23:.*]] = arith.cmpi ult, %[[VAL_20]], %[[VAL_17]] : index
@ -989,12 +989,12 @@ func.func @sum_reduction_ss(%arga: tensor<16xf32, #SV>,
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_2]] {dimension = 0 : index} : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]] : memref<f32>
-// CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_13]][] : memref<f32>
-// CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_9]][] : memref<f32>
-// CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK:           %[[VAL_19:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_20:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_15:.*]] = memref.load %[[VAL_13]][] : memref<f32>
+// CHECK-DAG:       %[[VAL_16:.*]] = memref.load %[[VAL_9]][] : memref<f32>
+// CHECK-DAG:       %[[VAL_17:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_19:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_20:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           %[[VAL_21:.*]]:3 = scf.while (%[[VAL_22:.*]] = %[[VAL_17]], %[[VAL_23:.*]] = %[[VAL_19]], %[[VAL_24:.*]] = %[[VAL_15]]) : (index, index, f32) -> (index, index, f32) {
 // CHECK:             %[[VAL_25:.*]] = arith.cmpi ult, %[[VAL_22]], %[[VAL_18]] : index
 // CHECK:             %[[VAL_26:.*]] = arith.cmpi ult, %[[VAL_23]], %[[VAL_20]] : index
@ -1106,13 +1106,13 @@ func.func @sum_reduction_inv(%arga: tensor<16xf32, #SV>,
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.pointers %[[VAL_3]] {dimension = 0 : index} : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.indices %[[VAL_3]] {dimension = 0 : index} : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_15:.*]] = sparse_tensor.values %[[VAL_3]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf64>
-// CHECK-DAG:       %[[VAL_16:.*]] = tensor.dim %[[VAL_4]], %[[VAL_5]] : tensor<?xf64>
+// CHECK-DAG:       %[[VAL_16:.*]] = tensor.dim %[[VAL_0]], %[[VAL_5]] : tensor<?xf64>
 // CHECK-DAG:       %[[VAL_18:.*]] = bufferization.to_memref %[[VAL_4]]
-// CHECK:           linalg.fill ins(%{{.*}} : f64) outs(%[[VAL_18]] : memref<?xf64>)
-// CHECK:           %[[VAL_19:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK:           %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref<?xindex>
-// CHECK:           %[[VAL_21:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK:           %[[VAL_22:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_7]]] : memref<?xindex>
+// CHECK-DAG:       linalg.fill ins(%{{.*}} : f64) outs(%[[VAL_18]] : memref<?xf64>)
+// CHECK-DAG:       %[[VAL_19:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_21:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_22:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_7]]] : memref<?xindex>
 // CHECK:           %[[VAL_23:.*]]:3 = scf.while (%[[VAL_24:.*]] = %[[VAL_19]], %[[VAL_25:.*]] = %[[VAL_21]], %[[VAL_26:.*]] = %[[VAL_5]]) : (index, index, index) -> (index, index, index) {
 // CHECK:             %[[VAL_27:.*]] = arith.cmpi ult, %[[VAL_24]], %[[VAL_20]] : index
 // CHECK:             %[[VAL_28:.*]] = arith.cmpi ult, %[[VAL_25]], %[[VAL_22]] : index
@ -1284,13 +1284,13 @@ func.func @four_tensors_op(%arga: tensor<?xf64>,
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.indices %[[VAL_2]] {dimension = 0 : index} : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf64>
 // CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_3]] : memref<f64>
-// CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_15]][] : memref<f64>
-// CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_19:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK:           %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_21:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK:           %[[VAL_22:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_23:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_17:.*]] = memref.load %[[VAL_15]][] : memref<f64>
+// CHECK-DAG:       %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_19:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_21:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_22:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_23:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           %[[VAL_24:.*]]:4 = scf.while (%[[VAL_25:.*]] = %[[VAL_18]], %[[VAL_26:.*]] = %[[VAL_20]], %[[VAL_27:.*]] = %[[VAL_22]], %[[VAL_28:.*]] = %[[VAL_17]]) : (index, index, index, f64) -> (index, index, index, f64) {
 // CHECK:             %[[VAL_29:.*]] = arith.cmpi ult, %[[VAL_25]], %[[VAL_19]] : index
 // CHECK:             %[[VAL_30:.*]] = arith.cmpi ult, %[[VAL_26]], %[[VAL_21]] : index
--- a/mlir/test/Dialect/SparseTensor/sparse_2d.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_2d.mlir
@ -962,7 +962,7 @@ func.func @sum_reduction(%arga: tensor<10x20xf32, #Tds>, %argx: tensor<f32>) ->
 // CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>> to memref<?xf64>
-// CHECK-DAG:       %[[VAL_8:.*]] = tensor.dim %[[VAL_1]], %[[VAL_3]] : tensor<?x?xf64>
+// CHECK-DAG:       %[[VAL_8:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
 // CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf64>
 // CHECK:           linalg.fill ins(%{{.*}} : f64) outs(%[[VAL_11]] : memref<?x?xf64>)
 // CHECK:           scf.for %[[VAL_12:.*]] = %[[VAL_3]] to %[[VAL_8]] step %[[VAL_4]] {
@ -1015,7 +1015,7 @@ func.func @scale(%arga: tensor<?x?xf64, #Tds>, %argx: tensor<?x?xf64>) -> tensor
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf32>
-// CHECK-DAG:       %[[VAL_12:.*]] = tensor.dim %[[VAL_2]], %[[VAL_4]] : tensor<?x?xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = tensor.dim %[[VAL_1]], %[[VAL_5]] : tensor<?x?xf32>
 // CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf32>
 // CHECK-DAG:       %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_3]] : memref<?x?xf32>
 // CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@ -1095,7 +1095,7 @@ func.func @sampled_dense_dense(%args: tensor<?x?xf32, #Tss>,
 // CHECK-DAG:       %[[VAL_19:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_20:.*]] = bufferization.to_memref %[[VAL_3]] : memref<?xf32>
 // CHECK-DAG:       %[[VAL_21:.*]] = bufferization.to_memref %[[VAL_4]] : memref<f32>
-// CHECK-DAG:       %[[VAL_22:.*]] = tensor.dim %[[VAL_5]], %[[VAL_6]] : tensor<?xf32>
+// CHECK-DAG:       %[[VAL_22:.*]] = tensor.dim %[[VAL_2]], %[[VAL_6]] : tensor<?x?xf32,
 // CHECK-DAG:       %[[VAL_24:.*]] = bufferization.to_memref %[[VAL_5]] : memref<?xf32>
 // CHECK:           %[[VAL_25:.*]] = memref.load %[[VAL_21]][] : memref<f32>
 // CHECK:           %[[VAL_26:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_6]]] : memref<?xindex>
--- a/mlir/test/Dialect/SparseTensor/sparse_3d.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_3d.mlir
@ -1126,11 +1126,11 @@ func.func @mul_sss(%arga: tensor<32x16x8xf32, #Tsss>, %argb: tensor<32x16x8xf32>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_1]] {dimension = 2 : index} : tensor<?x?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 2 : index} : tensor<?x?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ] }>> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_10:.*]] = tensor.dim %[[VAL_2]], %[[VAL_5]] : tensor<?x?xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = tensor.dim %[[VAL_1]], %[[VAL_6]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
 // CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf32>
 // CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_3]] : memref<?x?xf32>
-// CHECK-DAG:       %[[VAL_13:.*]] = tensor.dim %[[VAL_0]], %[[VAL_5]] : tensor<?x?xf32>
-// CHECK-DAG:       %[[VAL_14:.*]] = tensor.dim %[[VAL_0]], %[[VAL_6]] : tensor<?x?xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = tensor.dim %[[VAL_1]], %[[VAL_5]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_14:.*]] = tensor.dim %[[VAL_2]], %[[VAL_6]] : tensor<?x?xf32>
 // CHECK-DAG:       %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_0]] : memref<?x?xf32>
 // CHECK:           scf.for %[[VAL_17:.*]] = %[[VAL_5]] to %[[VAL_13]] step %[[VAL_6]] {
 // CHECK:             scf.for %[[VAL_18:.*]] = %[[VAL_5]] to %[[VAL_10]] step %[[VAL_6]] {
@ -1247,7 +1247,7 @@ func.func @sum_reduction(%arga: tensor<10x20x30xf32, #Tsss>, %argx: tensor<f32>)
 // CHECK-DAG:       %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?x?xf32>
 // CHECK-DAG:       %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?x?xf32>
 // CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref<?x?x?xf32>
-// CHECK-DAG:       %[[VAL_9:.*]] = tensor.dim %[[VAL_1]], %[[VAL_5]] : tensor<?xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_9:.*]] = tensor.dim %[[VAL_0]], %[[VAL_5]] : tensor<?x?x?xf32>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xf32, #sparse_tensor.encoding<{{{.*}}}>>
 // CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<f32>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_12]][] : memref<f32>
--- a/mlir/test/Dialect/SparseTensor/sparse_concat_codegen.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_concat_codegen.mlir
@ -20,8 +20,8 @@
 //       CHECK:  %[[TMP_7:.*]] = memref.load %[[TMP_1]][%[[TMP_c1]]] : memref<?xindex>
 //       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_6]] to %[[TMP_7]] step %[[TMP_c1]] {
 //       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_2]][%[[TMP_arg3]]] : memref<?xindex>
-//       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
-//       CHECK:    %[[TMP_25:.*]] = memref.load %[[TMP_3]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_3]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
 //       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_3]][%[[TMP_24]]] : memref<?xindex>
 //       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] {
 //       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_4]][%[[TMP_arg4]]] : memref<?xindex>
@ -38,8 +38,8 @@
 //       CHECK:  %[[TMP_14:.*]] = memref.load %[[TMP_8]][%[[TMP_c1]]] : memref<?xindex>
 //       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_13]] to %[[TMP_14]] step %[[TMP_c1]] {
 //       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_9]][%[[TMP_arg3]]] : memref<?xindex>
-//       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
-//       CHECK:    %[[TMP_25:.*]] = memref.load %[[TMP_10]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_10]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
 //       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_10]][%[[TMP_24]]] : memref<?xindex>
 //       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] {
 //       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_11]][%[[TMP_arg4]]] : memref<?xindex>
@ -57,8 +57,8 @@
 //       CHECK:  %[[TMP_21:.*]] = memref.load %[[TMP_15]][%[[TMP_c1]]] : memref<?xindex>
 //       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_20]] to %[[TMP_21]] step %[[TMP_c1]] {
 //       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_16]][%[[TMP_arg3]]] : memref<?xindex>
-//       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
 //       CHECK:    %[[TMP_25:.*]] = memref.load %[[TMP_17]][%[[TMP_arg3]]] : memref<?xindex>
+//       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
 //       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_17]][%[[TMP_24]]] : memref<?xindex>
 //       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] {
 //       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_18]][%[[TMP_arg4]]] : memref<?xindex>
--- a/mlir/test/Dialect/SparseTensor/sparse_index.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_index.mlir
@ -25,14 +25,15 @@
 // CHECK-DAG:       %[[VAL_4:.*]] = tensor.dim %[[VAL_0]], %[[VAL_1]] : tensor<?x?xi64, #sparse_tensor.encoding
 // CHECK-DAG:       %[[VAL_5:.*]] = bufferization.alloc_tensor(%[[VAL_3]], %[[VAL_4]]) : tensor<?x?xi64, #sparse_tensor.encoding
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xi64, #sparse_tensor.encoding
-// CHECK-DAG:       %[[VAL_7:.*]] = tensor.dim %[[VAL_5]], %[[VAL_1]] : tensor<?x?xi64, #sparse_tensor.encoding
-// CHECK-DAG:       %[[VAL_8:.*]] = tensor.dim %[[VAL_5]], %[[VAL_2]] : tensor<?x?xi64, #sparse_tensor.encoding
+// CHECK-DAG:       %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_1]] : tensor<?x?xi64, #sparse_tensor.encoding
+// CHECK-DAG:       %[[VAL_8:.*]] = tensor.dim %[[VAL_0]], %[[VAL_2]] : tensor<?x?xi64, #sparse_tensor.encoding
+// CHECK-DAG:       %[[VAL_24:.*]] = tensor.dim %[[VAL_5]], %[[VAL_2]] : tensor<?x?xi64, #sparse_tensor.encoding
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_5]] : tensor<?x?xi64, #sparse_tensor.encoding
 // CHECK:           scf.for %[[VAL_10:.*]] = %[[VAL_1]] to %[[VAL_7]] step %[[VAL_2]] {
 // CHECK:             scf.for %[[VAL_11:.*]] = %[[VAL_1]] to %[[VAL_8]] step %[[VAL_2]] {
 // CHECK:               %[[VAL_12:.*]] = arith.muli %[[VAL_8]], %[[VAL_10]] : index
 // CHECK:               %[[VAL_13:.*]] = arith.addi %[[VAL_12]], %[[VAL_11]] : index
-// CHECK:               %[[VAL_14:.*]] = arith.muli %[[VAL_8]], %[[VAL_10]] : index
+// CHECK:               %[[VAL_14:.*]] = arith.muli %[[VAL_24]], %[[VAL_10]] : index
 // CHECK:               %[[VAL_15:.*]] = arith.addi %[[VAL_14]], %[[VAL_11]] : index
 // CHECK:               %[[VAL_16:.*]] = arith.index_cast %[[VAL_11]] : index to i64
 // CHECK:               %[[VAL_17:.*]] = arith.index_cast %[[VAL_10]] : index to i64
--- a/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir
@ -36,14 +36,14 @@
 // CHECK-HIR-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
 // CHECK-HIR-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
 // CHECK-HIR:           scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
-// CHECK-HIR:             %[[VAL_13:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_12]]] : memref<64xf64>
-// CHECK-HIR:             %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>
-// CHECK-HIR:             %[[VAL_15:.*]] = arith.addi %[[VAL_12]], %[[VAL_5]] : index
-// CHECK-HIR:             %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_15]]] : memref<?xindex>
+// CHECK-HIR-DAG:         %[[VAL_13:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_12]]] : memref<64xf64>
+// CHECK-HIR-DAG:         %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>
+// CHECK-HIR-DAG:         %[[VAL_15:.*]] = arith.addi %[[VAL_12]], %[[VAL_5]] : index
+// CHECK-HIR-DAG:         %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_15]]] : memref<?xindex>
 // CHECK-HIR:             scf.for %[[VAL_17:.*]] = %[[VAL_14]] to %[[VAL_16]] step %[[VAL_5]] {
-// CHECK-HIR:               %[[VAL_18:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_17]]] : memref<?xindex>
-// CHECK-HIR:               %[[VAL_19:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_18]]] : memref<32xf64>
-// CHECK-HIR:               %[[VAL_20:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_17]]] : memref<?xf64>
+// CHECK-HIR-DAG:           %[[VAL_18:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_17]]] : memref<?xindex>
+// CHECK-HIR-DAG:           %[[VAL_19:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_18]]] : memref<32xf64>
+// CHECK-HIR-DAG:           %[[VAL_20:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_17]]] : memref<?xf64>
 // CHECK-HIR:               %[[VAL_21:.*]] = arith.mulf %[[VAL_20]], %[[VAL_13]] : f64
 // CHECK-HIR:               %[[VAL_22:.*]] = arith.addf %[[VAL_19]], %[[VAL_21]] : f64
 // CHECK-HIR:               memref.store %[[VAL_22]], %[[VAL_11]]{{\[}}%[[VAL_18]]] : memref<32xf64>
--- a/mlir/test/Dialect/SparseTensor/sparse_perm.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_perm.mlir
@ -60,17 +60,17 @@ func.func @sparse_static_dims(%arga: tensor<10x20x30xf32, #X>,
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK-DAG:       %[[VAL_6:.*]] = tensor.dim %[[VAL_1]], %[[VAL_3]] : tensor<?x?x?xf32>
-// CHECK-DAG:       %[[VAL_7:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor<?x?x?xf32>
-// CHECK-DAG:       %[[VAL_8:.*]] = tensor.dim %[[VAL_1]], %[[VAL_2]] : tensor<?x?x?xf32>
+// CHECK-DAG:       %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_2]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_8:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
 // CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?x?xf32>
 // CHECK:           linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_10]] : memref<?x?x?xf32>)
-// CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_4]] {
-// CHECK:             scf.for %[[VAL_12:.*]] = %[[VAL_3]] to %[[VAL_8]] step %[[VAL_4]] {
-// CHECK:               %[[VAL_13:.*]] = arith.muli %[[VAL_8]], %[[VAL_11]] : index
+// CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_4]] {
+// CHECK:             scf.for %[[VAL_12:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_4]] {
+// CHECK:               %[[VAL_13:.*]] = arith.muli %[[VAL_7]], %[[VAL_11]] : index
 // CHECK:               %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_12]] : index
-// CHECK:               scf.for %[[VAL_15:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_4]] {
-// CHECK:                 %[[VAL_16:.*]] = arith.muli %[[VAL_6]], %[[VAL_14]] : index
+// CHECK:               scf.for %[[VAL_15:.*]] = %[[VAL_3]] to %[[VAL_8]] step %[[VAL_4]] {
+// CHECK:                 %[[VAL_16:.*]] = arith.muli %[[VAL_8]], %[[VAL_14]] : index
 // CHECK:                 %[[VAL_17:.*]] = arith.addi %[[VAL_16]], %[[VAL_15]] : index
 // CHECK:                 %[[VAL_18:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_17]]] : memref<?xf32>
 // CHECK:                 memref.store %[[VAL_18]], %[[VAL_10]]{{\[}}%[[VAL_15]], %[[VAL_11]], %[[VAL_12]]] : memref<?x?x?xf32>
--- a/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir
@ -117,8 +117,8 @@ func.func @sparse_expand(%arg0: tensor<100xf64, #SparseVector>) -> tensor<10x10x
 // CHECK-RWT:          %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref<?xindex>
 // CHECK-RWT:          scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] {
 // CHECK-RWT:            %[[SI0:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref<?xindex>
-// CHECK-RWT:            %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index
-// CHECK-RWT:            %[[S1:.*]] = memref.load %[[P1]]{{\[}}%[[I]]] : memref<?xindex>
+// CHECK-RWT-DAG:        %[[S1:.*]] = memref.load %[[P1]]{{\[}}%[[I]]] : memref<?xindex>
+// CHECK-RWT-DAG:        %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index
 // CHECK-RWT:            %[[E1:.*]] = memref.load %[[P1]]{{\[}}%[[PE1]]] : memref<?xindex>
 // CHECK-RWT:            scf.for %[[J:.*]] = %[[S1]] to %[[E1]] step %[[C1]] {
 // CHECK-RWT:              %[[SI1:.*]] = memref.load %[[I1]]{{\[}}%[[J]]] : memref<?xindex>
@ -266,8 +266,8 @@ func.func @dynamic_sparse_expand(%arg0: tensor<?xf64, #SparseVector>) -> tensor<
 // CHECK-RWT:          %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref<?xindex>
 // CHECK-RWT:          scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] {
 // CHECK-RWT:            %[[SI0:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref<?xindex>
-// CHECK-RWT:            %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index
-// CHECK-RWT:            %[[S1:.*]] = memref.load %[[P1]]{{\[}}%[[I]]] : memref<?xindex>
+// CHECK-RWT-DAG:        %[[S1:.*]] = memref.load %[[P1]]{{\[}}%[[I]]] : memref<?xindex>
+// CHECK-RWT-DAG:        %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index
 // CHECK-RWT:            %[[E1:.*]] = memref.load %[[P1]]{{\[}}%[[PE1]]] : memref<?xindex>
 // CHECK-RWT:            scf.for %[[J:.*]] = %[[S1]] to %[[E1]] step %[[C1]] {
 // CHECK-RWT:              %[[SI1:.*]] = memref.load %[[I1]]{{\[}}%[[J]]] : memref<?xindex>
--- a/mlir/test/Dialect/SparseTensor/sparse_scalars.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_scalars.mlir
@ -27,17 +27,17 @@
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 2.200000e+00 : f32
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_8:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : f32
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
-// CHECK:           %[[VAL_12:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
-// CHECK:           %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xf32>
-// CHECK:           %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
-// CHECK:           %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_4]] : memref<32x16xf32>
-// CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_14]][] : memref<f32>
-// CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_6]]] : memref<?xindex>
-// CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : f32
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
+// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_4]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_16:.*]] = memref.load %[[VAL_14]][] : memref<f32>
+// CHECK-DAG:       %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_6]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_18:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_19:.*]] = %[[VAL_17]] to %[[VAL_18]] step %[[VAL_7]] {
 // CHECK:             %[[VAL_20:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_19]]] : memref<?xindex>
 // CHECK:             %[[VAL_21:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_19]]] : memref<?xindex>