[mlir][sparse] merger extension to support sparsifying arith::CmpI/CmpF operation

Reviewed By: aartbik Differential Revision: https://reviews.llvm.org/D152761
2025-01-31 22:25:56 +00:00 · 2023-06-12 21:47:26 +00:00 · 2023-06-12 21:47:26 +00:00 · faf7cd97d0
commit faf7cd97d0
parent e92a27bcb7
6 changed files with 915 additions and 62 deletions
--- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
@ -45,7 +45,7 @@ struct TensorExp final {
  // The `y`, `v`, and `op` parameters either must or must not be
  // `kInvalidId`/`nullptr`, depending on the value of the `k` parameter;
  // however, they have uniform C++ types regardless of the value of `k`.
-  TensorExp(Kind k, unsigned x, ExprId y, Value v, Operation *op);
+  TensorExp(Kind k, unsigned x, ExprId y, Value v, Operation *op, Attribute a);

  /// Tensor expression kind.
  Kind kind;
@ -71,6 +71,10 @@ struct TensorExp final {
  /// kBinaryBranch, this holds the YieldOp for the left or right half
  /// to be merged into a nested scf loop.
  Operation *op;
+
+  /// An optional attribute that is required to determine the semantics of the
+  /// operations. E.g., CmpPredicateAttr for CmpI/CmpF operations.
+  Attribute attr;
 };

 /// Tensor expression kind.
@ -79,6 +83,10 @@ struct TensorExp final {
 /// That is, its argument is a `LoopId` identifying the loop-variable
 /// in question, and its value will be the current iteration's value
 /// of that loop-variable.  See the `LoopId` documentation for more details.
+///
+/// The `kSynZero` leaf kind is for representing a synthetic zero value, which
+/// can be introduced when sparsifying operations like `arith::cmp` to generate
+/// `arith::cmp %lhs, %syn_zero` when the rhs operand is absent.
 //
 // TODO: Modify this definition so that the numeric values already encode
 // the `ExpArity` (while extending the notion of "arity" to include not
@ -89,6 +97,7 @@ struct TensorExp final {
 enum class TensorExp::Kind {
  // Leaf.
  kTensor = 0,
+  kSynZero,
  kInvariant,
  kLoopVar,
  // Unary operations.
@ -143,6 +152,8 @@ enum class TensorExp::Kind {
  kAndI,
  kOrI,
  kXorI,
+  kCmpI,
+  kCmpF,
  kShrS, // signed
  kShrU, // unsigned
  kShlI,
@ -246,13 +257,16 @@ public:
  ExprId addLoopVarExp(LoopId i);
  /// Constructs a new invariant expression, and returns its identifier.
  ExprId addInvariantExp(Value v);
+  /// Constructs a new synthetic zero expression.
+  ExprId addSynZeroExp();
  /// Constructs a new unary or binary expression, and returns its identifier.
  ExprId addExp(TensorExp::Kind k, ExprId e0, ExprId e1 = detail::kInvalidId,
-                Operation *op = nullptr);
+                Operation *op = nullptr, Attribute attr = nullptr);
  /// Constructs a new sesquinary expression, and returns its identifier.
  /// Currently no sesquinary `Kind` allows specifying the `op`, but we
  /// allow it anyways because `mapSet` is designed to allow it.
-  ExprId addExp(TensorExp::Kind k, ExprId e, Value v, Operation *op = nullptr);
+  ExprId addExp(TensorExp::Kind k, ExprId e, Value v, Operation *op = nullptr,
+                Attribute attr = nullptr);

  /// Constructs a new iteration lattice point, and returns its identifier.
  LatPointId addLat(TensorId t, LoopId i, ExprId e);
@ -265,26 +279,29 @@ public:
  /// of `LoopId` (effectively constructing a larger "intersection" of those
  /// loops) with a newly constructed tensor (sub)expression of given kind.
  /// Returns the identifier of the new lattice point.
-  LatPointId conjLat(TensorExp::Kind kind, LatPointId p0, LatPointId p1,
+  LatPointId conjLat(ExprId e, LatPointId p0, LatPointId p1,
                     Operation *op = nullptr);

  /// Conjunctive merge of two lattice sets: `(s0 /\_op s1)`.
  /// Returns the identifier of the new set.
-  LatSetId conjSet(TensorExp::Kind kind, LatSetId s0, LatSetId s1,
-                   Operation *op = nullptr);
+  LatSetId conjSet(ExprId e, LatSetId s0, LatSetId s1, Operation *op = nullptr);

  /// Disjunctive merge of two lattice sets: `(s0 /\_op s1, s0, s1)`.
  /// Returns the identifier of the new set.
-  LatSetId disjSet(TensorExp::Kind kind, LatSetId s0, LatSetId s1,
-                   Operation *op = nullptr);
+  LatSetId disjSet(ExprId e, LatSetId s0, LatSetId s1, Operation *op = nullptr);
+
+  /// Disjunctive merge of two lattice sets and also set one of the operand to
+  /// zero: `(s0 /\_op s1 (e0 op e1), s0 (0 op e0), s1 (e1 op 0))`.
+  /// Returns the identifier of the new set.
+  LatSetId disjSetWithZero(ExprId e, LatSetId s0, LatSetId s1);

  /// Disjunctive merge of two lattice sets with custom handling of the
  /// overlap, left, and right regions.  Any region may be left missing
  /// in the output.  Returns the identifier of the new set.
-  LatSetId combiSet(TensorExp::Kind kind, LatSetId s0, LatSetId s1,
-                    Operation *orig, bool includeLeft, TensorExp::Kind ltrans,
-                    Operation *opleft, bool includeRight,
-                    TensorExp::Kind rtrans, Operation *opright);
+  LatSetId combiSet(ExprId e, LatSetId s0, LatSetId s1, Operation *orig,
+                    bool includeLeft, TensorExp::Kind ltrans, Operation *opleft,
+                    bool includeRight, TensorExp::Kind rtrans,
+                    Operation *opright);

  /// Maps the unary operator over the lattice set of the operand, i.e. each
  /// lattice point on an expression E is simply copied over, but with OP E
@ -292,6 +309,12 @@ public:
  LatSetId mapSet(TensorExp::Kind kind, LatSetId s, Value v = Value(),
                  Operation *op = nullptr);

+  /// Maps the binary operator to the same operation but with one of its operand
+  /// set to zero, i.e. each lattice point on an expression E is simply copied
+  /// over, but with `OP 0 E` (if lhsZero == true) or `OP E 0` (if lhsZero ==
+  /// false) as new expression. Returns the identifier of the new set.
+  LatSetId mapBinWithSynZeroSet(ExprId e, LatSetId s, bool lhsZero);
+
  /// Optimizes the iteration lattice points in the given set. This
  /// method should be called right before code generation to avoid
  /// generating redundant loops and conditions.
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@ -1154,11 +1154,11 @@ static Value relinkBranch(CodegenEnv &env, RewriterBase &rewriter, Block *block,
 /// Recursively generates tensor expression.
 static Value genExp(CodegenEnv &env, RewriterBase &rewriter, ExprId e,
                    LoopId ldx) {
-  linalg::GenericOp op = env.op();
-  Location loc = op.getLoc();
-
  if (e == ::mlir::sparse_tensor::detail::kInvalidId)
    return Value();
+
+  linalg::GenericOp op = env.op();
+  Location loc = op.getLoc();
  const TensorExp &exp = env.exp(e);
  const auto kind = exp.kind;
  if (kind == TensorExp::Kind::kTensor)
@ -1171,8 +1171,22 @@ static Value genExp(CodegenEnv &env, RewriterBase &rewriter, ExprId e,
  if (kind == TensorExp::Kind::kReduce)
    env.startCustomReduc(e); // enter custom

-  Value v0 = genExp(env, rewriter, exp.children.e0, ldx);
-  Value v1 = genExp(env, rewriter, exp.children.e1, ldx);
+  Value v0, v1;
+  // If either lhs/rhs is a synthetic zero, we infer the type for the zero value
+  // based on the type of the other operand.
+  if (exp.children.e0 != ::mlir::sparse_tensor::detail::kInvalidId &&
+      env.exp(exp.children.e0).kind == TensorExp::Kind::kSynZero) {
+    v1 = genExp(env, rewriter, exp.children.e1, ldx);
+    v0 = constantZero(rewriter, loc, v1.getType());
+  } else if (exp.children.e1 != ::mlir::sparse_tensor::detail::kInvalidId &&
+             env.exp(exp.children.e1).kind == TensorExp::Kind::kSynZero) {
+    v0 = genExp(env, rewriter, exp.children.e0, ldx);
+    v1 = constantZero(rewriter, loc, v0.getType());
+  } else {
+    v0 = genExp(env, rewriter, exp.children.e0, ldx);
+    v1 = genExp(env, rewriter, exp.children.e1, ldx);
+  }
+
  Value ee;
  if (kind == TensorExp::Kind::kReduce && (!v0 || !v1)) {
    // custom reduce did not receive a value
@ -1248,7 +1262,8 @@ static void genInvariants(CodegenEnv &env, OpBuilder &builder, ExprId exp,
        env.merger().clearExprValue(exp);
    }
  } else if (env.exp(exp).kind != TensorExp::Kind::kInvariant &&
-             env.exp(exp).kind != TensorExp::Kind::kLoopVar) {
+             env.exp(exp).kind != TensorExp::Kind::kLoopVar &&
+             env.exp(exp).kind != TensorExp::Kind::kSynZero) {
    // Traverse into the binary operations. Note that we only hoist
    // tensor loads, since subsequent MLIR/LLVM passes know how to
    // deal with all other kinds of derived loop invariants.
--- a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
@ -31,6 +31,7 @@ static ExpArity getExpArity(TensorExp::Kind k) {
  case TensorExp::Kind::kTensor:
  case TensorExp::Kind::kInvariant:
  case TensorExp::Kind::kLoopVar:
+  case TensorExp::Kind::kSynZero:
    return ExpArity::kNullary;
  case TensorExp::Kind::kAbsF:
  case TensorExp::Kind::kAbsC:
@ -89,6 +90,8 @@ static ExpArity getExpArity(TensorExp::Kind k) {
  case TensorExp::Kind::kSubF:
  case TensorExp::Kind::kSubC:
  case TensorExp::Kind::kSubI:
+  case TensorExp::Kind::kCmpF:
+  case TensorExp::Kind::kCmpI:
    return ExpArity::kBinary;
  }
  llvm_unreachable("unexpected kind");
@ -99,7 +102,7 @@ static ExpArity getExpArity(TensorExp::Kind k) {
 //===----------------------------------------------------------------------===//

 TensorExp::TensorExp(TensorExp::Kind k, unsigned x, ExprId y, Value v,
-                     Operation *o)
+                     Operation *o, Attribute a)
    : kind(k), val(v), op(o) {
  switch (kind) {
  // Leaf.
@ -107,6 +110,9 @@ TensorExp::TensorExp(TensorExp::Kind k, unsigned x, ExprId y, Value v,
    assert(x != detail::kInvalidId && y == detail::kInvalidId && !v && !o);
    tensor = x;
    return;
+  case TensorExp::Kind::kSynZero:
+    assert(x == detail::kInvalidId && y == detail::kInvalidId && !v && !o);
+    return;
  case TensorExp::Kind::kInvariant:
    assert(x == detail::kInvalidId && y == detail::kInvalidId && v && !o);
    return;
@ -191,6 +197,13 @@ TensorExp::TensorExp(TensorExp::Kind k, unsigned x, ExprId y, Value v,
    children.e0 = x;
    children.e1 = y;
    return;
+  case TensorExp::Kind::kCmpF:
+  case TensorExp::Kind::kCmpI:
+    assert(x != detail::kInvalidId && y != detail::kInvalidId && !v && !o);
+    attr = a;
+    children.e0 = x;
+    children.e1 = y;
+    return;
  case TensorExp::Kind::kBinary:
  case TensorExp::Kind::kReduce:
    assert(x != detail::kInvalidId && y != detail::kInvalidId && !v && o);
@ -228,7 +241,7 @@ ExprId Merger::addTensorExp(TensorId t) {
  assert(isValidTensorId(t));
  const ExprId eNew(tensorExps.size());
  tensorExps.emplace_back(TensorExp::Kind::kTensor, t, detail::kInvalidId,
-                          Value(), nullptr);
+                          Value(), nullptr, nullptr);
  return eNew;
 }

@ -236,28 +249,37 @@ ExprId Merger::addLoopVarExp(LoopId i) {
  assert(isValidLoopId(i));
  const ExprId eNew(tensorExps.size());
  tensorExps.emplace_back(TensorExp::Kind::kLoopVar, i, detail::kInvalidId,
-                          Value(), nullptr);
+                          Value(), nullptr, nullptr);
  return eNew;
 }

 ExprId Merger::addInvariantExp(Value v) {
  const ExprId eNew(tensorExps.size());
  tensorExps.emplace_back(TensorExp::Kind::kInvariant, detail::kInvalidId,
-                          detail::kInvalidId, v, nullptr);
+                          detail::kInvalidId, v, nullptr, nullptr);
  return eNew;
 }

-ExprId Merger::addExp(TensorExp::Kind k, ExprId e0, ExprId e1, Operation *op) {
-  assert(k > TensorExp::Kind::kLoopVar);
+ExprId Merger::addSynZeroExp() {
  const ExprId eNew(tensorExps.size());
-  tensorExps.emplace_back(k, e0, e1, Value(), op);
+  tensorExps.emplace_back(TensorExp::Kind::kSynZero, detail::kInvalidId,
+                          detail::kInvalidId, Value(), nullptr, nullptr);
  return eNew;
 }

-ExprId Merger::addExp(TensorExp::Kind k, ExprId e, Value v, Operation *op) {
+ExprId Merger::addExp(TensorExp::Kind k, ExprId e0, ExprId e1, Operation *op,
+                      Attribute attr) {
  assert(k > TensorExp::Kind::kLoopVar);
  const ExprId eNew(tensorExps.size());
-  tensorExps.emplace_back(k, e, detail::kInvalidId, v, op);
+  tensorExps.emplace_back(k, e0, e1, Value(), op, attr);
+  return eNew;
+}
+
+ExprId Merger::addExp(TensorExp::Kind k, ExprId e, Value v, Operation *op,
+                      Attribute attr) {
+  assert(k > TensorExp::Kind::kLoopVar);
+  const ExprId eNew(tensorExps.size());
+  tensorExps.emplace_back(k, e, detail::kInvalidId, v, op, attr);
  return eNew;
 }

@ -283,31 +305,33 @@ LatSetId Merger::addSet() {
  return sNew;
 }

-LatPointId Merger::conjLat(TensorExp::Kind kind, LatPointId p0, LatPointId p1,
+LatPointId Merger::conjLat(ExprId e, LatPointId p0, LatPointId p1,
                           Operation *op) {
+  TensorExp::Kind kind = exp(e).kind;
+  Attribute attr = exp(e).attr;
  const LatPointId pNew(latPoints.size());
  const auto &point0 = lat(p0);
  const auto &point1 = lat(p1);
  BitVector bits(point0.bits);
  bits |= point1.bits;
-  const ExprId e = addExp(kind, point0.exp, point1.exp, op);
-  latPoints.emplace_back(bits, e);
+  const ExprId ne = addExp(kind, point0.exp, point1.exp, op, attr);
+  latPoints.emplace_back(bits, ne);
  return pNew;
 }

-LatSetId Merger::conjSet(TensorExp::Kind kind, LatSetId s0, LatSetId s1,
-                         Operation *op) {
+LatSetId Merger::conjSet(ExprId e, LatSetId s0, LatSetId s1, Operation *op) {
  const LatSetId sNew = addSet();
  auto &setNew = latSets[sNew];
  for (const LatPointId p0 : set(s0))
    for (const LatPointId p1 : set(s1))
-      setNew.push_back(conjLat(kind, p0, p1, op));
+      setNew.push_back(conjLat(e, p0, p1, op));
  return sNew;
 }

-LatSetId Merger::disjSet(TensorExp::Kind kind, LatSetId s0, LatSetId s1,
-                         Operation *op) {
-  const LatSetId sNew = conjSet(kind, s0, s1, op);
+LatSetId Merger::disjSet(ExprId e, LatSetId s0, LatSetId s1, Operation *op) {
+  const LatSetId sNew = conjSet(e, s0, s1, op);
+  TensorExp::Kind kind = exp(e).kind;
+
  // Followed by all in s0.
  latSets[sNew].append(latSets[s0]);
  // Map binary 0-y to unary -y.
@ -323,12 +347,35 @@ LatSetId Merger::disjSet(TensorExp::Kind kind, LatSetId s0, LatSetId s1,
  return sNew;
 }

-LatSetId Merger::combiSet(TensorExp::Kind kind, LatSetId s0, LatSetId s1,
-                          Operation *orig, bool includeLeft,
-                          TensorExp::Kind ltrans, Operation *opleft,
-                          bool includeRight, TensorExp::Kind rtrans,
-                          Operation *opright) {
-  const LatSetId sNew = conjSet(kind, s0, s1, orig);
+LatSetId Merger::disjSetWithZero(ExprId e, LatSetId s0, LatSetId s1) {
+  assert(exp(e).kind == TensorExp::Kind::kCmpI ||
+         exp(e).kind == TensorExp::Kind::kCmpF);
+  const LatSetId sNew = conjSet(e, s0, s1, nullptr);
+
+  ExprId e0 = exp(e).children.e0;
+  ExprId e1 = exp(e).children.e1;
+  if (exp(e0).kind == TensorExp::Kind::kSynZero ||
+      exp(e1).kind == TensorExp::Kind::kSynZero) {
+    // lhs and rhs can't be synthetic zero at the same time.
+    assert(exp(e0).kind != exp(e1).kind);
+    // If one of the operands has already been assigned to zero (the
+    // element is absent in the corresponding operand), then we do not
+    // need to build disjunctive set for it.
+    return sNew;
+  }
+
+  auto lhsSet = mapBinWithSynZeroSet(e, s0, false);
+  auto rhsSet = mapBinWithSynZeroSet(e, s1, true);
+  latSets[sNew].append(latSets[lhsSet]);
+  latSets[sNew].append(latSets[rhsSet]);
+  return sNew;
+}
+
+LatSetId Merger::combiSet(ExprId e, LatSetId s0, LatSetId s1, Operation *orig,
+                          bool includeLeft, TensorExp::Kind ltrans,
+                          Operation *opleft, bool includeRight,
+                          TensorExp::Kind rtrans, Operation *opright) {
+  const LatSetId sNew = conjSet(e, s0, s1, orig);
  // Left Region.
  if (includeLeft) {
    if (opleft)
@ -356,6 +403,23 @@ LatSetId Merger::mapSet(TensorExp::Kind kind, LatSetId s0, Value v,
  return sNew;
 }

+LatSetId Merger::mapBinWithSynZeroSet(ExprId e, LatSetId s0, bool lhsZero) {
+  TensorExp::Kind kind = exp(e).kind;
+  Attribute a = exp(e).attr;
+  assert(TensorExp::Kind::kMulF <= kind && kind <= TensorExp::Kind::kShlI);
+  // Must be a binary operation.
+  const LatSetId sNew = addSet();
+  auto &setNew = latSets[sNew];
+  const ExprId zeroExp = addSynZeroExp();
+  for (const LatPointId p : set(s0)) {
+    const auto &point = latPoints[p];
+    ExprId newExp = lhsZero ? addExp(kind, zeroExp, point.exp, nullptr, a)
+                            : addExp(kind, point.exp, zeroExp, nullptr, a);
+    setNew.push_back(addLat(point.bits, newExp));
+  }
+  return sNew;
+}
+
 LatSetId Merger::optimizeSet(LatSetId s0) {
  const LatSetId sNew = addSet();
  auto &setNew = latSets[sNew];
@ -418,7 +482,8 @@ BitVector Merger::simplifyCond(LatSetId s0, LatPointId p0) {
    // Slice on dense level has `locate` property as well, and can be optimized.
    if (simple[b] && !isSparseLvlWithNonTrivialIdxExp(b)) {
      const auto dlt = getLvlType(b);
-      if (!isCompressedDLT(dlt) && !isSingletonDLT(dlt) && !isCompressedWithHiDLT(dlt)) {
+      if (!isCompressedDLT(dlt) && !isSingletonDLT(dlt) &&
+          !isCompressedWithHiDLT(dlt)) {
        if (reset)
          simple.reset(b);
        reset = true;
@ -505,6 +570,7 @@ bool Merger::isSingleCondition(TensorId t, ExprId e) const {
    return expr.tensor == t;
  case TensorExp::Kind::kInvariant:
  case TensorExp::Kind::kLoopVar:
+  case TensorExp::Kind::kSynZero:
    return false;
  // Unary operations.
  case TensorExp::Kind::kAbsF:
@ -576,6 +642,8 @@ bool Merger::isSingleCondition(TensorId t, ExprId e) const {
  case TensorExp::Kind::kSubI:
  case TensorExp::Kind::kOrI:
  case TensorExp::Kind::kXorI:
+  case TensorExp::Kind::kCmpF:
+  case TensorExp::Kind::kCmpI:
  case TensorExp::Kind::kBinary:
    return false;
  }
@ -585,7 +653,8 @@ bool Merger::isSingleCondition(TensorId t, ExprId e) const {
 bool Merger::hasAnySparse(const BitVector &bits) const {
  for (TensorLoopId b : bits.set_bits()) {
    const auto dlt = getLvlType(b);
-    if (isCompressedDLT(dlt) || isSingletonDLT(dlt) || isCompressedWithHiDLT(dlt))
+    if (isCompressedDLT(dlt) || isSingletonDLT(dlt) ||
+        isCompressedWithHiDLT(dlt))
      return true;
  }
  return hasSparseIdxReduction(bits);
@ -613,6 +682,8 @@ static const char *kindToOpSymbol(TensorExp::Kind kind) {
    return "invariant";
  case TensorExp::Kind::kLoopVar:
    return "index";
+  case TensorExp::Kind::kSynZero:
+    return "0";
  // Unary operations.
  case TensorExp::Kind::kAbsF:
  case TensorExp::Kind::kAbsC:
@ -693,6 +764,9 @@ static const char *kindToOpSymbol(TensorExp::Kind kind) {
    return ">>";
  case TensorExp::Kind::kShlI:
    return "<<";
+  case TensorExp::Kind::kCmpF:
+  case TensorExp::Kind::kCmpI:
+    return "cmp";
  case TensorExp::Kind::kBinary:
    return "binary";
  case TensorExp::Kind::kReduce:
@ -715,6 +789,9 @@ void Merger::dumpExp(ExprId e) const {
  case TensorExp::Kind::kInvariant:
    llvm::dbgs() << "invariant";
    break;
+  case TensorExp::Kind::kSynZero:
+    llvm::dbgs() << "0";
+    break;
  case TensorExp::Kind::kLoopVar:
    llvm::dbgs() << "loopvar_" << expr.loop;
    break;
@ -776,11 +853,16 @@ void Merger::dumpExp(ExprId e) const {
  case TensorExp::Kind::kShrS:
  case TensorExp::Kind::kShrU:
  case TensorExp::Kind::kShlI:
+  case TensorExp::Kind::kCmpF:
+  case TensorExp::Kind::kCmpI:
  case TensorExp::Kind::kBinary:
  case TensorExp::Kind::kReduce:
    llvm::dbgs() << "(";
    dumpExp(expr.children.e0);
-    llvm::dbgs() << " " << kindToOpSymbol(expr.kind) << " ";
+    llvm::dbgs() << " " << kindToOpSymbol(expr.kind);
+    if (expr.attr)
+      llvm::dbgs() << "{" << expr.attr << "}";
+    llvm::dbgs() << " ";
    dumpExp(expr.children.e1);
    llvm::dbgs() << ")";
    break;
@ -839,6 +921,7 @@ LatSetId Merger::buildLattices(ExprId e, LoopId i) {
  // Leaf.
  case TensorExp::Kind::kTensor:
  case TensorExp::Kind::kInvariant:
+  case TensorExp::Kind::kSynZero:
  case TensorExp::Kind::kLoopVar: {
    // Either the loop-var is really used in the tensor expression, or it is
    // set to the undefined loop-var in that level. An invariant expression,
@ -928,7 +1011,7 @@ LatSetId Merger::buildLattices(ExprId e, LoopId i) {
      YieldOp absentYield = cast<YieldOp>(absentBlock.getTerminator());
      const Value absentVal = absentYield.getResult();
      const ExprId rhs = addInvariantExp(absentVal);
-      return disjSet(kind, child0, buildLattices(rhs, i), unop);
+      return disjSet(e, child0, buildLattices(rhs, i), unop);
    }
  // Binary operations.
  case TensorExp::Kind::kMulF:
@ -947,7 +1030,7 @@ LatSetId Merger::buildLattices(ExprId e, LoopId i) {
    {
      const ExprId e0 = expr.children.e0;
      const ExprId e1 = expr.children.e1;
-      return conjSet(kind, buildLattices(e0, i), buildLattices(e1, i));
+      return conjSet(e, buildLattices(e0, i), buildLattices(e1, i));
    }
  case TensorExp::Kind::kDivF:
  case TensorExp::Kind::kDivC:
@ -970,7 +1053,7 @@ LatSetId Merger::buildLattices(ExprId e, LoopId i) {
      const ExprId e0 = expr.children.e0;
      const ExprId e1 = expr.children.e1;
      assert(!maybeZero(e1));
-      return conjSet(kind, buildLattices(e0, i), buildLattices(e1, i));
+      return conjSet(e, buildLattices(e0, i), buildLattices(e1, i));
    }
  case TensorExp::Kind::kAddF:
  case TensorExp::Kind::kAddC:
@ -990,7 +1073,21 @@ LatSetId Merger::buildLattices(ExprId e, LoopId i) {
    {
      const ExprId e0 = expr.children.e0;
      const ExprId e1 = expr.children.e1;
-      return disjSet(kind, buildLattices(e0, i), buildLattices(e1, i));
+      return disjSet(e, buildLattices(e0, i), buildLattices(e1, i));
+    }
+  case TensorExp::Kind::kCmpF:
+  case TensorExp::Kind::kCmpI:
+    // An comparison operation needs to be performed
+    // for the disjunction of sparse iteration spaces.
+    //
+    //   x < y |  !y   |   y   |
+    //  -------+-------+-------+
+    //     !x  |   0   | 0 < y |
+    //      x  | x < 0 | x < y |
+    {
+      const ExprId e0 = expr.children.e0;
+      const ExprId e1 = expr.children.e1;
+      return disjSetWithZero(e, buildLattices(e0, i), buildLattices(e1, i));
    }
  case TensorExp::Kind::kShrS:
  case TensorExp::Kind::kShrU:
@ -1002,7 +1099,7 @@ LatSetId Merger::buildLattices(ExprId e, LoopId i) {
      const ExprId e0 = expr.children.e0;
      const ExprId e1 = expr.children.e1;
      assert(isInvariant(e1));
-      return conjSet(kind, buildLattices(e0, i), buildLattices(e1, i));
+      return conjSet(e, buildLattices(e0, i), buildLattices(e1, i));
    }
  case TensorExp::Kind::kBinary:
    // A custom binary operation.
@ -1033,9 +1130,9 @@ LatSetId Merger::buildLattices(ExprId e, LoopId i) {
      }
      bool includeLeft = binop.getLeftIdentity() || !leftRegion.empty();
      bool includeRight = binop.getRightIdentity() || !rightRegion.empty();
-      return combiSet(TensorExp::Kind::kBinary, child0, child1, binop,
-                      includeLeft, TensorExp::Kind::kBinaryBranch, leftYield,
-                      includeRight, TensorExp::Kind::kBinaryBranch, rightYield);
+      return combiSet(e, child0, child1, binop, includeLeft,
+                      TensorExp::Kind::kBinaryBranch, leftYield, includeRight,
+                      TensorExp::Kind::kBinaryBranch, rightYield);
    }
  case TensorExp::Kind::kReduce:
    // A custom reduce operation.
@ -1043,7 +1140,7 @@ LatSetId Merger::buildLattices(ExprId e, LoopId i) {
      const ExprId e0 = expr.children.e0;
      const ExprId e1 = expr.children.e1;
      Operation *const op = expr.op;
-      return conjSet(kind, buildLattices(e0, i), buildLattices(e1, i), op);
+      return conjSet(e, buildLattices(e0, i), buildLattices(e1, i), op);
    }
  }
  llvm_unreachable("unexpected expression kind");
@ -1261,6 +1358,37 @@ std::optional<ExprId> Merger::buildTensorExp(linalg::GenericOp op, Value v) {
        return addExp(TensorExp::Kind::kShrU, e0, e1);
      if (isa<arith::ShLIOp>(def) && isInvariant(e1))
        return addExp(TensorExp::Kind::kShlI, e0, e1);
+      if (auto ci = dyn_cast<arith::CmpIOp>(def)) {
+        if (ci.getPredicate() == arith::CmpIPredicate::eq &&
+            ci.getPredicate() == arith::CmpIPredicate::sle &&
+            ci.getPredicate() == arith::CmpIPredicate::sge &&
+            ci.getPredicate() == arith::CmpIPredicate::ule &&
+            ci.getPredicate() == arith::CmpIPredicate::uge) {
+          // We can not sparsify comparison with equal, this is because 0 <= 0
+          // yields true, and thus densifies the result.
+          return std::nullopt;
+        }
+
+        return addExp(TensorExp::Kind::kCmpI, e0, e1, nullptr,
+                      ci.getPredicateAttr());
+      }
+      if (auto cf = dyn_cast<arith::CmpFOp>(def)) {
+        if (cf.getPredicate() == arith::CmpFPredicate::OEQ &&
+            cf.getPredicate() == arith::CmpFPredicate::OGE &&
+            cf.getPredicate() == arith::CmpFPredicate::OLE &&
+            cf.getPredicate() == arith::CmpFPredicate::ONE &&
+            cf.getPredicate() == arith::CmpFPredicate::UEQ &&
+            cf.getPredicate() == arith::CmpFPredicate::UGE &&
+            cf.getPredicate() == arith::CmpFPredicate::ULE &&
+            cf.getPredicate() == arith::CmpFPredicate::ORD &&
+            cf.getPredicate() == arith::CmpFPredicate::UNO) {
+          // We can not sparsify comparison with equal, this is because 0 <= 0
+          // yields true, and thus densifies the result.
+          return std::nullopt;
+        }
+        return addExp(TensorExp::Kind::kCmpF, e0, e1, nullptr,
+                      cf.getPredicateAttr());
+      }
      if (auto binop = dyn_cast<sparse_tensor::BinaryOp>(def)) {
        if (isAdmissibleBranch(binop, binop.getOverlapRegion()) &&
            (binop.getLeftIdentity() ||
@ -1342,6 +1470,7 @@ Value Merger::buildExp(RewriterBase &rewriter, Location loc, ExprId e, Value v0,
  case TensorExp::Kind::kTensor:
  case TensorExp::Kind::kInvariant:
  case TensorExp::Kind::kLoopVar:
+  case TensorExp::Kind::kSynZero:
    llvm_unreachable("unexpected non-op");
  // Unary operations.
  case TensorExp::Kind::kAbsF:
@ -1458,6 +1587,14 @@ Value Merger::buildExp(RewriterBase &rewriter, Location loc, ExprId e, Value v0,
    return rewriter.create<arith::ShRUIOp>(loc, v0, v1);
  case TensorExp::Kind::kShlI:
    return rewriter.create<arith::ShLIOp>(loc, v0, v1);
+  case TensorExp::Kind::kCmpI: {
+    auto predicate = llvm::cast<arith::CmpIPredicateAttr>(expr.attr);
+    return rewriter.create<arith::CmpIOp>(loc, predicate, v0, v1);
+  }
+  case TensorExp::Kind::kCmpF: {
+    auto predicate = llvm::cast<arith::CmpFPredicateAttr>(expr.attr);
+    return rewriter.create<arith::CmpFOp>(loc, predicate, v0, v1);
+  }
  case TensorExp::Kind::kBinaryBranch: // semi-ring ops with custom logic.
    return insertYieldOp(rewriter, loc, *expr.op->getBlock()->getParent(),
                         {v0});
--- a/mlir/test/Dialect/SparseTensor/sparse_2d.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_2d.mlir
@ -52,6 +52,43 @@ func.func @add_dd(%arga: tensor<32x16xf32, #Tdd>, %argb: tensor<32x16xf32>, %arg
  return %0 : tensor<32x16xf32>
 }

+// CHECK-LABEL:   func.func @cmp_dd(
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "dense" ] }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16xi1>) -> tensor<32x16xi1> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant false
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "dense" ] }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xi1>
+// CHECK:           linalg.fill ins(%[[VAL_5]] : i1) outs(%[[VAL_10]] : memref<32x16xi1>)
+// CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] {
+// CHECK:             scf.for %[[VAL_12:.*]] = %[[VAL_6]] to %[[VAL_4]] step %[[VAL_7]] {
+// CHECK:               %[[VAL_13:.*]] = arith.muli %[[VAL_11]], %[[VAL_4]] : index
+// CHECK:               %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_12]] : index
+// CHECK:               %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_14]]] : memref<?xf32>
+// CHECK:               %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_11]], %[[VAL_12]]] : memref<32x16xf32>
+// CHECK:               %[[VAL_17:.*]] = arith.cmpf ult, %[[VAL_15]], %[[VAL_16]] : f32
+// CHECK:               memref.store %[[VAL_17]], %[[VAL_10]]{{\[}}%[[VAL_11]], %[[VAL_12]]] : memref<32x16xi1>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           %[[VAL_18:.*]] = bufferization.to_tensor %[[VAL_10]] : memref<32x16xi1>
+// CHECK:           return %[[VAL_18]] : tensor<32x16xi1>
+// CHECK:         }
+func.func @cmp_dd(%arga: tensor<32x16xf32, #Tdd>, %argb: tensor<32x16xf32>, %argx: tensor<32x16xi1>) -> tensor<32x16xi1> {
+  %0 = linalg.generic #trait2
+     ins(%arga, %argb: tensor<32x16xf32, #Tdd>, tensor<32x16xf32>)
+    outs(%argx: tensor<32x16xi1>) {
+      ^bb(%a: f32, %b: f32, %x: i1):
+        %0 = arith.cmpf ult, %a, %b : f32
+        linalg.yield %0 : i1
+  } -> tensor<32x16xi1>
+  return %0 : tensor<32x16xi1>
+}
+
 // CHECK-LABEL:   func @mul_dd(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "dense" ] }>>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
@ -151,6 +188,73 @@ func.func @add_ds(%arga: tensor<32x16xf32, #Tds>, %argb: tensor<32x16xf32>, %arg
  return %0 : tensor<32x16xf32>
 }

+// CHECK-LABEL:   func.func @cmp_ds(
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16xi1>) -> tensor<32x16xi1> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant false
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xi1>
+// CHECK:           linalg.fill ins(%[[VAL_5]] : i1) outs(%[[VAL_14]] : memref<32x16xi1>)
+// CHECK:           scf.for %[[VAL_15:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] {
+// CHECK:             %[[VAL_16:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_15]]] : memref<?xindex>
+// CHECK:             %[[VAL_17:.*]] = arith.addi %[[VAL_15]], %[[VAL_7]] : index
+// CHECK:             %[[VAL_18:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_17]]] : memref<?xindex>
+// CHECK:             %[[VAL_19:.*]]:2 = scf.while (%[[VAL_20:.*]] = %[[VAL_16]], %[[VAL_21:.*]] = %[[VAL_6]]) : (index, index) -> (index, index) {
+// CHECK:               %[[VAL_22:.*]] = arith.cmpi ult, %[[VAL_20]], %[[VAL_18]] : index
+// CHECK:               scf.condition(%[[VAL_22]]) %[[VAL_20]], %[[VAL_21]] : index, index
+// CHECK:             } do {
+// CHECK:             ^bb0(%[[VAL_23:.*]]: index, %[[VAL_24:.*]]: index):
+// CHECK:               %[[VAL_25:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_23]]] : memref<?xindex>
+// CHECK:               %[[VAL_26:.*]] = arith.cmpi eq, %[[VAL_25]], %[[VAL_24]] : index
+// CHECK:               scf.if %[[VAL_26]] {
+// CHECK:                 %[[VAL_27:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_23]]] : memref<?xf32>
+// CHECK:                 %[[VAL_28:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_15]], %[[VAL_24]]] : memref<32x16xf32>
+// CHECK:                 %[[VAL_29:.*]] = arith.cmpf ult, %[[VAL_27]], %[[VAL_28]] : f32
+// CHECK:                 memref.store %[[VAL_29]], %[[VAL_14]]{{\[}}%[[VAL_15]], %[[VAL_24]]] : memref<32x16xi1>
+// CHECK:               } else {
+// CHECK:                 scf.if %[[VAL_8]] {
+// CHECK:                   %[[VAL_30:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_15]], %[[VAL_24]]] : memref<32x16xf32>
+// CHECK:                   %[[VAL_31:.*]] = arith.cmpf ult, %[[VAL_9]], %[[VAL_30]] : f32
+// CHECK:                   memref.store %[[VAL_31]], %[[VAL_14]]{{\[}}%[[VAL_15]], %[[VAL_24]]] : memref<32x16xi1>
+// CHECK:                 } else {
+// CHECK:                 }
+// CHECK:               }
+// CHECK:               %[[VAL_32:.*]] = arith.cmpi eq, %[[VAL_25]], %[[VAL_24]] : index
+// CHECK:               %[[VAL_33:.*]] = arith.addi %[[VAL_23]], %[[VAL_7]] : index
+// CHECK:               %[[VAL_34:.*]] = arith.select %[[VAL_32]], %[[VAL_33]], %[[VAL_23]] : index
+// CHECK:               %[[VAL_35:.*]] = arith.addi %[[VAL_24]], %[[VAL_7]] : index
+// CHECK:               scf.yield %[[VAL_34]], %[[VAL_35]] : index, index
+// CHECK:             } attributes
+// CHECK:             scf.for %[[VAL_36:.*]] = %[[VAL_37:.*]]#1 to %[[VAL_4]] step %[[VAL_7]] {
+// CHECK:               %[[VAL_38:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_15]], %[[VAL_36]]] : memref<32x16xf32>
+// CHECK:               %[[VAL_39:.*]] = arith.cmpf ult, %[[VAL_9]], %[[VAL_38]] : f32
+// CHECK:               memref.store %[[VAL_39]], %[[VAL_14]]{{\[}}%[[VAL_15]], %[[VAL_36]]] : memref<32x16xi1>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           %[[VAL_40:.*]] = bufferization.to_tensor %[[VAL_14]] : memref<32x16xi1>
+// CHECK:           return %[[VAL_40]] : tensor<32x16xi1>
+// CHECK:         }
+func.func @cmp_ds(%arga: tensor<32x16xf32, #Tds>, %argb: tensor<32x16xf32>, %argx: tensor<32x16xi1>) -> tensor<32x16xi1> {
+  %0 = linalg.generic #trait2
+     ins(%arga, %argb: tensor<32x16xf32, #Tds>, tensor<32x16xf32>)
+    outs(%argx: tensor<32x16xi1>) {
+      ^bb(%a: f32, %b: f32, %x: i1):
+        %0 = arith.cmpf ult, %a, %b : f32
+        linalg.yield %0 : i1
+  } -> tensor<32x16xi1>
+  return %0 : tensor<32x16xi1>
+}
+
 // CHECK-LABEL:   func @mul_ds(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
@ -258,6 +362,78 @@ func.func @add_sd(%arga: tensor<32x16xf32, #Tsd>, %argb: tensor<32x16xf32>, %arg
  return %0 : tensor<32x16xf32>
 }

+// CHECK-LABEL:   func.func @cmp_sd(
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "dense" ] }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16xi1>) -> tensor<32x16xi1> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant false
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "dense" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "dense" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "dense" ] }>> to memref<?xf32>
+// CHECK:           %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
+// CHECK:           %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xi1>
+// CHECK:           linalg.fill ins(%[[VAL_5]] : i1) outs(%[[VAL_14]] : memref<32x16xi1>)
+// CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_6]]] : memref<?xindex>
+// CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_7]]] : memref<?xindex>
+// CHECK:           %[[VAL_17:.*]]:2 = scf.while (%[[VAL_18:.*]] = %[[VAL_15]], %[[VAL_19:.*]] = %[[VAL_6]]) : (index, index) -> (index, index) {
+// CHECK:             %[[VAL_20:.*]] = arith.cmpi ult, %[[VAL_18]], %[[VAL_16]] : index
+// CHECK:             scf.condition(%[[VAL_20]]) %[[VAL_18]], %[[VAL_19]] : index, index
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_21:.*]]: index, %[[VAL_22:.*]]: index):
+// CHECK:             %[[VAL_23:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_21]]] : memref<?xindex>
+// CHECK:             %[[VAL_24:.*]] = arith.cmpi eq, %[[VAL_23]], %[[VAL_22]] : index
+// CHECK:             scf.if %[[VAL_24]] {
+// CHECK:               scf.for %[[VAL_25:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] {
+// CHECK:                 %[[VAL_26:.*]] = arith.muli %[[VAL_21]], %[[VAL_3]] : index
+// CHECK:                 %[[VAL_27:.*]] = arith.addi %[[VAL_26]], %[[VAL_25]] : index
+// CHECK:                 %[[VAL_28:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_27]]] : memref<?xf32>
+// CHECK:                 %[[VAL_29:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_22]], %[[VAL_25]]] : memref<32x16xf32>
+// CHECK:                 %[[VAL_30:.*]] = arith.cmpf ult, %[[VAL_28]], %[[VAL_29]] : f32
+// CHECK:                 memref.store %[[VAL_30]], %[[VAL_14]]{{\[}}%[[VAL_22]], %[[VAL_25]]] : memref<32x16xi1>
+// CHECK:               }
+// CHECK:             } else {
+// CHECK:               scf.if %[[VAL_8]] {
+// CHECK:                 scf.for %[[VAL_31:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] {
+// CHECK:                   %[[VAL_32:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_22]], %[[VAL_31]]] : memref<32x16xf32>
+// CHECK:                   %[[VAL_33:.*]] = arith.cmpf ult, %[[VAL_9]], %[[VAL_32]] : f32
+// CHECK:                   memref.store %[[VAL_33]], %[[VAL_14]]{{\[}}%[[VAL_22]], %[[VAL_31]]] : memref<32x16xi1>
+// CHECK:                 }
+// CHECK:               } else {
+// CHECK:               }
+// CHECK:             }
+// CHECK:             %[[VAL_34:.*]] = arith.cmpi eq, %[[VAL_23]], %[[VAL_22]] : index
+// CHECK:             %[[VAL_35:.*]] = arith.addi %[[VAL_21]], %[[VAL_7]] : index
+// CHECK:             %[[VAL_36:.*]] = arith.select %[[VAL_34]], %[[VAL_35]], %[[VAL_21]] : index
+// CHECK:             %[[VAL_37:.*]] = arith.addi %[[VAL_22]], %[[VAL_7]] : index
+// CHECK:             scf.yield %[[VAL_36]], %[[VAL_37]] : index, index
+// CHECK:           } attributes
+// CHECK:           scf.for %[[VAL_38:.*]] = %[[VAL_39:.*]]#1 to %[[VAL_4]] step %[[VAL_7]] {
+// CHECK:             scf.for %[[VAL_40:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] {
+// CHECK:               %[[VAL_41:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_38]], %[[VAL_40]]] : memref<32x16xf32>
+// CHECK:               %[[VAL_42:.*]] = arith.cmpf ult, %[[VAL_9]], %[[VAL_41]] : f32
+// CHECK:               memref.store %[[VAL_42]], %[[VAL_14]]{{\[}}%[[VAL_38]], %[[VAL_40]]] : memref<32x16xi1>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           %[[VAL_43:.*]] = bufferization.to_tensor %[[VAL_14]] : memref<32x16xi1>
+// CHECK:           return %[[VAL_43]] : tensor<32x16xi1>
+// CHECK:         }
+func.func @cmp_sd(%arga: tensor<32x16xf32, #Tsd>, %argb: tensor<32x16xf32>, %argx: tensor<32x16xi1>) -> tensor<32x16xi1> {
+  %0 = linalg.generic #trait2
+     ins(%arga, %argb: tensor<32x16xf32, #Tsd>, tensor<32x16xf32>)
+    outs(%argx: tensor<32x16xi1>) {
+      ^bb(%a: f32, %b: f32, %x: i1):
+        %0 = arith.cmpf ult, %a, %b : f32
+        linalg.yield %0 : i1
+  } -> tensor<32x16xi1>
+  return %0 : tensor<32x16xi1>
+}
+
 // CHECK-LABEL:   func @mul_sd(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "dense" ] }>>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
@ -392,6 +568,106 @@ func.func @add_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32>, %arg
  return %0 : tensor<32x16xf32>
 }

+// CHECK-LABEL:   func.func @cmp_ss(
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16xi1>) -> tensor<32x16xi1> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant false
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xf32>
+// CHECK:           %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
+// CHECK:           %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xi1>
+// CHECK:           linalg.fill ins(%[[VAL_5]] : i1) outs(%[[VAL_16]] : memref<32x16xi1>)
+// CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_6]]] : memref<?xindex>
+// CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_7]]] : memref<?xindex>
+// CHECK:           %[[VAL_19:.*]]:2 = scf.while (%[[VAL_20:.*]] = %[[VAL_17]], %[[VAL_21:.*]] = %[[VAL_6]]) : (index, index) -> (index, index) {
+// CHECK:             %[[VAL_22:.*]] = arith.cmpi ult, %[[VAL_20]], %[[VAL_18]] : index
+// CHECK:             scf.condition(%[[VAL_22]]) %[[VAL_20]], %[[VAL_21]] : index, index
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_23:.*]]: index, %[[VAL_24:.*]]: index):
+// CHECK:             %[[VAL_25:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_23]]] : memref<?xindex>
+// CHECK:             %[[VAL_26:.*]] = arith.cmpi eq, %[[VAL_25]], %[[VAL_24]] : index
+// CHECK:             scf.if %[[VAL_26]] {
+// CHECK:               %[[VAL_27:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_23]]] : memref<?xindex>
+// CHECK:               %[[VAL_28:.*]] = arith.addi %[[VAL_23]], %[[VAL_7]] : index
+// CHECK:               %[[VAL_29:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_28]]] : memref<?xindex>
+// CHECK:               %[[VAL_30:.*]]:2 = scf.while (%[[VAL_31:.*]] = %[[VAL_27]], %[[VAL_32:.*]] = %[[VAL_6]]) : (index, index) -> (index, index) {
+// CHECK:                 %[[VAL_33:.*]] = arith.cmpi ult, %[[VAL_31]], %[[VAL_29]] : index
+// CHECK:                 scf.condition(%[[VAL_33]]) %[[VAL_31]], %[[VAL_32]] : index, index
+// CHECK:               } do {
+// CHECK:               ^bb0(%[[VAL_34:.*]]: index, %[[VAL_35:.*]]: index):
+// CHECK:                 %[[VAL_36:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_34]]] : memref<?xindex>
+// CHECK:                 %[[VAL_37:.*]] = arith.cmpi eq, %[[VAL_36]], %[[VAL_35]] : index
+// CHECK:                 scf.if %[[VAL_37]] {
+// CHECK:                   %[[VAL_38:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_34]]] : memref<?xf32>
+// CHECK:                   %[[VAL_39:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_24]], %[[VAL_35]]] : memref<32x16xf32>
+// CHECK:                   %[[VAL_40:.*]] = arith.cmpf ult, %[[VAL_38]], %[[VAL_39]] : f32
+// CHECK:                   memref.store %[[VAL_40]], %[[VAL_16]]{{\[}}%[[VAL_24]], %[[VAL_35]]] : memref<32x16xi1>
+// CHECK:                 } else {
+// CHECK:                   scf.if %[[VAL_8]] {
+// CHECK:                     %[[VAL_41:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_24]], %[[VAL_35]]] : memref<32x16xf32>
+// CHECK:                     %[[VAL_42:.*]] = arith.cmpf ult, %[[VAL_9]], %[[VAL_41]] : f32
+// CHECK:                     memref.store %[[VAL_42]], %[[VAL_16]]{{\[}}%[[VAL_24]], %[[VAL_35]]] : memref<32x16xi1>
+// CHECK:                   } else {
+// CHECK:                   }
+// CHECK:                 }
+// CHECK:                 %[[VAL_43:.*]] = arith.cmpi eq, %[[VAL_36]], %[[VAL_35]] : index
+// CHECK:                 %[[VAL_44:.*]] = arith.addi %[[VAL_34]], %[[VAL_7]] : index
+// CHECK:                 %[[VAL_45:.*]] = arith.select %[[VAL_43]], %[[VAL_44]], %[[VAL_34]] : index
+// CHECK:                 %[[VAL_46:.*]] = arith.addi %[[VAL_35]], %[[VAL_7]] : index
+// CHECK:                 scf.yield %[[VAL_45]], %[[VAL_46]] : index, index
+// CHECK:               } attributes
+// CHECK:               scf.for %[[VAL_47:.*]] = %[[VAL_48:.*]]#1 to %[[VAL_4]] step %[[VAL_7]] {
+// CHECK:                 %[[VAL_49:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_24]], %[[VAL_47]]] : memref<32x16xf32>
+// CHECK:                 %[[VAL_50:.*]] = arith.cmpf ult, %[[VAL_9]], %[[VAL_49]] : f32
+// CHECK:                 memref.store %[[VAL_50]], %[[VAL_16]]{{\[}}%[[VAL_24]], %[[VAL_47]]] : memref<32x16xi1>
+// CHECK:               }
+// CHECK:             } else {
+// CHECK:               scf.if %[[VAL_8]] {
+// CHECK:                 scf.for %[[VAL_51:.*]] = %[[VAL_6]] to %[[VAL_4]] step %[[VAL_7]] {
+// CHECK:                   %[[VAL_52:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_24]], %[[VAL_51]]] : memref<32x16xf32>
+// CHECK:                   %[[VAL_53:.*]] = arith.cmpf ult, %[[VAL_9]], %[[VAL_52]] : f32
+// CHECK:                   memref.store %[[VAL_53]], %[[VAL_16]]{{\[}}%[[VAL_24]], %[[VAL_51]]] : memref<32x16xi1>
+// CHECK:                 }
+// CHECK:               } else {
+// CHECK:               }
+// CHECK:             }
+// CHECK:             %[[VAL_54:.*]] = arith.cmpi eq, %[[VAL_25]], %[[VAL_24]] : index
+// CHECK:             %[[VAL_55:.*]] = arith.addi %[[VAL_23]], %[[VAL_7]] : index
+// CHECK:             %[[VAL_56:.*]] = arith.select %[[VAL_54]], %[[VAL_55]], %[[VAL_23]] : index
+// CHECK:             %[[VAL_57:.*]] = arith.addi %[[VAL_24]], %[[VAL_7]] : index
+// CHECK:             scf.yield %[[VAL_56]], %[[VAL_57]] : index, index
+// CHECK:           } attributes
+// CHECK:           scf.for %[[VAL_58:.*]] = %[[VAL_59:.*]]#1 to %[[VAL_3]] step %[[VAL_7]] {
+// CHECK:             scf.for %[[VAL_60:.*]] = %[[VAL_6]] to %[[VAL_4]] step %[[VAL_7]] {
+// CHECK:               %[[VAL_61:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_58]], %[[VAL_60]]] : memref<32x16xf32>
+// CHECK:               %[[VAL_62:.*]] = arith.cmpf ult, %[[VAL_9]], %[[VAL_61]] : f32
+// CHECK:               memref.store %[[VAL_62]], %[[VAL_16]]{{\[}}%[[VAL_58]], %[[VAL_60]]] : memref<32x16xi1>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           %[[VAL_63:.*]] = bufferization.to_tensor %[[VAL_16]] : memref<32x16xi1>
+// CHECK:           return %[[VAL_63]] : tensor<32x16xi1>
+// CHECK:         }
+func.func @cmp_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32>, %argx: tensor<32x16xi1>) -> tensor<32x16xi1> {
+  %0 = linalg.generic #trait2
+     ins(%arga, %argb: tensor<32x16xf32, #Tss>, tensor<32x16xf32>)
+    outs(%argx: tensor<32x16xi1>) {
+      ^bb(%a: f32, %b: f32, %x: i1):
+        %0 = arith.cmpf ult, %a, %b : f32
+        linalg.yield %0 : i1
+  } -> tensor<32x16xi1>
+  return %0 : tensor<32x16xi1>
+}
+
 // CHECK-LABEL:   func @mul_ss(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
@ -599,6 +875,180 @@ func.func @add_ss_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32, #T
  return %0 : tensor<32x16xf32>
 }

+// CHECK-LABEL:   func.func @cmp_ss_ss(
+// CHECK-SAME:      %[[VAL_0:.*0]]: tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>,
+// CHECK-SAME:      %[[VAL_1:.*1]]: tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16xi1>) -> tensor<32x16xi1> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant false
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_15:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_16:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref<?xf32>
+// CHECK-DAG:           %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xi1>
+// CHECK:           linalg.fill ins(%[[VAL_3]] : i1) outs(%[[VAL_17]] : memref<32x16xi1>)
+// CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK:           %[[VAL_19:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK:           %[[VAL_20:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK:           %[[VAL_21:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK:           %[[VAL_22:.*]]:2 = scf.while (%[[VAL_23:.*]] = %[[VAL_18]], %[[VAL_24:.*]] = %[[VAL_20]]) : (index, index) -> (index, index) {
+// CHECK:             %[[VAL_25:.*]] = arith.cmpi ult, %[[VAL_23]], %[[VAL_19]] : index
+// CHECK:             %[[VAL_26:.*]] = arith.cmpi ult, %[[VAL_24]], %[[VAL_21]] : index
+// CHECK:             %[[VAL_27:.*]] = arith.andi %[[VAL_25]], %[[VAL_26]] : i1
+// CHECK:             scf.condition(%[[VAL_27]]) %[[VAL_23]], %[[VAL_24]] : index, index
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_28:.*]]: index, %[[VAL_29:.*]]: index):
+// CHECK:             %[[VAL_30:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_28]]] : memref<?xindex>
+// CHECK:             %[[VAL_31:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_29]]] : memref<?xindex>
+// CHECK:             %[[VAL_32:.*]] = arith.cmpi ult, %[[VAL_31]], %[[VAL_30]] : index
+// CHECK:             %[[VAL_33:.*]] = arith.select %[[VAL_32]], %[[VAL_31]], %[[VAL_30]] : index
+// CHECK:             %[[VAL_34:.*]] = arith.cmpi eq, %[[VAL_30]], %[[VAL_33]] : index
+// CHECK:             %[[VAL_35:.*]] = arith.cmpi eq, %[[VAL_31]], %[[VAL_33]] : index
+// CHECK:             %[[VAL_36:.*]] = arith.andi %[[VAL_34]], %[[VAL_35]] : i1
+// CHECK:             scf.if %[[VAL_36]] {
+// CHECK:               %[[VAL_37:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_28]]] : memref<?xindex>
+// CHECK:               %[[VAL_38:.*]] = arith.addi %[[VAL_28]], %[[VAL_5]] : index
+// CHECK:               %[[VAL_39:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_38]]] : memref<?xindex>
+// CHECK:               %[[VAL_40:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_29]]] : memref<?xindex>
+// CHECK:               %[[VAL_41:.*]] = arith.addi %[[VAL_29]], %[[VAL_5]] : index
+// CHECK:               %[[VAL_42:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_41]]] : memref<?xindex>
+// CHECK:               %[[VAL_43:.*]]:2 = scf.while (%[[VAL_44:.*]] = %[[VAL_37]], %[[VAL_45:.*]] = %[[VAL_40]]) : (index, index) -> (index, index) {
+// CHECK:                 %[[VAL_46:.*]] = arith.cmpi ult, %[[VAL_44]], %[[VAL_39]] : index
+// CHECK:                 %[[VAL_47:.*]] = arith.cmpi ult, %[[VAL_45]], %[[VAL_42]] : index
+// CHECK:                 %[[VAL_48:.*]] = arith.andi %[[VAL_46]], %[[VAL_47]] : i1
+// CHECK:                 scf.condition(%[[VAL_48]]) %[[VAL_44]], %[[VAL_45]] : index, index
+// CHECK:               } do {
+// CHECK:               ^bb0(%[[VAL_49:.*]]: index, %[[VAL_50:.*]]: index):
+// CHECK:                 %[[VAL_51:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_49]]] : memref<?xindex>
+// CHECK:                 %[[VAL_52:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_50]]] : memref<?xindex>
+// CHECK:                 %[[VAL_53:.*]] = arith.cmpi ult, %[[VAL_52]], %[[VAL_51]] : index
+// CHECK:                 %[[VAL_54:.*]] = arith.select %[[VAL_53]], %[[VAL_52]], %[[VAL_51]] : index
+// CHECK:                 %[[VAL_55:.*]] = arith.cmpi eq, %[[VAL_51]], %[[VAL_54]] : index
+// CHECK:                 %[[VAL_56:.*]] = arith.cmpi eq, %[[VAL_52]], %[[VAL_54]] : index
+// CHECK:                 %[[VAL_57:.*]] = arith.andi %[[VAL_55]], %[[VAL_56]] : i1
+// CHECK:                 scf.if %[[VAL_57]] {
+// CHECK:                   %[[VAL_58:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_49]]] : memref<?xf32>
+// CHECK:                   %[[VAL_59:.*]] = memref.load %[[VAL_16]]{{\[}}%[[VAL_50]]] : memref<?xf32>
+// CHECK:                   %[[VAL_60:.*]] = arith.cmpf ult, %[[VAL_58]], %[[VAL_59]] : f32
+// CHECK:                   memref.store %[[VAL_60]], %[[VAL_17]]{{\[}}%[[VAL_33]], %[[VAL_54]]] : memref<32x16xi1>
+// CHECK:                 } else {
+// CHECK:                   %[[VAL_61:.*]] = arith.cmpi eq, %[[VAL_51]], %[[VAL_54]] : index
+// CHECK:                   scf.if %[[VAL_61]] {
+// CHECK:                     %[[VAL_62:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_49]]] : memref<?xf32>
+// CHECK:                     %[[VAL_63:.*]] = arith.cmpf ult, %[[VAL_62]], %[[VAL_6]] : f32
+// CHECK:                     memref.store %[[VAL_63]], %[[VAL_17]]{{\[}}%[[VAL_33]], %[[VAL_54]]] : memref<32x16xi1>
+// CHECK:                   } else {
+// CHECK:                     %[[VAL_64:.*]] = arith.cmpi eq, %[[VAL_52]], %[[VAL_54]] : index
+// CHECK:                     scf.if %[[VAL_64]] {
+// CHECK:                       %[[VAL_65:.*]] = memref.load %[[VAL_16]]{{\[}}%[[VAL_50]]] : memref<?xf32>
+// CHECK:                       %[[VAL_66:.*]] = arith.cmpf ult, %[[VAL_6]], %[[VAL_65]] : f32
+// CHECK:                       memref.store %[[VAL_66]], %[[VAL_17]]{{\[}}%[[VAL_33]], %[[VAL_54]]] : memref<32x16xi1>
+// CHECK:                     } else {
+// CHECK:                     }
+// CHECK:                   }
+// CHECK:                 }
+// CHECK:                 %[[VAL_67:.*]] = arith.cmpi eq, %[[VAL_51]], %[[VAL_54]] : index
+// CHECK:                 %[[VAL_68:.*]] = arith.addi %[[VAL_49]], %[[VAL_5]] : index
+// CHECK:                 %[[VAL_69:.*]] = arith.select %[[VAL_67]], %[[VAL_68]], %[[VAL_49]] : index
+// CHECK:                 %[[VAL_70:.*]] = arith.cmpi eq, %[[VAL_52]], %[[VAL_54]] : index
+// CHECK:                 %[[VAL_71:.*]] = arith.addi %[[VAL_50]], %[[VAL_5]] : index
+// CHECK:                 %[[VAL_72:.*]] = arith.select %[[VAL_70]], %[[VAL_71]], %[[VAL_50]] : index
+// CHECK:                 scf.yield %[[VAL_69]], %[[VAL_72]] : index, index
+// CHECK:               } attributes
+// CHECK:               scf.for %[[VAL_73:.*]] = %[[VAL_74:.*]]#0 to %[[VAL_39]] step %[[VAL_5]] {
+// CHECK:                 %[[VAL_75:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_73]]] : memref<?xindex>
+// CHECK:                 %[[VAL_76:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_73]]] : memref<?xf32>
+// CHECK:                 %[[VAL_77:.*]] = arith.cmpf ult, %[[VAL_76]], %[[VAL_6]] : f32
+// CHECK:                 memref.store %[[VAL_77]], %[[VAL_17]]{{\[}}%[[VAL_33]], %[[VAL_75]]] : memref<32x16xi1>
+// CHECK:               }
+// CHECK:               scf.for %[[VAL_78:.*]] = %[[VAL_79:.*]]#1 to %[[VAL_42]] step %[[VAL_5]] {
+// CHECK:                 %[[VAL_80:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_78]]] : memref<?xindex>
+// CHECK:                 %[[VAL_81:.*]] = memref.load %[[VAL_16]]{{\[}}%[[VAL_78]]] : memref<?xf32>
+// CHECK:                 %[[VAL_82:.*]] = arith.cmpf ult, %[[VAL_6]], %[[VAL_81]] : f32
+// CHECK:                 memref.store %[[VAL_82]], %[[VAL_17]]{{\[}}%[[VAL_33]], %[[VAL_80]]] : memref<32x16xi1>
+// CHECK:               }
+// CHECK:             } else {
+// CHECK:               %[[VAL_83:.*]] = arith.cmpi eq, %[[VAL_30]], %[[VAL_33]] : index
+// CHECK:               scf.if %[[VAL_83]] {
+// CHECK:                 %[[VAL_84:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_28]]] : memref<?xindex>
+// CHECK:                 %[[VAL_85:.*]] = arith.addi %[[VAL_28]], %[[VAL_5]] : index
+// CHECK:                 %[[VAL_86:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_85]]] : memref<?xindex>
+// CHECK:                 scf.for %[[VAL_87:.*]] = %[[VAL_84]] to %[[VAL_86]] step %[[VAL_5]] {
+// CHECK:                   %[[VAL_88:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_87]]] : memref<?xindex>
+// CHECK:                   %[[VAL_89:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_87]]] : memref<?xf32>
+// CHECK:                   %[[VAL_90:.*]] = arith.cmpf ult, %[[VAL_89]], %[[VAL_6]] : f32
+// CHECK:                   memref.store %[[VAL_90]], %[[VAL_17]]{{\[}}%[[VAL_33]], %[[VAL_88]]] : memref<32x16xi1>
+// CHECK:                 }
+// CHECK:               } else {
+// CHECK:                 %[[VAL_91:.*]] = arith.cmpi eq, %[[VAL_31]], %[[VAL_33]] : index
+// CHECK:                 scf.if %[[VAL_91]] {
+// CHECK:                   %[[VAL_92:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_29]]] : memref<?xindex>
+// CHECK:                   %[[VAL_93:.*]] = arith.addi %[[VAL_29]], %[[VAL_5]] : index
+// CHECK:                   %[[VAL_94:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_93]]] : memref<?xindex>
+// CHECK:                   scf.for %[[VAL_95:.*]] = %[[VAL_92]] to %[[VAL_94]] step %[[VAL_5]] {
+// CHECK:                     %[[VAL_96:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_95]]] : memref<?xindex>
+// CHECK:                     %[[VAL_97:.*]] = memref.load %[[VAL_16]]{{\[}}%[[VAL_95]]] : memref<?xf32>
+// CHECK:                     %[[VAL_98:.*]] = arith.cmpf ult, %[[VAL_6]], %[[VAL_97]] : f32
+// CHECK:                     memref.store %[[VAL_98]], %[[VAL_17]]{{\[}}%[[VAL_33]], %[[VAL_96]]] : memref<32x16xi1>
+// CHECK:                   }
+// CHECK:                 } else {
+// CHECK:                 }
+// CHECK:               }
+// CHECK:             }
+// CHECK:             %[[VAL_99:.*]] = arith.cmpi eq, %[[VAL_30]], %[[VAL_33]] : index
+// CHECK:             %[[VAL_100:.*]] = arith.addi %[[VAL_28]], %[[VAL_5]] : index
+// CHECK:             %[[VAL_101:.*]] = arith.select %[[VAL_99]], %[[VAL_100]], %[[VAL_28]] : index
+// CHECK:             %[[VAL_102:.*]] = arith.cmpi eq, %[[VAL_31]], %[[VAL_33]] : index
+// CHECK:             %[[VAL_103:.*]] = arith.addi %[[VAL_29]], %[[VAL_5]] : index
+// CHECK:             %[[VAL_104:.*]] = arith.select %[[VAL_102]], %[[VAL_103]], %[[VAL_29]] : index
+// CHECK:             scf.yield %[[VAL_101]], %[[VAL_104]] : index, index
+// CHECK:           } attributes
+// CHECK:           scf.for %[[VAL_105:.*]] = %[[VAL_106:.*]]#0 to %[[VAL_19]] step %[[VAL_5]] {
+// CHECK:             %[[VAL_107:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_105]]] : memref<?xindex>
+// CHECK:             %[[VAL_108:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_105]]] : memref<?xindex>
+// CHECK:             %[[VAL_109:.*]] = arith.addi %[[VAL_105]], %[[VAL_5]] : index
+// CHECK:             %[[VAL_110:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_109]]] : memref<?xindex>
+// CHECK:             scf.for %[[VAL_111:.*]] = %[[VAL_108]] to %[[VAL_110]] step %[[VAL_5]] {
+// CHECK:               %[[VAL_112:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_111]]] : memref<?xindex>
+// CHECK:               %[[VAL_113:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_111]]] : memref<?xf32>
+// CHECK:               %[[VAL_114:.*]] = arith.cmpf ult, %[[VAL_113]], %[[VAL_6]] : f32
+// CHECK:               memref.store %[[VAL_114]], %[[VAL_17]]{{\[}}%[[VAL_107]], %[[VAL_112]]] : memref<32x16xi1>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           scf.for %[[VAL_115:.*]] = %[[VAL_116:.*]]#1 to %[[VAL_21]] step %[[VAL_5]] {
+// CHECK:             %[[VAL_117:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_115]]] : memref<?xindex>
+// CHECK:             %[[VAL_118:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_115]]] : memref<?xindex>
+// CHECK:             %[[VAL_119:.*]] = arith.addi %[[VAL_115]], %[[VAL_5]] : index
+// CHECK:             %[[VAL_120:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_119]]] : memref<?xindex>
+// CHECK:             scf.for %[[VAL_121:.*]] = %[[VAL_118]] to %[[VAL_120]] step %[[VAL_5]] {
+// CHECK:               %[[VAL_122:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_121]]] : memref<?xindex>
+// CHECK:               %[[VAL_123:.*]] = memref.load %[[VAL_16]]{{\[}}%[[VAL_121]]] : memref<?xf32>
+// CHECK:               %[[VAL_124:.*]] = arith.cmpf ult, %[[VAL_6]], %[[VAL_123]] : f32
+// CHECK:               memref.store %[[VAL_124]], %[[VAL_17]]{{\[}}%[[VAL_117]], %[[VAL_122]]] : memref<32x16xi1>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           %[[VAL_125:.*]] = bufferization.to_tensor %[[VAL_17]] : memref<32x16xi1>
+// CHECK:           return %[[VAL_125]] : tensor<32x16xi1>
+// CHECK:         }
+func.func @cmp_ss_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32, #Tss>, %argx: tensor<32x16xi1>) -> tensor<32x16xi1> {
+  %0 = linalg.generic #trait2
+     ins(%arga, %argb: tensor<32x16xf32, #Tss>, tensor<32x16xf32, #Tss>)
+    outs(%argx: tensor<32x16xi1>) {
+      ^bb(%a: f32, %b: f32, %x: i1):
+        %0 = arith.cmpf ult, %a, %b : f32
+        linalg.yield %0 : i1
+  } -> tensor<32x16xi1>
+  return %0 : tensor<32x16xi1>
+}
+
 #BatchedVector = #sparse_tensor.encoding<{
  lvlTypes = [ "dense", "compressed-hi" ],
 }>
@ -671,22 +1121,22 @@ func.func @add_ss_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32, #T
 // CHECK:               %[[VAL_60:.*]] = arith.addi %[[VAL_31]], %[[VAL_4]] : index
 // CHECK:               %[[VAL_61:.*]] = arith.select %[[VAL_59]], %[[VAL_60]], %[[VAL_31]] : index
 // CHECK:               scf.yield %[[VAL_58]], %[[VAL_61]], %[[VAL_62:.*]] : index, index, tensor<2x3xf64, #{{.*}}>>
-// CHECK:             } attributes {"Emitted from" = "linalg.generic"}
+// CHECK:             } attributes
 // CHECK:             %[[VAL_63:.*]] = scf.for %[[VAL_64:.*]] = %[[VAL_65:.*]]#0 to %[[VAL_18]] step %[[VAL_4]] iter_args(%[[VAL_66:.*]] = %[[VAL_65]]#2)
 // CHECK:               %[[VAL_67:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_64]]] : memref<?xindex>
 // CHECK:               %[[VAL_68:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_64]]] : memref<?xf64>
 // CHECK:               %[[VAL_69:.*]] = sparse_tensor.insert %[[VAL_68]] into %[[VAL_66]]{{\[}}%[[VAL_13]], %[[VAL_67]]] : tensor<2x3xf64, #{{.*}}>>
 // CHECK:               scf.yield %[[VAL_69]] : tensor<2x3xf64, #{{.*}}>>
-// CHECK:             } {"Emitted from" = "linalg.generic"}
+// CHECK:             }
 // CHECK:             %[[VAL_70:.*]] = scf.for %[[VAL_71:.*]] = %[[VAL_72:.*]]#1 to %[[VAL_22]] step %[[VAL_4]] iter_args(%[[VAL_73:.*]] = %[[VAL_74:.*]])
 // CHECK:               %[[VAL_75:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_71]]] : memref<?xindex>
 // CHECK:               %[[VAL_76:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_71]]] : memref<?xf64>
 // CHECK:               %[[VAL_77:.*]] = arith.negf %[[VAL_76]] : f64
 // CHECK:               %[[VAL_78:.*]] = sparse_tensor.insert %[[VAL_77]] into %[[VAL_73]]{{\[}}%[[VAL_13]], %[[VAL_75]]] : tensor<2x3xf64, #{{.*}}>>
 // CHECK:               scf.yield %[[VAL_78]] : tensor<2x3xf64, #{{.*}}>>
-// CHECK:             } {"Emitted from" = "linalg.generic"}
+// CHECK:             }
 // CHECK:             scf.yield %[[VAL_79:.*]] : tensor<2x3xf64, #{{.*}}>>
-// CHECK:           } {"Emitted from" = "linalg.generic"}
+// CHECK:           }
 // CHECK:           %[[VAL_80:.*]] = sparse_tensor.load %[[VAL_81:.*]] hasInserts : tensor<2x3xf64, #{{.*}}>>
 // CHECK:           return %[[VAL_80]] : tensor<2x3xf64, #{{.*}}>>
 // CHECK:         }
@ -1140,9 +1590,9 @@ func.func @scale(%arga: tensor<?x?xf64, #Tds>, %argx: tensor<?x?xf64>) -> tensor
 // CHECK:                 %[[VAL_30:.*]] = arith.mulf %[[VAL_27]], %[[VAL_29]] : f32
 // CHECK:                 %[[VAL_31:.*]] = arith.addf %[[VAL_26]], %[[VAL_30]] : f32
 // CHECK:                 memref.store %[[VAL_31]], %[[VAL_14]]{{\[}}%[[VAL_18]], %[[VAL_25]]] : memref<?x?xf32>
-// CHECK:               } {"Emitted from" = "linalg.generic"}
-// CHECK:             } {"Emitted from" = "linalg.generic"}
-// CHECK:           } {"Emitted from" = "linalg.generic"}
+// CHECK:               }
+// CHECK:             }
+// CHECK:           }
 // CHECK:           %[[VAL_32:.*]] = bufferization.to_tensor %[[VAL_14]] : memref<?x?xf32>
 // CHECK:           return %[[VAL_32]] : tensor<?x?xf32>
 // CHECK:         }
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cmp.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cmp.mlir
@ -0,0 +1,146 @@
+// DEFINE: %{option} = "enable-runtime-library=false"
+// DEFINE: %{compile} = mlir-opt %s --sparse-compiler=%{option}
+// DEFINE: %{run} = mlir-cpu-runner \
+// DEFINE:  -e entry -entry-point-result=void  \
+// DEFINE:  -shared-libs=%mlir_c_runner_utils | \
+// DEFINE: FileCheck %s
+//
+// RUN: %{compile} | %{run}
+//
+// Do the same run, but now with direct IR generation and vectorization.
+// REDEFINE: %{option} = "enable-runtime-library=false vl=2 reassociate-fp-reductions=true enable-index-optimizations=true"
+// RUN: %{compile} | %{run}
+
+// Do the same run, but now with direct IR generation and, if available, VLA
+// vectorization.
+// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-arm-sve=%ENABLE_VLA"
+// REDEFINE: %{run} = %lli_host_or_aarch64_cmd \
+// REDEFINE:   --entry-function=entry_lli \
+// REDEFINE:   --extra-module=%S/Inputs/main_for_lli.ll \
+// REDEFINE:   %VLA_ARCH_ATTR_OPTIONS \
+// REDEFINE:   --dlopen=%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext | \
+// REDEFINE: FileCheck %s
+// RUN: %{compile} | mlir-translate -mlir-to-llvmir | %{run}
+
+#DCSR = #sparse_tensor.encoding<{
+  lvlTypes = [ "compressed", "compressed" ]
+}>
+
+#trait = {
+  indexing_maps = [
+    affine_map<(i,j) -> (i,j)>, // A
+    affine_map<(i,j) -> (i,j)>, // B
+    affine_map<(i,j) -> (i,j)>  // x (out)
+  ],
+  iterator_types = ["parallel", "parallel"],
+  doc = "X(i, j) = cmp A(i,j) B(i, j)"
+}
+
+//
+// Integration test that lowers a kernel annotated as sparse to
+// actual sparse code, initializes a matching sparse storage scheme
+// from file, and runs the resulting code with the JIT compiler.
+//
+module {
+  func.func @cmp_all_dense(%arga: tensor<4x4xf64>,
+                           %argb: tensor<4x4xf64>,
+                           %argx: tensor<4x4xi8>) -> tensor<4x4xi8> {
+    %0 = linalg.generic #trait
+       ins(%arga, %argb: tensor<4x4xf64>, tensor<4x4xf64>)
+      outs(%argx: tensor<4x4xi8>) {
+        ^bb(%a: f64, %b: f64, %x: i8):
+          %0 = arith.cmpf ult, %a, %b : f64
+          %1 = arith.extui %0 : i1 to i8
+          linalg.yield %1 : i8
+    } -> tensor<4x4xi8>
+    return %0 : tensor<4x4xi8>
+  }
+
+  func.func @cmp_lhs_sparse(%arga: tensor<4x4xf64, #DCSR>,
+                            %argb: tensor<4x4xf64>) -> tensor<4x4xi8, #DCSR> {
+    %argx = bufferization.alloc_tensor() : tensor<4x4xi8, #DCSR>
+    %0 = linalg.generic #trait
+       ins(%arga, %argb: tensor<4x4xf64, #DCSR>, tensor<4x4xf64>)
+      outs(%argx: tensor<4x4xi8, #DCSR>) {
+        ^bb(%a: f64, %b: f64, %x: i8):
+          %0 = arith.cmpf ult, %a, %b : f64
+          %1 = arith.extui %0 : i1 to i8
+          linalg.yield %1 : i8
+    } -> tensor<4x4xi8, #DCSR>
+    return %0 : tensor<4x4xi8, #DCSR>
+  }
+
+  func.func @cmp_all_sparse(%arga: tensor<4x4xf64, #DCSR>,
+                            %argb: tensor<4x4xf64, #DCSR>) -> tensor<4x4xi8, #DCSR> {
+    %argx = bufferization.alloc_tensor() : tensor<4x4xi8, #DCSR>
+    %0 = linalg.generic #trait
+       ins(%arga, %argb: tensor<4x4xf64, #DCSR>, tensor<4x4xf64, #DCSR>)
+      outs(%argx: tensor<4x4xi8, #DCSR>) {
+        ^bb(%a: f64, %b: f64, %x: i8):
+          %0 = arith.cmpf ult, %a, %b : f64
+          %1 = arith.extui %0 : i1 to i8
+          linalg.yield %1 : i8
+    } -> tensor<4x4xi8, #DCSR>
+    return %0 : tensor<4x4xi8, #DCSR>
+  }
+
+  //
+  // Main driver that constructs matrix and calls the sparse kernel to perform
+  // element-wise comparison.
+  //
+  func.func @entry() {
+    %d0 = arith.constant 0 : i8
+    %c0 = arith.constant 0 : index
+
+    %lhs_dn = arith.constant dense<
+      [ [ 0.0, 0.0, 1.5, 1.0],
+        [ 0.0, 3.5, 0.0, 0.0],
+        [ 1.0, 5.0, 2.0, 0.0],
+        [ 1.0, 0.5, 0.0, 0.0] ]> : tensor<4x4xf64>
+
+    %rhs_dn = arith.constant dense<
+      [ [ 0.0, 1.5, 1.0, 1.5],
+        [ 3.5, 0.0, 0.0, 0.0],
+        [ 5.0, 2.0, 0.0, 2.0],
+        [ 0.5, 0.0, 0.0, 0.0] ]> : tensor<4x4xf64>
+
+    %lhs_sp = sparse_tensor.convert %lhs_dn : tensor<4x4xf64> to tensor<4x4xf64, #DCSR>
+    %rhs_sp = sparse_tensor.convert %rhs_dn : tensor<4x4xf64> to tensor<4x4xf64, #DCSR>
+
+    %output = arith.constant dense<0> : tensor<4x4xi8>
+    %all_dn_out = call @cmp_all_dense(%lhs_dn, %rhs_dn, %output)
+            : (tensor<4x4xf64>, tensor<4x4xf64>, tensor<4x4xi8>) -> tensor<4x4xi8>
+    %lhs_sp_out = call @cmp_lhs_sparse(%lhs_sp, %rhs_dn)
+            : (tensor<4x4xf64, #DCSR>, tensor<4x4xf64>) -> tensor<4x4xi8, #DCSR>
+    %all_sp_out = call @cmp_all_sparse(%lhs_sp, %rhs_sp)
+            : (tensor<4x4xf64, #DCSR>, tensor<4x4xf64, #DCSR>) -> tensor<4x4xi8, #DCSR>
+
+    //
+    // All should have the same result.
+    //
+    // CHECK-COUNT-3: ( ( 0, 1, 0, 1 ), ( 1, 0, 0, 0 ), ( 1, 0, 0, 1 ), ( 0, 0, 0, 0 ) )
+    %v = vector.transfer_read %all_dn_out[%c0, %c0], %d0
+       : tensor<4x4xi8>, vector<4x4xi8>
+    vector.print %v : vector<4x4xi8>
+
+    %lhs_sp_ret = sparse_tensor.convert %lhs_sp_out
+      : tensor<4x4xi8, #DCSR> to tensor<4x4xi8>
+    %v1 = vector.transfer_read %lhs_sp_ret[%c0, %c0], %d0
+      : tensor<4x4xi8>, vector<4x4xi8>
+    vector.print %v1 : vector<4x4xi8>
+
+    %rhs_sp_ret = sparse_tensor.convert %all_sp_out
+      : tensor<4x4xi8, #DCSR> to tensor<4x4xi8>
+    %v2 = vector.transfer_read %rhs_sp_ret[%c0, %c0], %d0
+      : tensor<4x4xi8>, vector<4x4xi8>
+    vector.print %v2 : vector<4x4xi8>
+
+
+    bufferization.dealloc_tensor %lhs_sp : tensor<4x4xf64, #DCSR>
+    bufferization.dealloc_tensor %rhs_sp : tensor<4x4xf64, #DCSR>
+    bufferization.dealloc_tensor %lhs_sp_out : tensor<4x4xi8, #DCSR>
+    bufferization.dealloc_tensor %all_sp_out : tensor<4x4xi8, #DCSR>
+
+    return
+  }
+}
--- a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
+++ b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
@ -34,7 +34,9 @@ namespace {
  DO(subi, TensorExp::Kind::kSubI)                                             \
  DO(andi, TensorExp::Kind::kAndI)                                             \
  DO(xori, TensorExp::Kind::kXorI)                                             \
-  DO(ori, TensorExp::Kind::kOrI)
+  DO(ori, TensorExp::Kind::kOrI)                                               \
+  DO(cmpf, TensorExp::Kind::kCmpF)                                             \
+  DO(cmpi, TensorExp::Kind::kCmpI)

 // TODO: Disjunctive binary operations that need special handling are not
 // included, e.g., Division are not tested (for now) as it need a constant
@ -109,6 +111,7 @@ struct Pattern {
  /// Constructors.
  /// Rather than using these, please use the readable helper constructor
  /// functions below to make tests more readable.
+  Pattern() : kind(TensorExp::Kind::kSynZero) {}
  Pattern(TensorId tid) : kind(TensorExp::Kind::kTensor), tid(tid) {}
  Pattern(TensorExp::Kind kind, PatternRef e0, PatternRef e1)
      : kind(kind), children(e0, e1) {
@ -122,6 +125,7 @@ struct Pattern {
 ///

 static Pattern tensorPattern(TensorId tid) { return Pattern(tid); }
+static Pattern synZeroPattern() { return Pattern(); }

 #define IMPL_BINOP_PATTERN(OP, KIND)                                           \
  LLVM_ATTRIBUTE_UNUSED static Pattern OP##Pattern(PatternRef e0,              \
@ -232,6 +236,9 @@ protected:
    // Leaf.
    case TensorExp::Kind::kTensor:
      return tensorExp.tensor == pattern.tid;
+    case TensorExp::Kind::kSynZero:
+      // Already checked kind equivalence @L233
+      return true;
    case TensorExp::Kind::kInvariant:
      llvm_unreachable("invariant not handled yet");
    case TensorExp::Kind::kLoopVar:
@ -289,6 +296,8 @@ protected:
    case TensorExp::Kind::kAndI:
    case TensorExp::Kind::kOrI:
    case TensorExp::Kind::kXorI:
+    case TensorExp::Kind::kCmpF:
+    case TensorExp::Kind::kCmpI:
    case TensorExp::Kind::kShrS:
    case TensorExp::Kind::kShrU:
    case TensorExp::Kind::kShlI:
@ -752,6 +761,79 @@ FOREVERY_COMMON_DISJ_BINOP(IMPL_MERGER_TEST_OPTIMIZED_DISJ)

 FOREVERY_COMMON_CONJ_BINOP(IMPL_MERGER_TEST_OPTIMIZED_CONJ)

+/// Vector element-wise comparison (disjunction) of 2 vectors. i.e.;
+///   a(i) = b(i) + c(i)
+/// which should form the 3 lattice points
+/// {
+///   lat( i_00 i_01 / (tensor_0 cmp tensor_1) )
+///   lat( i_00 / tensor_0 cmp 0 )
+///   lat( i_01 / 0 cmp tensor_1 )
+/// }
+/// and after optimization, the lattice points do not change (as there is no
+/// duplicated point and all input vectors are sparse vector).
+/// {
+///   lat( i_00 i_01 / (tensor_0 cmp tensor_1) )
+///   lat( i_00 / tensor_0 cmp 0 )
+///   lat( i_01 / 0 cmp tensor_1 )
+/// }
+TEST_F(MergerTest3T1L, vector_cmp) {
+  const auto e = cmpiExpr(tensor(0), tensor(1));
+  const auto l0 = lid(0);
+  const auto t0 = tid(0);
+  const auto t1 = tid(1);
+  PatternRef zero = synZeroPattern();
+  PatternRef p0 = tensorPattern(t0);
+  PatternRef p1 = tensorPattern(t1);
+  auto s = merger.buildLattices(e, l0);
+  expectLatPoint(s, 0, cmpiPattern(p0, p1), loopsToBits({{l0, t0}, {l0, t1}}));
+  expectLatPointWithinRange(s, 1, 2, cmpiPattern(p0, zero),
+                            loopsToBits({{l0, t0}}));
+  expectLatPointWithinRange(s, 1, 2, cmpiPattern(zero, p1),
+                            loopsToBits({{l0, t1}}));
+  s = merger.optimizeSet(s);
+  expectLatPoint(s, 0, cmpiPattern(p0, p1), loopsToBits({{l0, t0}, {l0, t1}}));
+  expectLatPointWithinRange(s, 1, 2, cmpiPattern(p0, zero),
+                            loopsToBits({{l0, t0}}));
+  expectLatPointWithinRange(s, 1, 2, cmpiPattern(zero, p1),
+                            loopsToBits({{l0, t1}}));
+}
+
+/// Vector element-wise comparsion (disjunction) of 2 vectors, i.e.;
+///   a(i) = b(i) cmp c(i)
+/// which should form the 3 lattice points
+/// {
+///   lat( i_00 i_01 / (sparse_tensor_0 cmp dense_tensor_1) )
+///   lat( i_00 / sparse_tensor_0 cmp 0)
+///   lat( i_01 / 0 cmp dense_tensor_1 )
+/// }
+/// which should be optimized to
+/// {
+///   lat( i_00 i_01 / (sparse_tensor_0 cmp dense_tensor_1) ) (not singleton)
+///   lat( i_01 / 0 cmp dense_tensor_0 ) ()
+/// }
+///
+/// lat( i_00 / sparse_tensor_0 ) should be opted out as it only has dense diff
+/// with lat( i_00 i_01 / (sparse_tensor_0 cmp dense_tensor_1) ).
+TEST_F(MergerTest3T1LD, vector_cmp) {
+  const auto e = cmpiExpr(tensor(0), tensor(1));
+  const auto l0 = lid(0);
+  const auto t0 = tid(0);
+  const auto t1 = tid(1);
+  PatternRef zero = synZeroPattern();
+  PatternRef p0 = tensorPattern(t0);
+  PatternRef p1 = tensorPattern(t1);
+  auto s = merger.buildLattices(e, l0);
+  expectLatPoint(s, 0, cmpiPattern(p0, p1), loopsToBits({{l0, t0}, {l0, t1}}));
+  expectLatPointWithinRange(s, 1, 2, cmpiPattern(p0, zero),
+                            loopsToBits({{l0, t0}}));
+  expectLatPointWithinRange(s, 1, 2, cmpiPattern(zero, p1),
+                            loopsToBits({{l0, t1}}));
+  s = merger.optimizeSet(s);
+  expectLatPoint(s, 0, cmpiPattern(p0, p1), loopsToBits({{l0, t0}, {l0, t1}}));
+  expectLatPointWithinRange(s, 1, 2, cmpiPattern(zero, p1),
+                            loopsToBits({{l0, t1}}));
+}
+
 #undef IMPL_MERGER_TEST_OPTIMIZED_CONJ

 // TODO: mult-dim tests