[sparse][mlir] give all sparse kernels an explicit "output" tensor

Rationale: Providing an output tensor, even if one is not used as input to the kernel provides the right pattern for using lingalg sparse kernels (in contrast with reusing a tensor just to provide the shape). This prepares proper bufferization that will follow. Reviewed By: bixia Differential Revision: https://reviews.llvm.org/D95587
2025-01-24 18:20:38 +00:00 · 2021-01-28 10:24:33 -08:00 · 2021-01-28 10:24:33 -08:00 · 8af0ccf5a4
commit 8af0ccf5a4
parent eae50bb210
8 changed files with 2273 additions and 2224 deletions
--- a/mlir/test/Dialect/Linalg/sparse_1d.mlir
+++ b/mlir/test/Dialect/Linalg/sparse_1d.mlir
--- a/mlir/test/Dialect/Linalg/sparse_2d.mlir
+++ b/mlir/test/Dialect/Linalg/sparse_2d.mlir
--- a/mlir/test/Dialect/Linalg/sparse_3d.mlir
+++ b/mlir/test/Dialect/Linalg/sparse_3d.mlir
--- a/mlir/test/Dialect/Linalg/sparse_invalid.mlir
+++ b/mlir/test/Dialect/Linalg/sparse_invalid.mlir
@ -9,18 +9,17 @@
    [ "S" ],  // a
    [ "D" ]   // x
  ],
-  iterator_types = ["parallel"]
+  iterator_types = ["parallel"],
+  doc = "x(i) = a(i) + b"
 }

-func @invalid_memref(%arga: memref<32xf32>, %argb: f32, %shape: tensor<32xf32>)
-  -> tensor<32xf32>
-{
+func @invalid_memref(%arga: memref<32xf32>, %argb: f32, %argx: tensor<32xf32>) -> tensor<32xf32> {
  // expected-error@+1 {{'linalg.generic' op expected sparse annotations on tensors only}}
  %0 = linalg.generic #trait_memref
     ins(%arga: memref<32xf32>)
-    outs(%shape: tensor<32xf32>) {
-      ^bb(%a: f32, %s: f32):
-        %0 = addf %a, %argb  : f32
+    outs(%argx: tensor<32xf32>) {
+      ^bb(%a: f32, %x: f32):
+        %0 = addf %a, %argb : f32
        linalg.yield %0 : f32
  } -> tensor<32xf32>
  return %0 : tensor<32xf32>
@ -38,16 +37,17 @@ func @invalid_memref(%arga: memref<32xf32>, %argb: f32, %shape: tensor<32xf32>)
    [ "S" ],  // b
    [ "D" ]   // x
  ],
-  iterator_types = ["parallel"]
+  iterator_types = ["parallel"],
+  doc = "x(i) = a(i) + b"
 }

-func @invalid_too_many(%arga: tensor<32xf32>, %argb: f32) -> tensor<32xf32> {
+func @invalid_too_many(%arga: tensor<32xf32>, %argb: f32, %argx: tensor<32xf32>) -> tensor<32xf32> {
  // expected-error@+1 {{'linalg.generic' op expected one sparse annotation for each tensor}}
  %0 = linalg.generic #trait_too_many
     ins(%arga: tensor<32xf32>)
-    outs(%arga: tensor<32xf32>) {
-      ^bb(%a: f32, %s: f32):
-        %0 = addf %a, %argb  : f32
+    outs(%argx: tensor<32xf32>) {
+      ^bb(%a: f32, %x: f32):
+        %0 = addf %a, %argb : f32
        linalg.yield %0 : f32
  } -> tensor<32xf32>
  return %0 : tensor<32xf32>
@ -61,16 +61,17 @@ func @invalid_too_many(%arga: tensor<32xf32>, %argb: f32) -> tensor<32xf32> {
    affine_map<(i) -> (i)>   // x (out)
  ],
  sparse = [ 1, 2 ],
-  iterator_types = ["parallel"]
+  iterator_types = ["parallel"],
+  doc = "x(i) = a(i) + b"
 }

-func @invalid_no_array(%arga: tensor<32xf32>, %argb: f32) -> tensor<32xf32> {
+func @invalid_no_array(%arga: tensor<32xf32>, %argb: f32, %argx: tensor<32xf32>) -> tensor<32xf32> {
  // expected-error@+1 {{'linalg.generic' op expected sparse annotation array for tensor 0}}
  %0 = linalg.generic #trait_no_array
     ins(%arga: tensor<32xf32>)
-    outs(%arga: tensor<32xf32>) {
-      ^bb(%a: f32, %s: f32):
-        %0 = addf %a, %argb  : f32
+    outs(%argx: tensor<32xf32>) {
+      ^bb(%a: f32, %x: f32):
+        %0 = addf %a, %argb : f32
        linalg.yield %0 : f32
  } -> tensor<32xf32>
  return %0 : tensor<32xf32>
@ -87,16 +88,17 @@ func @invalid_no_array(%arga: tensor<32xf32>, %argb: f32) -> tensor<32xf32> {
    [ "S" ],
    [ "D", "D" ]
  ],
-  iterator_types = ["parallel"]
+  iterator_types = ["parallel"],
+  doc = "x(i) = a(i) + b"
 }

-func @invalid_wrong_rank(%arga: tensor<32xf32>, %argb: f32) -> tensor<32xf32> {
+func @invalid_wrong_rank(%arga: tensor<32xf32>, %argb: f32, %argx: tensor<32xf32>) -> tensor<32xf32> {
  // expected-error@+1 {{'linalg.generic' op expected sparse annotation with rank 1 for tensor 1}}
  %0 = linalg.generic #trait_wrong_rank
     ins(%arga: tensor<32xf32>)
-    outs(%arga: tensor<32xf32>) {
-      ^bb(%a: f32, %s: f32):
-        %0 = addf %a, %argb  : f32
+    outs(%argx: tensor<32xf32>) {
+      ^bb(%a: f32, %x: f32):
+        %0 = addf %a, %argb : f32
        linalg.yield %0 : f32
  } -> tensor<32xf32>
  return %0 : tensor<32xf32>
@ -113,16 +115,17 @@ func @invalid_wrong_rank(%arga: tensor<32xf32>, %argb: f32) -> tensor<32xf32> {
    [ "S", 1 ],
    [ "D", "D" ]
  ],
-  iterator_types = ["parallel","parallel"]
+  iterator_types = ["parallel","parallel"],
+  doc = "x(i,j) = a(i,j) + b"
 }

-func @invalid_no_string(%arga: tensor<32x16xf32>, %argb: f32) -> tensor<32x16xf32> {
+func @invalid_no_string(%arga: tensor<32x16xf32>, %argb: f32, %argx: tensor<32x16xf32>) -> tensor<32x16xf32> {
  // expected-error@+1 {{'linalg.generic' op expected sparse annotation at position 1 for tensor 0}}
  %0 = linalg.generic #trait_no_string
     ins(%arga: tensor<32x16xf32>)
-    outs(%arga: tensor<32x16xf32>) {
-      ^bb(%a: f32, %s: f32):
-        %0 = addf %a, %argb  : f32
+    outs(%argx: tensor<32x16xf32>) {
+      ^bb(%a: f32, %x: f32):
+        %0 = addf %a, %argb : f32
        linalg.yield %0 : f32
  } -> tensor<32x16xf32>
  return %0 : tensor<32x16xf32>
@ -139,16 +142,17 @@ func @invalid_no_string(%arga: tensor<32x16xf32>, %argb: f32) -> tensor<32x16xf3
    [ "S", "S" ],
    [ "D", "X" ]
  ],
-  iterator_types = ["parallel","parallel"]
+  iterator_types = ["parallel","parallel"],
+  doc = "x(i,j) = a(i,j) + b"
 }

-func @invalid_wrong_symbol(%arga: tensor<32x16xf32>, %argb: f32) -> tensor<32x16xf32> {
+func @invalid_wrong_symbol(%arga: tensor<32x16xf32>, %argb: f32, %argx: tensor<32x16xf32>) -> tensor<32x16xf32> {
  // expected-error@+1 {{'linalg.generic' op expected sparse annotation at position 1 for tensor 1}}
  %0 = linalg.generic #trait_wrong_symbol
     ins(%arga: tensor<32x16xf32>)
-    outs(%arga: tensor<32x16xf32>) {
-      ^bb(%a: f32, %s: f32):
-        %0 = addf %a, %argb  : f32
+    outs(%argx: tensor<32x16xf32>) {
+      ^bb(%a: f32, %x: f32):
+        %0 = addf %a, %argb : f32
        linalg.yield %0 : f32
  } -> tensor<32x16xf32>
  return %0 : tensor<32x16xf32>
@ -165,16 +169,17 @@ func @invalid_wrong_symbol(%arga: tensor<32x16xf32>, %argb: f32) -> tensor<32x16
    [ "S", "S" ],
    [ "D", "S" ]
  ],
-  iterator_types = ["parallel","parallel"]
+  iterator_types = ["parallel","parallel"],
+  doc = "x(i,j) = a(i,j) + b"
 }

-func @invalid_no_sparse_output(%arga: tensor<32x16xf32>, %argb: f32) -> tensor<32x16xf32> {
+func @invalid_no_sparse_output(%arga: tensor<32x16xf32>, %argb: f32, %argx: tensor<32x16xf32>) -> tensor<32x16xf32> {
  // expected-error@+1 {{'linalg.generic' op sparse output tensors not supported (yet)}}
  %0 = linalg.generic #trait_no_sparse_output
     ins(%arga: tensor<32x16xf32>)
-    outs(%arga: tensor<32x16xf32>) {
-      ^bb(%a: f32, %s: f32):
-        %0 = addf %a, %argb  : f32
+    outs(%argx: tensor<32x16xf32>) {
+      ^bb(%a: f32, %x: f32):
+        %0 = addf %a, %argb : f32
        linalg.yield %0 : f32
  } -> tensor<32x16xf32>
  return %0 : tensor<32x16xf32>
--- a/mlir/test/Dialect/Linalg/sparse_nd.mlir
+++ b/mlir/test/Dialect/Linalg/sparse_nd.mlir
@ -20,54 +20,55 @@
 }

 // CHECK-LABEL:   func @mul(
-// CHECK-SAME:              %[[VAL_0:.*]]: tensor<100x200x300x400x500x600x700x800xf32>,
-// CHECK-SAME:              %[[VAL_1:.*]]: tensor<100x200x300x400x500x600x700x800xf32>) -> tensor<100x200x300x400x500x600x700x800xf32> {
-// CHECK:           %[[VAL_2:.*]] = constant 999 : index
-// CHECK:           %[[VAL_3:.*]] = constant 100 : index
-// CHECK:           %[[VAL_4:.*]] = constant 200 : index
-// CHECK:           %[[VAL_5:.*]] = constant 300 : index
-// CHECK:           %[[VAL_6:.*]] = constant 600 : index
-// CHECK:           %[[VAL_7:.*]] = constant 700 : index
-// CHECK:           %[[VAL_8:.*]] = constant 800 : index
-// CHECK:           %[[VAL_9:.*]] = constant 0 : index
-// CHECK:           %[[VAL_10:.*]] = constant 1 : index
-// CHECK:           %[[VAL_11:.*]] = alloca() : memref<100x200x300x400x500x600x700x800xf32>
-// CHECK:           %[[VAL_12:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
-// CHECK:           %[[VAL_13:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
-// CHECK:           %[[VAL_14:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
-// CHECK:           %[[VAL_15:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
-// CHECK:           %[[VAL_16:.*]] = alloca(%[[VAL_2]]) : memref<?xf32>
-// CHECK:           %[[VAL_17:.*]] = alloca() : memref<100x200x300x400x500x600x700x800xf32>
-// CHECK:           scf.for %[[VAL_18:.*]] = %[[VAL_9]] to %[[VAL_8]] step %[[VAL_10]] {
-// CHECK:             scf.for %[[VAL_19:.*]] = %[[VAL_9]] to %[[VAL_7]] step %[[VAL_10]] {
-// CHECK:               %[[VAL_20:.*]] = muli %[[VAL_18]], %[[VAL_7]] : index
-// CHECK:               %[[VAL_21:.*]] = addi %[[VAL_20]], %[[VAL_19]] : index
-// CHECK:               scf.for %[[VAL_22:.*]] = %[[VAL_9]] to %[[VAL_6]] step %[[VAL_10]] {
-// CHECK:                 %[[VAL_23:.*]] = muli %[[VAL_21]], %[[VAL_6]] : index
-// CHECK:                 %[[VAL_24:.*]] = addi %[[VAL_23]], %[[VAL_22]] : index
-// CHECK:                 %[[VAL_25:.*]] = load %[[VAL_12]]{{\[}}%[[VAL_24]]] : memref<?xindex>
-// CHECK:                 %[[VAL_26:.*]] = addi %[[VAL_24]], %[[VAL_10]] : index
-// CHECK:                 %[[VAL_27:.*]] = load %[[VAL_12]]{{\[}}%[[VAL_26]]] : memref<?xindex>
-// CHECK:                 scf.for %[[VAL_28:.*]] = %[[VAL_25]] to %[[VAL_27]] step %[[VAL_10]] {
-// CHECK:                   %[[VAL_29:.*]] = load %[[VAL_13]]{{\[}}%[[VAL_28]]] : memref<?xindex>
-// CHECK:                   %[[VAL_30:.*]] = load %[[VAL_14]]{{\[}}%[[VAL_28]]] : memref<?xindex>
-// CHECK:                   %[[VAL_31:.*]] = addi %[[VAL_28]], %[[VAL_10]] : index
-// CHECK:                   %[[VAL_32:.*]] = load %[[VAL_14]]{{\[}}%[[VAL_31]]] : memref<?xindex>
-// CHECK:                   scf.for %[[VAL_33:.*]] = %[[VAL_30]] to %[[VAL_32]] step %[[VAL_10]] {
-// CHECK:                     %[[VAL_34:.*]] = load %[[VAL_15]]{{\[}}%[[VAL_33]]] : memref<?xindex>
-// CHECK:                     scf.for %[[VAL_35:.*]] = %[[VAL_9]] to %[[VAL_5]] step %[[VAL_10]] {
-// CHECK:                       %[[VAL_36:.*]] = muli %[[VAL_33]], %[[VAL_5]] : index
-// CHECK:                       %[[VAL_37:.*]] = addi %[[VAL_36]], %[[VAL_35]] : index
-// CHECK:                       scf.for %[[VAL_38:.*]] = %[[VAL_9]] to %[[VAL_4]] step %[[VAL_10]] {
-// CHECK:                         %[[VAL_39:.*]] = muli %[[VAL_37]], %[[VAL_4]] : index
-// CHECK:                         %[[VAL_40:.*]] = addi %[[VAL_39]], %[[VAL_38]] : index
-// CHECK:                         scf.for %[[VAL_41:.*]] = %[[VAL_9]] to %[[VAL_3]] step %[[VAL_10]] {
-// CHECK:                           %[[VAL_42:.*]] = muli %[[VAL_40]], %[[VAL_3]] : index
-// CHECK:                           %[[VAL_43:.*]] = addi %[[VAL_42]], %[[VAL_41]] : index
-// CHECK:                           %[[VAL_44:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_41]], %[[VAL_38]], %[[VAL_35]], %[[VAL_34]], %[[VAL_29]], %[[VAL_22]], %[[VAL_19]], %[[VAL_18]]] : memref<100x200x300x400x500x600x700x800xf32>
-// CHECK:                           %[[VAL_45:.*]] = load %[[VAL_16]]{{\[}}%[[VAL_43]]] : memref<?xf32>
-// CHECK:                           %[[VAL_46:.*]] = mulf %[[VAL_44]], %[[VAL_45]] : f32
-// CHECK:                           store %[[VAL_46]], %[[VAL_17]]{{\[}}%[[VAL_41]], %[[VAL_38]], %[[VAL_35]], %[[VAL_34]], %[[VAL_29]], %[[VAL_22]], %[[VAL_19]], %[[VAL_18]]] : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK-SAME:              %[[VAL_0:.*0]]: tensor<100x200x300x400x500x600x700x800xf32>,
+// CHECK-SAME:              %[[VAL_1:.*1]]: tensor<100x200x300x400x500x600x700x800xf32>,
+// CHECK-SAME:              %[[VAL_2:.*2]]: tensor<100x200x300x400x500x600x700x800xf32>) -> tensor<100x200x300x400x500x600x700x800xf32> {
+// CHECK:           %[[VAL_3:.*]] = constant 999 : index
+// CHECK:           %[[VAL_4:.*]] = constant 100 : index
+// CHECK:           %[[VAL_5:.*]] = constant 200 : index
+// CHECK:           %[[VAL_6:.*]] = constant 300 : index
+// CHECK:           %[[VAL_7:.*]] = constant 600 : index
+// CHECK:           %[[VAL_8:.*]] = constant 700 : index
+// CHECK:           %[[VAL_9:.*]] = constant 800 : index
+// CHECK:           %[[VAL_10:.*]] = constant 0 : index
+// CHECK:           %[[VAL_11:.*]] = constant 1 : index
+// CHECK:           %[[VAL_12:.*]] = alloca() : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:           %[[VAL_13:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
+// CHECK:           %[[VAL_14:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
+// CHECK:           %[[VAL_15:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
+// CHECK:           %[[VAL_16:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
+// CHECK:           %[[VAL_17:.*]] = alloca(%[[VAL_3]]) : memref<?xf32>
+// CHECK:           %[[VAL_18:.*]] = alloca() : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:           scf.for %[[VAL_19:.*]] = %[[VAL_10]] to %[[VAL_9]] step %[[VAL_11]] {
+// CHECK:             scf.for %[[VAL_20:.*]] = %[[VAL_10]] to %[[VAL_8]] step %[[VAL_11]] {
+// CHECK:               %[[VAL_21:.*]] = muli %[[VAL_19]], %[[VAL_8]] : index
+// CHECK:               %[[VAL_22:.*]] = addi %[[VAL_21]], %[[VAL_20]] : index
+// CHECK:               scf.for %[[VAL_23:.*]] = %[[VAL_10]] to %[[VAL_7]] step %[[VAL_11]] {
+// CHECK:                 %[[VAL_24:.*]] = muli %[[VAL_22]], %[[VAL_7]] : index
+// CHECK:                 %[[VAL_25:.*]] = addi %[[VAL_24]], %[[VAL_23]] : index
+// CHECK:                 %[[VAL_26:.*]] = load %[[VAL_13]]{{\[}}%[[VAL_25]]] : memref<?xindex>
+// CHECK:                 %[[VAL_27:.*]] = addi %[[VAL_25]], %[[VAL_11]] : index
+// CHECK:                 %[[VAL_28:.*]] = load %[[VAL_13]]{{\[}}%[[VAL_27]]] : memref<?xindex>
+// CHECK:                 scf.for %[[VAL_29:.*]] = %[[VAL_26]] to %[[VAL_28]] step %[[VAL_11]] {
+// CHECK:                   %[[VAL_30:.*]] = load %[[VAL_14]]{{\[}}%[[VAL_29]]] : memref<?xindex>
+// CHECK:                   %[[VAL_31:.*]] = load %[[VAL_15]]{{\[}}%[[VAL_29]]] : memref<?xindex>
+// CHECK:                   %[[VAL_32:.*]] = addi %[[VAL_29]], %[[VAL_11]] : index
+// CHECK:                   %[[VAL_33:.*]] = load %[[VAL_15]]{{\[}}%[[VAL_32]]] : memref<?xindex>
+// CHECK:                   scf.for %[[VAL_34:.*]] = %[[VAL_31]] to %[[VAL_33]] step %[[VAL_11]] {
+// CHECK:                     %[[VAL_35:.*]] = load %[[VAL_16]]{{\[}}%[[VAL_34]]] : memref<?xindex>
+// CHECK:                     scf.for %[[VAL_36:.*]] = %[[VAL_10]] to %[[VAL_6]] step %[[VAL_11]] {
+// CHECK:                       %[[VAL_37:.*]] = muli %[[VAL_34]], %[[VAL_6]] : index
+// CHECK:                       %[[VAL_38:.*]] = addi %[[VAL_37]], %[[VAL_36]] : index
+// CHECK:                       scf.for %[[VAL_39:.*]] = %[[VAL_10]] to %[[VAL_5]] step %[[VAL_11]] {
+// CHECK:                         %[[VAL_40:.*]] = muli %[[VAL_38]], %[[VAL_5]] : index
+// CHECK:                         %[[VAL_41:.*]] = addi %[[VAL_40]], %[[VAL_39]] : index
+// CHECK:                         scf.for %[[VAL_42:.*]] = %[[VAL_10]] to %[[VAL_4]] step %[[VAL_11]] {
+// CHECK:                           %[[VAL_43:.*]] = muli %[[VAL_41]], %[[VAL_4]] : index
+// CHECK:                           %[[VAL_44:.*]] = addi %[[VAL_43]], %[[VAL_42]] : index
+// CHECK:                           %[[VAL_45:.*]] = load %[[VAL_12]]{{\[}}%[[VAL_42]], %[[VAL_39]], %[[VAL_36]], %[[VAL_35]], %[[VAL_30]], %[[VAL_23]], %[[VAL_20]], %[[VAL_19]]] : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:                           %[[VAL_46:.*]] = load %[[VAL_17]]{{\[}}%[[VAL_44]]] : memref<?xf32>
+// CHECK:                           %[[VAL_47:.*]] = mulf %[[VAL_45]], %[[VAL_46]] : f32
+// CHECK:                           store %[[VAL_47]], %[[VAL_18]]{{\[}}%[[VAL_42]], %[[VAL_39]], %[[VAL_36]], %[[VAL_35]], %[[VAL_30]], %[[VAL_23]], %[[VAL_20]], %[[VAL_19]]] : memref<100x200x300x400x500x600x700x800xf32>
 // CHECK:                         }
 // CHECK:                       }
 // CHECK:                     }
@ -76,17 +77,18 @@
 // CHECK:               }
 // CHECK:             }
 // CHECK:           }
-// CHECK:           %[[VAL_47:.*]] = tensor_load %[[VAL_17]] : memref<100x200x300x400x500x600x700x800xf32>
-// CHECK:           return %[[VAL_47]] : tensor<100x200x300x400x500x600x700x800xf32>
+// CHECK:           %[[VAL_48:.*]] = tensor_load %[[VAL_18]] : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:           return %[[VAL_48]] : tensor<100x200x300x400x500x600x700x800xf32>
 // CHECK:         }
 func @mul(%arga: tensor<100x200x300x400x500x600x700x800xf32>,
-          %argb: tensor<100x200x300x400x500x600x700x800xf32>)
+          %argb: tensor<100x200x300x400x500x600x700x800xf32>,
+          %argx: tensor<100x200x300x400x500x600x700x800xf32>)
 	      -> tensor<100x200x300x400x500x600x700x800xf32> {
  %0 = linalg.generic #trait_mul
    ins(%arga, %argb: tensor<100x200x300x400x500x600x700x800xf32>,
                      tensor<100x200x300x400x500x600x700x800xf32>)
-    outs(%arga: tensor<100x200x300x400x500x600x700x800xf32>) {
-      ^bb(%a: f32, %b: f32, %s : f32):
+    outs(%argx: tensor<100x200x300x400x500x600x700x800xf32>) {
+      ^bb(%a: f32, %b: f32, %x: f32):
        %0 = mulf %a, %b : f32
        linalg.yield %0 : f32
    }      -> tensor<100x200x300x400x500x600x700x800xf32>
--- a/mlir/test/Dialect/Linalg/sparse_parallel.mlir
+++ b/mlir/test/Dialect/Linalg/sparse_parallel.mlir
@ -48,12 +48,12 @@
 // CHECK-PAR4:           scf.parallel
 // CHECK-PAR4:         return
 //
-func @scale_dd(%scale: f32, %arga: tensor<?x?xf32>) -> tensor<?x?xf32> {
+func @scale_dd(%scale: f32, %arga: tensor<?x?xf32>, %argx: tensor<?x?xf32>) -> tensor<?x?xf32> {
  %0 = linalg.generic #trait_dd
     ins(%arga: tensor<?x?xf32>)
-    outs(%arga: tensor<?x?xf32>) {
-      ^bb(%a: f32, %s: f32):
-        %0 = mulf %a, %scale  : f32
+    outs(%argx: tensor<?x?xf32>) {
+      ^bb(%a: f32, %x: f32):
+        %0 = mulf %a, %scale : f32
        linalg.yield %0 : f32
  } -> tensor<?x?xf32>
  return %0 : tensor<?x?xf32>
@ -98,12 +98,12 @@ func @scale_dd(%scale: f32, %arga: tensor<?x?xf32>) -> tensor<?x?xf32> {
 // CHECK-PAR4:           scf.parallel
 // CHECK-PAR4:         return
 //
-func @scale_ss(%scale: f32, %arga: tensor<?x?xf32>) -> tensor<?x?xf32> {
+func @scale_ss(%scale: f32, %arga: tensor<?x?xf32>, %argx: tensor<?x?xf32>) -> tensor<?x?xf32> {
  %0 = linalg.generic #trait_ss
     ins(%arga: tensor<?x?xf32>)
-    outs(%arga: tensor<?x?xf32>) {
-      ^bb(%a: f32, %s: f32):
-        %0 = mulf %a, %scale  : f32
+    outs(%argx: tensor<?x?xf32>) {
+      ^bb(%a: f32, %x: f32):
+        %0 = mulf %a, %scale : f32
        linalg.yield %0 : f32
  } -> tensor<?x?xf32>
  return %0 : tensor<?x?xf32>
@ -153,7 +153,7 @@ func @scale_ss(%scale: f32, %arga: tensor<?x?xf32>) -> tensor<?x?xf32> {
 func @matvec(%argA: tensor<16x32xf32>, %argb: tensor<32xf32>, %argx: tensor<16xf32>) -> tensor<16xf32> {
  %0 = linalg.generic #trait_matvec
      ins(%argA, %argb : tensor<16x32xf32>, tensor<32xf32>)
-     outs(%argx : tensor<16xf32>) {
+     outs(%argx: tensor<16xf32>) {
    ^bb(%A: f32, %b: f32, %x: f32):
      %0 = mulf %A, %b : f32
      %1 = addf %0, %x : f32
--- a/mlir/test/Dialect/Linalg/sparse_storage.mlir
+++ b/mlir/test/Dialect/Linalg/sparse_storage.mlir
@ -122,14 +122,13 @@
 // CHECK-TYPE5:   store %[[MUL]], %{{.*}}[%[[INDC]]] : memref<32xf64>
 // CHECK-TYPE5: }

-func @mul_dd(%arga: tensor<32xf64>, %argb: tensor<32xf64>) -> tensor<32xf64> {
+func @mul_dd(%arga: tensor<32xf64>, %argb: tensor<32xf64>, %argx: tensor<32xf64>) -> tensor<32xf64> {
  %0 = linalg.generic #trait_mul_1d
     ins(%arga, %argb: tensor<32xf64>, tensor<32xf64>)
-    outs(%arga : tensor<32xf64>) {
-      ^bb(%a: f64, %b: f64, %s: f64):
-        %0 = mulf %a, %b  : f64
+    outs(%argx: tensor<32xf64>) {
+      ^bb(%a: f64, %b: f64, %x: f64):
+        %0 = mulf %a, %b : f64
        linalg.yield %0 : f64
  } -> tensor<32xf64>
  return %0 : tensor<32xf64>
 }
-
--- a/mlir/test/Dialect/Linalg/sparse_vector.mlir
+++ b/mlir/test/Dialect/Linalg/sparse_vector.mlir
@ -54,11 +54,11 @@
 // CHECK-VEC2:       }
 // CHECK-VEC2:       return
 //
-func @scale_d(%arga: tensor<1024xf32>, %scale: f32) -> tensor<1024xf32> {
+func @scale_d(%arga: tensor<1024xf32>, %scale: f32, %argx: tensor<1024xf32>) -> tensor<1024xf32> {
  %0 = linalg.generic #trait_scale_d
    ins(%arga: tensor<1024xf32>)
-    outs(%arga: tensor<1024xf32>) {
-      ^bb(%a: f32, %s : f32):
+    outs(%argx: tensor<1024xf32>) {
+      ^bb(%a: f32, %x: f32):
        %0 = mulf %a, %scale : f32
        linalg.yield %0 : f32
  } -> tensor<1024xf32>
@ -134,11 +134,11 @@ func @scale_d(%arga: tensor<1024xf32>, %scale: f32) -> tensor<1024xf32> {
 // CHECK-VEC2:       }
 // CHECK-VEC2:       return
 //
-func @mul_s(%arga: tensor<1024xf32>, %argb: tensor<1024xf32>) -> tensor<1024xf32> {
+func @mul_s(%arga: tensor<1024xf32>, %argb: tensor<1024xf32>, %argx: tensor<1024xf32>) -> tensor<1024xf32> {
  %0 = linalg.generic #trait_mul_s
    ins(%arga, %argb: tensor<1024xf32>, tensor<1024xf32>)
-    outs(%arga: tensor<1024xf32>) {
-      ^bb(%a: f32, %b: f32, %s : f32):
+    outs(%argx: tensor<1024xf32>) {
+      ^bb(%a: f32, %b: f32, %x: f32):
        %0 = mulf %a, %b : f32
        linalg.yield %0 : f32
  } -> tensor<1024xf32>
@ -208,7 +208,7 @@ func @reduction_d(%arga: tensor<1024xf32>, %argb: tensor<1024xf32>, %argx: tenso
  %0 = linalg.generic #trait_reduction_d
    ins(%arga, %argb: tensor<1024xf32>, tensor<1024xf32>)
    outs(%argx: tensor<f32>) {
-      ^bb(%a: f32, %b : f32, %x : f32):
+      ^bb(%a: f32, %b: f32, %x: f32):
        %0 = mulf %a, %b : f32
        %1 = addf %x, %0 : f32
        linalg.yield %1 : f32
@ -288,8 +288,8 @@ func @reduction_d(%arga: tensor<1024xf32>, %argb: tensor<1024xf32>, %argx: tenso
 // CHECK-VEC2:         scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c16]] {
 // CHECK-VEC2:           %[[sub:.*]] = subi %[[s]], %[[j]] : index
 // CHECK-VEC2:           %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
-// CHECK-VEC2:           %[[lj:.*]] = vector.maskedload %{{.*}}[%arg3], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
-// CHECK-VEC2:           %[[la:.*]] = vector.maskedload %{{.*}}[%arg3], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+// CHECK-VEC2:           %[[lj:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
+// CHECK-VEC2:           %[[la:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
 // CHECK-VEC2:           %[[lb:.*]] = vector.gather %{{.*}}[%[[lj]]], %[[mask]], %{{.*}} : memref<512x1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
 // CHECK-VEC2:           %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
 // CHECK-VEC2:           vector.scatter %{{.*}}[%[[lj]]], %[[mask]], %[[m]] : memref<512x1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32>
@ -297,11 +297,11 @@ func @reduction_d(%arga: tensor<1024xf32>, %argb: tensor<1024xf32>, %argx: tenso
 // CHECK-VEC2:       }
 // CHECK-VEC2:       return
 //
-func @mul_ds(%arga: tensor<512x1024xf32>, %argb: tensor<512x1024xf32>) -> tensor<512x1024xf32> {
+func @mul_ds(%arga: tensor<512x1024xf32>, %argb: tensor<512x1024xf32>, %argx: tensor<512x1024xf32>) -> tensor<512x1024xf32> {
  %0 = linalg.generic #trait_mul_ds
    ins(%arga, %argb: tensor<512x1024xf32>, tensor<512x1024xf32>)
-    outs(%arga: tensor<512x1024xf32>) {
-      ^bb(%a: f32, %b: f32, %s : f32):
+    outs(%argx: tensor<512x1024xf32>) {
+      ^bb(%a: f32, %b: f32, %x: f32):
        %0 = mulf %a, %b : f32
        linalg.yield %0 : f32
  } -> tensor<512x1024xf32>