[sparse][mlir] give all sparse kernels an explicit "output" tensor

Rationale:
Providing an output tensor, even if one is not used as input to
the kernel provides the right pattern for using lingalg sparse
kernels (in contrast with reusing a tensor just to provide the shape).
This prepares proper bufferization that will follow.

Reviewed By: bixia

Differential Revision: https://reviews.llvm.org/D95587
This commit is contained in:
Aart Bik 2021-01-28 10:24:33 -08:00
parent eae50bb210
commit 8af0ccf5a4
8 changed files with 2273 additions and 2224 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -9,18 +9,17 @@
[ "S" ], // a
[ "D" ] // x
],
iterator_types = ["parallel"]
iterator_types = ["parallel"],
doc = "x(i) = a(i) + b"
}
func @invalid_memref(%arga: memref<32xf32>, %argb: f32, %shape: tensor<32xf32>)
-> tensor<32xf32>
{
func @invalid_memref(%arga: memref<32xf32>, %argb: f32, %argx: tensor<32xf32>) -> tensor<32xf32> {
// expected-error@+1 {{'linalg.generic' op expected sparse annotations on tensors only}}
%0 = linalg.generic #trait_memref
ins(%arga: memref<32xf32>)
outs(%shape: tensor<32xf32>) {
^bb(%a: f32, %s: f32):
%0 = addf %a, %argb : f32
outs(%argx: tensor<32xf32>) {
^bb(%a: f32, %x: f32):
%0 = addf %a, %argb : f32
linalg.yield %0 : f32
} -> tensor<32xf32>
return %0 : tensor<32xf32>
@ -38,16 +37,17 @@ func @invalid_memref(%arga: memref<32xf32>, %argb: f32, %shape: tensor<32xf32>)
[ "S" ], // b
[ "D" ] // x
],
iterator_types = ["parallel"]
iterator_types = ["parallel"],
doc = "x(i) = a(i) + b"
}
func @invalid_too_many(%arga: tensor<32xf32>, %argb: f32) -> tensor<32xf32> {
func @invalid_too_many(%arga: tensor<32xf32>, %argb: f32, %argx: tensor<32xf32>) -> tensor<32xf32> {
// expected-error@+1 {{'linalg.generic' op expected one sparse annotation for each tensor}}
%0 = linalg.generic #trait_too_many
ins(%arga: tensor<32xf32>)
outs(%arga: tensor<32xf32>) {
^bb(%a: f32, %s: f32):
%0 = addf %a, %argb : f32
outs(%argx: tensor<32xf32>) {
^bb(%a: f32, %x: f32):
%0 = addf %a, %argb : f32
linalg.yield %0 : f32
} -> tensor<32xf32>
return %0 : tensor<32xf32>
@ -61,16 +61,17 @@ func @invalid_too_many(%arga: tensor<32xf32>, %argb: f32) -> tensor<32xf32> {
affine_map<(i) -> (i)> // x (out)
],
sparse = [ 1, 2 ],
iterator_types = ["parallel"]
iterator_types = ["parallel"],
doc = "x(i) = a(i) + b"
}
func @invalid_no_array(%arga: tensor<32xf32>, %argb: f32) -> tensor<32xf32> {
func @invalid_no_array(%arga: tensor<32xf32>, %argb: f32, %argx: tensor<32xf32>) -> tensor<32xf32> {
// expected-error@+1 {{'linalg.generic' op expected sparse annotation array for tensor 0}}
%0 = linalg.generic #trait_no_array
ins(%arga: tensor<32xf32>)
outs(%arga: tensor<32xf32>) {
^bb(%a: f32, %s: f32):
%0 = addf %a, %argb : f32
outs(%argx: tensor<32xf32>) {
^bb(%a: f32, %x: f32):
%0 = addf %a, %argb : f32
linalg.yield %0 : f32
} -> tensor<32xf32>
return %0 : tensor<32xf32>
@ -87,16 +88,17 @@ func @invalid_no_array(%arga: tensor<32xf32>, %argb: f32) -> tensor<32xf32> {
[ "S" ],
[ "D", "D" ]
],
iterator_types = ["parallel"]
iterator_types = ["parallel"],
doc = "x(i) = a(i) + b"
}
func @invalid_wrong_rank(%arga: tensor<32xf32>, %argb: f32) -> tensor<32xf32> {
func @invalid_wrong_rank(%arga: tensor<32xf32>, %argb: f32, %argx: tensor<32xf32>) -> tensor<32xf32> {
// expected-error@+1 {{'linalg.generic' op expected sparse annotation with rank 1 for tensor 1}}
%0 = linalg.generic #trait_wrong_rank
ins(%arga: tensor<32xf32>)
outs(%arga: tensor<32xf32>) {
^bb(%a: f32, %s: f32):
%0 = addf %a, %argb : f32
outs(%argx: tensor<32xf32>) {
^bb(%a: f32, %x: f32):
%0 = addf %a, %argb : f32
linalg.yield %0 : f32
} -> tensor<32xf32>
return %0 : tensor<32xf32>
@ -113,16 +115,17 @@ func @invalid_wrong_rank(%arga: tensor<32xf32>, %argb: f32) -> tensor<32xf32> {
[ "S", 1 ],
[ "D", "D" ]
],
iterator_types = ["parallel","parallel"]
iterator_types = ["parallel","parallel"],
doc = "x(i,j) = a(i,j) + b"
}
func @invalid_no_string(%arga: tensor<32x16xf32>, %argb: f32) -> tensor<32x16xf32> {
func @invalid_no_string(%arga: tensor<32x16xf32>, %argb: f32, %argx: tensor<32x16xf32>) -> tensor<32x16xf32> {
// expected-error@+1 {{'linalg.generic' op expected sparse annotation at position 1 for tensor 0}}
%0 = linalg.generic #trait_no_string
ins(%arga: tensor<32x16xf32>)
outs(%arga: tensor<32x16xf32>) {
^bb(%a: f32, %s: f32):
%0 = addf %a, %argb : f32
outs(%argx: tensor<32x16xf32>) {
^bb(%a: f32, %x: f32):
%0 = addf %a, %argb : f32
linalg.yield %0 : f32
} -> tensor<32x16xf32>
return %0 : tensor<32x16xf32>
@ -139,16 +142,17 @@ func @invalid_no_string(%arga: tensor<32x16xf32>, %argb: f32) -> tensor<32x16xf3
[ "S", "S" ],
[ "D", "X" ]
],
iterator_types = ["parallel","parallel"]
iterator_types = ["parallel","parallel"],
doc = "x(i,j) = a(i,j) + b"
}
func @invalid_wrong_symbol(%arga: tensor<32x16xf32>, %argb: f32) -> tensor<32x16xf32> {
func @invalid_wrong_symbol(%arga: tensor<32x16xf32>, %argb: f32, %argx: tensor<32x16xf32>) -> tensor<32x16xf32> {
// expected-error@+1 {{'linalg.generic' op expected sparse annotation at position 1 for tensor 1}}
%0 = linalg.generic #trait_wrong_symbol
ins(%arga: tensor<32x16xf32>)
outs(%arga: tensor<32x16xf32>) {
^bb(%a: f32, %s: f32):
%0 = addf %a, %argb : f32
outs(%argx: tensor<32x16xf32>) {
^bb(%a: f32, %x: f32):
%0 = addf %a, %argb : f32
linalg.yield %0 : f32
} -> tensor<32x16xf32>
return %0 : tensor<32x16xf32>
@ -165,16 +169,17 @@ func @invalid_wrong_symbol(%arga: tensor<32x16xf32>, %argb: f32) -> tensor<32x16
[ "S", "S" ],
[ "D", "S" ]
],
iterator_types = ["parallel","parallel"]
iterator_types = ["parallel","parallel"],
doc = "x(i,j) = a(i,j) + b"
}
func @invalid_no_sparse_output(%arga: tensor<32x16xf32>, %argb: f32) -> tensor<32x16xf32> {
func @invalid_no_sparse_output(%arga: tensor<32x16xf32>, %argb: f32, %argx: tensor<32x16xf32>) -> tensor<32x16xf32> {
// expected-error@+1 {{'linalg.generic' op sparse output tensors not supported (yet)}}
%0 = linalg.generic #trait_no_sparse_output
ins(%arga: tensor<32x16xf32>)
outs(%arga: tensor<32x16xf32>) {
^bb(%a: f32, %s: f32):
%0 = addf %a, %argb : f32
outs(%argx: tensor<32x16xf32>) {
^bb(%a: f32, %x: f32):
%0 = addf %a, %argb : f32
linalg.yield %0 : f32
} -> tensor<32x16xf32>
return %0 : tensor<32x16xf32>

View File

@ -20,54 +20,55 @@
}
// CHECK-LABEL: func @mul(
// CHECK-SAME: %[[VAL_0:.*]]: tensor<100x200x300x400x500x600x700x800xf32>,
// CHECK-SAME: %[[VAL_1:.*]]: tensor<100x200x300x400x500x600x700x800xf32>) -> tensor<100x200x300x400x500x600x700x800xf32> {
// CHECK: %[[VAL_2:.*]] = constant 999 : index
// CHECK: %[[VAL_3:.*]] = constant 100 : index
// CHECK: %[[VAL_4:.*]] = constant 200 : index
// CHECK: %[[VAL_5:.*]] = constant 300 : index
// CHECK: %[[VAL_6:.*]] = constant 600 : index
// CHECK: %[[VAL_7:.*]] = constant 700 : index
// CHECK: %[[VAL_8:.*]] = constant 800 : index
// CHECK: %[[VAL_9:.*]] = constant 0 : index
// CHECK: %[[VAL_10:.*]] = constant 1 : index
// CHECK: %[[VAL_11:.*]] = alloca() : memref<100x200x300x400x500x600x700x800xf32>
// CHECK: %[[VAL_12:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
// CHECK: %[[VAL_13:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
// CHECK: %[[VAL_14:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
// CHECK: %[[VAL_15:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
// CHECK: %[[VAL_16:.*]] = alloca(%[[VAL_2]]) : memref<?xf32>
// CHECK: %[[VAL_17:.*]] = alloca() : memref<100x200x300x400x500x600x700x800xf32>
// CHECK: scf.for %[[VAL_18:.*]] = %[[VAL_9]] to %[[VAL_8]] step %[[VAL_10]] {
// CHECK: scf.for %[[VAL_19:.*]] = %[[VAL_9]] to %[[VAL_7]] step %[[VAL_10]] {
// CHECK: %[[VAL_20:.*]] = muli %[[VAL_18]], %[[VAL_7]] : index
// CHECK: %[[VAL_21:.*]] = addi %[[VAL_20]], %[[VAL_19]] : index
// CHECK: scf.for %[[VAL_22:.*]] = %[[VAL_9]] to %[[VAL_6]] step %[[VAL_10]] {
// CHECK: %[[VAL_23:.*]] = muli %[[VAL_21]], %[[VAL_6]] : index
// CHECK: %[[VAL_24:.*]] = addi %[[VAL_23]], %[[VAL_22]] : index
// CHECK: %[[VAL_25:.*]] = load %[[VAL_12]]{{\[}}%[[VAL_24]]] : memref<?xindex>
// CHECK: %[[VAL_26:.*]] = addi %[[VAL_24]], %[[VAL_10]] : index
// CHECK: %[[VAL_27:.*]] = load %[[VAL_12]]{{\[}}%[[VAL_26]]] : memref<?xindex>
// CHECK: scf.for %[[VAL_28:.*]] = %[[VAL_25]] to %[[VAL_27]] step %[[VAL_10]] {
// CHECK: %[[VAL_29:.*]] = load %[[VAL_13]]{{\[}}%[[VAL_28]]] : memref<?xindex>
// CHECK: %[[VAL_30:.*]] = load %[[VAL_14]]{{\[}}%[[VAL_28]]] : memref<?xindex>
// CHECK: %[[VAL_31:.*]] = addi %[[VAL_28]], %[[VAL_10]] : index
// CHECK: %[[VAL_32:.*]] = load %[[VAL_14]]{{\[}}%[[VAL_31]]] : memref<?xindex>
// CHECK: scf.for %[[VAL_33:.*]] = %[[VAL_30]] to %[[VAL_32]] step %[[VAL_10]] {
// CHECK: %[[VAL_34:.*]] = load %[[VAL_15]]{{\[}}%[[VAL_33]]] : memref<?xindex>
// CHECK: scf.for %[[VAL_35:.*]] = %[[VAL_9]] to %[[VAL_5]] step %[[VAL_10]] {
// CHECK: %[[VAL_36:.*]] = muli %[[VAL_33]], %[[VAL_5]] : index
// CHECK: %[[VAL_37:.*]] = addi %[[VAL_36]], %[[VAL_35]] : index
// CHECK: scf.for %[[VAL_38:.*]] = %[[VAL_9]] to %[[VAL_4]] step %[[VAL_10]] {
// CHECK: %[[VAL_39:.*]] = muli %[[VAL_37]], %[[VAL_4]] : index
// CHECK: %[[VAL_40:.*]] = addi %[[VAL_39]], %[[VAL_38]] : index
// CHECK: scf.for %[[VAL_41:.*]] = %[[VAL_9]] to %[[VAL_3]] step %[[VAL_10]] {
// CHECK: %[[VAL_42:.*]] = muli %[[VAL_40]], %[[VAL_3]] : index
// CHECK: %[[VAL_43:.*]] = addi %[[VAL_42]], %[[VAL_41]] : index
// CHECK: %[[VAL_44:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_41]], %[[VAL_38]], %[[VAL_35]], %[[VAL_34]], %[[VAL_29]], %[[VAL_22]], %[[VAL_19]], %[[VAL_18]]] : memref<100x200x300x400x500x600x700x800xf32>
// CHECK: %[[VAL_45:.*]] = load %[[VAL_16]]{{\[}}%[[VAL_43]]] : memref<?xf32>
// CHECK: %[[VAL_46:.*]] = mulf %[[VAL_44]], %[[VAL_45]] : f32
// CHECK: store %[[VAL_46]], %[[VAL_17]]{{\[}}%[[VAL_41]], %[[VAL_38]], %[[VAL_35]], %[[VAL_34]], %[[VAL_29]], %[[VAL_22]], %[[VAL_19]], %[[VAL_18]]] : memref<100x200x300x400x500x600x700x800xf32>
// CHECK-SAME: %[[VAL_0:.*0]]: tensor<100x200x300x400x500x600x700x800xf32>,
// CHECK-SAME: %[[VAL_1:.*1]]: tensor<100x200x300x400x500x600x700x800xf32>,
// CHECK-SAME: %[[VAL_2:.*2]]: tensor<100x200x300x400x500x600x700x800xf32>) -> tensor<100x200x300x400x500x600x700x800xf32> {
// CHECK: %[[VAL_3:.*]] = constant 999 : index
// CHECK: %[[VAL_4:.*]] = constant 100 : index
// CHECK: %[[VAL_5:.*]] = constant 200 : index
// CHECK: %[[VAL_6:.*]] = constant 300 : index
// CHECK: %[[VAL_7:.*]] = constant 600 : index
// CHECK: %[[VAL_8:.*]] = constant 700 : index
// CHECK: %[[VAL_9:.*]] = constant 800 : index
// CHECK: %[[VAL_10:.*]] = constant 0 : index
// CHECK: %[[VAL_11:.*]] = constant 1 : index
// CHECK: %[[VAL_12:.*]] = alloca() : memref<100x200x300x400x500x600x700x800xf32>
// CHECK: %[[VAL_13:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
// CHECK: %[[VAL_14:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
// CHECK: %[[VAL_15:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
// CHECK: %[[VAL_16:.*]] = alloca(%[[VAL_3]]) : memref<?xindex>
// CHECK: %[[VAL_17:.*]] = alloca(%[[VAL_3]]) : memref<?xf32>
// CHECK: %[[VAL_18:.*]] = alloca() : memref<100x200x300x400x500x600x700x800xf32>
// CHECK: scf.for %[[VAL_19:.*]] = %[[VAL_10]] to %[[VAL_9]] step %[[VAL_11]] {
// CHECK: scf.for %[[VAL_20:.*]] = %[[VAL_10]] to %[[VAL_8]] step %[[VAL_11]] {
// CHECK: %[[VAL_21:.*]] = muli %[[VAL_19]], %[[VAL_8]] : index
// CHECK: %[[VAL_22:.*]] = addi %[[VAL_21]], %[[VAL_20]] : index
// CHECK: scf.for %[[VAL_23:.*]] = %[[VAL_10]] to %[[VAL_7]] step %[[VAL_11]] {
// CHECK: %[[VAL_24:.*]] = muli %[[VAL_22]], %[[VAL_7]] : index
// CHECK: %[[VAL_25:.*]] = addi %[[VAL_24]], %[[VAL_23]] : index
// CHECK: %[[VAL_26:.*]] = load %[[VAL_13]]{{\[}}%[[VAL_25]]] : memref<?xindex>
// CHECK: %[[VAL_27:.*]] = addi %[[VAL_25]], %[[VAL_11]] : index
// CHECK: %[[VAL_28:.*]] = load %[[VAL_13]]{{\[}}%[[VAL_27]]] : memref<?xindex>
// CHECK: scf.for %[[VAL_29:.*]] = %[[VAL_26]] to %[[VAL_28]] step %[[VAL_11]] {
// CHECK: %[[VAL_30:.*]] = load %[[VAL_14]]{{\[}}%[[VAL_29]]] : memref<?xindex>
// CHECK: %[[VAL_31:.*]] = load %[[VAL_15]]{{\[}}%[[VAL_29]]] : memref<?xindex>
// CHECK: %[[VAL_32:.*]] = addi %[[VAL_29]], %[[VAL_11]] : index
// CHECK: %[[VAL_33:.*]] = load %[[VAL_15]]{{\[}}%[[VAL_32]]] : memref<?xindex>
// CHECK: scf.for %[[VAL_34:.*]] = %[[VAL_31]] to %[[VAL_33]] step %[[VAL_11]] {
// CHECK: %[[VAL_35:.*]] = load %[[VAL_16]]{{\[}}%[[VAL_34]]] : memref<?xindex>
// CHECK: scf.for %[[VAL_36:.*]] = %[[VAL_10]] to %[[VAL_6]] step %[[VAL_11]] {
// CHECK: %[[VAL_37:.*]] = muli %[[VAL_34]], %[[VAL_6]] : index
// CHECK: %[[VAL_38:.*]] = addi %[[VAL_37]], %[[VAL_36]] : index
// CHECK: scf.for %[[VAL_39:.*]] = %[[VAL_10]] to %[[VAL_5]] step %[[VAL_11]] {
// CHECK: %[[VAL_40:.*]] = muli %[[VAL_38]], %[[VAL_5]] : index
// CHECK: %[[VAL_41:.*]] = addi %[[VAL_40]], %[[VAL_39]] : index
// CHECK: scf.for %[[VAL_42:.*]] = %[[VAL_10]] to %[[VAL_4]] step %[[VAL_11]] {
// CHECK: %[[VAL_43:.*]] = muli %[[VAL_41]], %[[VAL_4]] : index
// CHECK: %[[VAL_44:.*]] = addi %[[VAL_43]], %[[VAL_42]] : index
// CHECK: %[[VAL_45:.*]] = load %[[VAL_12]]{{\[}}%[[VAL_42]], %[[VAL_39]], %[[VAL_36]], %[[VAL_35]], %[[VAL_30]], %[[VAL_23]], %[[VAL_20]], %[[VAL_19]]] : memref<100x200x300x400x500x600x700x800xf32>
// CHECK: %[[VAL_46:.*]] = load %[[VAL_17]]{{\[}}%[[VAL_44]]] : memref<?xf32>
// CHECK: %[[VAL_47:.*]] = mulf %[[VAL_45]], %[[VAL_46]] : f32
// CHECK: store %[[VAL_47]], %[[VAL_18]]{{\[}}%[[VAL_42]], %[[VAL_39]], %[[VAL_36]], %[[VAL_35]], %[[VAL_30]], %[[VAL_23]], %[[VAL_20]], %[[VAL_19]]] : memref<100x200x300x400x500x600x700x800xf32>
// CHECK: }
// CHECK: }
// CHECK: }
@ -76,17 +77,18 @@
// CHECK: }
// CHECK: }
// CHECK: }
// CHECK: %[[VAL_47:.*]] = tensor_load %[[VAL_17]] : memref<100x200x300x400x500x600x700x800xf32>
// CHECK: return %[[VAL_47]] : tensor<100x200x300x400x500x600x700x800xf32>
// CHECK: %[[VAL_48:.*]] = tensor_load %[[VAL_18]] : memref<100x200x300x400x500x600x700x800xf32>
// CHECK: return %[[VAL_48]] : tensor<100x200x300x400x500x600x700x800xf32>
// CHECK: }
func @mul(%arga: tensor<100x200x300x400x500x600x700x800xf32>,
%argb: tensor<100x200x300x400x500x600x700x800xf32>)
%argb: tensor<100x200x300x400x500x600x700x800xf32>,
%argx: tensor<100x200x300x400x500x600x700x800xf32>)
-> tensor<100x200x300x400x500x600x700x800xf32> {
%0 = linalg.generic #trait_mul
ins(%arga, %argb: tensor<100x200x300x400x500x600x700x800xf32>,
tensor<100x200x300x400x500x600x700x800xf32>)
outs(%arga: tensor<100x200x300x400x500x600x700x800xf32>) {
^bb(%a: f32, %b: f32, %s : f32):
outs(%argx: tensor<100x200x300x400x500x600x700x800xf32>) {
^bb(%a: f32, %b: f32, %x: f32):
%0 = mulf %a, %b : f32
linalg.yield %0 : f32
} -> tensor<100x200x300x400x500x600x700x800xf32>

View File

@ -48,12 +48,12 @@
// CHECK-PAR4: scf.parallel
// CHECK-PAR4: return
//
func @scale_dd(%scale: f32, %arga: tensor<?x?xf32>) -> tensor<?x?xf32> {
func @scale_dd(%scale: f32, %arga: tensor<?x?xf32>, %argx: tensor<?x?xf32>) -> tensor<?x?xf32> {
%0 = linalg.generic #trait_dd
ins(%arga: tensor<?x?xf32>)
outs(%arga: tensor<?x?xf32>) {
^bb(%a: f32, %s: f32):
%0 = mulf %a, %scale : f32
outs(%argx: tensor<?x?xf32>) {
^bb(%a: f32, %x: f32):
%0 = mulf %a, %scale : f32
linalg.yield %0 : f32
} -> tensor<?x?xf32>
return %0 : tensor<?x?xf32>
@ -98,12 +98,12 @@ func @scale_dd(%scale: f32, %arga: tensor<?x?xf32>) -> tensor<?x?xf32> {
// CHECK-PAR4: scf.parallel
// CHECK-PAR4: return
//
func @scale_ss(%scale: f32, %arga: tensor<?x?xf32>) -> tensor<?x?xf32> {
func @scale_ss(%scale: f32, %arga: tensor<?x?xf32>, %argx: tensor<?x?xf32>) -> tensor<?x?xf32> {
%0 = linalg.generic #trait_ss
ins(%arga: tensor<?x?xf32>)
outs(%arga: tensor<?x?xf32>) {
^bb(%a: f32, %s: f32):
%0 = mulf %a, %scale : f32
outs(%argx: tensor<?x?xf32>) {
^bb(%a: f32, %x: f32):
%0 = mulf %a, %scale : f32
linalg.yield %0 : f32
} -> tensor<?x?xf32>
return %0 : tensor<?x?xf32>
@ -153,7 +153,7 @@ func @scale_ss(%scale: f32, %arga: tensor<?x?xf32>) -> tensor<?x?xf32> {
func @matvec(%argA: tensor<16x32xf32>, %argb: tensor<32xf32>, %argx: tensor<16xf32>) -> tensor<16xf32> {
%0 = linalg.generic #trait_matvec
ins(%argA, %argb : tensor<16x32xf32>, tensor<32xf32>)
outs(%argx : tensor<16xf32>) {
outs(%argx: tensor<16xf32>) {
^bb(%A: f32, %b: f32, %x: f32):
%0 = mulf %A, %b : f32
%1 = addf %0, %x : f32

View File

@ -122,14 +122,13 @@
// CHECK-TYPE5: store %[[MUL]], %{{.*}}[%[[INDC]]] : memref<32xf64>
// CHECK-TYPE5: }
func @mul_dd(%arga: tensor<32xf64>, %argb: tensor<32xf64>) -> tensor<32xf64> {
func @mul_dd(%arga: tensor<32xf64>, %argb: tensor<32xf64>, %argx: tensor<32xf64>) -> tensor<32xf64> {
%0 = linalg.generic #trait_mul_1d
ins(%arga, %argb: tensor<32xf64>, tensor<32xf64>)
outs(%arga : tensor<32xf64>) {
^bb(%a: f64, %b: f64, %s: f64):
%0 = mulf %a, %b : f64
outs(%argx: tensor<32xf64>) {
^bb(%a: f64, %b: f64, %x: f64):
%0 = mulf %a, %b : f64
linalg.yield %0 : f64
} -> tensor<32xf64>
return %0 : tensor<32xf64>
}

View File

@ -54,11 +54,11 @@
// CHECK-VEC2: }
// CHECK-VEC2: return
//
func @scale_d(%arga: tensor<1024xf32>, %scale: f32) -> tensor<1024xf32> {
func @scale_d(%arga: tensor<1024xf32>, %scale: f32, %argx: tensor<1024xf32>) -> tensor<1024xf32> {
%0 = linalg.generic #trait_scale_d
ins(%arga: tensor<1024xf32>)
outs(%arga: tensor<1024xf32>) {
^bb(%a: f32, %s : f32):
outs(%argx: tensor<1024xf32>) {
^bb(%a: f32, %x: f32):
%0 = mulf %a, %scale : f32
linalg.yield %0 : f32
} -> tensor<1024xf32>
@ -134,11 +134,11 @@ func @scale_d(%arga: tensor<1024xf32>, %scale: f32) -> tensor<1024xf32> {
// CHECK-VEC2: }
// CHECK-VEC2: return
//
func @mul_s(%arga: tensor<1024xf32>, %argb: tensor<1024xf32>) -> tensor<1024xf32> {
func @mul_s(%arga: tensor<1024xf32>, %argb: tensor<1024xf32>, %argx: tensor<1024xf32>) -> tensor<1024xf32> {
%0 = linalg.generic #trait_mul_s
ins(%arga, %argb: tensor<1024xf32>, tensor<1024xf32>)
outs(%arga: tensor<1024xf32>) {
^bb(%a: f32, %b: f32, %s : f32):
outs(%argx: tensor<1024xf32>) {
^bb(%a: f32, %b: f32, %x: f32):
%0 = mulf %a, %b : f32
linalg.yield %0 : f32
} -> tensor<1024xf32>
@ -208,7 +208,7 @@ func @reduction_d(%arga: tensor<1024xf32>, %argb: tensor<1024xf32>, %argx: tenso
%0 = linalg.generic #trait_reduction_d
ins(%arga, %argb: tensor<1024xf32>, tensor<1024xf32>)
outs(%argx: tensor<f32>) {
^bb(%a: f32, %b : f32, %x : f32):
^bb(%a: f32, %b: f32, %x: f32):
%0 = mulf %a, %b : f32
%1 = addf %x, %0 : f32
linalg.yield %1 : f32
@ -288,8 +288,8 @@ func @reduction_d(%arga: tensor<1024xf32>, %argb: tensor<1024xf32>, %argx: tenso
// CHECK-VEC2: scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c16]] {
// CHECK-VEC2: %[[sub:.*]] = subi %[[s]], %[[j]] : index
// CHECK-VEC2: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
// CHECK-VEC2: %[[lj:.*]] = vector.maskedload %{{.*}}[%arg3], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
// CHECK-VEC2: %[[la:.*]] = vector.maskedload %{{.*}}[%arg3], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
// CHECK-VEC2: %[[lj:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
// CHECK-VEC2: %[[la:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
// CHECK-VEC2: %[[lb:.*]] = vector.gather %{{.*}}[%[[lj]]], %[[mask]], %{{.*}} : memref<512x1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
// CHECK-VEC2: %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
// CHECK-VEC2: vector.scatter %{{.*}}[%[[lj]]], %[[mask]], %[[m]] : memref<512x1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32>
@ -297,11 +297,11 @@ func @reduction_d(%arga: tensor<1024xf32>, %argb: tensor<1024xf32>, %argx: tenso
// CHECK-VEC2: }
// CHECK-VEC2: return
//
func @mul_ds(%arga: tensor<512x1024xf32>, %argb: tensor<512x1024xf32>) -> tensor<512x1024xf32> {
func @mul_ds(%arga: tensor<512x1024xf32>, %argb: tensor<512x1024xf32>, %argx: tensor<512x1024xf32>) -> tensor<512x1024xf32> {
%0 = linalg.generic #trait_mul_ds
ins(%arga, %argb: tensor<512x1024xf32>, tensor<512x1024xf32>)
outs(%arga: tensor<512x1024xf32>) {
^bb(%a: f32, %b: f32, %s : f32):
outs(%argx: tensor<512x1024xf32>) {
^bb(%a: f32, %b: f32, %x: f32):
%0 = mulf %a, %b : f32
linalg.yield %0 : f32
} -> tensor<512x1024xf32>