iree-org · benvanik · Jan 17, 2024 · Jan 17, 2024
@@ -51,33 +51,6 @@ module attributes {hal.device.targets = [#device_target_cuda]} {
       }
     }
   }
-  func.func @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
-    %c1310720 = arith.constant 1310720 : index
-    %c5242880 = arith.constant 5242880 : index
-    %c13107200 = arith.constant 13107200 : index
-    %c0 = arith.constant 0 : index
-    %c320 = arith.constant 320 : index
-    %c553648160_i32 = arith.constant 553648160 : i32
-    %c1_i32 = arith.constant 1 : i32
-    %c128 = arith.constant 128 : index
-    %c80 = arith.constant 80 : index
-    %c32 = arith.constant 32 : index
-    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input 0") shape([%c128, %c80, %c32]) type(%c553648160_i32) encoding(%c1_i32)
-    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<128x80x32xf32> in !stream.resource<external>{%c1310720}
-    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input 1") shape([%c128, %c32, %c320]) type(%c553648160_i32) encoding(%c1_i32)
-    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<128x32x320xf32> in !stream.resource<external>{%c5242880}
-    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c13107200}
-    %3 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c1310720}, %1 as %arg4: !stream.resource<external>{%c5242880}, %2 as %arg5: !stream.resource<external>{%c13107200}) {
-      stream.cmd.dispatch @batch_matmul_dispatch_0::@cuda_nvptx_fb::@batch_matmul_dispatch_0_generic_128x80x320x32_f32 {
-        ro %arg3[%c0 for %c1310720] : !stream.resource<external>{%c1310720},
-        ro %arg4[%c0 for %c5242880] : !stream.resource<external>{%c5242880},
-        wo %arg5[%c0 for %c13107200] : !stream.resource<external>{%c13107200}
-      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]}
-    } => !stream.timepoint
-    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c13107200}
-    %5 = stream.tensor.export %4 : tensor<128x80x320xf32> in !stream.resource<external>{%c13107200} -> !hal.buffer_view
-    return %5 : !hal.buffer_view
-  }
 }
 
 

@@ -96,17 +96,14 @@ func.func @complex_create(%real : f32, %imag : f32, %input: tensor<4x2xcomplex<f
 
 // -----
 
-#map = affine_map<() -> ()>
-func.func @use_in_dispatch_count(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view {
+func.func @use_in_dispatch_count(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<i32> {
   %c1 = arith.constant 1 : index
   %c2_i32 = arith.constant 2 : i32
   %c0 = arith.constant 0 : index
-  %0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<1xi32>
-  %1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<1xi32>
   %2 = tensor.empty() : tensor<i32>
-  %extracted = tensor.extract %0[%c1] : tensor<1xi32>
+  %extracted = tensor.extract %arg0[%c1] : tensor<1xi32>
   %4 = flow.dispatch.region -> (tensor<i32>) {
-    %6 = linalg.generic {indexing_maps = [#map], iterator_types = []} outs(%2 : tensor<i32>) {
+    %6 = linalg.generic {indexing_maps = [affine_map<() -> ()>], iterator_types = []} outs(%2 : tensor<i32>) {
     ^bb0(%out: i32):
       %7 = arith.addi %extracted, %c2_i32 : i32
       linalg.yield %7 : i32
@@ -115,8 +112,7 @@ func.func @use_in_dispatch_count(%arg0: !hal.buffer_view, %arg1: !hal.buffer_vie
   } count() -> (index, index, index) {
     flow.return %c1, %c1, %c1 : index, index, index
   }
-  %5 = hal.tensor.export %4 "output 0" : tensor<i32> -> !hal.buffer_view
-  return %5 : !hal.buffer_view
+  return %4 : tensor<i32>
 }
 
 
@@ -194,16 +190,16 @@ module {
   func.func @clone_dequantization_like(%arg0: tensor<32x1x16x1x8xi16>, %arg1: tensor<32x344x16x32x8xi4>) -> tensor<32x1x344x1x32xi32> {
     %c0_i32 = arith.constant 0 : i32
     %0 = tensor.empty() : tensor<32x1x16x1x8xi32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], 
-                         iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} 
+    %1 = linalg.generic {indexing_maps = [#map, #map],
+                         iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]}
                          ins(%arg0 : tensor<32x1x16x1x8xi16>) outs(%0 : tensor<32x1x16x1x8xi32>) {
     ^bb0(%in: i16, %out: i32):
       %7 = arith.extsi %in : i16 to i32
       linalg.yield %7 : i32
     } -> tensor<32x1x16x1x8xi32>
     %2 = tensor.empty() : tensor<32x344x16x32x8xi32>
-    %3 = linalg.generic {indexing_maps = [#map, #map], 
-                         iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} 
+    %3 = linalg.generic {indexing_maps = [#map, #map],
+                         iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]}
                          ins(%arg1 : tensor<32x344x16x32x8xi4>) outs(%2 : tensor<32x344x16x32x8xi32>) {
     ^bb0(%in: i4, %out: i32):
       %7 = arith.extui %in : i4 to i32

@@ -459,24 +459,22 @@ func.func @collapse12() -> (!type,!type,!type,!type) {
 
 // -----
 
-func.func @multi_reduce_dim(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
+func.func @multi_reduce_dim(%arg0: tensor<2x32x10x4096xf32>) -> tensor<2x32x1x1xf32> {
   %cst = arith.constant -0.000000e+00 : f32
-  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<2x32x10x4096xf32>
   %1 = tensor.empty() : tensor<2x32xf32>
   %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<2x32xf32>) -> tensor<2x32xf32>
-  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%0 : tensor<2x32x10x4096xf32>) outs(%2 : tensor<2x32xf32>) {
+  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%arg0 : tensor<2x32x10x4096xf32>) outs(%2 : tensor<2x32xf32>) {
   ^bb0(%arg1: f32, %arg2: f32):
     %6 = arith.addf %arg1, %arg2 : f32
     linalg.yield %6 : f32
   } -> tensor<2x32xf32>
   %4 = tensor.expand_shape %3 [[0], [1, 2, 3]] : tensor<2x32xf32> into tensor<2x32x1x1xf32>
-  %5 = hal.tensor.export %4 : tensor<2x32x1x1xf32> -> !hal.buffer_view
-  return %5 : !hal.buffer_view
+  return %4 : tensor<2x32x1x1xf32>
 }
 
 // Check that we collapse dimensions.
-// CHECK-LABEL: @multi_reduce_dim(
-//   CHECK-DAG:   %[[ARG0:.+]] = hal.tensor.import
+// CHECK-LABEL: @multi_reduce_dim
+//  CHECK-SAME: (%[[ARG0:.+]]: tensor<2x32x10x4096xf32>)
 //       CHECK:   %[[COLLAPSE:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}[0, 1], [2, 3]{{\]}}
 //       CHECK:   %[[DISPATCH:.+]] = flow.dispatch.region
 //       CHECK:     %[[EMPTY:.+]] = tensor.empty() : tensor<64xf32>

@@ -1,18 +1,16 @@
 // RUN: iree-opt --split-input-file -iree-flow-collapse-dims %s | FileCheck %s
 
-func.func @multi_reduce_dim(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
+func.func @multi_reduce_dim(%arg0: tensor<2x32x10x4096xf32>) -> tensor<2x32x1x1xf32> {
   %cst = arith.constant -0.000000e+00 : f32
-  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<2x32x10x4096xf32>
   %1 = tensor.empty() : tensor<2x32xf32>
   %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<2x32xf32>) -> tensor<2x32xf32>
-  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%0 : tensor<2x32x10x4096xf32>) outs(%2 : tensor<2x32xf32>) {
+  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%arg0 : tensor<2x32x10x4096xf32>) outs(%2 : tensor<2x32xf32>) {
   ^bb0(%arg1: f32, %arg2: f32):
     %6 = arith.addf %arg1, %arg2 : f32
     linalg.yield %6 : f32
   } -> tensor<2x32xf32>
   %4 = tensor.expand_shape %3 [[0], [1, 2, 3]] : tensor<2x32xf32> into tensor<2x32x1x1xf32>
-  %5 = hal.tensor.export %4 : tensor<2x32x1x1xf32> -> !hal.buffer_view
-  return %5 : !hal.buffer_view
+  return %4 : tensor<2x32x1x1xf32>
 }
 
 // Check that we collapse dimensions.
@@ -42,22 +40,20 @@ func.func @input_broadcast(%arg0: tensor<4x8xf32>, %arg1: tensor<4xf32>) -> tens
 
 // Collapsing should not happen to ops in flow.dispatch.region or flow.dispatch.workgroups
 
-func.func @multi_reduce_dim_dispatch(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
+func.func @multi_reduce_dim_dispatch(%arg0: tensor<2x32x10x4096xf32>) -> tensor<2x32x1x1xf32> {
   %cst = arith.constant -0.000000e+00 : f32
-  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<2x32x10x4096xf32>
   %1 = tensor.empty() : tensor<2x32xf32>
   %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<2x32xf32>) -> tensor<2x32xf32>
   %3 = flow.dispatch.region -> (tensor<2x32xf32>) {
-    %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%0 : tensor<2x32x10x4096xf32>) outs(%2 : tensor<2x32xf32>) {
+    %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%arg0 : tensor<2x32x10x4096xf32>) outs(%2 : tensor<2x32xf32>) {
     ^bb0(%arg1: f32, %arg2: f32):
       %7 = arith.addf %arg1, %arg2 : f32
       linalg.yield %7 : f32
     } -> tensor<2x32xf32>
     flow.return %6 : tensor<2x32xf32>
   }
   %4 = tensor.expand_shape %3 [[0], [1, 2, 3]] : tensor<2x32xf32> into tensor<2x32x1x1xf32>
-  %5 = hal.tensor.export %4 : tensor<2x32x1x1xf32> -> !hal.buffer_view
-  return %5 : !hal.buffer_view
+  return %4 : tensor<2x32x1x1xf32>
 }
 
 // CHECK: @multi_reduce_dim_dispatch

@@ -790,7 +790,7 @@ func.func @dynamic_slice(%arg0: tensor<?x?xi32>, %arg1: tensor<i32>, %arg2: tens
 
 // -----
 
-func.func @dynamic_dot() -> !hal.buffer_view attributes {iree.abi.stub} {
+func.func @dynamic_dot() -> tensor<?x?xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %cst = arith.constant 0.000000e+00 : f32
@@ -801,10 +801,7 @@ func.func @dynamic_dot() -> !hal.buffer_view attributes {iree.abi.stub} {
   %4 = tensor.empty(%2, %3) : tensor<?x?xf32>
   %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
   %6 = linalg.matmul ins(%0, %1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%5 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  %7 = tensor.dim %6, %c0 : tensor<?x?xf32>
-  %8 = tensor.dim %6, %c1 : tensor<?x?xf32>
-  %9 = hal.tensor.export %6 : tensor<?x?xf32>{%7, %8} -> !hal.buffer_view
-  return %9 : !hal.buffer_view
+  return %6 : tensor<?x?xf32>
 }
 // CHECK-LABEL: func.func @dynamic_dot()
 //   CHECK-NOT:    linalg.fill