From 8bf383e6920764cfe02736a238eae3ba4a616c32 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Fri, 27 May 2022 19:45:12 -0700
Subject: [PATCH 01/39] [Runtime][PipelineExecutor]  Tutorial of using pipeline
 executor.

Tutorial of using pipeline executor including the byoc use case.
---
 .../using_with_pipeline_executor.py           | 187 ++++++++++++++++++
 1 file changed, 187 insertions(+)
 create mode 100644 gallery/how_to/work_with_relay/using_with_pipeline_executor.py

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
new file mode 100644
index 000000000000..17f14fef72a3
--- /dev/null
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -0,0 +1,187 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Using Pipeline Executor in Relay
+=================================
+**Author**: `Hua Jiang <https://https://github.com/huajsj>`_
+
+This is a short tutorial on how to use Pipeline Executor with Relay.
+
+Relay uses TVM internally to generate target specific code. For example, with cuda backend TVM generates cuda kernels for all layers in the user provided network.
+But sometimes it is also helpful to incorporate external libraries developed by various vendors into Relay.
+Luckily, TVM has a mechanism to transparently call into these libraries.
+For Relay users, all we need to do is just to set a target string appropriately.
+
+Before we can use external libraries from Relay, your TVM needs to be built with libraries you want to use.
+For example, to use cuDNN, USE_CUDNN option in `cmake/config.cmake` needs to be enabled, and cuDNN include and library directories need to be specified if necessary.
+
+To begin with, we import Relay and TVM.
+"""
+import tvm
+from tvm import te
+import numpy as np
+from tvm.contrib import graph_executor as runtime
+from tvm import relay
+from tvm.relay import testing
+import tvm.testing
+import time
+
+######################################################################
+# Create a simple network, this network can be a pre-trained model too.
+# -----------------------
+# Let's create a very simple network for demonstration.
+# It consists of convolution, batch normalization, and ReLU activation.
+def get_network():
+    out_channels = 16
+    batch_size = 1
+    data = relay.var("data", relay.TensorType((batch_size, 3, 224, 224), "float32"))
+    weight = relay.var("weight")
+    second_weight = relay.var("second_weight")
+    bn_gamma = relay.var("bn_gamma")
+    bn_beta = relay.var("bn_beta")
+    bn_mmean = relay.var("bn_mean")
+    bn_mvar = relay.var("bn_var")
+    simple_net = relay.nn.conv2d(
+        data=data, weight=weight, kernel_size=(3, 3), channels=out_channels, padding=(1, 1)
+    )
+    simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0]
+    simple_net = relay.nn.relu(simple_net)
+    simple_net = relay.nn.conv2d(
+        data=simple_net,
+        weight=second_weight,
+        kernel_size=(3, 3),
+        channels=out_channels,
+        padding=(1, 1),
+    )
+    simple_net = relay.Function(relay.analysis.free_vars(simple_net), simple_net)
+    data_shape = (batch_size, 3, 224, 224)
+    net, params = testing.create_workload(simple_net)
+    return net, params, data_shape
+
+
+net, params, data_shape = get_network()
+######################################################################
+# Apply a customer graph splitting function.
+# -------------------------------
+# We use an testing linear graph splitting function as a example. User also can create their
+# own splitting function logic.
+import os
+
+os.sys.path.append(os.path.abspath("../../../tests/python/relay"))
+from test_pipeline_executor import graph_split
+
+# Splitting the network into two subgraphs.
+split_config = [{"op_name": "nn.relu", "op_index": 0}]
+subgraphs = graph_split(net["main"], split_config, params)
+##############################################################
+# The generated subgraphs should look something like below.
+##subgraphs[0])
+#
+# def @main(%data: Tensor[(1, 3, 224, 224), float32]) {
+#  %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, 224, 224), float32] */;
+#  %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float32] */, meta[relay.Constant][2] /* ty=Tensor[(16), float32]*/, meta[relay.Constant][3] /* ty=Tensor[(16), float32] */, meta[relay.Constant][4] /* ty=Tensor[(16), float32] */) /* ty=(Tensor[(1,16, 224, 224), float32], Tensor[(16), float32], Tensor[(16), float32]) */;
+#  %2 = %1.0;
+#  nn.relu(%2) /* ty=Tensor[(1, 16, 224, 224), float32] */
+# }
+#
+#
+##subgraphs[1]
+#
+# def @main(%data_n_0: Tensor[(1, 16, 224, 224), float32]) {
+#  nn.conv2d(%data_n_0, meta[relay.Constant][0] /* ty=Tensor[(16, 16, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, 224, 224), float32] */
+# }
+##############################################################
+
+##############################################################
+# Enable the pipeline executor, and doing the configuration.
+# -------------------------------------------------------------
+# In build/config.cmake set USE_PIPELINE_EXECUTOR as ON to enable pipeline executor
+# import pipeline_executor, and pipeline_executor_build
+##############################################################
+from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
+
+# Create subgraph pipeline configuration.
+mod0, mod1 = subgraphs[0], subgraphs[1]
+####################################################################
+# Associate the subgraph module with a target.
+# Set the codegen of the second subgraph module as dnnl, and the target as the CPU
+# Enable dnnl by set USE_DNNL_CODEGEN as on in config.cmake and install MKL-DNN.
+# using BYOC to apply dnnl codegen
+mod0 = relay.transform.AnnotateTarget(["dnnl"])(mod0)
+mod0 = relay.transform.MergeCompilerRegions()(mod0)
+mod0 = relay.transform.PartitionGraph()(mod0)
+# Start setting the pipeline configure.
+pipe_config = pipeline_executor_build.PipelineConfig()
+# Set the compile target of the second subgraph module as CPU.
+pipe_config[mod0].target = "llvm"
+pipe_config[mod0].dev = tvm.cpu(0)
+# Set the cpu afinity for control flow, for example use cpu 0 for control flow.
+pipe_config[mod1].cpu_affinity = "0"
+# Set the compile target of the second subgraph module as CPU.
+pipe_config[mod1].target = "llvm"
+pipe_config[mod1].dev = tvm.cpu(0)
+# Set the cpu afinity for control flow, for example use cpu 1 for control flow.
+pipe_config[mod1].cpu_affinity = "1"
+pipe_config["input"]["data"].connect(pipe_config[mod0]["input"]["data"])
+pipe_config[mod0]["output"][0].connect(pipe_config[mod1]["input"]["data_n_0"])
+pipe_config[mod1]["output"]["0"].connect(pipe_config["output"][0])
+####################################################################
+# The pipeline configuration like below(print(pipe_config)).
+#
+# Inputs
+#  |data: mod0:data
+#
+# output
+#  |output(0) : mod1.output(0)
+#
+# connections
+#  |mod0.output(0)-> mod1.data_n_0
+# Build the pipeline executor
+with tvm.transform.PassContext(opt_level=3):
+    pipeline_mod_factory = pipeline_executor_build.build(pipe_config)
+# Export the parameter configuration to a file.
+directory_path = tvm.contrib.utils.tempdir().temp_dir
+# If the directory does not exist, create it.
+if not os.path.exists(directory_path):
+    os.makedirs(directory_path)
+config_file_name = pipeline_mod_factory.export_library(directory_path)
+# Use the load function to create and initialize PipelineModule.
+pipeline_module = pipeline_executor.PipelineModule.load_library(config_file_name)
+# Allocated a data.
+data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+# Run the pipeline executor
+pipeline_module.set_input("data", tvm.nd.array(data))
+pipeline_module.run()
+outputs = []
+while not (outputs := pipeline_module.get_output()):
+    time.sleep(0.001)
+# Run with graph_executor and verify the output of pipeline executor.
+target = "llvm"
+dev = tvm.device(target, 0)
+lib0 = relay.build_module.build(mod0, target, params=params)
+lib1 = relay.build_module.build(mod1, target, params=params)
+module0 = runtime.GraphModule(lib0["default"](dev))
+module1 = runtime.GraphModule(lib1["default"](dev))
+module0.set_input("data", data)
+module0.run()
+out_shape = (1, 16, 224, 224)
+out = module0.get_output(0, tvm.nd.empty(out_shape))
+module1.set_input("data_n_0", out)
+module1.run()
+out = module1.get_output(0, tvm.nd.empty(out_shape))
+# Verify the result.
+tvm.testing.assert_allclose(outputs[0].numpy(), out.numpy())

From 6332de0ce1f17e4dc846770f5f2125aad345e3fa Mon Sep 17 00:00:00 2001
From: Hua Jiang <huaj@xilinx.com>
Date: Thu, 2 Jun 2022 21:55:52 -0700
Subject: [PATCH 02/39] fix ci issue

---
 .../how_to/work_with_relay/using_with_pipeline_executor.py   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index 17f14fef72a3..111071cb0f57 100644
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -81,7 +81,7 @@ def get_network():
 # own splitting function logic.
 import os
 
-os.sys.path.append(os.path.abspath("../../../tests/python/relay"))
+os.sys.path.append(os.path.abspath(os.environ["TVM_HOME"] + "/tests/python/relay"))
 from test_pipeline_executor import graph_split
 
 # Splitting the network into two subgraphs.
@@ -167,7 +167,8 @@ def get_network():
 pipeline_module.set_input("data", tvm.nd.array(data))
 pipeline_module.run()
 outputs = []
-while not (outputs := pipeline_module.get_output()):
+while not outputs:
+    outputs = pipeline_module.get_output()
     time.sleep(0.001)
 # Run with graph_executor and verify the output of pipeline executor.
 target = "llvm"

From cb49f993f1f538ea7b54b17b8c078eb282683096 Mon Sep 17 00:00:00 2001
From: Hua Jiang <huaj@xilinx.com>
Date: Thu, 2 Jun 2022 21:59:38 -0700
Subject: [PATCH 03/39] document change.

---
 .../work_with_relay/using_with_pipeline_executor.py    | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index 111071cb0f57..d3e52d7e70d3 100644
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -20,16 +20,6 @@
 **Author**: `Hua Jiang <https://https://github.com/huajsj>`_
 
 This is a short tutorial on how to use Pipeline Executor with Relay.
-
-Relay uses TVM internally to generate target specific code. For example, with cuda backend TVM generates cuda kernels for all layers in the user provided network.
-But sometimes it is also helpful to incorporate external libraries developed by various vendors into Relay.
-Luckily, TVM has a mechanism to transparently call into these libraries.
-For Relay users, all we need to do is just to set a target string appropriately.
-
-Before we can use external libraries from Relay, your TVM needs to be built with libraries you want to use.
-For example, to use cuDNN, USE_CUDNN option in `cmake/config.cmake` needs to be enabled, and cuDNN include and library directories need to be specified if necessary.
-
-To begin with, we import Relay and TVM.
 """
 import tvm
 from tvm import te

From 226fc58db82bc666a899aa399f476cbfea74308a Mon Sep 17 00:00:00 2001
From: Hua Jiang <huaj@xilinx.com>
Date: Fri, 3 Jun 2022 00:05:23 -0700
Subject: [PATCH 04/39] triger build

---
 gallery/how_to/work_with_relay/using_with_pipeline_executor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index d3e52d7e70d3..2df82546f449 100644
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -19,7 +19,7 @@
 =================================
 **Author**: `Hua Jiang <https://https://github.com/huajsj>`_
 
-This is a short tutorial on how to use Pipeline Executor with Relay.
+This is a short tutorial on how to use the Pipeline Executor with Relay.
 """
 import tvm
 from tvm import te

From 031b3ad13dd2a1bf679ab83cfb873b5dee836cbc Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Sat, 4 Jun 2022 22:20:29 -0700
Subject: [PATCH 05/39] fix doc issue

---
 .../using_with_pipeline_executor.py           | 57 ++++++++++---------
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index 2df82546f449..17fa994c982f 100644
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -30,9 +30,9 @@
 import tvm.testing
 import time
 
-######################################################################
+#######################################################################
 # Create a simple network, this network can be a pre-trained model too.
-# -----------------------
+# ---------------------------------------------------------------------
 # Let's create a very simple network for demonstration.
 # It consists of convolution, batch normalization, and ReLU activation.
 def get_network():
@@ -64,9 +64,9 @@ def get_network():
 
 
 net, params, data_shape = get_network()
-######################################################################
+#############################################
 # Apply a customer graph splitting function.
-# -------------------------------
+# ------------------------------------------
 # We use an testing linear graph splitting function as a example. User also can create their
 # own splitting function logic.
 import os
@@ -77,28 +77,30 @@ def get_network():
 # Splitting the network into two subgraphs.
 split_config = [{"op_name": "nn.relu", "op_index": 0}]
 subgraphs = graph_split(net["main"], split_config, params)
-##############################################################
+###########################################################
 # The generated subgraphs should look something like below.
-##subgraphs[0])
-#
-# def @main(%data: Tensor[(1, 3, 224, 224), float32]) {
-#  %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, 224, 224), float32] */;
-#  %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float32] */, meta[relay.Constant][2] /* ty=Tensor[(16), float32]*/, meta[relay.Constant][3] /* ty=Tensor[(16), float32] */, meta[relay.Constant][4] /* ty=Tensor[(16), float32] */) /* ty=(Tensor[(1,16, 224, 224), float32], Tensor[(16), float32], Tensor[(16), float32]) */;
-#  %2 = %1.0;
-#  nn.relu(%2) /* ty=Tensor[(1, 16, 224, 224), float32] */
-# }
-#
-#
-##subgraphs[1]
-#
-# def @main(%data_n_0: Tensor[(1, 16, 224, 224), float32]) {
-#  nn.conv2d(%data_n_0, meta[relay.Constant][0] /* ty=Tensor[(16, 16, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, 224, 224), float32] */
-# }
-##############################################################
+#----------------------------------------------------------
+```
+#subgraphs[0])
 
-##############################################################
+ def @main(%data: Tensor[(1, 3, 224, 224), float32]) {
+  %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, 224, 224), float32] */;
+  %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float32] */, meta[relay.Constant][2] /* ty=Tensor[(16), float32]*/, meta[relay.Constant][3] /* ty=Tensor[(16), float32] */, meta[relay.Constant][4] /* ty=Tensor[(16), float32] */) /* ty=(Tensor[(1,16, 224, 224), float32], Tensor[(16), float32], Tensor[(16), float32]) */;
+  %2 = %1.0;
+  nn.relu(%2) /* ty=Tensor[(1, 16, 224, 224), float32] */
+ }
+
+
+#subgraphs[1]
+
+ def @main(%data_n_0: Tensor[(1, 16, 224, 224), float32]) {
+  nn.conv2d(%data_n_0, meta[relay.Constant][0] /* ty=Tensor[(16, 16, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, 224, 224), float32] */
+ }
+```
+
+############################################################
 # Enable the pipeline executor, and doing the configuration.
-# -------------------------------------------------------------
+# ----------------------------------------------------------
 # In build/config.cmake set USE_PIPELINE_EXECUTOR as ON to enable pipeline executor
 # import pipeline_executor, and pipeline_executor_build
 ##############################################################
@@ -129,9 +131,11 @@ def get_network():
 pipe_config["input"]["data"].connect(pipe_config[mod0]["input"]["data"])
 pipe_config[mod0]["output"][0].connect(pipe_config[mod1]["input"]["data_n_0"])
 pipe_config[mod1]["output"]["0"].connect(pipe_config["output"][0])
-####################################################################
-# The pipeline configuration like below(print(pipe_config)).
-#
+##########################################
+# The pipeline configuration like below().
+#-----------------------------------------
+```
+print(pipe_config)
 # Inputs
 #  |data: mod0:data
 #
@@ -140,6 +144,7 @@ def get_network():
 #
 # connections
 #  |mod0.output(0)-> mod1.data_n_0
+```
 # Build the pipeline executor
 with tvm.transform.PassContext(opt_level=3):
     pipeline_mod_factory = pipeline_executor_build.build(pipe_config)

From d046177a3a5945f6d82a6ee50823f48975595989 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Sat, 4 Jun 2022 22:38:23 -0700
Subject: [PATCH 06/39] fix ci issue

---
 .../using_with_pipeline_executor.py           | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index 17fa994c982f..b2ea6f7a85fc 100644
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -79,8 +79,8 @@ def get_network():
 subgraphs = graph_split(net["main"], split_config, params)
 ###########################################################
 # The generated subgraphs should look something like below.
-#----------------------------------------------------------
-```
+# ----------------------------------------------------------
+"""
 #subgraphs[0])
 
  def @main(%data: Tensor[(1, 3, 224, 224), float32]) {
@@ -96,7 +96,7 @@ def @main(%data: Tensor[(1, 3, 224, 224), float32]) {
  def @main(%data_n_0: Tensor[(1, 16, 224, 224), float32]) {
   nn.conv2d(%data_n_0, meta[relay.Constant][0] /* ty=Tensor[(16, 16, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, 224, 224), float32] */
  }
-```
+"""
 
 ############################################################
 # Enable the pipeline executor, and doing the configuration.
@@ -133,18 +133,18 @@ def @main(%data_n_0: Tensor[(1, 16, 224, 224), float32]) {
 pipe_config[mod1]["output"]["0"].connect(pipe_config["output"][0])
 ##########################################
 # The pipeline configuration like below().
-#-----------------------------------------
-```
+# -----------------------------------------
+"""
 print(pipe_config)
-# Inputs
-#  |data: mod0:data
-#
-# output
-#  |output(0) : mod1.output(0)
-#
-# connections
-#  |mod0.output(0)-> mod1.data_n_0
-```
+ Inputs
+  |data: mod0:data
+
+ output
+  |output(0) : mod1.output(0)
+
+ connections
+  |mod0.output(0)-> mod1.data_n_0
+"""
 # Build the pipeline executor
 with tvm.transform.PassContext(opt_level=3):
     pipeline_mod_factory = pipeline_executor_build.build(pipe_config)

From 8d01a7f87e907a2dcd35ef46fcb46b1865291846 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Sat, 4 Jun 2022 23:27:04 -0700
Subject: [PATCH 07/39] doc issue

---
 .../how_to/work_with_relay/using_with_pipeline_executor.py   | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index b2ea6f7a85fc..80dadb6ad0f0 100644
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -101,9 +101,8 @@ def @main(%data_n_0: Tensor[(1, 16, 224, 224), float32]) {
 ############################################################
 # Enable the pipeline executor, and doing the configuration.
 # ----------------------------------------------------------
-# In build/config.cmake set USE_PIPELINE_EXECUTOR as ON to enable pipeline executor
-# import pipeline_executor, and pipeline_executor_build
-##############################################################
+# In build/config.cmake set USE_PIPELINE_EXECUTOR as ON to enable pipeline executor.
+
 from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
 
 # Create subgraph pipeline configuration.

From 86cfbe4340810c0da5fdd51b5d7cdc9ca724c271 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Wed, 8 Jun 2022 00:40:39 -0700
Subject: [PATCH 08/39] fix ci issue

---
 .../using_with_pipeline_executor.py           | 80 ++++++++++++-------
 1 file changed, 49 insertions(+), 31 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index 80dadb6ad0f0..e26a45f68689 100644
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -46,9 +46,11 @@ def get_network():
     bn_mmean = relay.var("bn_mean")
     bn_mvar = relay.var("bn_var")
     simple_net = relay.nn.conv2d(
-        data=data, weight=weight, kernel_size=(3, 3), channels=out_channels, padding=(1, 1)
+        data=data, weight=weight, kernel_size=(3, 3),
+        channels=out_channels, padding=(1, 1)
     )
-    simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0]
+    simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta,
+                                     bn_mmean, bn_mvar)[0]
     simple_net = relay.nn.relu(simple_net)
     simple_net = relay.nn.conv2d(
         data=simple_net,
@@ -64,22 +66,22 @@ def get_network():
 
 
 net, params, data_shape = get_network()
-#############################################
-# Apply a customer graph splitting function.
-# ------------------------------------------
+###########################################
+# Splitting the network into two subgraphs.
+# -----------------------------------------
 # We use an testing linear graph splitting function as a example. User also can create their
 # own splitting function logic.
 import os
-
-os.sys.path.append(os.path.abspath(os.environ["TVM_HOME"] + "/tests/python/relay"))
+test_path = os.path.join(os.path.dirname(__file__), '../../../tests/python/relay')
+os.sys.path.append(test_path)
 from test_pipeline_executor import graph_split
-
+###########################################
 # Splitting the network into two subgraphs.
 split_config = [{"op_name": "nn.relu", "op_index": 0}]
 subgraphs = graph_split(net["main"], split_config, params)
 ###########################################################
 # The generated subgraphs should look something like below.
-# ----------------------------------------------------------
+
 """
 #subgraphs[0])
 
@@ -98,41 +100,42 @@ def @main(%data_n_0: Tensor[(1, 16, 224, 224), float32]) {
  }
 """
 
-############################################################
-# Enable the pipeline executor, and doing the configuration.
-# ----------------------------------------------------------
+#########################################
+# Create subgraph pipeline configuration.
+# ---------------------------------------
 # In build/config.cmake set USE_PIPELINE_EXECUTOR as ON to enable pipeline executor.
-
 from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
-
+#########################################
 # Create subgraph pipeline configuration.
-mod0, mod1 = subgraphs[0], subgraphs[1]
-####################################################################
 # Associate the subgraph module with a target.
-# Set the codegen of the second subgraph module as dnnl, and the target as the CPU
-# Enable dnnl by set USE_DNNL_CODEGEN as on in config.cmake and install MKL-DNN.
-# using BYOC to apply dnnl codegen
+# Using BYOC to set the codegen of the second subgraph module.
+# To use dnnl the 'USE_DNNL_CODEGEN' should set as ON in config.cmake and installing MKL-DNN.
+mod0, mod1 = subgraphs[0], subgraphs[1]
 mod0 = relay.transform.AnnotateTarget(["dnnl"])(mod0)
 mod0 = relay.transform.MergeCompilerRegions()(mod0)
 mod0 = relay.transform.PartitionGraph()(mod0)
-# Start setting the pipeline configure.
+#################################################
+# Get the pipeline executor configuration object.
 pipe_config = pipeline_executor_build.PipelineConfig()
-# Set the compile target of the second subgraph module as CPU.
+###########################################################################
+# Set the compile target of the second subgraph module for example as LLVM.
 pipe_config[mod0].target = "llvm"
 pipe_config[mod0].dev = tvm.cpu(0)
-# Set the cpu afinity for control flow, for example use cpu 0 for control flow.
+###############################################################################
+# Set the cpu afinity for control flow, for example using cpu 0 for control flow.
 pipe_config[mod1].cpu_affinity = "0"
-# Set the compile target of the second subgraph module as CPU.
+##############################################################
+# Set the compile target of the second subgraph module as LLVM.
 pipe_config[mod1].target = "llvm"
 pipe_config[mod1].dev = tvm.cpu(0)
-# Set the cpu afinity for control flow, for example use cpu 1 for control flow.
+#################################################################################
+# Set the cpu afinity for control flow, for example using cpu 1 for control flow.
 pipe_config[mod1].cpu_affinity = "1"
 pipe_config["input"]["data"].connect(pipe_config[mod0]["input"]["data"])
 pipe_config[mod0]["output"][0].connect(pipe_config[mod1]["input"]["data_n_0"])
 pipe_config[mod1]["output"]["0"].connect(pipe_config["output"][0])
-##########################################
-# The pipeline configuration like below().
-# -----------------------------------------
+######################################
+# The pipeline configuration as below.
 """
 print(pipe_config)
  Inputs
@@ -144,27 +147,41 @@ def @main(%data_n_0: Tensor[(1, 16, 224, 224), float32]) {
  connections
   |mod0.output(0)-> mod1.data_n_0
 """
-# Build the pipeline executor
+##############################
+# Build the pipeline executor.
+# ----------------------------
 with tvm.transform.PassContext(opt_level=3):
     pipeline_mod_factory = pipeline_executor_build.build(pipe_config)
 # Export the parameter configuration to a file.
 directory_path = tvm.contrib.utils.tempdir().temp_dir
+#############################################
 # If the directory does not exist, create it.
 if not os.path.exists(directory_path):
     os.makedirs(directory_path)
 config_file_name = pipeline_mod_factory.export_library(directory_path)
+################################################################
 # Use the load function to create and initialize PipelineModule.
+# --------------------------------------------------------------
 pipeline_module = pipeline_executor.PipelineModule.load_library(config_file_name)
-# Allocated a data.
+
+
+############################
+# Run the pipeline executor.
+# --------------------------
+# Allocated a input data.
 data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
-# Run the pipeline executor
 pipeline_module.set_input("data", tvm.nd.array(data))
+##########################################################################
+# Run the two subgraph in pipeline mode and get the output asynchronously.
 pipeline_module.run()
 outputs = []
 while not outputs:
     outputs = pipeline_module.get_output()
     time.sleep(0.001)
-# Run with graph_executor and verify the output of pipeline executor.
+######################################
+# Use graph_executor for verification.
+# ------------------------------------
+# Run these two subgraphs in sequence with graph_executor to get the output.
 target = "llvm"
 dev = tvm.device(target, 0)
 lib0 = relay.build_module.build(mod0, target, params=params)
@@ -178,5 +195,6 @@ def @main(%data_n_0: Tensor[(1, 16, 224, 224), float32]) {
 module1.set_input("data_n_0", out)
 module1.run()
 out = module1.get_output(0, tvm.nd.empty(out_shape))
+####################
 # Verify the result.
 tvm.testing.assert_allclose(outputs[0].numpy(), out.numpy())

From 22788ba2565299665e9c5311fc535e456a3739c1 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Wed, 8 Jun 2022 00:46:22 -0700
Subject: [PATCH 09/39] fix ci issue.

---
 .../work_with_relay/using_with_pipeline_executor.py   | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index e26a45f68689..43349a366946 100644
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -46,11 +46,9 @@ def get_network():
     bn_mmean = relay.var("bn_mean")
     bn_mvar = relay.var("bn_var")
     simple_net = relay.nn.conv2d(
-        data=data, weight=weight, kernel_size=(3, 3),
-        channels=out_channels, padding=(1, 1)
+        data=data, weight=weight, kernel_size=(3, 3), channels=out_channels, padding=(1, 1)
     )
-    simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta,
-                                     bn_mmean, bn_mvar)[0]
+    simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0]
     simple_net = relay.nn.relu(simple_net)
     simple_net = relay.nn.conv2d(
         data=simple_net,
@@ -72,9 +70,11 @@ def get_network():
 # We use an testing linear graph splitting function as a example. User also can create their
 # own splitting function logic.
 import os
-test_path = os.path.join(os.path.dirname(__file__), '../../../tests/python/relay')
+
+test_path = os.path.join(os.path.dirname(__file__), "../../../tests/python/relay")
 os.sys.path.append(test_path)
 from test_pipeline_executor import graph_split
+
 ###########################################
 # Splitting the network into two subgraphs.
 split_config = [{"op_name": "nn.relu", "op_index": 0}]
@@ -105,6 +105,7 @@ def @main(%data_n_0: Tensor[(1, 16, 224, 224), float32]) {
 # ---------------------------------------
 # In build/config.cmake set USE_PIPELINE_EXECUTOR as ON to enable pipeline executor.
 from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
+
 #########################################
 # Create subgraph pipeline configuration.
 # Associate the subgraph module with a target.

From 9a550fbfe2c4c6216d83a1961d41a22daa0e0c7f Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Wed, 8 Jun 2022 11:32:18 -0700
Subject: [PATCH 10/39] fix __file__ not found problem.

this is a known issue of sphinx-gallery
https://github.com/sphinx-gallery/sphinx-gallery/issues/211
---
 .../how_to/work_with_relay/using_with_pipeline_executor.py   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
 mode change 100644 => 100755 gallery/how_to/work_with_relay/using_with_pipeline_executor.py

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
old mode 100644
new mode 100755
index 43349a366946..93d2d6aa7761
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -69,10 +69,11 @@ def get_network():
 # -----------------------------------------
 # We use an testing linear graph splitting function as a example. User also can create their
 # own splitting function logic.
+import inspect
 import os
 
-test_path = os.path.join(os.path.dirname(__file__), "../../../tests/python/relay")
-os.sys.path.append(test_path)
+test_path = os.path.dirname(inspect.getfile(lambda: None))
+os.sys.path.append(os.path.join(test_path, "../../../tests/python/relay"))
 from test_pipeline_executor import graph_split
 
 ###########################################

From 1b532586493714e9647db00974be63863c69ad2f Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Wed, 8 Jun 2022 17:46:36 -0700
Subject: [PATCH 11/39] fix byoc with dnnl issue

---
 .../using_with_pipeline_executor.py           | 211 ++++++++++--------
 1 file changed, 112 insertions(+), 99 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index 93d2d6aa7761..bb1467c8e0e7 100755
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -93,6 +93,7 @@ def @main(%data: Tensor[(1, 3, 224, 224), float32]) {
   nn.relu(%2) /* ty=Tensor[(1, 16, 224, 224), float32] */
  }
 
+peline-tutorial
 
 #subgraphs[1]
 
@@ -101,102 +102,114 @@ def @main(%data_n_0: Tensor[(1, 16, 224, 224), float32]) {
  }
 """
 
-#########################################
-# Create subgraph pipeline configuration.
-# ---------------------------------------
-# In build/config.cmake set USE_PIPELINE_EXECUTOR as ON to enable pipeline executor.
-from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
-
-#########################################
-# Create subgraph pipeline configuration.
-# Associate the subgraph module with a target.
-# Using BYOC to set the codegen of the second subgraph module.
-# To use dnnl the 'USE_DNNL_CODEGEN' should set as ON in config.cmake and installing MKL-DNN.
-mod0, mod1 = subgraphs[0], subgraphs[1]
-mod0 = relay.transform.AnnotateTarget(["dnnl"])(mod0)
-mod0 = relay.transform.MergeCompilerRegions()(mod0)
-mod0 = relay.transform.PartitionGraph()(mod0)
-#################################################
-# Get the pipeline executor configuration object.
-pipe_config = pipeline_executor_build.PipelineConfig()
-###########################################################################
-# Set the compile target of the second subgraph module for example as LLVM.
-pipe_config[mod0].target = "llvm"
-pipe_config[mod0].dev = tvm.cpu(0)
-###############################################################################
-# Set the cpu afinity for control flow, for example using cpu 0 for control flow.
-pipe_config[mod1].cpu_affinity = "0"
-##############################################################
-# Set the compile target of the second subgraph module as LLVM.
-pipe_config[mod1].target = "llvm"
-pipe_config[mod1].dev = tvm.cpu(0)
-#################################################################################
-# Set the cpu afinity for control flow, for example using cpu 1 for control flow.
-pipe_config[mod1].cpu_affinity = "1"
-pipe_config["input"]["data"].connect(pipe_config[mod0]["input"]["data"])
-pipe_config[mod0]["output"][0].connect(pipe_config[mod1]["input"]["data_n_0"])
-pipe_config[mod1]["output"]["0"].connect(pipe_config["output"][0])
-######################################
-# The pipeline configuration as below.
-"""
-print(pipe_config)
- Inputs
-  |data: mod0:data
-
- output
-  |output(0) : mod1.output(0)
-
- connections
-  |mod0.output(0)-> mod1.data_n_0
-"""
-##############################
-# Build the pipeline executor.
-# ----------------------------
-with tvm.transform.PassContext(opt_level=3):
-    pipeline_mod_factory = pipeline_executor_build.build(pipe_config)
-# Export the parameter configuration to a file.
-directory_path = tvm.contrib.utils.tempdir().temp_dir
-#############################################
-# If the directory does not exist, create it.
-if not os.path.exists(directory_path):
-    os.makedirs(directory_path)
-config_file_name = pipeline_mod_factory.export_library(directory_path)
-################################################################
-# Use the load function to create and initialize PipelineModule.
-# --------------------------------------------------------------
-pipeline_module = pipeline_executor.PipelineModule.load_library(config_file_name)
-
-
-############################
-# Run the pipeline executor.
-# --------------------------
-# Allocated a input data.
-data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
-pipeline_module.set_input("data", tvm.nd.array(data))
-##########################################################################
-# Run the two subgraph in pipeline mode and get the output asynchronously.
-pipeline_module.run()
-outputs = []
-while not outputs:
-    outputs = pipeline_module.get_output()
-    time.sleep(0.001)
-######################################
-# Use graph_executor for verification.
-# ------------------------------------
-# Run these two subgraphs in sequence with graph_executor to get the output.
-target = "llvm"
-dev = tvm.device(target, 0)
-lib0 = relay.build_module.build(mod0, target, params=params)
-lib1 = relay.build_module.build(mod1, target, params=params)
-module0 = runtime.GraphModule(lib0["default"](dev))
-module1 = runtime.GraphModule(lib1["default"](dev))
-module0.set_input("data", data)
-module0.run()
-out_shape = (1, 16, 224, 224)
-out = module0.get_output(0, tvm.nd.empty(out_shape))
-module1.set_input("data_n_0", out)
-module1.run()
-out = module1.get_output(0, tvm.nd.empty(out_shape))
-####################
-# Verify the result.
-tvm.testing.assert_allclose(outputs[0].numpy(), out.numpy())
+###########################################################
+# Run the two subgraphs in pipeline with pipeline executor.
+# ---------------------------------------------------------
+# Define a function to do all the codegen and pipeline executor works.
+def run_pipeline_executor():
+    #########################################
+    # Create subgraph pipeline configuration.
+    # ---------------------------------------
+    # In build/config.cmake set USE_PIPELINE_EXECUTOR as ON to enable pipeline executor.
+    from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
+
+    #########################################
+    # Create subgraph pipeline configuration.
+    # Associate the subgraph module with a target.
+    # Using BYOC to set the codegen of the second subgraph module.
+    # To use dnnl the 'USE_DNNL_CODEGEN' should set as ON in config.cmake and installing MKL-DNN.
+    mod0, mod1 = subgraphs[0], subgraphs[1]
+    mod0 = relay.transform.AnnotateTarget(["dnnl"])(mod0)
+    mod0 = relay.transform.MergeCompilerRegions()(mod0)
+    mod0 = relay.transform.PartitionGraph()(mod0)
+    #################################################
+    # Get the pipeline executor configuration object.
+    pipe_config = pipeline_executor_build.PipelineConfig()
+    ###########################################################################
+    # Set the compile target of the second subgraph module for example as LLVM.
+    pipe_config[mod0].target = "llvm"
+    pipe_config[mod0].dev = tvm.cpu(0)
+    ###############################################################################
+    # Set the cpu afinity for control flow, for example using cpu 0 for control flow.
+    pipe_config[mod1].cpu_affinity = "0"
+    ##############################################################
+    # Set the compile target of the second subgraph module as LLVM.
+    pipe_config[mod1].target = "llvm"
+    pipe_config[mod1].dev = tvm.cpu(0)
+    #################################################################################
+    # Set the cpu afinity for control flow, for example using cpu 1 for control flow.
+    pipe_config[mod1].cpu_affinity = "1"
+    pipe_config["input"]["data"].connect(pipe_config[mod0]["input"]["data"])
+    pipe_config[mod0]["output"][0].connect(pipe_config[mod1]["input"]["data_n_0"])
+    pipe_config[mod1]["output"]["0"].connect(pipe_config["output"][0])
+    ######################################
+    # The pipeline configuration as below.
+    """
+    print(pipe_config)
+     Inputs
+      |data: mod0:data
+
+     output
+      |output(0) : mod1.output(0)
+
+     connections
+      |mod0.output(0)-> mod1.data_n_0
+    """
+    ##############################
+    # Build the pipeline executor.
+    # ----------------------------
+    with tvm.transform.PassContext(opt_level=3):
+        pipeline_mod_factory = pipeline_executor_build.build(pipe_config)
+    ###############################################
+    # Export the parameter configuration to a file.
+    directory_path = tvm.contrib.utils.tempdir().temp_dir
+    #############################################
+    # If the directory does not exist, create it.
+    if not os.path.exists(directory_path):
+        os.makedirs(directory_path)
+    config_file_name = pipeline_mod_factory.export_library(directory_path)
+    ################################################################
+    # Use the load function to create and initialize PipelineModule.
+    # --------------------------------------------------------------
+    pipeline_module = pipeline_executor.PipelineModule.load_library(config_file_name)
+
+    ############################
+    # Run the pipeline executor.
+    # --------------------------
+    # Allocated a input data.
+    data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+    pipeline_module.set_input("data", tvm.nd.array(data))
+    ##########################################################################
+    # Run the two subgraph in pipeline mode and get the output asynchronously.
+    pipeline_module.run()
+    outputs = []
+    while not outputs:
+        outputs = pipeline_module.get_output()
+        time.sleep(0.001)
+    ######################################
+    # Use graph_executor for verification.
+    # ------------------------------------
+    # Run these two subgraphs in sequence with graph_executor to get the output.
+    target = "llvm"
+    dev = tvm.device(target, 0)
+    lib0 = relay.build_module.build(mod0, target, params=params)
+    lib1 = relay.build_module.build(mod1, target, params=params)
+    module0 = runtime.GraphModule(lib0["default"](dev))
+    module1 = runtime.GraphModule(lib1["default"](dev))
+    module0.set_input("data", data)
+    module0.run()
+    out_shape = (1, 16, 224, 224)
+    out = module0.get_output(0, tvm.nd.empty(out_shape))
+    module1.set_input("data_n_0", out)
+    module1.run()
+    out = module1.get_output(0, tvm.nd.empty(out_shape))
+    ####################
+    # Verify the result.
+    tvm.testing.assert_allclose(outputs[0].numpy(), out.numpy())
+
+
+##################################################################################
+# To run pipeline executor with dnnl, USE_PIPELINE_EXECUTOR need to get set as ON.
+# and the 'USE_DNNL_CODEGEN' should set as ON in config.cmake and installing MKL-DNN.
+
+# run_pipeline_executor()

From 7757b1bcb2c39fad56d5db7ad5fe14f419153e90 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Thu, 9 Jun 2022 13:44:18 -0700
Subject: [PATCH 12/39] enable dnnl and pipeline executor

---
 gallery/how_to/work_with_relay/using_with_pipeline_executor.py | 2 +-
 tests/scripts/task_config_build_gpu.sh                         | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index bb1467c8e0e7..380cf79c42bf 100755
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -212,4 +212,4 @@ def run_pipeline_executor():
 # To run pipeline executor with dnnl, USE_PIPELINE_EXECUTOR need to get set as ON.
 # and the 'USE_DNNL_CODEGEN' should set as ON in config.cmake and installing MKL-DNN.
 
-# run_pipeline_executor()
+run_pipeline_executor()
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 9a71983886dd..8d0ecc5e3c21 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -47,3 +47,5 @@ echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
 echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake
+echo set\(USE_PIPELINE_EXECUTOR ON\) >> config.cmake
+echo set\(USE_DNNL_CODEGEN ON\) >> config.cmake

From 15db48a98226ddb8099dda4df22b5c90132daa10 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Thu, 9 Jun 2022 14:48:05 -0700
Subject: [PATCH 13/39] trigger build

---
 gallery/how_to/work_with_relay/using_with_pipeline_executor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index 380cf79c42bf..a4e5c1a425a9 100755
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -19,7 +19,7 @@
 =================================
 **Author**: `Hua Jiang <https://https://github.com/huajsj>`_
 
-This is a short tutorial on how to use the Pipeline Executor with Relay.
+This is a short tutorial on how to use "Pipeline Executor" with Relay.
 """
 import tvm
 from tvm import te

From 3b02c9ab84b5ec36181db4dfad4170474921df27 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Thu, 9 Jun 2022 17:17:28 -0700
Subject: [PATCH 14/39] trigger build

---
 gallery/how_to/work_with_relay/using_with_pipeline_executor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index a4e5c1a425a9..eaec39de1031 100755
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -19,7 +19,7 @@
 =================================
 **Author**: `Hua Jiang <https://https://github.com/huajsj>`_
 
-This is a short tutorial on how to use "Pipeline Executor" with Relay.
+This is a short tutorial on how to use the "Pipeline Executor" with Relay.
 """
 import tvm
 from tvm import te

From 0811b24dde7319171c8f96cfe6f33cd425dadecb Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Tue, 21 Jun 2022 13:59:45 -0700
Subject: [PATCH 15/39] fix build issue

---
 .../using_with_pipeline_executor.py           | 198 +++++++++---------
 tests/scripts/task_config_build_gpu.sh        |   2 +-
 2 files changed, 95 insertions(+), 105 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index eaec39de1031..d829f2030a06 100755
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -106,110 +106,100 @@ def @main(%data_n_0: Tensor[(1, 16, 224, 224), float32]) {
 # Run the two subgraphs in pipeline with pipeline executor.
 # ---------------------------------------------------------
 # Define a function to do all the codegen and pipeline executor works.
-def run_pipeline_executor():
-    #########################################
-    # Create subgraph pipeline configuration.
-    # ---------------------------------------
-    # In build/config.cmake set USE_PIPELINE_EXECUTOR as ON to enable pipeline executor.
-    from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
-
-    #########################################
-    # Create subgraph pipeline configuration.
-    # Associate the subgraph module with a target.
-    # Using BYOC to set the codegen of the second subgraph module.
-    # To use dnnl the 'USE_DNNL_CODEGEN' should set as ON in config.cmake and installing MKL-DNN.
-    mod0, mod1 = subgraphs[0], subgraphs[1]
-    mod0 = relay.transform.AnnotateTarget(["dnnl"])(mod0)
-    mod0 = relay.transform.MergeCompilerRegions()(mod0)
-    mod0 = relay.transform.PartitionGraph()(mod0)
-    #################################################
-    # Get the pipeline executor configuration object.
-    pipe_config = pipeline_executor_build.PipelineConfig()
-    ###########################################################################
-    # Set the compile target of the second subgraph module for example as LLVM.
-    pipe_config[mod0].target = "llvm"
-    pipe_config[mod0].dev = tvm.cpu(0)
-    ###############################################################################
-    # Set the cpu afinity for control flow, for example using cpu 0 for control flow.
-    pipe_config[mod1].cpu_affinity = "0"
-    ##############################################################
-    # Set the compile target of the second subgraph module as LLVM.
-    pipe_config[mod1].target = "llvm"
-    pipe_config[mod1].dev = tvm.cpu(0)
-    #################################################################################
-    # Set the cpu afinity for control flow, for example using cpu 1 for control flow.
-    pipe_config[mod1].cpu_affinity = "1"
-    pipe_config["input"]["data"].connect(pipe_config[mod0]["input"]["data"])
-    pipe_config[mod0]["output"][0].connect(pipe_config[mod1]["input"]["data_n_0"])
-    pipe_config[mod1]["output"]["0"].connect(pipe_config["output"][0])
-    ######################################
-    # The pipeline configuration as below.
-    """
-    print(pipe_config)
-     Inputs
-      |data: mod0:data
-
-     output
-      |output(0) : mod1.output(0)
-
-     connections
-      |mod0.output(0)-> mod1.data_n_0
-    """
-    ##############################
-    # Build the pipeline executor.
-    # ----------------------------
-    with tvm.transform.PassContext(opt_level=3):
-        pipeline_mod_factory = pipeline_executor_build.build(pipe_config)
-    ###############################################
-    # Export the parameter configuration to a file.
-    directory_path = tvm.contrib.utils.tempdir().temp_dir
-    #############################################
-    # If the directory does not exist, create it.
-    if not os.path.exists(directory_path):
-        os.makedirs(directory_path)
-    config_file_name = pipeline_mod_factory.export_library(directory_path)
-    ################################################################
-    # Use the load function to create and initialize PipelineModule.
-    # --------------------------------------------------------------
-    pipeline_module = pipeline_executor.PipelineModule.load_library(config_file_name)
-
-    ############################
-    # Run the pipeline executor.
-    # --------------------------
-    # Allocated a input data.
-    data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
-    pipeline_module.set_input("data", tvm.nd.array(data))
-    ##########################################################################
-    # Run the two subgraph in pipeline mode and get the output asynchronously.
-    pipeline_module.run()
-    outputs = []
-    while not outputs:
-        outputs = pipeline_module.get_output()
-        time.sleep(0.001)
-    ######################################
-    # Use graph_executor for verification.
-    # ------------------------------------
-    # Run these two subgraphs in sequence with graph_executor to get the output.
-    target = "llvm"
-    dev = tvm.device(target, 0)
-    lib0 = relay.build_module.build(mod0, target, params=params)
-    lib1 = relay.build_module.build(mod1, target, params=params)
-    module0 = runtime.GraphModule(lib0["default"](dev))
-    module1 = runtime.GraphModule(lib1["default"](dev))
-    module0.set_input("data", data)
-    module0.run()
-    out_shape = (1, 16, 224, 224)
-    out = module0.get_output(0, tvm.nd.empty(out_shape))
-    module1.set_input("data_n_0", out)
-    module1.run()
-    out = module1.get_output(0, tvm.nd.empty(out_shape))
-    ####################
-    # Verify the result.
-    tvm.testing.assert_allclose(outputs[0].numpy(), out.numpy())
-
-
-##################################################################################
 # To run pipeline executor with dnnl, USE_PIPELINE_EXECUTOR need to get set as ON.
 # and the 'USE_DNNL_CODEGEN' should set as ON in config.cmake and installing MKL-DNN.
+from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
+
+#########################################
+# Create subgraph pipeline configuration.
+# Associate the subgraph module with a target.
+# Using BYOC to set the codegen of the second subgraph module.
+# To use dnnl the 'USE_DNNL_CODEGEN' should set as ON in config.cmake and installing MKL-DNN.
+mod0, mod1 = subgraphs[0], subgraphs[1]
+mod0 = relay.transform.AnnotateTarget(["dnnl"])(mod0)
+mod0 = relay.transform.MergeCompilerRegions()(mod0)
+mod0 = relay.transform.PartitionGraph()(mod0)
+#################################################
+# Get the pipeline executor configuration object.
+pipe_config = pipeline_executor_build.PipelineConfig()
+###########################################################################
+# Set the compile target of the second subgraph module for example as LLVM.
+pipe_config[mod0].target = "llvm"
+pipe_config[mod0].dev = tvm.cpu(0)
+###############################################################################
+# Set the cpu afinity for control flow, for example using cpu 0 for control flow.
+pipe_config[mod1].cpu_affinity = "0"
+##############################################################
+# Set the compile target of the second subgraph module as LLVM.
+pipe_config[mod1].target = "llvm"
+pipe_config[mod1].dev = tvm.cpu(0)
+#################################################################################
+# Set the cpu afinity for control flow, for example using cpu 1 for control flow.
+pipe_config[mod1].cpu_affinity = "1"
+pipe_config["input"]["data"].connect(pipe_config[mod0]["input"]["data"])
+pipe_config[mod0]["output"][0].connect(pipe_config[mod1]["input"]["data_n_0"])
+pipe_config[mod1]["output"]["0"].connect(pipe_config["output"][0])
+######################################
+# The pipeline configuration as below.
+"""
+print(pipe_config)
+ Inputs
+  |data: mod0:data
+
+ output
+  |output(0) : mod1.output(0)
 
-run_pipeline_executor()
+ connections
+  |mod0.output(0)-> mod1.data_n_0
+"""
+##############################
+# Build the pipeline executor.
+# ----------------------------
+with tvm.transform.PassContext(opt_level=3):
+    pipeline_mod_factory = pipeline_executor_build.build(pipe_config)
+###############################################
+# Export the parameter configuration to a file.
+directory_path = tvm.contrib.utils.tempdir().temp_dir
+#############################################
+# If the directory does not exist, create it.
+if not os.path.exists(directory_path):
+    os.makedirs(directory_path)
+config_file_name = pipeline_mod_factory.export_library(directory_path)
+################################################################
+# Use the load function to create and initialize PipelineModule.
+# --------------------------------------------------------------
+pipeline_module = pipeline_executor.PipelineModule.load_library(config_file_name)
+
+############################
+# Run the pipeline executor.
+# --------------------------
+# Allocated a input data.
+data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+pipeline_module.set_input("data", tvm.nd.array(data))
+##########################################################################
+# Run the two subgraph in pipeline mode and get the output asynchronously.
+pipeline_module.run()
+outputs = []
+while not outputs:
+    outputs = pipeline_module.get_output()
+    time.sleep(0.001)
+######################################
+# Use graph_executor for verification.
+# ------------------------------------
+# Run these two subgraphs in sequence with graph_executor to get the output.
+target = "llvm"
+dev = tvm.device(target, 0)
+lib0 = relay.build_module.build(mod0, target, params=params)
+lib1 = relay.build_module.build(mod1, target, params=params)
+module0 = runtime.GraphModule(lib0["default"](dev))
+module1 = runtime.GraphModule(lib1["default"](dev))
+module0.set_input("data", data)
+module0.run()
+out_shape = (1, 16, 224, 224)
+out = module0.get_output(0, tvm.nd.empty(out_shape))
+module1.set_input("data_n_0", out)
+module1.run()
+out = module1.get_output(0, tvm.nd.empty(out_shape))
+####################
+# Verify the result.
+tvm.testing.assert_allclose(outputs[0].numpy(), out.numpy())
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 8d0ecc5e3c21..9357fa6c8078 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -48,4 +48,4 @@ echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
 echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake
 echo set\(USE_PIPELINE_EXECUTOR ON\) >> config.cmake
-echo set\(USE_DNNL_CODEGEN ON\) >> config.cmake
+echo set\(USE_DNNL ON\) >> config.cmake

From 53894ec1b5c933261b8468f51d98021cf64601bf Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Wed, 22 Jun 2022 00:19:25 -0700
Subject: [PATCH 16/39] trigger build

---
 gallery/how_to/work_with_relay/using_with_pipeline_executor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index d829f2030a06..364bc50a39d9 100755
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -19,7 +19,7 @@
 =================================
 **Author**: `Hua Jiang <https://https://github.com/huajsj>`_
 
-This is a short tutorial on how to use the "Pipeline Executor" with Relay.
+This is a short tutorial on how to use "Pipeline Executor" with Relay.
 """
 import tvm
 from tvm import te

From fb4f8215f6a519b49b216fb439d2f13ac42b07e0 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Mon, 4 Jul 2022 22:31:05 -0700
Subject: [PATCH 17/39] oneflow cause crash, do test with change

---
 gallery/how_to/compile_models/from_oneflow.py | 126 ------------------
 1 file changed, 126 deletions(-)

diff --git a/gallery/how_to/compile_models/from_oneflow.py b/gallery/how_to/compile_models/from_oneflow.py
index eb27c4b3e34b..a80bd562eb49 100644
--- a/gallery/how_to/compile_models/from_oneflow.py
+++ b/gallery/how_to/compile_models/from_oneflow.py
@@ -55,129 +55,3 @@
 from tvm import relay
 from tvm.contrib.download import download_testdata
 
-######################################################################
-# Load a pretrained OneFlow model and save model
-# ----------------------------------------------
-model_name = "resnet18"
-model = getattr(flowvision.models, model_name)(pretrained=True)
-model = model.eval()
-
-model_dir = "resnet18_model"
-if not os.path.exists(model_dir):
-    flow.save(model.state_dict(), model_dir)
-
-######################################################################
-# Load a test image
-# -----------------
-# Classic cat example!
-from PIL import Image
-
-img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true"
-img_path = download_testdata(img_url, "cat.png", module="data")
-img = Image.open(img_path).resize((224, 224))
-
-# Preprocess the image and convert to tensor
-from flowvision import transforms
-
-my_preprocess = transforms.Compose(
-    [
-        transforms.Resize(256),
-        transforms.CenterCrop(224),
-        transforms.ToTensor(),
-        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-    ]
-)
-img = my_preprocess(img)
-img = np.expand_dims(img.numpy(), 0)
-
-######################################################################
-# Import the graph to Relay
-# -------------------------
-# Convert OneFlow graph to Relay graph. The input name can be arbitrary.
-class Graph(flow.nn.Graph):
-    def __init__(self, module):
-        super().__init__()
-        self.m = module
-
-    def build(self, x):
-        out = self.m(x)
-        return out
-
-
-graph = Graph(model)
-_ = graph._compile(flow.randn(1, 3, 224, 224))
-
-mod, params = relay.frontend.from_oneflow(graph, model_dir)
-
-######################################################################
-# Relay Build
-# -----------
-# Compile the graph to llvm target with given input specification.
-target = tvm.target.Target("llvm", host="llvm")
-dev = tvm.cpu(0)
-with tvm.transform.PassContext(opt_level=3):
-    lib = relay.build(mod, target=target, params=params)
-
-######################################################################
-# Execute the portable graph on TVM
-# ---------------------------------
-# Now we can try deploying the compiled model on target.
-target = "cuda"
-with tvm.transform.PassContext(opt_level=10):
-    intrp = relay.build_module.create_executor("graph", mod, tvm.cuda(0), target)
-
-print(type(img))
-print(img.shape)
-tvm_output = intrp.evaluate()(tvm.nd.array(img.astype("float32")), **params)
-
-#####################################################################
-# Look up synset name
-# -------------------
-# Look up prediction top 1 index in 1000 class synset.
-synset_url = "".join(
-    [
-        "https://raw.githubusercontent.com/Cadene/",
-        "pretrained-models.pytorch/master/data/",
-        "imagenet_synsets.txt",
-    ]
-)
-synset_name = "imagenet_synsets.txt"
-synset_path = download_testdata(synset_url, synset_name, module="data")
-with open(synset_path) as f:
-    synsets = f.readlines()
-
-synsets = [x.strip() for x in synsets]
-splits = [line.split(" ") for line in synsets]
-key_to_classname = {spl[0]: " ".join(spl[1:]) for spl in splits}
-
-class_url = "".join(
-    [
-        "https://raw.githubusercontent.com/Cadene/",
-        "pretrained-models.pytorch/master/data/",
-        "imagenet_classes.txt",
-    ]
-)
-class_name = "imagenet_classes.txt"
-class_path = download_testdata(class_url, class_name, module="data")
-with open(class_path) as f:
-    class_id_to_key = f.readlines()
-
-class_id_to_key = [x.strip() for x in class_id_to_key]
-
-# Get top-1 result for TVM
-top1_tvm = np.argmax(tvm_output.numpy()[0])
-tvm_class_key = class_id_to_key[top1_tvm]
-
-# Convert input to OneFlow variable and get OneFlow result for comparison
-with flow.no_grad():
-    torch_img = flow.from_numpy(img)
-    output = model(torch_img)
-
-    # Get top-1 result for OneFlow
-    top_oneflow = np.argmax(output.numpy())
-    oneflow_class_key = class_id_to_key[top_oneflow]
-
-print("Relay top-1 id: {}, class name: {}".format(top1_tvm, key_to_classname[tvm_class_key]))
-print(
-    "OneFlow top-1 id: {}, class name: {}".format(top_oneflow, key_to_classname[oneflow_class_key])
-)

From e2597985745e687ad9c302dcfcdca12275404957 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Mon, 4 Jul 2022 22:53:28 -0700
Subject: [PATCH 18/39] add sphinx skip

---
 .../how_to/work_with_relay/using_with_pipeline_executor.py  | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index 364bc50a39d9..47dc67dfbf9d 100755
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -152,6 +152,12 @@ def @main(%data_n_0: Tensor[(1, 16, 224, 224), float32]) {
  connections
   |mod0.output(0)-> mod1.data_n_0
 """
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 ##############################
 # Build the pipeline executor.
 # ----------------------------

From b70a731463dd3675c574d758191eb638dbd7d00a Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Mon, 4 Jul 2022 23:03:18 -0700
Subject: [PATCH 19/39] plint

---
 gallery/how_to/compile_models/from_oneflow.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gallery/how_to/compile_models/from_oneflow.py b/gallery/how_to/compile_models/from_oneflow.py
index a80bd562eb49..2a4d0815acc0 100644
--- a/gallery/how_to/compile_models/from_oneflow.py
+++ b/gallery/how_to/compile_models/from_oneflow.py
@@ -54,4 +54,3 @@
 import tvm
 from tvm import relay
 from tvm.contrib.download import download_testdata
-

From 215a2bdc967289387347d477e3b85b836b3e98ac Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Tue, 5 Jul 2022 13:30:31 -0700
Subject: [PATCH 20/39] remove from_oneflow change test.

---
 gallery/how_to/compile_models/from_oneflow.py | 127 ++++++++++++++++++
 1 file changed, 127 insertions(+)

diff --git a/gallery/how_to/compile_models/from_oneflow.py b/gallery/how_to/compile_models/from_oneflow.py
index 2a4d0815acc0..eb27c4b3e34b 100644
--- a/gallery/how_to/compile_models/from_oneflow.py
+++ b/gallery/how_to/compile_models/from_oneflow.py
@@ -54,3 +54,130 @@
 import tvm
 from tvm import relay
 from tvm.contrib.download import download_testdata
+
+######################################################################
+# Load a pretrained OneFlow model and save model
+# ----------------------------------------------
+model_name = "resnet18"
+model = getattr(flowvision.models, model_name)(pretrained=True)
+model = model.eval()
+
+model_dir = "resnet18_model"
+if not os.path.exists(model_dir):
+    flow.save(model.state_dict(), model_dir)
+
+######################################################################
+# Load a test image
+# -----------------
+# Classic cat example!
+from PIL import Image
+
+img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true"
+img_path = download_testdata(img_url, "cat.png", module="data")
+img = Image.open(img_path).resize((224, 224))
+
+# Preprocess the image and convert to tensor
+from flowvision import transforms
+
+my_preprocess = transforms.Compose(
+    [
+        transforms.Resize(256),
+        transforms.CenterCrop(224),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ]
+)
+img = my_preprocess(img)
+img = np.expand_dims(img.numpy(), 0)
+
+######################################################################
+# Import the graph to Relay
+# -------------------------
+# Convert OneFlow graph to Relay graph. The input name can be arbitrary.
+class Graph(flow.nn.Graph):
+    def __init__(self, module):
+        super().__init__()
+        self.m = module
+
+    def build(self, x):
+        out = self.m(x)
+        return out
+
+
+graph = Graph(model)
+_ = graph._compile(flow.randn(1, 3, 224, 224))
+
+mod, params = relay.frontend.from_oneflow(graph, model_dir)
+
+######################################################################
+# Relay Build
+# -----------
+# Compile the graph to llvm target with given input specification.
+target = tvm.target.Target("llvm", host="llvm")
+dev = tvm.cpu(0)
+with tvm.transform.PassContext(opt_level=3):
+    lib = relay.build(mod, target=target, params=params)
+
+######################################################################
+# Execute the portable graph on TVM
+# ---------------------------------
+# Now we can try deploying the compiled model on target.
+target = "cuda"
+with tvm.transform.PassContext(opt_level=10):
+    intrp = relay.build_module.create_executor("graph", mod, tvm.cuda(0), target)
+
+print(type(img))
+print(img.shape)
+tvm_output = intrp.evaluate()(tvm.nd.array(img.astype("float32")), **params)
+
+#####################################################################
+# Look up synset name
+# -------------------
+# Look up prediction top 1 index in 1000 class synset.
+synset_url = "".join(
+    [
+        "https://raw.githubusercontent.com/Cadene/",
+        "pretrained-models.pytorch/master/data/",
+        "imagenet_synsets.txt",
+    ]
+)
+synset_name = "imagenet_synsets.txt"
+synset_path = download_testdata(synset_url, synset_name, module="data")
+with open(synset_path) as f:
+    synsets = f.readlines()
+
+synsets = [x.strip() for x in synsets]
+splits = [line.split(" ") for line in synsets]
+key_to_classname = {spl[0]: " ".join(spl[1:]) for spl in splits}
+
+class_url = "".join(
+    [
+        "https://raw.githubusercontent.com/Cadene/",
+        "pretrained-models.pytorch/master/data/",
+        "imagenet_classes.txt",
+    ]
+)
+class_name = "imagenet_classes.txt"
+class_path = download_testdata(class_url, class_name, module="data")
+with open(class_path) as f:
+    class_id_to_key = f.readlines()
+
+class_id_to_key = [x.strip() for x in class_id_to_key]
+
+# Get top-1 result for TVM
+top1_tvm = np.argmax(tvm_output.numpy()[0])
+tvm_class_key = class_id_to_key[top1_tvm]
+
+# Convert input to OneFlow variable and get OneFlow result for comparison
+with flow.no_grad():
+    torch_img = flow.from_numpy(img)
+    output = model(torch_img)
+
+    # Get top-1 result for OneFlow
+    top_oneflow = np.argmax(output.numpy())
+    oneflow_class_key = class_id_to_key[top_oneflow]
+
+print("Relay top-1 id: {}, class name: {}".format(top1_tvm, key_to_classname[tvm_class_key]))
+print(
+    "OneFlow top-1 id: {}, class name: {}".format(top_oneflow, key_to_classname[oneflow_class_key])
+)

From bc6e863d2711f90014b4d5e7f837881bbdfabf1e Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Tue, 5 Jul 2022 13:37:22 -0700
Subject: [PATCH 21/39] remove pipeline executor change for test

---
 .../using_with_pipeline_executor.py           | 203 +++++++++---------
 tests/scripts/task_config_build_gpu.sh        |   4 +-
 2 files changed, 104 insertions(+), 103 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index 47dc67dfbf9d..4b5cba3702a5 100755
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -108,104 +108,105 @@ def @main(%data_n_0: Tensor[(1, 16, 224, 224), float32]) {
 # Define a function to do all the codegen and pipeline executor works.
 # To run pipeline executor with dnnl, USE_PIPELINE_EXECUTOR need to get set as ON.
 # and the 'USE_DNNL_CODEGEN' should set as ON in config.cmake and installing MKL-DNN.
-from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
-
-#########################################
-# Create subgraph pipeline configuration.
-# Associate the subgraph module with a target.
-# Using BYOC to set the codegen of the second subgraph module.
-# To use dnnl the 'USE_DNNL_CODEGEN' should set as ON in config.cmake and installing MKL-DNN.
-mod0, mod1 = subgraphs[0], subgraphs[1]
-mod0 = relay.transform.AnnotateTarget(["dnnl"])(mod0)
-mod0 = relay.transform.MergeCompilerRegions()(mod0)
-mod0 = relay.transform.PartitionGraph()(mod0)
-#################################################
-# Get the pipeline executor configuration object.
-pipe_config = pipeline_executor_build.PipelineConfig()
-###########################################################################
-# Set the compile target of the second subgraph module for example as LLVM.
-pipe_config[mod0].target = "llvm"
-pipe_config[mod0].dev = tvm.cpu(0)
-###############################################################################
-# Set the cpu afinity for control flow, for example using cpu 0 for control flow.
-pipe_config[mod1].cpu_affinity = "0"
-##############################################################
-# Set the compile target of the second subgraph module as LLVM.
-pipe_config[mod1].target = "llvm"
-pipe_config[mod1].dev = tvm.cpu(0)
-#################################################################################
-# Set the cpu afinity for control flow, for example using cpu 1 for control flow.
-pipe_config[mod1].cpu_affinity = "1"
-pipe_config["input"]["data"].connect(pipe_config[mod0]["input"]["data"])
-pipe_config[mod0]["output"][0].connect(pipe_config[mod1]["input"]["data_n_0"])
-pipe_config[mod1]["output"]["0"].connect(pipe_config["output"][0])
-######################################
-# The pipeline configuration as below.
-"""
-print(pipe_config)
- Inputs
-  |data: mod0:data
-
- output
-  |output(0) : mod1.output(0)
-
- connections
-  |mod0.output(0)-> mod1.data_n_0
-"""
-
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-##############################
-# Build the pipeline executor.
-# ----------------------------
-with tvm.transform.PassContext(opt_level=3):
-    pipeline_mod_factory = pipeline_executor_build.build(pipe_config)
-###############################################
-# Export the parameter configuration to a file.
-directory_path = tvm.contrib.utils.tempdir().temp_dir
-#############################################
-# If the directory does not exist, create it.
-if not os.path.exists(directory_path):
-    os.makedirs(directory_path)
-config_file_name = pipeline_mod_factory.export_library(directory_path)
-################################################################
-# Use the load function to create and initialize PipelineModule.
-# --------------------------------------------------------------
-pipeline_module = pipeline_executor.PipelineModule.load_library(config_file_name)
-
-############################
-# Run the pipeline executor.
-# --------------------------
-# Allocated a input data.
-data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
-pipeline_module.set_input("data", tvm.nd.array(data))
-##########################################################################
-# Run the two subgraph in pipeline mode and get the output asynchronously.
-pipeline_module.run()
-outputs = []
-while not outputs:
-    outputs = pipeline_module.get_output()
-    time.sleep(0.001)
-######################################
-# Use graph_executor for verification.
-# ------------------------------------
-# Run these two subgraphs in sequence with graph_executor to get the output.
-target = "llvm"
-dev = tvm.device(target, 0)
-lib0 = relay.build_module.build(mod0, target, params=params)
-lib1 = relay.build_module.build(mod1, target, params=params)
-module0 = runtime.GraphModule(lib0["default"](dev))
-module1 = runtime.GraphModule(lib1["default"](dev))
-module0.set_input("data", data)
-module0.run()
-out_shape = (1, 16, 224, 224)
-out = module0.get_output(0, tvm.nd.empty(out_shape))
-module1.set_input("data_n_0", out)
-module1.run()
-out = module1.get_output(0, tvm.nd.empty(out_shape))
-####################
-# Verify the result.
-tvm.testing.assert_allclose(outputs[0].numpy(), out.numpy())
+def run_pipeline():
+    from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
+
+    #########################################
+    # Create subgraph pipeline configuration.
+    # Associate the subgraph module with a target.
+    # Using BYOC to set the codegen of the second subgraph module.
+    # To use dnnl the 'USE_DNNL_CODEGEN' should set as ON in config.cmake and installing MKL-DNN.
+    mod0, mod1 = subgraphs[0], subgraphs[1]
+    mod0 = relay.transform.AnnotateTarget(["dnnl"])(mod0)
+    mod0 = relay.transform.MergeCompilerRegions()(mod0)
+    mod0 = relay.transform.PartitionGraph()(mod0)
+    #################################################
+    # Get the pipeline executor configuration object.
+    pipe_config = pipeline_executor_build.PipelineConfig()
+    ###########################################################################
+    # Set the compile target of the second subgraph module for example as LLVM.
+    pipe_config[mod0].target = "llvm"
+    pipe_config[mod0].dev = tvm.cpu(0)
+    ###############################################################################
+    # Set the cpu afinity for control flow, for example using cpu 0 for control flow.
+    pipe_config[mod1].cpu_affinity = "0"
+    ##############################################################
+    # Set the compile target of the second subgraph module as LLVM.
+    pipe_config[mod1].target = "llvm"
+    pipe_config[mod1].dev = tvm.cpu(0)
+    #################################################################################
+    # Set the cpu afinity for control flow, for example using cpu 1 for control flow.
+    pipe_config[mod1].cpu_affinity = "1"
+    pipe_config["input"]["data"].connect(pipe_config[mod0]["input"]["data"])
+    pipe_config[mod0]["output"][0].connect(pipe_config[mod1]["input"]["data_n_0"])
+    pipe_config[mod1]["output"]["0"].connect(pipe_config["output"][0])
+    ######################################
+    # The pipeline configuration as below.
+    """
+    print(pipe_config)
+     Inputs
+      |data: mod0:data
+
+     output
+      |output(0) : mod1.output(0)
+
+     connections
+      |mod0.output(0)-> mod1.data_n_0
+    """
+
+    # sphinx_gallery_start_ignore
+    from tvm import testing
+
+    testing.utils.install_request_hook(depth=3)
+    # sphinx_gallery_end_ignore
+    ##############################
+    # Build the pipeline executor.
+    # ----------------------------
+    with tvm.transform.PassContext(opt_level=3):
+        pipeline_mod_factory = pipeline_executor_build.build(pipe_config)
+    ###############################################
+    # Export the parameter configuration to a file.
+    directory_path = tvm.contrib.utils.tempdir().temp_dir
+    #############################################
+    # If the directory does not exist, create it.
+    if not os.path.exists(directory_path):
+        os.makedirs(directory_path)
+    config_file_name = pipeline_mod_factory.export_library(directory_path)
+    ################################################################
+    # Use the load function to create and initialize PipelineModule.
+    # --------------------------------------------------------------
+    pipeline_module = pipeline_executor.PipelineModule.load_library(config_file_name)
+
+    ############################
+    # Run the pipeline executor.
+    # --------------------------
+    # Allocated a input data.
+    data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+    pipeline_module.set_input("data", tvm.nd.array(data))
+    ##########################################################################
+    # Run the two subgraph in pipeline mode and get the output asynchronously.
+    pipeline_module.run()
+    outputs = []
+    while not outputs:
+        outputs = pipeline_module.get_output()
+        time.sleep(0.001)
+    ######################################
+    # Use graph_executor for verification.
+    # ------------------------------------
+    # Run these two subgraphs in sequence with graph_executor to get the output.
+    target = "llvm"
+    dev = tvm.device(target, 0)
+    lib0 = relay.build_module.build(mod0, target, params=params)
+    lib1 = relay.build_module.build(mod1, target, params=params)
+    module0 = runtime.GraphModule(lib0["default"](dev))
+    module1 = runtime.GraphModule(lib1["default"](dev))
+    module0.set_input("data", data)
+    module0.run()
+    out_shape = (1, 16, 224, 224)
+    out = module0.get_output(0, tvm.nd.empty(out_shape))
+    module1.set_input("data_n_0", out)
+    module1.run()
+    out = module1.get_output(0, tvm.nd.empty(out_shape))
+    ####################
+    # Verify the result.
+    tvm.testing.assert_allclose(outputs[0].numpy(), out.numpy())
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 9357fa6c8078..f45dd584c862 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -47,5 +47,5 @@ echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
 echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake
-echo set\(USE_PIPELINE_EXECUTOR ON\) >> config.cmake
-echo set\(USE_DNNL ON\) >> config.cmake
+#echo set\(USE_PIPELINE_EXECUTOR ON\) >> config.cmake
+#echo set\(USE_DNNL ON\) >> config.cmake

From 77099740debd43a47fa3504f60776530a7b5e42d Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Tue, 5 Jul 2022 14:44:33 -0700
Subject: [PATCH 22/39] plint

---
 .../how_to/work_with_relay/using_with_pipeline_executor.py  | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index 4b5cba3702a5..7a6d68423b9e 100755
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -102,6 +102,12 @@ def @main(%data_n_0: Tensor[(1, 16, 224, 224), float32]) {
  }
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ###########################################################
 # Run the two subgraphs in pipeline with pipeline executor.
 # ---------------------------------------------------------

From 745ec3bbb21cef7ff5e30f358a9546662ef93352 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Tue, 5 Jul 2022 21:01:41 -0700
Subject: [PATCH 23/39] enable DNNL and pipeline

---
 tests/scripts/task_config_build_gpu.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index f45dd584c862..9357fa6c8078 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -47,5 +47,5 @@ echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
 echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake
-#echo set\(USE_PIPELINE_EXECUTOR ON\) >> config.cmake
-#echo set\(USE_DNNL ON\) >> config.cmake
+echo set\(USE_PIPELINE_EXECUTOR ON\) >> config.cmake
+echo set\(USE_DNNL ON\) >> config.cmake

From e14d431ecaed10e7a7dd2d3c6bb3aaafceccf6e8 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Tue, 5 Jul 2022 23:35:19 -0700
Subject: [PATCH 24/39] disable DNNL

---
 tests/scripts/task_config_build_gpu.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 9357fa6c8078..86e1ad3f7a61 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -48,4 +48,4 @@ echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
 echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake
 echo set\(USE_PIPELINE_EXECUTOR ON\) >> config.cmake
-echo set\(USE_DNNL ON\) >> config.cmake
+#echo set\(USE_DNNL ON\) >> config.cmake

From 6640dd6815e3c208ddab9bf22fda976e0f1a54e2 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Wed, 6 Jul 2022 13:15:38 -0700
Subject: [PATCH 25/39] enable DNNL without pipeline

---
 tests/scripts/task_config_build_gpu.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 86e1ad3f7a61..4dd25735e0b8 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -47,5 +47,5 @@ echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
 echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake
-echo set\(USE_PIPELINE_EXECUTOR ON\) >> config.cmake
-#echo set\(USE_DNNL ON\) >> config.cmake
+#echo set\(USE_PIPELINE_EXECUTOR ON\) >> config.cmake
+echo set\(USE_DNNL ON\) >> config.cmake

From f5b61fd73222b6e3e1f80ea5b15d9eccc023a997 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Mon, 11 Jul 2022 18:36:39 -0700
Subject: [PATCH 26/39] remove dnnl and add cutlass

---
 tests/scripts/task_config_build_gpu.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 4dd25735e0b8..f79076e213cb 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -47,5 +47,5 @@ echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
 echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake
-#echo set\(USE_PIPELINE_EXECUTOR ON\) >> config.cmake
-echo set\(USE_DNNL ON\) >> config.cmake
+echo set\(USE_PIPELINE_EXECUTOR ON\) >> config.cmake
+echo set\(USE_CUTLASS ON\) >> config.cmake

From 50a7eb907361783f3077edcfa70331c11dc18759 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Tue, 12 Jul 2022 00:21:00 -0700
Subject: [PATCH 27/39] use cutlass with byoc

---
 .../using_with_pipeline_executor.py           | 55 +++++++++++--------
 1 file changed, 33 insertions(+), 22 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index 7a6d68423b9e..f693b76126b6 100755
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -25,11 +25,19 @@
 from tvm import te
 import numpy as np
 from tvm.contrib import graph_executor as runtime
+from tvm.relay.op.contrib.cutlass import partition_for_cutlass
 from tvm import relay
 from tvm.relay import testing
 import tvm.testing
 import time
-
+from tvm.contrib.cutlass import (
+    has_cutlass,
+    num_cutlass_partitions,
+    finalize_modules,
+    finalize_modules_vm,
+)
+
+img_size = 8
 #######################################################################
 # Create a simple network, this network can be a pre-trained model too.
 # ---------------------------------------------------------------------
@@ -38,7 +46,10 @@
 def get_network():
     out_channels = 16
     batch_size = 1
-    data = relay.var("data", relay.TensorType((batch_size, 3, 224, 224), "float32"))
+    data = relay.var("data", relay.TensorType((batch_size, 3, img_size, img_size), "float32"))
+    dense_weight = relay.var(
+        "data", relay.TensorType((batch_size, 16 * img_size * img_size), "float32")
+    )
     weight = relay.var("weight")
     second_weight = relay.var("second_weight")
     bn_gamma = relay.var("bn_gamma")
@@ -50,15 +61,10 @@ def get_network():
     )
     simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0]
     simple_net = relay.nn.relu(simple_net)
-    simple_net = relay.nn.conv2d(
-        data=simple_net,
-        weight=second_weight,
-        kernel_size=(3, 3),
-        channels=out_channels,
-        padding=(1, 1),
-    )
+    simple_net = relay.nn.batch_flatten(simple_net)
+    simple_net = relay.nn.dense(simple_net, dense_weight)
     simple_net = relay.Function(relay.analysis.free_vars(simple_net), simple_net)
-    data_shape = (batch_size, 3, 224, 224)
+    data_shape = (batch_size, 3, img_size, img_size)
     net, params = testing.create_workload(simple_net)
     return net, params, data_shape
 
@@ -86,19 +92,19 @@ def get_network():
 """
 #subgraphs[0])
 
- def @main(%data: Tensor[(1, 3, 224, 224), float32]) {
-  %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, 224, 224), float32] */;
-  %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float32] */, meta[relay.Constant][2] /* ty=Tensor[(16), float32]*/, meta[relay.Constant][3] /* ty=Tensor[(16), float32] */, meta[relay.Constant][4] /* ty=Tensor[(16), float32] */) /* ty=(Tensor[(1,16, 224, 224), float32], Tensor[(16), float32], Tensor[(16), float32]) */;
+ def @main(%data: Tensor[(1, 3, img_size, img_size), float32]) {
+  %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, img_size, img_size), float32] */;
+  %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float32] */, meta[relay.Constant][2] /* ty=Tensor[(16), float32]*/, meta[relay.Constant][3] /* ty=Tensor[(16), float32] */, meta[relay.Constant][4] /* ty=Tensor[(16), float32] */) /* ty=(Tensor[(1,16, img_size, img_size), float32], Tensor[(16), float32], Tensor[(16), float32]) */;
   %2 = %1.0;
-  nn.relu(%2) /* ty=Tensor[(1, 16, 224, 224), float32] */
+  nn.relu(%2) /* ty=Tensor[(1, 16, img_size, img_size), float32] */
  }
 
 peline-tutorial
 
 #subgraphs[1]
 
- def @main(%data_n_0: Tensor[(1, 16, 224, 224), float32]) {
-  nn.conv2d(%data_n_0, meta[relay.Constant][0] /* ty=Tensor[(16, 16, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, 224, 224), float32] */
+ def @main(%data_n_0: Tensor[(1, 16, img_size, img_size), float32]) {
+  nn.conv2d(%data_n_0, meta[relay.Constant][0] /* ty=Tensor[(16, 16, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, img_size, img_size), float32] */
  }
 """
 
@@ -123,9 +129,11 @@ def run_pipeline():
     # Using BYOC to set the codegen of the second subgraph module.
     # To use dnnl the 'USE_DNNL_CODEGEN' should set as ON in config.cmake and installing MKL-DNN.
     mod0, mod1 = subgraphs[0], subgraphs[1]
-    mod0 = relay.transform.AnnotateTarget(["dnnl"])(mod0)
-    mod0 = relay.transform.MergeCompilerRegions()(mod0)
-    mod0 = relay.transform.PartitionGraph()(mod0)
+    # mod0 = relay.transform.AnnotateTarget(["dnnl"])(mod0)
+    # mod0 = relay.transform.AnnotateTarget(["cutlass"])(mod0)
+    # mod0 = relay.transform.MergeCompilerRegions()(mod0)
+    # mod0 = relay.transform.PartitionGraph()(mod0)
+    mod1 = partition_for_cutlass(mod1)
     #################################################
     # Get the pipeline executor configuration object.
     pipe_config = pipeline_executor_build.PipelineConfig()
@@ -138,8 +146,8 @@ def run_pipeline():
     pipe_config[mod1].cpu_affinity = "0"
     ##############################################################
     # Set the compile target of the second subgraph module as LLVM.
-    pipe_config[mod1].target = "llvm"
-    pipe_config[mod1].dev = tvm.cpu(0)
+    pipe_config[mod1].target = "cuda"
+    pipe_config[mod1].dev = tvm.device("cuda", 0)
     #################################################################################
     # Set the cpu afinity for control flow, for example using cpu 1 for control flow.
     pipe_config[mod1].cpu_affinity = "1"
@@ -208,7 +216,7 @@ def run_pipeline():
     module1 = runtime.GraphModule(lib1["default"](dev))
     module0.set_input("data", data)
     module0.run()
-    out_shape = (1, 16, 224, 224)
+    out_shape = (1, 16, img_size, img_size)
     out = module0.get_output(0, tvm.nd.empty(out_shape))
     module1.set_input("data_n_0", out)
     module1.run()
@@ -216,3 +224,6 @@ def run_pipeline():
     ####################
     # Verify the result.
     tvm.testing.assert_allclose(outputs[0].numpy(), out.numpy())
+
+
+run_pipeline()

From 0b30034bac7334bdeb361b69f84fdcc002449019 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Sun, 17 Jul 2022 00:06:56 -0700
Subject: [PATCH 28/39] change into cutlass

---
 .../using_with_pipeline_executor.py           | 96 ++++++++++++++-----
 python/tvm/contrib/pipeline_executor.py       |  9 +-
 python/tvm/contrib/pipeline_executor_build.py | 14 ++-
 3 files changed, 89 insertions(+), 30 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index f693b76126b6..f58edfc43021 100755
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -46,9 +46,9 @@
 def get_network():
     out_channels = 16
     batch_size = 1
-    data = relay.var("data", relay.TensorType((batch_size, 3, img_size, img_size), "float32"))
+    data = relay.var("data", relay.TensorType((batch_size, 3, img_size, img_size), "float16"))
     dense_weight = relay.var(
-        "data", relay.TensorType((batch_size, 16 * img_size * img_size), "float32")
+        "dweight", relay.TensorType((batch_size, 16 * img_size * img_size), "float16")
     )
     weight = relay.var("weight")
     second_weight = relay.var("second_weight")
@@ -92,20 +92,22 @@ def get_network():
 """
 #subgraphs[0])
 
- def @main(%data: Tensor[(1, 3, img_size, img_size), float32]) {
-  %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, img_size, img_size), float32] */;
-  %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float32] */, meta[relay.Constant][2] /* ty=Tensor[(16), float32]*/, meta[relay.Constant][3] /* ty=Tensor[(16), float32] */, meta[relay.Constant][4] /* ty=Tensor[(16), float32] */) /* ty=(Tensor[(1,16, img_size, img_size), float32], Tensor[(16), float32], Tensor[(16), float32]) */;
+ def @main(%data: Tensor[(1, 3, img_size, img_size), float16]) {
+  %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), float16] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, img_size, img_size), float16] */;
+  %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float16] */, meta[relay.Constant][2] /* ty=Tensor[(16), float16]*/, meta[relay.Constant][3] /* ty=Tensor[(16), float16] */, meta[relay.Constant][4] /* ty=Tensor[(16), float16] */) /* ty=(Tensor[(1,16, img_size, img_size), float16], Tensor[(16), float16], Tensor[(16), float16]) */;
   %2 = %1.0;
-  nn.relu(%2) /* ty=Tensor[(1, 16, img_size, img_size), float32] */
+  nn.relu(%2) /* ty=Tensor[(1, 16, img_size, img_size), float16] */
  }
 
 peline-tutorial
 
 #subgraphs[1]
 
- def @main(%data_n_0: Tensor[(1, 16, img_size, img_size), float32]) {
-  nn.conv2d(%data_n_0, meta[relay.Constant][0] /* ty=Tensor[(16, 16, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, img_size, img_size), float32] */
+ def @main(%data_n_0: Tensor[(1, 16, 8, 8), float16] /* ty=Tensor[(1, 16, 8, 8), float16] */) {
+  %0 = nn.batch_flatten(%data_n_0) /* ty=Tensor[(1, 1024), float16] */;
+  nn.dense(%0, meta[relay.Constant][0] /* ty=Tensor[(1, 1024), float16] */, units=None) /* ty=Tensor[(1, 1), float16] */
  }
+
 """
 
 # sphinx_gallery_start_ignore
@@ -113,13 +115,40 @@ def @main(%data_n_0: Tensor[(1, 16, img_size, img_size), float32]) {
 
 testing.utils.install_request_hook(depth=3)
 # sphinx_gallery_end_ignore
+#########################################
+# Build the subgraph with cutlass target.
+# ---------------------------------------
+#########################################
+cutlass = tvm.target.Target(
+    {
+        "kind": "cutlass",
+        "sm": 80,
+        "use_3xtf32": True,
+        "split_k_slices": [1],
+        "profile_all_alignments": False,
+        "find_first_valid": True,
+        "use_multiprocessing": True,
+        "use_fast_math": False,
+        "tmp_dir": "./tmp",
+    },
+    host=tvm.target.Target("llvm"),
+)
+
+
+def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"):
+    target = [target, cutlass]
+    lib = relay.build_module.build(
+        mod, target=target, params=params, target_host=target_host, mod_name=mod_name
+    )
+    return lib
+
 
 ###########################################################
 # Run the two subgraphs in pipeline with pipeline executor.
 # ---------------------------------------------------------
 # Define a function to do all the codegen and pipeline executor works.
 # To run pipeline executor with dnnl, USE_PIPELINE_EXECUTOR need to get set as ON.
-# and the 'USE_DNNL_CODEGEN' should set as ON in config.cmake and installing MKL-DNN.
+# and the 'USE_CUTLASS' should set as ON in config.cmake.
 def run_pipeline():
     from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
 
@@ -127,12 +156,9 @@ def run_pipeline():
     # Create subgraph pipeline configuration.
     # Associate the subgraph module with a target.
     # Using BYOC to set the codegen of the second subgraph module.
-    # To use dnnl the 'USE_DNNL_CODEGEN' should set as ON in config.cmake and installing MKL-DNN.
+    # To use cutlass the 'USE_CUTLASS' should set as ON.
     mod0, mod1 = subgraphs[0], subgraphs[1]
-    # mod0 = relay.transform.AnnotateTarget(["dnnl"])(mod0)
-    # mod0 = relay.transform.AnnotateTarget(["cutlass"])(mod0)
-    # mod0 = relay.transform.MergeCompilerRegions()(mod0)
-    # mod0 = relay.transform.PartitionGraph()(mod0)
+    # Apply cutlass as the codegen.
     mod1 = partition_for_cutlass(mod1)
     #################################################
     # Get the pipeline executor configuration object.
@@ -144,10 +170,13 @@ def run_pipeline():
     ###############################################################################
     # Set the cpu afinity for control flow, for example using cpu 0 for control flow.
     pipe_config[mod1].cpu_affinity = "0"
+    pipe_config[mod1].export_cc = None
     ##############################################################
     # Set the compile target of the second subgraph module as LLVM.
-    pipe_config[mod1].target = "cuda"
+    pipe_config[mod1].target = "cuda"  # tvm.target.Target("cuda", host=tvm.target.Target("llvm"))
     pipe_config[mod1].dev = tvm.device("cuda", 0)
+    pipe_config[mod1].build_func = cutlass_build
+    pipe_config[mod1].export_cc = "nvcc"
     #################################################################################
     # Set the cpu afinity for control flow, for example using cpu 1 for control flow.
     pipe_config[mod1].cpu_affinity = "1"
@@ -171,7 +200,7 @@ def run_pipeline():
     # sphinx_gallery_start_ignore
     from tvm import testing
 
-    testing.utils.install_request_hook(depth=3)
+    # testing.utils.install_request_hook(depth=3)
     # sphinx_gallery_end_ignore
     ##############################
     # Build the pipeline executor.
@@ -195,7 +224,7 @@ def run_pipeline():
     # Run the pipeline executor.
     # --------------------------
     # Allocated a input data.
-    data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+    data = np.random.uniform(-1, 1, size=data_shape).astype("float16")
     pipeline_module.set_input("data", tvm.nd.array(data))
     ##########################################################################
     # Run the two subgraph in pipeline mode and get the output asynchronously.
@@ -209,18 +238,39 @@ def run_pipeline():
     # ------------------------------------
     # Run these two subgraphs in sequence with graph_executor to get the output.
     target = "llvm"
-    dev = tvm.device(target, 0)
+    dev0 = tvm.device(target, 0)
     lib0 = relay.build_module.build(mod0, target, params=params)
-    lib1 = relay.build_module.build(mod1, target, params=params)
-    module0 = runtime.GraphModule(lib0["default"](dev))
-    module1 = runtime.GraphModule(lib1["default"](dev))
+    module0 = runtime.GraphModule(lib0["default"](dev0))
+    cutlass = tvm.target.Target(
+        {
+            "kind": "cutlass",
+            "sm": 75,
+            "use_3xtf32": True,
+            "split_k_slices": [1],
+            "profile_all_alignments": False,
+            "find_first_valid": True,
+            "use_multiprocessing": True,
+            "use_fast_math": False,
+            "tmp_dir": "./tmp",
+        },
+        host=tvm.target.Target("llvm"),
+    )
+    cuda = tvm.target.Target("cuda", host=tvm.target.Target("llvm"))
+    lib1 = relay.build_module.build(mod1, [cuda, cutlass], params=params)
+    lib1 = finalize_modules(lib1, "compile.so", "./tmp")
+
+    dev1 = tvm.device("cuda", 0)
+
+    module1 = runtime.GraphModule(lib1["default"](dev1))
+
     module0.set_input("data", data)
     module0.run()
     out_shape = (1, 16, img_size, img_size)
-    out = module0.get_output(0, tvm.nd.empty(out_shape))
+    out = module0.get_output(0, tvm.nd.empty(out_shape, "float16"))
     module1.set_input("data_n_0", out)
     module1.run()
-    out = module1.get_output(0, tvm.nd.empty(out_shape))
+    out_shape = (1, 1)
+    out = module1.get_output(0, tvm.nd.empty(out_shape, "float16"))
     ####################
     # Verify the result.
     tvm.testing.assert_allclose(outputs[0].numpy(), out.numpy())
diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index 5ef309bb2808..f1c4e98a51d7 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -302,11 +302,16 @@ def export_library(self, directory_path):
                 self.pipeline_mods[lib_index]["dev"].device_type,
                 self.pipeline_mods[lib_index]["dev"].device_id,
             )
-
             # Get the graph, lib, and parameters from GraphExecutorFactoryModule.
             lib = self.pipeline_mods[lib_index]["lib"]
             # Export the lib, graph, and parameters to disk.
-            lib.export_library(mconfig["lib_name"])
+            if self.pipeline_mods[lib_index]["export_cc"]:
+                lib.export_library(
+                    mconfig["lib_name"], cc=self.pipeline_mods[lib_index]["export_cc"]
+                )
+            else:
+                lib.export_library(mconfig["lib_name"])
+
             with open(mconfig["json_name"], "w") as file_handle:
                 file_handle.write(lib.graph_json)
             with open(mconfig["params_name"], "wb") as file_handle:
diff --git a/python/tvm/contrib/pipeline_executor_build.py b/python/tvm/contrib/pipeline_executor_build.py
index 520156b47406..324383ab7ce3 100644
--- a/python/tvm/contrib/pipeline_executor_build.py
+++ b/python/tvm/contrib/pipeline_executor_build.py
@@ -86,7 +86,12 @@ def build(pipe_configs):
         # Use "mod_idx" as the key to create a "module_connection" map which is not only
         # for the module index but also for the module connection used to build the pipeline.
         module_string_config[mod_idx] = pipe_config
-        libs[mod_idx] = {"lib": lib, "dev": dev, "fcompile": mod_config["fcompile"]}
+        libs[mod_idx] = {
+            "lib": lib,
+            "dev": dev,
+            "fcompile": mod_config["fcompile"],
+            "export_cc": mod_config["export_cc"],
+        }
 
     # Creating a text form configuration to record the "input_connection" and the
     # "module_connection" information. The "input_connection" is used to record the
@@ -132,10 +137,7 @@ def export_library(factory, directory_path):
         mconfig["json_name"] = "{}/json{}".format(directory_path, lib_index)
         mconfig["params_name"] = "{}/params{}".format(directory_path, lib_index)
         lib_config = factory.pipeline_mods[lib_index]
-        mconfig["dev"] = "{},{}".format(
-            lib_config["dev"].device_type,
-            lib_config["dev"].device_id,
-        )
+        mconfig["dev"] = "{},{}".format(lib_config["dev"].device_type, lib_config["dev"].device_id)
         fcompile = lib_config["fcompile"]
         if not fcompile:
             fcompile = False
@@ -413,6 +415,7 @@ def __init__(self, mod=None):
             self.fcompile = None
             self.name = None
             self.dev = None
+            self.export_cc = None
             self.cpu_affinity = ""
             self.idx = None
             self.mod = mod
@@ -601,6 +604,7 @@ def get_config(self):
                 "target": module.target,
                 "fcompile": module.fcompile,
                 "dev": module.dev,
+                "export_cc": module.export_cc,
             }
 
         # Creating a map including pipeline inputs and subgraph inputs.

From 873e0273a40e67cb6e9e15d9fc7342f64938eb8e Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Sun, 17 Jul 2022 15:03:50 -0700
Subject: [PATCH 29/39] fix doc convention issue

---
 .../using_with_pipeline_executor.py           | 240 +++++++++---------
 1 file changed, 118 insertions(+), 122 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index f58edfc43021..b02293864407 100755
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -117,12 +117,12 @@ def @main(%data_n_0: Tensor[(1, 16, 8, 8), float16] /* ty=Tensor[(1, 16, 8, 8),
 # sphinx_gallery_end_ignore
 #########################################
 # Build the subgraph with cutlass target.
-# ---------------------------------------
-#########################################
+#----------------------------------------
+
 cutlass = tvm.target.Target(
     {
         "kind": "cutlass",
-        "sm": 80,
+        "sm": int(tvm.target.Target("cuda").arch.split("_")[1]),
         "use_3xtf32": True,
         "split_k_slices": [1],
         "profile_all_alignments": False,
@@ -149,131 +149,127 @@ def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"
 # Define a function to do all the codegen and pipeline executor works.
 # To run pipeline executor with dnnl, USE_PIPELINE_EXECUTOR need to get set as ON.
 # and the 'USE_CUTLASS' should set as ON in config.cmake.
-def run_pipeline():
-    from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
-
-    #########################################
-    # Create subgraph pipeline configuration.
-    # Associate the subgraph module with a target.
-    # Using BYOC to set the codegen of the second subgraph module.
-    # To use cutlass the 'USE_CUTLASS' should set as ON.
-    mod0, mod1 = subgraphs[0], subgraphs[1]
-    # Apply cutlass as the codegen.
-    mod1 = partition_for_cutlass(mod1)
-    #################################################
-    # Get the pipeline executor configuration object.
-    pipe_config = pipeline_executor_build.PipelineConfig()
-    ###########################################################################
-    # Set the compile target of the second subgraph module for example as LLVM.
-    pipe_config[mod0].target = "llvm"
-    pipe_config[mod0].dev = tvm.cpu(0)
-    ###############################################################################
-    # Set the cpu afinity for control flow, for example using cpu 0 for control flow.
-    pipe_config[mod1].cpu_affinity = "0"
-    pipe_config[mod1].export_cc = None
-    ##############################################################
-    # Set the compile target of the second subgraph module as LLVM.
-    pipe_config[mod1].target = "cuda"  # tvm.target.Target("cuda", host=tvm.target.Target("llvm"))
-    pipe_config[mod1].dev = tvm.device("cuda", 0)
-    pipe_config[mod1].build_func = cutlass_build
-    pipe_config[mod1].export_cc = "nvcc"
-    #################################################################################
-    # Set the cpu afinity for control flow, for example using cpu 1 for control flow.
-    pipe_config[mod1].cpu_affinity = "1"
-    pipe_config["input"]["data"].connect(pipe_config[mod0]["input"]["data"])
-    pipe_config[mod0]["output"][0].connect(pipe_config[mod1]["input"]["data_n_0"])
-    pipe_config[mod1]["output"]["0"].connect(pipe_config["output"][0])
-    ######################################
-    # The pipeline configuration as below.
-    """
-    print(pipe_config)
-     Inputs
-      |data: mod0:data
+from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
 
-     output
-      |output(0) : mod1.output(0)
-
-     connections
-      |mod0.output(0)-> mod1.data_n_0
-    """
+#########################################
+# Create subgraph pipeline configuration.
+# Associate the subgraph module with a target.
+# Using BYOC to set the codegen of the second subgraph module.
+# To use cutlass the 'USE_CUTLASS' should set as ON.
+mod0, mod1 = subgraphs[0], subgraphs[1]
+# Use cutlass as the codegen.
+mod1 = partition_for_cutlass(mod1)
+#################################################
+# Get the pipeline executor configuration object.
+pipe_config = pipeline_executor_build.PipelineConfig()
+###########################################################################
+# Set the compile target of the second subgraph module for example as LLVM.
+pipe_config[mod0].target = "llvm"
+pipe_config[mod0].dev = tvm.cpu(0)
+###############################################################################
+# Set the cpu afinity for control flow, for example using cpu 0 for control flow.
+pipe_config[mod1].cpu_affinity = "0"
+pipe_config[mod1].export_cc = None
+##############################################################
+# Set the compile target of the second subgraph module as LLVM.
+pipe_config[mod1].target = "cuda"  # tvm.target.Target("cuda", host=tvm.target.Target("llvm"))
+pipe_config[mod1].dev = tvm.device("cuda", 0)
+pipe_config[mod1].build_func = cutlass_build
+pipe_config[mod1].export_cc = "nvcc"
+#################################################################################
+# Set the cpu afinity for control flow, for example using cpu 1 for control flow.
+pipe_config[mod1].cpu_affinity = "1"
+pipe_config["input"]["data"].connect(pipe_config[mod0]["input"]["data"])
+pipe_config[mod0]["output"][0].connect(pipe_config[mod1]["input"]["data_n_0"])
+pipe_config[mod1]["output"]["0"].connect(pipe_config["output"][0])
+######################################
+# The pipeline configuration as below.
+"""
+print(pipe_config)
+ Inputs
+  |data: mod0:data
 
-    # sphinx_gallery_start_ignore
-    from tvm import testing
+ output
+  |output(0) : mod1.output(0)
 
-    # testing.utils.install_request_hook(depth=3)
-    # sphinx_gallery_end_ignore
-    ##############################
-    # Build the pipeline executor.
-    # ----------------------------
-    with tvm.transform.PassContext(opt_level=3):
-        pipeline_mod_factory = pipeline_executor_build.build(pipe_config)
-    ###############################################
-    # Export the parameter configuration to a file.
-    directory_path = tvm.contrib.utils.tempdir().temp_dir
-    #############################################
-    # If the directory does not exist, create it.
-    if not os.path.exists(directory_path):
-        os.makedirs(directory_path)
-    config_file_name = pipeline_mod_factory.export_library(directory_path)
-    ################################################################
-    # Use the load function to create and initialize PipelineModule.
-    # --------------------------------------------------------------
-    pipeline_module = pipeline_executor.PipelineModule.load_library(config_file_name)
+ connections
+  |mod0.output(0)-> mod1.data_n_0
+"""
 
-    ############################
-    # Run the pipeline executor.
-    # --------------------------
-    # Allocated a input data.
-    data = np.random.uniform(-1, 1, size=data_shape).astype("float16")
-    pipeline_module.set_input("data", tvm.nd.array(data))
-    ##########################################################################
-    # Run the two subgraph in pipeline mode and get the output asynchronously.
-    pipeline_module.run()
-    outputs = []
-    while not outputs:
-        outputs = pipeline_module.get_output()
-        time.sleep(0.001)
-    ######################################
-    # Use graph_executor for verification.
-    # ------------------------------------
-    # Run these two subgraphs in sequence with graph_executor to get the output.
-    target = "llvm"
-    dev0 = tvm.device(target, 0)
-    lib0 = relay.build_module.build(mod0, target, params=params)
-    module0 = runtime.GraphModule(lib0["default"](dev0))
-    cutlass = tvm.target.Target(
-        {
-            "kind": "cutlass",
-            "sm": 75,
-            "use_3xtf32": True,
-            "split_k_slices": [1],
-            "profile_all_alignments": False,
-            "find_first_valid": True,
-            "use_multiprocessing": True,
-            "use_fast_math": False,
-            "tmp_dir": "./tmp",
-        },
-        host=tvm.target.Target("llvm"),
-    )
-    cuda = tvm.target.Target("cuda", host=tvm.target.Target("llvm"))
-    lib1 = relay.build_module.build(mod1, [cuda, cutlass], params=params)
-    lib1 = finalize_modules(lib1, "compile.so", "./tmp")
+# sphinx_gallery_start_ignore
+from tvm import testing
 
-    dev1 = tvm.device("cuda", 0)
+# testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+##############################
+# Build the pipeline executor.
+# ----------------------------
+with tvm.transform.PassContext(opt_level=3):
+    pipeline_mod_factory = pipeline_executor_build.build(pipe_config)
+###############################################
+# Export the parameter configuration to a file.
+directory_path = tvm.contrib.utils.tempdir().temp_dir
+#############################################
+# If the directory does not exist, create it.
+if not os.path.exists(directory_path):
+    os.makedirs(directory_path)
+config_file_name = pipeline_mod_factory.export_library(directory_path)
+################################################################
+# Use the load function to create and initialize PipelineModule.
+# --------------------------------------------------------------
+pipeline_module = pipeline_executor.PipelineModule.load_library(config_file_name)
 
-    module1 = runtime.GraphModule(lib1["default"](dev1))
+############################
+# Run the pipeline executor.
+# --------------------------
+# Allocated a input data.
+data = np.random.uniform(-1, 1, size=data_shape).astype("float16")
+pipeline_module.set_input("data", tvm.nd.array(data))
+##########################################################################
+# Run the two subgraph in pipeline mode and get the output asynchronously.
+pipeline_module.run()
+outputs = []
+while not outputs:
+    outputs = pipeline_module.get_output()
+    time.sleep(0.001)
+######################################
+# Use graph_executor for verification.
+# ------------------------------------
+# Run these two subgraphs in sequence with graph_executor to get the output.
+target = "llvm"
+dev0 = tvm.device(target, 0)
+lib0 = relay.build_module.build(mod0, target, params=params)
+module0 = runtime.GraphModule(lib0["default"](dev0))
+cutlass = tvm.target.Target(
+    {
+        "kind": "cutlass",
+        "sm": 75,
+        "use_3xtf32": True,
+        "split_k_slices": [1],
+        "profile_all_alignments": False,
+        "find_first_valid": True,
+        "use_multiprocessing": True,
+        "use_fast_math": False,
+        "tmp_dir": "./tmp",
+    },
+    host=tvm.target.Target("llvm"),
+)
+cuda = tvm.target.Target("cuda", host=tvm.target.Target("llvm"))
+lib1 = relay.build_module.build(mod1, [cuda, cutlass], params=params)
+lib1 = finalize_modules(lib1, "compile.so", "./tmp")
 
-    module0.set_input("data", data)
-    module0.run()
-    out_shape = (1, 16, img_size, img_size)
-    out = module0.get_output(0, tvm.nd.empty(out_shape, "float16"))
-    module1.set_input("data_n_0", out)
-    module1.run()
-    out_shape = (1, 1)
-    out = module1.get_output(0, tvm.nd.empty(out_shape, "float16"))
-    ####################
-    # Verify the result.
-    tvm.testing.assert_allclose(outputs[0].numpy(), out.numpy())
+dev1 = tvm.device("cuda", 0)
 
+module1 = runtime.GraphModule(lib1["default"](dev1))
 
-run_pipeline()
+module0.set_input("data", data)
+module0.run()
+out_shape = (1, 16, img_size, img_size)
+out = module0.get_output(0, tvm.nd.empty(out_shape, "float16"))
+module1.set_input("data_n_0", out)
+module1.run()
+out_shape = (1, 1)
+out = module1.get_output(0, tvm.nd.empty(out_shape, "float16"))
+####################
+# Verify the result.
+tvm.testing.assert_allclose(outputs[0].numpy(), out.numpy())

From 73656af1cfb73b73f066b3861685e76162dc48fe Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Sun, 17 Jul 2022 15:21:04 -0700
Subject: [PATCH 30/39] remove duplicate variable

---
 .../using_with_pipeline_executor.py                | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index b02293864407..c03cd88ef182 100755
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -240,20 +240,6 @@ def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"
 dev0 = tvm.device(target, 0)
 lib0 = relay.build_module.build(mod0, target, params=params)
 module0 = runtime.GraphModule(lib0["default"](dev0))
-cutlass = tvm.target.Target(
-    {
-        "kind": "cutlass",
-        "sm": 75,
-        "use_3xtf32": True,
-        "split_k_slices": [1],
-        "profile_all_alignments": False,
-        "find_first_valid": True,
-        "use_multiprocessing": True,
-        "use_fast_math": False,
-        "tmp_dir": "./tmp",
-    },
-    host=tvm.target.Target("llvm"),
-)
 cuda = tvm.target.Target("cuda", host=tvm.target.Target("llvm"))
 lib1 = relay.build_module.build(mod1, [cuda, cutlass], params=params)
 lib1 = finalize_modules(lib1, "compile.so", "./tmp")

From e4d83606fa8c3d2931b0c81d599ecb662446cb0e Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Sun, 17 Jul 2022 16:34:26 -0700
Subject: [PATCH 31/39] fix plint issue.

---
 .../how_to/work_with_relay/using_with_pipeline_executor.py   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index c03cd88ef182..849677cca562 100755
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -115,9 +115,10 @@ def @main(%data_n_0: Tensor[(1, 16, 8, 8), float16] /* ty=Tensor[(1, 16, 8, 8),
 
 testing.utils.install_request_hook(depth=3)
 # sphinx_gallery_end_ignore
+
 #########################################
 # Build the subgraph with cutlass target.
-#----------------------------------------
+# ---------------------------------------
 
 cutlass = tvm.target.Target(
     {
@@ -172,7 +173,7 @@ def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"
 pipe_config[mod1].export_cc = None
 ##############################################################
 # Set the compile target of the second subgraph module as LLVM.
-pipe_config[mod1].target = "cuda"  # tvm.target.Target("cuda", host=tvm.target.Target("llvm"))
+pipe_config[mod1].target = "cuda"
 pipe_config[mod1].dev = tvm.device("cuda", 0)
 pipe_config[mod1].build_func = cutlass_build
 pipe_config[mod1].export_cc = "nvcc"

From cfd2af2b73ae3d647bafade80a0aaedc3c3b13db Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Wed, 20 Jul 2022 19:43:23 -0700
Subject: [PATCH 32/39] address review comments.

---
 ...executor.py => using_pipeline_executor.py} | 26 +++++++------------
 1 file changed, 10 insertions(+), 16 deletions(-)
 rename gallery/how_to/work_with_relay/{using_with_pipeline_executor.py => using_pipeline_executor.py} (92%)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_pipeline_executor.py
similarity index 92%
rename from gallery/how_to/work_with_relay/using_with_pipeline_executor.py
rename to gallery/how_to/work_with_relay/using_pipeline_executor.py
index 849677cca562..9161a66a90a7 100755
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_pipeline_executor.py
@@ -42,7 +42,7 @@
 # Create a simple network, this network can be a pre-trained model too.
 # ---------------------------------------------------------------------
 # Let's create a very simple network for demonstration.
-# It consists of convolution, batch normalization, and ReLU activation.
+# It consists of convolution, batch normalization, dense, and ReLU activation.
 def get_network():
     out_channels = 16
     batch_size = 1
@@ -73,8 +73,8 @@ def get_network():
 ###########################################
 # Splitting the network into two subgraphs.
 # -----------------------------------------
-# We use an testing linear graph splitting function as a example. User also can create their
-# own splitting function logic.
+# The graph splitting function comming from a uinit test is an example. User can create  a
+# customized function for graph splitting.
 import inspect
 import os
 
@@ -99,8 +99,6 @@ def @main(%data: Tensor[(1, 3, img_size, img_size), float16]) {
   nn.relu(%2) /* ty=Tensor[(1, 16, img_size, img_size), float16] */
  }
 
-peline-tutorial
-
 #subgraphs[1]
 
  def @main(%data_n_0: Tensor[(1, 16, 8, 8), float16] /* ty=Tensor[(1, 16, 8, 8), float16] */) {
@@ -147,16 +145,13 @@ def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"
 ###########################################################
 # Run the two subgraphs in pipeline with pipeline executor.
 # ---------------------------------------------------------
-# Define a function to do all the codegen and pipeline executor works.
-# To run pipeline executor with dnnl, USE_PIPELINE_EXECUTOR need to get set as ON.
-# and the 'USE_CUTLASS' should set as ON in config.cmake.
+# Set 'USE_PIPELINE_EXECUTOR' as ON, and set USE_CUTLASS' as ON.
 from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
 
 #########################################
 # Create subgraph pipeline configuration.
-# Associate the subgraph module with a target.
-# Using BYOC to set the codegen of the second subgraph module.
-# To use cutlass the 'USE_CUTLASS' should set as ON.
+# Associate a subgraph module with a target.
+# Use CUTLASS BYOC to build the second subgraph module.
 mod0, mod1 = subgraphs[0], subgraphs[1]
 # Use cutlass as the codegen.
 mod1 = partition_for_cutlass(mod1)
@@ -164,7 +159,7 @@ def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"
 # Get the pipeline executor configuration object.
 pipe_config = pipeline_executor_build.PipelineConfig()
 ###########################################################################
-# Set the compile target of the second subgraph module for example as LLVM.
+# Set the compile target of the subgraph module.
 pipe_config[mod0].target = "llvm"
 pipe_config[mod0].dev = tvm.cpu(0)
 ###############################################################################
@@ -172,7 +167,7 @@ def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"
 pipe_config[mod1].cpu_affinity = "0"
 pipe_config[mod1].export_cc = None
 ##############################################################
-# Set the compile target of the second subgraph module as LLVM.
+# Set the compile target of the second subgraph module as cuda.
 pipe_config[mod1].target = "cuda"
 pipe_config[mod1].dev = tvm.device("cuda", 0)
 pipe_config[mod1].build_func = cutlass_build
@@ -212,8 +207,7 @@ def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"
 directory_path = tvm.contrib.utils.tempdir().temp_dir
 #############################################
 # If the directory does not exist, create it.
-if not os.path.exists(directory_path):
-    os.makedirs(directory_path)
+os.makedirs(directory_path, exist_ok=True)
 config_file_name = pipeline_mod_factory.export_library(directory_path)
 ################################################################
 # Use the load function to create and initialize PipelineModule.
@@ -223,7 +217,7 @@ def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"
 ############################
 # Run the pipeline executor.
 # --------------------------
-# Allocated a input data.
+# Allocate input data.
 data = np.random.uniform(-1, 1, size=data_shape).astype("float16")
 pipeline_module.set_input("data", tvm.nd.array(data))
 ##########################################################################

From a1fc852e281749c6997a38efb1bf9a61117bb774 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Wed, 20 Jul 2022 23:07:36 -0700
Subject: [PATCH 33/39] address review comments

---
 .../work_with_relay/using_pipeline_executor.py   | 12 ++++--------
 python/tvm/contrib/pipeline_executor.py          | 16 ++++++++++++++--
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_pipeline_executor.py b/gallery/how_to/work_with_relay/using_pipeline_executor.py
index 9161a66a90a7..98c39f699386 100755
--- a/gallery/how_to/work_with_relay/using_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_pipeline_executor.py
@@ -145,7 +145,7 @@ def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"
 ###########################################################
 # Run the two subgraphs in pipeline with pipeline executor.
 # ---------------------------------------------------------
-# Set 'USE_PIPELINE_EXECUTOR' as ON, and set USE_CUTLASS' as ON.
+# Set 'USE_PIPELINE_EXECUTOR' as ON, and set USE_CUTLASS' as ON  in cmake.
 from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
 
 #########################################
@@ -205,8 +205,6 @@ def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"
 ###############################################
 # Export the parameter configuration to a file.
 directory_path = tvm.contrib.utils.tempdir().temp_dir
-#############################################
-# If the directory does not exist, create it.
 os.makedirs(directory_path, exist_ok=True)
 config_file_name = pipeline_mod_factory.export_library(directory_path)
 ################################################################
@@ -221,12 +219,10 @@ def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"
 data = np.random.uniform(-1, 1, size=data_shape).astype("float16")
 pipeline_module.set_input("data", tvm.nd.array(data))
 ##########################################################################
-# Run the two subgraph in pipeline mode and get the output asynchronously.
+# Run the two subgraph in the pipeline mode to get the output asynchronously
+# or synchronously. In the following example, it is synchronous.
 pipeline_module.run()
-outputs = []
-while not outputs:
-    outputs = pipeline_module.get_output()
-    time.sleep(0.001)
+outputs = pipeline_module.get_output()
 ######################################
 # Use graph_executor for verification.
 # ------------------------------------
diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index f1c4e98a51d7..f4ff3b8b6eb2 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -131,14 +131,26 @@ def get_input(self, key):
         """
         return self._get_input(key)
 
-    def get_output(self):
+    def get_output(self, synchronize=True, sleep_interval=0.001):
         """Get the output.
         Returns
         -------
         data : Array[NDArray]
             A list of output data.
+        synchronize : BOOL
+            Whether to do a synchronize poll.
+        sleep_interval : Float32
+            When doing the synchronize loop poll, how many seconds the loop should sleep for yield.
         """
-        return self._get_output()
+        outputs = []
+        if not synchronize:
+            outputs = self._get_output()
+        else:
+            while not outputs:
+                outputs = pipeline_module.get_output()
+                time.sleep(sleep_interval)
+
+        return outputs
 
     @property
     def num_executing_pipeline(self):

From 60c89533e39d74afb6996f3d10f7bde717e50bde Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Wed, 20 Jul 2022 23:14:54 -0700
Subject: [PATCH 34/39] fix bug.

---
 gallery/how_to/work_with_relay/using_pipeline_executor.py | 1 -
 python/tvm/contrib/pipeline_executor.py                   | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_pipeline_executor.py b/gallery/how_to/work_with_relay/using_pipeline_executor.py
index 98c39f699386..c848cbc85d92 100755
--- a/gallery/how_to/work_with_relay/using_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_pipeline_executor.py
@@ -29,7 +29,6 @@
 from tvm import relay
 from tvm.relay import testing
 import tvm.testing
-import time
 from tvm.contrib.cutlass import (
     has_cutlass,
     num_cutlass_partitions,
diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index f4ff3b8b6eb2..2543ff66db89 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -20,6 +20,7 @@
 from tvm import runtime
 from tvm._ffi import get_global_func
 from tvm.contrib import graph_executor
+import time
 
 
 def pipeline_executor_enabled():
@@ -147,7 +148,7 @@ def get_output(self, synchronize=True, sleep_interval=0.001):
             outputs = self._get_output()
         else:
             while not outputs:
-                outputs = pipeline_module.get_output()
+                outputs = self._get_output()
                 time.sleep(sleep_interval)
 
         return outputs

From 420e95180d4dfb4aa37308300805e75f4fcf089a Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Wed, 20 Jul 2022 23:29:13 -0700
Subject: [PATCH 35/39] polish the document

---
 gallery/how_to/work_with_relay/using_pipeline_executor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_pipeline_executor.py b/gallery/how_to/work_with_relay/using_pipeline_executor.py
index c848cbc85d92..2c836eacdf42 100755
--- a/gallery/how_to/work_with_relay/using_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_pipeline_executor.py
@@ -72,8 +72,8 @@ def get_network():
 ###########################################
 # Splitting the network into two subgraphs.
 # -----------------------------------------
-# The graph splitting function comming from a uinit test is an example. User can create  a
-# customized function for graph splitting.
+# It is an example that the graph splitting function comes from a unit test. User can create  a
+# customized function to split the graph.
 import inspect
 import os
 

From b998f12e167f3fa607d61365642543007ce3d964 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Wed, 20 Jul 2022 23:49:21 -0700
Subject: [PATCH 36/39] fix plint issue

---
 python/tvm/contrib/pipeline_executor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index 2543ff66db89..b61463073749 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -17,10 +17,10 @@
 """Pipeline executor that executes a series of modules in a pipeline fashion."""
 import json
 import os
+import time
 from tvm import runtime
 from tvm._ffi import get_global_func
 from tvm.contrib import graph_executor
-import time
 
 
 def pipeline_executor_enabled():

From 1a930aff1b3779425e5525481a6c53e101b33826 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Thu, 21 Jul 2022 07:47:09 -0700
Subject: [PATCH 37/39] address review comments.

---
 gallery/how_to/work_with_relay/using_pipeline_executor.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_pipeline_executor.py b/gallery/how_to/work_with_relay/using_pipeline_executor.py
index 2c836eacdf42..bde6e4cb706e 100755
--- a/gallery/how_to/work_with_relay/using_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_pipeline_executor.py
@@ -72,8 +72,8 @@ def get_network():
 ###########################################
 # Splitting the network into two subgraphs.
 # -----------------------------------------
-# It is an example that the graph splitting function comes from a unit test. User can create  a
-# customized function to split the graph.
+# This function called 'graph_split' from a unit test is just an example. User can create a customized logic
+# to split the graph.
 import inspect
 import os
 
@@ -162,9 +162,8 @@ def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"
 pipe_config[mod0].target = "llvm"
 pipe_config[mod0].dev = tvm.cpu(0)
 ###############################################################################
-# Set the cpu afinity for control flow, for example using cpu 0 for control flow.
+# Set the cpu affinity for control flow, for example using cpu 0 for control flow.
 pipe_config[mod1].cpu_affinity = "0"
-pipe_config[mod1].export_cc = None
 ##############################################################
 # Set the compile target of the second subgraph module as cuda.
 pipe_config[mod1].target = "cuda"

From 7449ff7ed82b301a9cc3cacbc92e844f228c541d Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Fri, 22 Jul 2022 00:38:41 -0700
Subject: [PATCH 38/39] address review comments

---
 .../work_with_relay/using_pipeline_executor.py   | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_pipeline_executor.py b/gallery/how_to/work_with_relay/using_pipeline_executor.py
index bde6e4cb706e..490ff276eff3 100755
--- a/gallery/how_to/work_with_relay/using_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_pipeline_executor.py
@@ -77,8 +77,8 @@ def get_network():
 import inspect
 import os
 
-test_path = os.path.dirname(inspect.getfile(lambda: None))
-os.sys.path.append(os.path.join(test_path, "../../../tests/python/relay"))
+tutorial_dir = os.path.dirname(inspect.getfile(lambda: None))
+os.sys.path.append(os.path.join(tutorial_dir, "../../../tests/python/relay"))
 from test_pipeline_executor import graph_split
 
 ###########################################
@@ -161,21 +161,19 @@ def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"
 # Set the compile target of the subgraph module.
 pipe_config[mod0].target = "llvm"
 pipe_config[mod0].dev = tvm.cpu(0)
-###############################################################################
-# Set the cpu affinity for control flow, for example using cpu 0 for control flow.
-pipe_config[mod1].cpu_affinity = "0"
 ##############################################################
 # Set the compile target of the second subgraph module as cuda.
 pipe_config[mod1].target = "cuda"
 pipe_config[mod1].dev = tvm.device("cuda", 0)
 pipe_config[mod1].build_func = cutlass_build
 pipe_config[mod1].export_cc = "nvcc"
-#################################################################################
-# Set the cpu afinity for control flow, for example using cpu 1 for control flow.
-pipe_config[mod1].cpu_affinity = "1"
+# Create the pipeline by connecting the subgraphs module.
+# The global input will be forwarded to the input interface of the first moudle named mod0
 pipe_config["input"]["data"].connect(pipe_config[mod0]["input"]["data"])
+# The first output of mod0 will be forwarded to the input interface of mod1
 pipe_config[mod0]["output"][0].connect(pipe_config[mod1]["input"]["data_n_0"])
-pipe_config[mod1]["output"]["0"].connect(pipe_config["output"][0])
+# the first output of mod1 will be the first global output.
+pipe_config[mod1]["output"][0].connect(pipe_config["output"][0])
 ######################################
 # The pipeline configuration as below.
 """

From 0dcc5bffb25f8eb2fa064637c4f9db893c373550 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Fri, 22 Jul 2022 08:59:00 -0700
Subject: [PATCH 39/39] address review comments

---
 gallery/how_to/work_with_relay/using_pipeline_executor.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_pipeline_executor.py b/gallery/how_to/work_with_relay/using_pipeline_executor.py
index 490ff276eff3..5496058265ba 100755
--- a/gallery/how_to/work_with_relay/using_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_pipeline_executor.py
@@ -167,12 +167,12 @@ def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"
 pipe_config[mod1].dev = tvm.device("cuda", 0)
 pipe_config[mod1].build_func = cutlass_build
 pipe_config[mod1].export_cc = "nvcc"
-# Create the pipeline by connecting the subgraphs module.
-# The global input will be forwarded to the input interface of the first moudle named mod0
+# Create the pipeline by connecting the subgraph modules.
+# The global input will be forwarded to the input interface of the first module named mod0
 pipe_config["input"]["data"].connect(pipe_config[mod0]["input"]["data"])
 # The first output of mod0 will be forwarded to the input interface of mod1
 pipe_config[mod0]["output"][0].connect(pipe_config[mod1]["input"]["data_n_0"])
-# the first output of mod1 will be the first global output.
+# The first output of mod1 will be the first global output.
 pipe_config[mod1]["output"][0].connect(pipe_config["output"][0])
 ######################################
 # The pipeline configuration as below.