From 9da9bd209331a07b61d47bfc65d26221e6ed9621 Mon Sep 17 00:00:00 2001
From: Siva Rama Krishna Reddy B <sivb@blr-ubuntu-ripper.qualcomm.com>
Date: Tue, 23 May 2023 10:28:48 +0530
Subject: [PATCH] Reactor and introduce in chip memory and memory planner

Introduced thread context with CLMLWorkspace.
Organized the code as runtime, utils and memory planners
Introcuded recording queue support and on chip memory support.
On chip memory allocation planner to acommodate multiple tensors at a time.
DDR memory planner introduced to reuse the underlaying memory across
multiple tensor descriptors.

Dense layer support refactored to use GEMM.
CLML binary operators doesn't support broadcasting. Hence introduced an explicite
broadcast op as a work around.

clml SDK codegen is enhanced accordingly.
---
 apps/cpp_clml/clml_runner.cc                  |   36 +-
 apps/cpp_clml/clml_runner.h                   |    2 +-
 apps/cpp_clml/scripts/clml_codegen.py         |    2 +-
 python/tvm/relay/op/contrib/clml.py           |  161 ++-
 .../contrib/clml/clml_memory_planner.cc       |  266 +++++
 .../contrib/clml/clml_memory_planner.h        |   45 +
 src/runtime/contrib/clml/clml_runtime.cc      | 1000 +++++++++--------
 src/runtime/contrib/clml/clml_runtime.h       |  173 +++
 src/runtime/contrib/clml/clml_utils.cc        |  258 +++++
 src/runtime/contrib/clml/clml_utils.h         |   74 ++
 .../contrib/test_clml/infrastructure.py       |   19 +
 tests/python/contrib/test_clml/test_ops.py    |  110 +-
 12 files changed, 1597 insertions(+), 549 deletions(-)
 create mode 100644 src/runtime/contrib/clml/clml_memory_planner.cc
 create mode 100644 src/runtime/contrib/clml/clml_memory_planner.h
 create mode 100644 src/runtime/contrib/clml/clml_runtime.h
 create mode 100644 src/runtime/contrib/clml/clml_utils.cc
 create mode 100644 src/runtime/contrib/clml/clml_utils.h

diff --git a/apps/cpp_clml/clml_runner.cc b/apps/cpp_clml/clml_runner.cc
index d733922da4996..0a5508635e0a5 100644
--- a/apps/cpp_clml/clml_runner.cc
+++ b/apps/cpp_clml/clml_runner.cc
@@ -50,8 +50,8 @@ CLMLRunner::CLMLRunner(std::string name, ToolArgs& args, cl_platform_id arg_plat
       context(arg_context),
       device_id(arg_device_id),
       queue(arg_queue) {
-  LOG(INFO) << "CLMLRunner Constructor: Input:" << r_args.input << " Output:" << r_args.output
-            << " Params:" << r_args.params;
+  LOG(INFO) << "CLMLRunner Constructor:" << name << " Input:" << r_args.input
+            << " Output:" << r_args.output << " Params:" << r_args.params;
   cl_int result;
 
   // Query and Get CLML Interface
@@ -648,25 +648,29 @@ void CLMLRunner::MakeConcatenate(
 void CLMLRunner::MakeDense(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
                            std::shared_ptr<cl_ml_tensor_memory_desc_qcom> weight_desc,
                            std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                           std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bias_desc,
+                           std::vector<cl_uint> in_shape, std::vector<cl_uint> wt_shape,
                            std::string dtype) {
   cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(MakeCLDataType(dtype));
   cl_ml_op_qcom op = nullptr;
   cl_int result;
+  cl_gemm_transform_qcom b_transform = CL_GEMM_TRANSFORM_NONE_QCOM;
 
-  cl_ml_op_convolution_desc_qcom conv_desc = {CL_CONVOLUTION_MODE_CONVOLUTION_QCOM,
-                                              1,
-                                              4,
-                                              {0, 0},
-                                              {0, 0},
-                                              {1, 1},
-                                              {1, 1},
-                                              0,
-                                              cl_arithmetic_mode};
-
-  result = h_ClmlIntf->clCreateMLOpConvolutionForwardQCOM(
-      this->context, 0, &conv_desc, input_desc->tensor, weight_desc->tensor, bias_desc->tensor,
-      output_desc->tensor, &op, tuning_cache);
+  if (in_shape[1] == wt_shape[1]) {
+    b_transform = CL_GEMM_TRANSFORM_TRANSPOSE_QCOM;
+  }
+
+  cl_ml_op_gemm_desc_qcom gemmDesc = {in_shape[0],                  // m
+                                      wt_shape[0],                  // n
+                                      wt_shape[1],                  // k
+                                      CL_GEMM_TRANSFORM_NONE_QCOM,  // A transform
+                                      b_transform,                  // B transform
+                                      {{1.0}, CL_FLOAT},            // alpha
+                                      {{0.0}, CL_FLOAT},            // beta
+                                      cl_arithmetic_mode};
+
+  result =
+      h_ClmlIntf->clCreateMLOpGemmQCOM(this->context, 0, &gemmDesc, input_desc->tensor,
+                                       weight_desc->tensor, output_desc->tensor, &op, tuning_cache);
 
   CLML_SDK_TEST_AND_EXIT(op && result == CL_SUCCESS);
   this->function.push_back(op);
diff --git a/apps/cpp_clml/clml_runner.h b/apps/cpp_clml/clml_runner.h
index 4e73674d72ae2..a1e78fcb66bef 100644
--- a/apps/cpp_clml/clml_runner.h
+++ b/apps/cpp_clml/clml_runner.h
@@ -178,7 +178,7 @@ class CLMLRunner {
   void MakeDense(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
                  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> weight_desc,
                  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                 std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bias_desc, std::string dtype);
+                 std::vector<cl_uint> in_shape, std::vector<cl_uint> wt_shape, std::string dtype);
 
   /*! \brief SoftMax layer implementattion */
   void MakeSoftMax(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
diff --git a/apps/cpp_clml/scripts/clml_codegen.py b/apps/cpp_clml/scripts/clml_codegen.py
index 32e5782db3852..bf19c0e4b9b60 100644
--- a/apps/cpp_clml/scripts/clml_codegen.py
+++ b/apps/cpp_clml/scripts/clml_codegen.py
@@ -45,7 +45,7 @@ def main():
         clml_mod = clml.partition_for_clml(mod, params)
         libm = relay.build(
             clml_mod,
-            target="opencl -device=adreno",
+            target="opencl",
             target_host="llvm -mtriple=aarch64-linux-gnu",
             params=params,
         )
diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py
index 608c8a2a1b737..34ca09f209337 100644
--- a/python/tvm/relay/op/contrib/clml.py
+++ b/python/tvm/relay/op/contrib/clml.py
@@ -81,6 +81,36 @@ def transform_function(
         return RemoveDropout().visit(func)
 
 
+class BroadcastInputs(ExprMutator):
+    """
+    Binary operators need broadcasting for CLML.
+    """
+
+    def visit_call(self, call):
+        if call.op.name in ["add", "subtract", "multiply", "divide", "maximum", "minimum"]:
+            new_fn = self.visit(call.op)
+            call_shape = call.checked_type.shape
+            lhs = call.args[0]
+            rhs = call.args[1]
+            lhs_shape = lhs.checked_type.shape
+            rhs_shape = rhs.checked_type.shape
+            if list(call_shape) != list(lhs_shape):
+                lhs = relay.broadcast_to(self.visit(lhs), call_shape)
+            if list(call_shape) != list(rhs_shape):
+                rhs = relay.broadcast_to(self.visit(rhs), call_shape)
+            args = [lhs, rhs]
+            return Call(new_fn, args, call.attrs)
+        return super().visit_call(call)
+
+
+@transform.function_pass(opt_level=0)
+class BinaryOpBroadcaster:
+    def transform_function(
+        self, func: relay.function.Function, mod: tvm.IRModule, _: tvm.transform.PassContext
+    ) -> relay.function.Function:
+        return BroadcastInputs().visit(func)
+
+
 def partition_for_clml(mod, params=None, **opts):
     """Partition the graph greedily offloading supported
     operators to CLML Library.
@@ -104,6 +134,7 @@ def partition_for_clml(mod, params=None, **opts):
         [
             transform.InferType(),
             RemoveDropoutPass(),
+            BinaryOpBroadcaster(),
             transform.FoldConstant(),
             transform.MergeComposite(clml_pattern_table()),
             transform.AnnotateTarget("clml", False),
@@ -261,8 +292,6 @@ def concat_pattern():
     def dense_pattern():
         """Create a dense pattern."""
         pattern = is_op("nn.dense")(wildcard(), is_constant())
-        pattern = pattern.optional(lambda x: is_op("add")(x, is_constant()))
-        pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant()))
         return pattern
 
     def pad_pattern():
@@ -344,9 +373,19 @@ def check_conv_transpose(extract):
 
     def check_binary_op(extract):
         call = extract
-        if len(call.args[1].checked_type.shape) > 0:
-            return True
-        return False
+        # Scalers are not supported
+        if len(call.args[1].checked_type.shape) == 0:
+            return False
+
+        for arg in call.args:
+            # Avoid any operators with dtype Int64
+            if arg.checked_type.dtype == "int64":
+                return False
+            # No support for batch> 1
+            if arg.checked_type.shape[0] > 1:
+                return False
+
+        return True
 
     def check_pad_op(extract):
         call = extract
@@ -377,6 +416,20 @@ def check_concat_op(extract):
         return True
 
     def check_default_op(extract):
+        call = extract
+        # Avoid any operators with dtype Int64
+        for arg in call.args:
+            if arg.checked_type.dtype == "int64":
+                return False
+        return True
+
+    def check_batch_matmul_op(extract):
+        call = extract
+        # Only support single Matmul
+        if call.args[0].checked_type.shape[0] > 1:
+            return False
+        if call.args[1].checked_type.shape[0] > 1:
+            return False
         return True
 
     return [
@@ -394,7 +447,7 @@ def check_default_op(extract):
         ("clml.minimum", is_op("minimum")(wildcard(), wildcard()), check_binary_op),
         ("clml.maximum", is_op("maximum")(wildcard(), wildcard()), check_binary_op),
         ("clml.softmax", is_op("nn.softmax")(wildcard()), check_softmax_op),
-        ("clml.reshape", is_op("reshape")(wildcard()), check_default_op),
+        # ("clml.reshape", is_op("reshape")(wildcard()), check_default_op),
         ("clml.avg_pool2d", is_op("nn.avg_pool2d")(wildcard()), check_default_op),
         ("clml.max_pool2d", is_op("nn.max_pool2d")(wildcard()), check_default_op),
         ("clml.global_avg_pool2d", is_op("nn.global_avg_pool2d")(wildcard()), check_default_op),
@@ -404,6 +457,11 @@ def check_default_op(extract):
         ("clml.batch_flatten", is_op("nn.batch_flatten")(wildcard()), check_default_op),
         ("clml.depth_to_space", is_op("nn.depth_to_space")(wildcard()), check_default_op),
         ("clml.upsampling", is_op("nn.upsampling")(wildcard()), check_upsampling_op),
+        (
+            "clml.batch_matmul",
+            is_op("nn.batch_matmul")(wildcard(), wildcard()),
+            check_batch_matmul_op,
+        ),
     ]
 
 
@@ -570,7 +628,9 @@ def __init__(self, cmod):
         runner.MakeDense($input_tensor,
           $weight_tensor,
           $output_tensor,
-          $bias_tensor, "$dtype");"""
+          std::vector<cl_uint> ({$in_shape}),
+          std::vector<cl_uint> ({$wt_shape}),
+          "$dtype");"""
         )
         self.MakeSoftMax = Template(
             """
@@ -641,13 +701,12 @@ def __init__(self, cmod):
             "    Output Count : $output_count\\n"
             '    Input MetaInfo\\n$input_meta\\n    Output MetaInfo\\n$output_meta");'
         )
-
         self.MakeInputMetaInfo = Template(
-            "        Input: $in_name\\n            Dtype : $dtype\\n            Shape : [$shape]"
+            "        Input: $in_name\\n          Dtype : $dtype\\n          Shape : [$shape]\\n"
         )
 
         self.MakeOutputMetaInfo = Template(
-            "        Output: $out_name\\n            Dtype : $dtype\\n            Shape : [$shape]"
+            "        Output: $out_name\\n         Dtype : $dtype\\n          Shape : [$shape]\\n"
         )
 
     def get_src(self):
@@ -666,23 +725,40 @@ def get_tensor_from_map(
             else:
                 node = self.nodes[node_seq]
                 dtype = str(node["attrs"]["dtype"][0][0])
+                if node["op"] == "input":
+                    self.clml_code.append("// Input Node")
+                    node_out_name = self.sub_module_name + "_" + "input_" + str(node_seq)
+                else:
+                    node_out_name = node["name"]
                 if shape is None:
                     shape = str(tuple(node["attrs"]["shape"][0][0]))[1:-1]
 
                 self.clml_code.append(
                     self.MakeCLMLTensor.substitute(
-                        name=node["name"], shape=shape, dtype=dtype, layout=layout
+                        name=node_out_name, shape=shape, dtype=dtype, layout=layout
                     )
                 )
                 self.clml_code.append(
-                    self.MapInsert.substitute(nid=node["name"], tensor_desc=node["name"])
+                    self.MapInsert.substitute(nid=node_out_name, tensor_desc=node_out_name)
                 )
+                if node["op"] == "input":
+                    self.clml_code.append(
+                        Template("runner.inputs.push_back($clml_input);").substitute(
+                            clml_input=node_out_name
+                        )
+                    )
+                    self.input_meta.append(
+                        self.MakeInputMetaInfo.substitute(
+                            in_name=node_out_name, dtype=dtype, shape=shape
+                        )
+                    )
+
                 if self.nodes[node_seq]["op"] == "const":
                     self.clml_code.append(
                         Template('runner.consts.push_back("$nid");').substitute(nid=node["name"])
                     )
-                self.node_map[node_seq] = node["name"]
-                return node["name"]
+                self.node_map[node_seq] = node_out_name
+                return node_out_name
 
         def make_output_tensor(
             node, node_seq, shape=None, layout="CL_TENSOR_LAYOUT_OPTIMAL_QCOM", dtype="float32"
@@ -697,40 +773,13 @@ def make_output_tensor(
                     name=node_out_name,
                     shape=shape,
                     dtype=dtype,
-                    layout="CL_TENSOR_LAYOUT_OPTIMAL_QCOM",
+                    layout=layout,
                 )
             )
             return node_out_name
 
         for node_seq, node in enumerate(self.nodes):
-            if node["op"] == "input":
-                self.clml_code.append("// Input Node")
-                dtype = str(node["attrs"]["dtype"][0][0])
-                shape = str(tuple(node["attrs"]["shape"][0][0]))[1:-1]
-                node_out_name = self.sub_module_name + "_" + "input_" + str(node_seq)
-                self.clml_code.append(
-                    self.MakeCLMLTensor.substitute(
-                        name=node_out_name,
-                        shape=shape,
-                        dtype=dtype,
-                        layout="CL_TENSOR_LAYOUT_OPTIMAL_QCOM",
-                    )
-                )
-                self.clml_code.append(
-                    self.MapInsert.substitute(nid=node_out_name, tensor_desc=node_out_name)
-                )
-                self.clml_code.append(
-                    Template("runner.inputs.push_back($clml_input);").substitute(
-                        clml_input=node_out_name
-                    )
-                )
-                self.node_map[node_seq] = node_out_name
-                self.input_meta.append(
-                    self.MakeInputMetaInfo.substitute(
-                        in_name=node_out_name, dtype=dtype, shape=shape
-                    )
-                )
-            elif node["op"] == "kernel":
+            if node["op"] == "kernel":
                 self.clml_code.append("// Kernel Node : " + node["name"])
                 if node["name"] == "nn.conv2d" or node["name"] == "nn.depthwise_conv2d":
                     if "padding" in node["attrs"]:
@@ -791,6 +840,7 @@ def make_output_tensor(
                         bn_shape = [1, 1, 1, 1]
                         bn_node = self.nodes[node["inputs"][bn_index][0]]
                         bn_shape[axis] = bn_node["attrs"]["shape"][0][0]
+                        dtype = bn_node["attrs"]["dtype"][0][0]
 
                         bn_scale_tensor = get_tensor_from_map(
                             node["inputs"][bn_index][0],
@@ -858,6 +908,7 @@ def make_output_tensor(
                     bn_shape = [1, 1, 1, 1]
                     bn_node = self.nodes[node["inputs"][0][0]]
                     bn_shape[axis] = bn_node["attrs"]["shape"][0][0]
+                    dtype = bn_node["attrs"]["dtype"][0][0]
                     bn_scale_tensor = get_tensor_from_map(
                         node["inputs"][0][0], shape=str(tuple(bn_shape))[1:-1], dtype=dtype
                     )
@@ -947,26 +998,26 @@ def make_output_tensor(
                     in_shape = tuple(in_node["attrs"]["shape"][0][0])
                     wt_shape = tuple(in_node["attrs"]["shape"][0][0])
                     input_tensor = get_tensor_from_map(
-                        node["inputs"][0][0], shape=str(tuple([1, in_shape[1], 1, 1]))[1:-1]
+                        node["inputs"][0][0], layout="CL_TENSOR_LAYOUT_NCHW_QCOM"
                     )
                     weight_tensor = get_tensor_from_map(
                         node["inputs"][1][0],
-                        shape=str(tuple([wt_shape[0], wt_shape[1], 1, 1]))[1:-1],
+                        shape=str(tuple([1, 1, wt_shape[0], wt_shape[1]]))[1:-1],
+                        layout="CL_TENSOR_LAYOUT_NCHW_QCOM",
                     )
-                    if len(node["inputs"]) == 3:
-                        bias_tensor = "runner.unusedTensor"
-                    else:
-                        bias_tensor = get_tensor_from_map(node["inputs"][2][0])
-
                     node_out_name = make_output_tensor(
-                        node, node_seq, shape=str(tuple([1, wt_shape[0], 1, 1]))[1:-1]
+                        node,
+                        node_seq,
+                        shape=str(tuple([in_shape[0], wt_shape[0], 1, 1]))[1:-1],
+                        layout="CL_TENSOR_LAYOUT_NCHW_QCOM",
                     )
                     self.clml_code.append(
                         self.MakeDense.substitute(
                             input_tensor=input_tensor,
                             weight_tensor=weight_tensor,
                             output_tensor=node_out_name,
-                            bias_tensor=bias_tensor,
+                            in_shape=str(in_shape)[1:-1],
+                            wt_shape=str(wt_shape)[1:-1],
                             dtype=node["attrs"]["dtype"][0][0],
                         )
                     )
@@ -1045,7 +1096,7 @@ def make_output_tensor(
                 )
                 self.node_map[node_seq] = node_out_name
 
-            elif node["op"] != "const":
+            elif node["op"] not in ["const", "input"]:
                 print("Unknown Node type:", node["op"])
 
         # Populate outputs
@@ -1086,8 +1137,8 @@ def make_output_tensor(
                 name=self.sub_module_name,
                 input_count=len(self.input_meta),
                 output_count=len(self.output_meta),
-                input_meta="\n".join(self.input_meta),
-                output_meta="\n".join(self.output_meta),
+                input_meta="\\\n".join(self.input_meta),
+                output_meta="\\\n".join(self.output_meta),
             )
         )
 
diff --git a/src/runtime/contrib/clml/clml_memory_planner.cc b/src/runtime/contrib/clml/clml_memory_planner.cc
new file mode 100644
index 0000000000000..408500e9e3f16
--- /dev/null
+++ b/src/runtime/contrib/clml/clml_memory_planner.cc
@@ -0,0 +1,266 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/clml/clml_memory_planner.cc
+ * \brief Various memory planning methods.
+ */
+#ifdef TVM_GRAPH_EXECUTOR_CLML
+#include "clml_memory_planner.h"
+
+#include "clml_utils.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace tvm::runtime::json;
+using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+
+/*!
+ * Release memory after use.
+ *
+ */
+void FreeMemory(CachedLayer* layer, int nid) {
+  LOG_MEM << "FreeMemory:" << nid;
+  if (layer->storage_ref_map.find(nid) != layer->storage_ref_map.end()) {
+    LOG_MEM << "Ref Cnt:" << layer->storage_ref_map[nid];
+    layer->storage_ref_map[nid]--;
+    if (0 == layer->storage_ref_map[nid]) {
+      LOG_MEM << "Ref Cnt Nill";
+      // Look into on-chip allocation
+      for (auto it = layer->on_chip_pool_alloc_info.begin();
+           it != layer->on_chip_pool_alloc_info.end(); it++) {
+        if (it->second == nid) {
+          LOG_MEM << "Free Segment:" << it->first << " Nid:" << nid;
+          layer->in_chip_total_free += layer->on_chip_pool_size[it->first];
+          layer->in_chip_total_alloc -= layer->on_chip_pool_size[it->first];
+          layer->on_chip_pool_alloc_info.erase(it->first);
+          return;
+        }
+      }
+      // Look into DDR allocation
+      if (layer->ddr_alloc_plan.find(nid) != layer->ddr_alloc_plan.end()) {
+        LOG_MEM << "Free DDR segment from local pool";
+        layer->ddr_storage_ref_map[layer->ddr_alloc_plan[nid]].second = false;
+        return;
+      }
+      LOG_MEM << "*** Not a managed memory buffer";
+    }
+  } else {
+    LOG_MEM << "Not in storage ref map :" << nid;
+  }
+}
+
+/*!
+ * \brief Partition and allocate
+ *
+ */
+size_t PartitionAndAllocate(CachedLayer* layer, size_t segment_start, size_t size, bool is_left) {
+  LOG_MEM << "PartitionAndAllocate:" << segment_start << " Size:" << size
+          << " Is Begin:" << is_left;
+  size_t segment_size = layer->on_chip_pool_size[segment_start];
+  size_t left_space = segment_size - size;
+
+  layer->in_chip_total_free -= size;
+  layer->in_chip_total_alloc += size;
+
+  if (is_left) {
+    // Start allocation
+    layer->on_chip_pool_size[segment_start] = size;
+    if (left_space) {
+      layer->on_chip_pool_size.insert({segment_start + size, left_space});
+    }
+    return segment_start;
+  } else {
+    // End allocation
+    if (left_space) {
+      layer->on_chip_pool_size[segment_start] = left_space;
+    }
+    layer->on_chip_pool_size.insert({segment_start + left_space, size});
+    return segment_start + left_space;
+  }
+}
+
+/*!
+ * \brief Ping-Pong allocation with in best fit
+ *
+ */
+size_t PingPongAllocate(CachedLayer* layer, std::map<size_t, size_t>& segments, size_t size) {
+  /*
+   * segments contains all free segments details (start, size) that can fit the requirement
+   * PingPong Allocation Strategy:
+   * Here we find the smallest segment among all.
+   * We allocate at begining or end of this segment based on the ping-pong flag.
+   * Ping-pong allocation helps to have largest possible free segment at center
+   * for most of the graphs.
+   *
+   */
+  ssize_t free_start;
+  ssize_t free_size;
+  ssize_t last_found_size = CLMLWorkspace::Global()->onchip_mem_size + 1;
+
+  for (auto it = segments.begin(); it != segments.end(); it++) {
+    if (it->second < last_found_size) {
+      free_start = it->first;
+      free_size = it->second;
+      last_found_size = it->second;
+      LOG_MEM << "Mem Found:" << free_start << " Size:" << free_size;
+    }
+  }
+
+  LOG_MEM << "Alloc On-chip Mem:" << free_start << " Size:" << free_size
+          << " PingPong:" << layer->alloc_ping_pong;
+
+  // Allocate on-chip memory
+  layer->alloc_ping_pong ^= 1;
+  return PartitionAndAllocate(layer, free_start, size, layer->alloc_ping_pong);
+}
+
+/*!
+ * \brief Allocate on-chip memory.
+ *
+ */
+size_t RequestOnChipMemory(CachedLayer* layer, size_t size) {
+  LOG_MEM << "Request On-Chip Mem:" << size;
+  // Optimize for any fragmented parts
+  bool any_merge = true;
+  while (any_merge) {
+    any_merge = false;
+    for (auto it = layer->on_chip_pool_size.begin(); it != layer->on_chip_pool_size.end(); it++) {
+      if ((layer->on_chip_pool_alloc_info.find(it->first) ==
+           layer->on_chip_pool_alloc_info.end()) &&
+          (layer->on_chip_pool_alloc_info.find(it->first + it->second) ==
+           layer->on_chip_pool_alloc_info.end()) &&
+          (it->first + it->second < CLMLWorkspace::Global()->onchip_mem_size)) {
+        size_t left_begin = it->first;
+        size_t left_size = it->second;
+        size_t right_size = layer->on_chip_pool_size[it->first + it->second];
+        LOG_MEM << "Merge:" << left_begin << " Size:" << left_size << " with :" << right_size;
+        layer->on_chip_pool_size[left_begin] = left_size + right_size;
+        layer->on_chip_pool_size.erase(left_begin + left_size);
+        any_merge = true;
+        break;
+      }
+    }
+  }
+
+  // Look for any best fit free fragment
+  std::map<size_t, size_t> feasible_segments;
+  for (auto it = layer->on_chip_pool_size.begin(); it != layer->on_chip_pool_size.end(); it++) {
+    if (layer->on_chip_pool_alloc_info.find(it->first) == layer->on_chip_pool_alloc_info.end()) {
+      if (it->second >= size) {
+        LOG_MEM << "Mem Pool:" << it->first << " - " << it->first + it->second << ":" << it->second
+                << " - Free";
+        feasible_segments.insert({it->first, it->second});
+      } else {
+        LOG_MEM << "Mem Pool:" << it->first << " - " << it->first + it->second << ":" << it->second
+                << " - Doesn't fit";
+      }
+    } else {
+      LOG_MEM << "Mem Pool:" << it->first << " - " << it->first + it->second << ":" << it->second
+              << " - Busy";
+    }
+  }
+  if (0 == feasible_segments.size()) {
+    LOG_MEM << "No Suitable Mem Found:" << size << " Free Size:" << layer->in_chip_total_free;
+    if (size <= layer->in_chip_total_free) {
+      LOG_STATS << "*** ALERT ***: Couldn't allocate due to fragmentation:" << size
+                << " Total Free:" << layer->in_chip_total_free;
+      layer->on_chip_alert_fail += size;
+    }
+    return -1;
+  }
+
+  return PingPongAllocate(layer, feasible_segments, size);
+}
+
+/*!
+ * \brief Allocate DDR memory for requested size.
+ *
+ */
+cl_mem RequestDDRMemory(CachedLayer* layer, size_t size) {
+  // Look for local storage map for a best fit
+  auto cws = CLMLWorkspace::Global();
+  cl_mem memptr = nullptr;
+  size_t best_fit = INT_MAX;
+  for (auto it = layer->ddr_storage_ref_map.begin(); it != layer->ddr_storage_ref_map.end(); it++) {
+    if ((it->second.first >= size) && (false == it->second.second)) {
+      if (best_fit > it->second.first) {
+        memptr = it->first;
+        best_fit = it->second.first;
+      }
+    }
+  }
+
+  if (memptr) {
+    LOG_MEM << "Reuse from local pool";
+    layer->ddr_storage_ref_map[memptr].second = true;
+    return memptr;
+  } else {
+    // No available buffer in local pool, look for global pool
+    for (auto it = cws->ddr_global_pool.begin(); it != cws->ddr_global_pool.end(); it++) {
+      if ((it->second.first >= size) &&
+          (layer->ddr_storage_ref_map.find(it->first) == layer->ddr_storage_ref_map.end())) {
+        // Found a buffer in global pool. Insert in local pool and then use.
+        if (best_fit > it->second.first) {
+          memptr = it->first;
+          best_fit = it->second.first;
+        }
+      }
+    }
+  }
+
+  if (memptr) {
+    LOG_MEM << "Reuse from global pool";
+    cws->ddr_global_pool[memptr].second += 1;
+    layer->ddr_storage_ref_map.insert(
+        {memptr, std::make_pair(cws->ddr_global_pool[memptr].first, true)});
+    return memptr;
+  } else {
+    // Allocate a fresh buffer in global then use in local pool.
+    LOG_MEM << "Allocating fresh buffer in global pool";
+    memptr = AllocateDDRTensorMemory(size);
+    cws->ddr_global_pool.insert({memptr, std::make_pair(size, 1)});
+    layer->ddr_storage_ref_map.insert({memptr, std::make_pair(size, true)});
+  }
+
+  return memptr;
+}
+
+/*!
+ * \brief Release memory from global pool.
+ *
+ */
+void ReleaseDDRMemory(cl_mem memptr) {
+  cl_int result;
+  auto cws = CLMLWorkspace::Global();
+  cws->ddr_global_pool[memptr].second -= 1;
+  if (0 == cws->ddr_global_pool[memptr].second) {
+    LOG_MEM << "Release DDR mem from global pool";
+    result = clReleaseMemObject(memptr);
+    ICHECK(result == CL_SUCCESS) << "clReleaseMemObject:" << result;
+    cws->ddr_global_pool.erase(memptr);
+  }
+}
+
+}  //  namespace contrib
+}  //  namespace runtime
+}  //  namespace tvm
+#endif
diff --git a/src/runtime/contrib/clml/clml_memory_planner.h b/src/runtime/contrib/clml/clml_memory_planner.h
new file mode 100644
index 0000000000000..b4e34e4f32d47
--- /dev/null
+++ b/src/runtime/contrib/clml/clml_memory_planner.h
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/clml/clml_memory_planner.h
+ * \brief CLML memory planner header
+ */
+#ifndef TVM_RUNTIME_CONTRIB_CLML_CLML_MEMORY_PLANNER_H_
+#define TVM_RUNTIME_CONTRIB_CLML_CLML_MEMORY_PLANNER_H_
+
+#include "clml_runtime.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+void FreeMemory(CachedLayer* layer, int nid);
+
+void ReleaseDDRMemory(cl_mem memptr);
+
+size_t RequestOnChipMemory(CachedLayer* layer, size_t size);
+
+cl_mem RequestDDRMemory(CachedLayer* layer, size_t size);
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_CONTRIB_CLML_CLML_MEMORY_PLANNER_H_
diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
index 7c716e68763bf..b27cf4532001b 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -21,33 +21,12 @@
  * \file src/runtime/contrib/clml/clml_runtime.cc
  * \brief A simple JSON runtime for CLML.
  */
+#include "clml_runtime.h"
 
-#include <CL/cl.h>
-#include <CL/opencl.h>
 #ifdef TVM_GRAPH_EXECUTOR_CLML
-#include <CL/cl_qcom_ml_ops.h>
+#include "clml_memory_planner.h"
+#include "clml_utils.h"
 #endif
-#include <stdlib.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/profiling.h>
-#include <tvm/runtime/registry.h>
-
-#include <fstream>
-#include <map>
-#include <utility>
-
-#include "../../file_utils.h"
-#include "../../opencl/opencl_common.h"
-#include "../json/json_node.h"
-#include "../json/json_runtime.h"
-
-#define CAT_I(a, b) a##b
-#define CAT(a, b) CAT_I(a, b)
-#define GET_ML_INTERFACE CAT(CAT(clGetMLInterfaceV, CL_QCOM_ML_OPS_H_MAJOR_VERSION), QCOM)
-#define GET_ML_API_INTERFACE CAT(CAT(CLMLInterfaceV, CL_QCOM_ML_OPS_H_MAJOR_VERSION), QCOM)
-
-/*! \brief Magic number for CLML Tuning cache entry */
-static const uint64_t kTVMCLMLTuningCacheMagic = 0x434C4D4C54554E45;
 
 namespace tvm {
 namespace runtime {
@@ -56,6 +35,88 @@ namespace contrib {
 using namespace tvm::runtime::json;
 using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
 
+#ifdef TVM_GRAPH_EXECUTOR_CLML
+CLMLThreadEntry* CLMLWorkspace::GetThreadEntry() { return CLMLThreadEntry::ThreadLocal(); }
+
+CLMLWorkspace* CLMLWorkspace::Global() {
+  static CLMLWorkspace* inst = new CLMLWorkspace();
+  return inst;
+}
+
+CLMLWorkspace::CLMLWorkspace() {
+  cl_int result = 0;
+  workspace = cl::OpenCLWorkspace::Global();
+  workspace->Init();
+  tentry = workspace->GetThreadEntry();
+
+  device_id = workspace->GetCLDeviceID(tentry->device.device_id);
+  platform_id = workspace->device_to_platform[device_id];
+
+  // Print extensions
+  size_t reqd_size = 0;
+  result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, nullptr, &reqd_size);
+  ICHECK(reqd_size > 0u && result == CL_SUCCESS) << "clGetDeviceInfo:" << result;
+  std::vector<char> extn_buf(reqd_size);
+  result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, reqd_size, extn_buf.data(), nullptr);
+  ICHECK(result == CL_SUCCESS) << "clGetDeviceInfo:" << result;
+  std::string extensions(extn_buf.data());
+  LOG(WARNING) << "OpenCL Extensions:" << extensions;
+
+  if(extensions.find("cl_qcom_ml_ops") == std::string::npos) {
+    LOG(FATAL) << "CLML Runtime Init: Qualcomm extn not present.\n";
+    return;
+  }
+  is_recordable_queue = (extensions.find("cl_qcom_recordable_queues") != std::string::npos);
+  is_on_chip_memory = (extensions.find("cl_qcom_onchip_global_memory") != std::string::npos);
+  LOG(WARNING) << "Recordable Queues Support :" << is_recordable_queue;
+  LOG(WARNING) << "On chip Memory Support :" << is_on_chip_memory;
+
+  if (is_on_chip_memory) {
+    result = clGetDeviceInfo(device_id, CL_DEVICE_ONCHIP_GLOBAL_MEM_SIZE_QCOM,
+                             sizeof(onchip_mem_size), &onchip_mem_size, NULL);
+    ICHECK(result == CL_SUCCESS) << "clGetDeviceInfo(CL_DEVICE_ONCHIP_GLOBAL_MEM_SIZE_QCOM):"
+                                 << result;
+    LOG(WARNING) << "On chip memory size:" << onchip_mem_size;
+  }
+
+  // Query and Get CLML Interface
+  static const cl_uint MAX_VERSIONS = 256;
+  cl_int majorVersions[MAX_VERSIONS];
+  cl_int minorVersions[MAX_VERSIONS];
+  cl_uint numVersions = 0;
+  result = clQueryMLInterfaceVersionsQCOM(nullptr, nullptr, 0, &numVersions);
+  ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << result;
+  ICHECK(numVersions > 0u);
+  ICHECK(numVersions <= MAX_VERSIONS);
+
+  result = clQueryMLInterfaceVersionsQCOM(majorVersions, minorVersions, numVersions, nullptr);
+  ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << result;
+
+  for (cl_uint i = 0; i < numVersions; ++i) {
+    if (majorVersions[i] == CL_QCOM_ML_OPS_H_MAJOR_VERSION) {
+      h_ClmlIntf = GET_ML_INTERFACE(0);
+      LOG(WARNING) << "CLML Target version:" << majorVersions[i];
+      break;
+    }
+  }
+  ICHECK(h_ClmlIntf != nullptr)
+      << "clGetMLInterfaceVxQCOM:" << result
+      << " Perhaps there is mispatch between CLML SDK version to target supported version:"
+      << majorVersions[numVersions - 1];
+  char* tune_flag;
+  if ((tune_flag = getenv("CLML_IS_TUNING_RUN")))
+    is_tuning_run = std::stoi(tune_flag);
+  else
+    is_tuning_run = 0;
+
+  if (!(tuning_file = getenv("CLML_TUNING_CACHE"))) this->is_tuning_run = 0;
+}
+
+typedef dmlc::ThreadLocalStore<CLMLThreadEntry> CLMLThreadStore;
+
+CLMLThreadEntry* CLMLThreadEntry::ThreadLocal() { return CLMLThreadStore::Get(); }
+#endif
+
 class CLMLRuntime : public JSONRuntimeBase {
  public:
   /*!
@@ -73,33 +134,42 @@ class CLMLRuntime : public JSONRuntimeBase {
   ~CLMLRuntime() {
 #ifdef TVM_GRAPH_EXECUTOR_CLML
     cl_int result = 0;
-    if (this->is_tuning_run) {
-      result = h_ClmlIntf->clReleaseMLTuningCacheQCOM(this->tuning_cache);
+    if (this->layer_.tuning_cache) {
+      result = CLML_INTF->clReleaseMLTuningCacheQCOM(this->layer_.tuning_cache);
       ICHECK(result == CL_SUCCESS) << "clReleaseMLTuningCacheQCOM:" << result;
     }
     for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) {
       auto tensor_desc = it->second.first;
-      result = h_ClmlIntf->clReleaseMLTensorQCOM(tensor_desc->tensor);
+      result = CLML_INTF->clReleaseMLTensorQCOM(tensor_desc->tensor);
       ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorQCOM:" << result;
-      result = clReleaseMemObject(tensor_desc->memory);
-      ICHECK(result == CL_SUCCESS) << "clReleaseMemObject:" << result;
+      if (this->layer_.ddr_storage_ref_map.find(tensor_desc->memory) !=
+          this->layer_.ddr_storage_ref_map.end()) {
+        ReleaseDDRMemory(tensor_desc->memory);
+      } else {
+        result = clReleaseMemObject(tensor_desc->memory);
+        ICHECK(result == CL_SUCCESS) << "clReleaseMemObject:" << result;
+      }
     }
     for (size_t i = 0; i < this->layer_.function.size(); ++i) {
-      result = h_ClmlIntf->clReleaseMLOpQCOM(this->layer_.function[i]);
+      result = CLML_INTF->clReleaseMLOpQCOM(this->layer_.function[i]);
       ICHECK(result == CL_SUCCESS) << "clReleaseMLOpQCOM:" << result;
     }
     for (auto it = this->layer_.in_placeholder.begin(); it != this->layer_.in_placeholder.end();
          it++) {
-      result = h_ClmlIntf->clReleaseMLTensorQCOM((*it)->tensor);
+      result = CLML_INTF->clReleaseMLTensorQCOM(it->second->tensor);
       ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorQCOM:" << result;
     }
     for (auto it = this->layer_.out_placeholder.begin(); it != this->layer_.out_placeholder.end();
          it++) {
-      result = h_ClmlIntf->clReleaseMLTensorQCOM((*it)->tensor);
+      result = CLML_INTF->clReleaseMLTensorQCOM((*it)->tensor);
       ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorQCOM:" << result;
     }
-    result = h_ClmlIntf->clReleaseMLTensorMemoryDescriptorSetQCOM(layer_.descriptorSet);
+    result = CLML_INTF->clReleaseMLTensorMemoryDescriptorSetQCOM(layer_.descriptorSet);
     ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorMemoryDescriptorSetQCOM:" << result;
+
+    if (this->layer_.recordable_queue) {
+      clReleaseCommandQueue(this->layer_.recordable_queue);
+    }
 #endif
   }
 
@@ -129,66 +199,27 @@ class CLMLRuntime : public JSONRuntimeBase {
   }
 
 #ifdef TVM_GRAPH_EXECUTOR_CLML
-  std::vector<cl_uint> GetVectorValues(const std::vector<std::string>& val) {
-    std::vector<cl_uint> array;
-    for (auto i : val) {
-      array.push_back((cl_uint)stoi(i));
-    }
-    return array;
-  }
-
   void InitCLML() {
     // Setup CLML Context
     cl_int result = 0;
-    workspace = cl::OpenCLWorkspace::Global();
-    workspace->Init();
-    tentry = workspace->GetThreadEntry();
+    cws = CLMLWorkspace::Global();
 
-    if (!ExtensionStringPresent()) {
-      LOG(FATAL) << "CLML Runtime Init: Qualcomm extn not present.\n";
-      return;
-    }
-    device_id = workspace->GetCLDeviceID(tentry->device.device_id);
-    platform_id = workspace->device_to_platform[device_id];
-
-    // Query and Get CLML Interface
-    static const cl_uint MAX_VERSIONS = 256;
-    cl_int majorVersions[MAX_VERSIONS];
-    cl_int minorVersions[MAX_VERSIONS];
-    cl_uint numVersions = 0;
-    result = clQueryMLInterfaceVersionsQCOM(nullptr, nullptr, 0, &numVersions);
-    ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << result;
-    ICHECK(numVersions > 0u);
-    ICHECK(numVersions <= MAX_VERSIONS);
-
-    result = clQueryMLInterfaceVersionsQCOM(majorVersions, minorVersions, numVersions, nullptr);
-    ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << result;
-
-    for (cl_uint i = 0; i < numVersions; ++i) {
-      if (majorVersions[i] == CL_QCOM_ML_OPS_H_MAJOR_VERSION) {
-        h_ClmlIntf = GET_ML_INTERFACE(0);
-        LOG(WARNING) << "CLML Target version:" << majorVersions[i];
-        break;
-      }
+    if (cws->is_recordable_queue) {
+      this->layer_.recordable_queue =
+          clCreateCommandQueue(CLML_CTX, cws->device_id, CL_QUEUE_RECORDABLE_QCOM, &result);
+      ICHECK(result == CL_SUCCESS) << "clCreateCommandQueue - Recordable:" << result;
+
+      this->layer_.recording = clNewRecordingQCOM(this->layer_.recordable_queue, &result);
+      ICHECK(result == CL_SUCCESS) << "clNewRecordingQCOM:" << result;
     }
-    ICHECK(h_ClmlIntf != nullptr)
-        << "clGetMLInterfaceVxQCOM:" << result
-        << " Perhaps there is mispatch between CLML SDK version to target supported version:"
-        << majorVersions[numVersions - 1];
-    char* tune_flag;
-    if ((tune_flag = getenv("CLML_IS_TUNING_RUN")))
-      this->is_tuning_run = std::stoi(tune_flag);
-    else
-      this->is_tuning_run = 0;
 
-    if (!(tuning_file = getenv("CLML_TUNING_CACHE"))) this->is_tuning_run = 0;
     // A Tuning run, so create the cache from scratch
-    result = h_ClmlIntf->clCreateMLTuningCacheQCOM(&tuning_cache);
+    result = CLML_INTF->clCreateMLTuningCacheQCOM(&layer_.tuning_cache);
     ICHECK(result == CL_SUCCESS) << "clCreateMLTuningCacheQCOM:" << result;
-    if (!this->is_tuning_run && this->tuning_file) {
+    if (!cws->is_tuning_run && cws->tuning_file) {
       std::vector<unsigned char> tune_buffer;
       std::string tune_blob;
-      LoadBinaryFromFile(this->tuning_file, &tune_blob);
+      LoadBinaryFromFile(cws->tuning_file, &tune_blob);
       dmlc::MemoryStringStream mstrm(const_cast<std::string*>(&tune_blob));
       dmlc::Stream* strm = &mstrm;
 
@@ -198,7 +229,7 @@ class CLMLRuntime : public JSONRuntimeBase {
         if (header != kTVMCLMLTuningCacheMagic) break;
         if (!strm->Read(&reserve)) break;
         if (!strm->Read(&tune_symbol)) break;
-        LOG(INFO) << "Tuning Cache Symbol:" << tune_symbol;
+        // LOG(INFO) << "Tuning Cache Symbol:" << tune_symbol;
         if (tune_symbol == clml_symbol) {
           strm->Read(&tune_buffer);
           break;
@@ -211,59 +242,16 @@ class CLMLRuntime : public JSONRuntimeBase {
       if (tune_buffer.size()) {
         LOG(INFO) << "Loading tuning cache for symbol:" << clml_symbol
                   << " size:" << tune_buffer.size();
-        result = h_ClmlIntf->clLoadMLTuningCacheQCOM(tuning_cache, tune_buffer.size(),
-                                                     tune_buffer.data());
+        result = CLML_INTF->clLoadMLTuningCacheQCOM(layer_.tuning_cache, tune_buffer.size(),
+                                                    tune_buffer.data());
         ICHECK(result == CL_SUCCESS) << "clLoadMLTuningCacheQCOM:" << result;
       } else {
         LOG(WARNING) << "Tuning cache not cound for symbol :" << clml_symbol << " in file "
-                     << this->tuning_file;
+                     << cws->tuning_file;
       }
     }
   }
 
-  std::vector<unsigned char> readBinFile(const std::string& filename) {
-    std::ifstream fin(filename, std::ios::binary | std::ios::ate);
-    if (!fin.good()) {
-      LOG(FATAL) << "ERROR: Could not load tuning cache file: " + filename;
-    }
-    ICHECK(fin.good());
-    int64_t size = fin.tellg();
-    fin.seekg(0, std::ios::beg);
-    std::vector<unsigned char> buffer(static_cast<size_t>(size));
-    char* ptr = reinterpret_cast<char*>(buffer.data());
-    fin.read(ptr, size);
-    ICHECK(fin.good());
-    return buffer;
-  }
-
-  void CopyDataToCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tensor, void* data,
-                            cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM) {
-    cl_int result = 0;
-    cl_event evt = nullptr;
-    result = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(workspace->GetQueue(tentry->device), data,
-                                                        layout, tensor->tensor, tensor->memory,
-                                                        0,        // n waitlist
-                                                        nullptr,  // waitlist
-                                                        &evt);    // event
-    ICHECK((evt != nullptr) && result == CL_SUCCESS) << "clEnqueueWriteMLTensorDataQCOM:" << result;
-  }
-
-  void CopyDataFromCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tensor, void* data,
-                              cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM) {
-    cl_int result = 0;
-    cl_event readEvent = nullptr;
-    // Read the output tensor
-    result = h_ClmlIntf->clEnqueueReadMLTensorDataQCOM(workspace->GetQueue(tentry->device),
-                                                       tensor->tensor, tensor->memory, data, layout,
-                                                       0,            // n waitlist
-                                                       nullptr,      // waitlist
-                                                       &readEvent);  // event
-    ICHECK(result == CL_SUCCESS) << "clEnqueueReadMLTensorDataQCOM:" << result;
-
-    result = clWaitForEvents(1, &readEvent);
-    ICHECK(result == CL_SUCCESS) << "clWaitForEvents:" << result;
-  }
-
   /*!
    * \brief Unpack inputs and outputs and run inference on a given layer.
    *
@@ -273,8 +261,8 @@ class CLMLRuntime : public JSONRuntimeBase {
    */
   void Run() override {
     cl_int result = 0;
-    cl_command_queue queue = workspace->GetQueue(tentry->device);
-    std::vector<cl_event>& evts = workspace->GetEventQueue(tentry->device);
+    cl_command_queue queue = CLML_QUEUE;
+    std::vector<cl_event>& evts = cws->workspace->GetEventQueue(cws->tentry->device);
     for (size_t i = 0; i < input_nodes_.size(); ++i) {
       auto nid = input_nodes_[i];
       uint32_t eid = EntryID(nid, 0);
@@ -285,19 +273,19 @@ class CLMLRuntime : public JSONRuntimeBase {
           isize *= data_entry_[eid]->shape[j];
         }
         if (kDLCPU == data_entry_[eid]->device.device_type) {
-          CopyDataToCLMLTensor(layer_.inputs[i], data);
+          CopyDataToCLMLTensor(layer_.inputs[nid], data);
         } else if (kDLOpenCL == data_entry_[eid]->device.device_type) {
-          layer_.in_placeholder[i]->memory = static_cast<cl_mem>(
+          layer_.in_placeholder[nid]->memory = static_cast<cl_mem>(
               ((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
           cl_event cpy_evt = nullptr;
           cl_event* evt = &cpy_evt;
-          if (workspace->IsProfiling(tentry->device)) {
+          if (cws->workspace->IsProfiling(cws->tentry->device)) {
             evts.resize(evts.size() + 1);
             evt = &(evts.back());
           }
-          result = h_ClmlIntf->clEnqueueCopyMLTensorDataQCOM(
-              queue, layer_.in_placeholder[i]->tensor, layer_.in_placeholder[i]->memory,
-              layer_.inputs[i]->tensor, layer_.inputs[i]->memory, 0, nullptr, evt);
+          result = CLML_INTF->clEnqueueCopyMLTensorDataQCOM(
+              queue, layer_.in_placeholder[nid]->tensor, layer_.in_placeholder[nid]->memory,
+              layer_.inputs[nid]->tensor, layer_.inputs[nid]->memory, 0, NULL, evt);
           ICHECK(result == CL_SUCCESS) << "clEnqueueCopyMLTensorDataQCOM:" << result;
         } else {
           DLDataType tvm_dtype = const_cast<DLTensor*>(data_entry_[eid])->dtype;
@@ -306,36 +294,57 @@ class CLMLRuntime : public JSONRuntimeBase {
           void* tmpptr = reinterpret_cast<void*>(malloc(isize * dtype_size));
           TVMArrayCopyToBytes(const_cast<DLTensor*>(data_entry_[eid]), const_cast<void*>(tmpptr),
                               isize * dtype_size);
-          CopyDataToCLMLTensor(layer_.inputs[i], tmpptr);
+          CopyDataToCLMLTensor(layer_.inputs[nid], tmpptr);
           free(tmpptr);
         }
       }
     }
 
     int64_t duration = 0;
-    for (size_t i = 0; i < this->layer_.function.size(); ++i) {
-      // Make CLML subgraphs accounted by OpenCLTimerNode.
-
+    if (cws->is_recordable_queue) {
       if (getenv("CLML_PROFILING")) {
         Timer t;
         auto f = Registry::Get(std::string("profiling.timer.opencl"));
-        t = f->operator()(tentry->device);
+        t = f->operator()(cws->tentry->device);
         t->Start();
-        queue = workspace->GetQueue(tentry->device);
+        queue = CLML_QUEUE;
         evts.resize(evts.size() + 1);
         cl_event* evt = &(evts.back());
-
-        result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->layer_.function[i],
-                                               this->layer_.descriptorSet, 0, nullptr, evt);
+        result = CLML_INTF->clEnqueueRecordingMLOpQCOM(queue, this->layer_.recording, 0, nullptr, 0,
+                                                       nullptr, 0, nullptr, 0, nullptr, 0, nullptr,
+                                                       0, nullptr, 0, nullptr, 0, nullptr, evt);
+        ICHECK(result == CL_SUCCESS) << "clEnqueueRecordingMLOpQCOM:" << result;
         t->Stop();
         duration += t->SyncAndGetElapsedNanos();
-        LOG(WARNING) << "Layer:" << this->layer_.layer_names[i]
-                     << " Duration:" << t->SyncAndGetElapsedNanos();
       } else {
-        result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->layer_.function[i],
-                                               this->layer_.descriptorSet, 0, nullptr, nullptr);
+        result = CLML_INTF->clEnqueueRecordingMLOpQCOM(queue, this->layer_.recording, 0, nullptr, 0,
+                                                       nullptr, 0, nullptr, 0, nullptr, 0, nullptr,
+                                                       0, nullptr, 0, nullptr, 0, nullptr, nullptr);
+        ICHECK(result == CL_SUCCESS) << "clEnqueueRecordingMLOpQCOM:" << result;
+      }
+    } else {
+      for (size_t i = 0; i < this->layer_.function.size(); ++i) {
+        // Make CLML subgraphs accounted by OpenCLTimerNode.
+        if (getenv("CLML_PROFILING")) {
+          Timer t;
+          auto f = Registry::Get(std::string("profiling.timer.opencl"));
+          t = f->operator()(cws->tentry->device);
+          t->Start();
+          queue = CLML_QUEUE;
+          evts.resize(evts.size() + 1);
+          cl_event* evt = &(evts.back());
+          result = CLML_INTF->clEnqueueMLOpQCOM(queue, this->layer_.function[i],
+                                                this->layer_.descriptorSet, 0, nullptr, evt);
+          t->Stop();
+          duration += t->SyncAndGetElapsedNanos();
+          LOG(WARNING) << "Layer:" << this->layer_.layer_names[i]
+                       << " Duration:" << t->SyncAndGetElapsedNanos();
+        } else {
+          result = CLML_INTF->clEnqueueMLOpQCOM(queue, this->layer_.function[i],
+                                                this->layer_.descriptorSet, 0, nullptr, nullptr);
+        }
+        ICHECK(result == CL_SUCCESS) << "clEnqueueMLOpQCOM:" << result;
       }
-      ICHECK(result == CL_SUCCESS) << "clEnqueueMLOpQCOM:" << result;
     }
     if (getenv("CLML_PROFILING")) {
       LOG(WARNING) << "Total Duration for " << clml_symbol << " is:" << duration;
@@ -356,11 +365,11 @@ class CLMLRuntime : public JSONRuntimeBase {
             ((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
         cl_event cpy_evt = nullptr;
         cl_event* evt = &cpy_evt;
-        if (workspace->IsProfiling(tentry->device)) {
+        if (cws->workspace->IsProfiling(cws->tentry->device)) {
           evts.resize(evts.size() + 1);
           evt = &(evts.back());
         }
-        result = h_ClmlIntf->clEnqueueCopyMLTensorDataQCOM(
+        result = CLML_INTF->clEnqueueCopyMLTensorDataQCOM(
             queue, layer_.outputs[i]->tensor, layer_.outputs[i]->memory,
             layer_.out_placeholder[i]->tensor, layer_.out_placeholder[i]->memory, 0, nullptr, evt);
         ICHECK(result == CL_SUCCESS) << "clEnqueueCopyMLTensorDataQCOM:" << result;
@@ -379,6 +388,163 @@ class CLMLRuntime : public JSONRuntimeBase {
   }
 
  private:
+  /*!
+   * \brief check if the nid is graph output tensor or not.
+   *
+   */
+  bool IsOutputTensor(int nid) {
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      if (nid == outputs_[i].id_) return true;
+    }
+    return false;
+  }
+
+  /*!
+   * \brief Initialize memory pool.
+   *
+   */
+  void InitMemoryPool(void) {
+    layer_.on_chip_pool_size.clear();
+    layer_.on_chip_pool_size.insert({0, cws->onchip_mem_size});
+    layer_.on_chip_pool_alloc_info.clear();
+    layer_.alloc_ping_pong = true;
+    layer_.in_chip_total_free = cws->onchip_mem_size;
+    layer_.in_chip_total_alloc = 0;
+    layer_.on_chip_alert_fail = 0;
+  }
+
+  /*!
+   * \brief Plan Memory for activations to allocate on on-chip global memory where ever possible.
+   *
+   */
+  void PlanMemory() {
+    InitMemoryPool();
+    // Build the ref count table for all activation tensors.
+    LOG_MEM << "Build Ref Map";
+    for (size_t nid = 0; nid < nodes_.size(); ++nid) {
+      const auto& node = nodes_[nid];
+      if (node.GetOpType() == "kernel") {
+        std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
+        for (auto& input_node : inputs) {
+          if (nodes_[input_node.id_].GetOpType() != "const") {
+            if (layer_.storage_ref_map.find(input_node.id_) == layer_.storage_ref_map.end()) {
+              layer_.storage_ref_map.insert({input_node.id_, 1});
+              layer_.life_span.insert({input_node.id_, nid});
+            } else {
+              layer_.storage_ref_map[input_node.id_]++;
+              layer_.life_span[input_node.id_] = nid;
+            }
+          }
+        }
+      }
+    }
+    LOG_MEM << "Print Ref Map";
+
+    for (auto it = layer_.storage_ref_map.begin(); it != layer_.storage_ref_map.end(); it++) {
+      LOG_MEM << "RefMap:" << it->first << " Count:" << it->second
+              << "Life Span:" << layer_.life_span[it->first];
+    }
+
+    for (size_t nid = 0; nid < nodes_.size(); ++nid) {
+      const auto& node = nodes_[nid];
+      uint32_t size = 0;
+      cl_int result = CL_OUT_OF_HOST_MEMORY;
+      result = CLML_INTF->clGetMLTensorMemorySizeQCOM(CLML_CTX,
+                                                      layer_.storage_map[nid].first->tensor, &size);
+      ICHECK(result == CL_SUCCESS) << "clGetMLTensorMemorySizeQCOM:" << result;
+
+      if ((node.GetOpType() == "kernel") || (node.GetOpType() == "input")) {
+        std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
+        LOG_MEM << "Request :" << size << " Nid:" << nid;
+        size_t offset = -1;
+        // On-chip memory only for intermediate tensors with in recording scope.
+        if ((cws->is_on_chip_memory) && (!IsOutputTensor(nid)) && (node.GetOpType() != "input")) {
+          offset = RequestOnChipMemory(&this->layer_, size);
+        }
+        if (-1 != offset) {
+          LOG_MEM << "Got On-Chip Mem:" << offset << "Nid:" << nid;
+          layer_.on_chip_pool_alloc_info.insert({offset, nid});
+          layer_.on_chip_alloc_plan.insert({nid, std::make_pair(size, offset)});
+        } else {
+          layer_.on_chip_reject.insert({nid, size});
+          // DDR Allocation
+          auto ddr_mem = RequestDDRMemory(&this->layer_, size);
+          LOG_MEM << "Alloc DDR from global pool for nid:" << nid << " Type:" << node.GetOpType();
+          layer_.ddr_alloc_plan.insert({nid, ddr_mem});
+        }
+
+        // Now free up the input tensors on-chip memory for reuse.
+        for (auto& input_node : inputs) {
+          if (nodes_[input_node.id_].GetOpType() != "const") {
+            LOG_MEM << "Free Input Mem:" << input_node.id_;
+            FreeMemory(&this->layer_, input_node.id_);
+          }
+        }
+      }
+    }
+
+    // Stats dump
+    size_t in_chip_total_alloc = 0;
+    size_t total_reject = 0;
+    for (auto it = layer_.on_chip_alloc_plan.begin(); it != layer_.on_chip_alloc_plan.end(); it++) {
+      LOG_STATS << " On-chip Alloc:" << it->first << " Size:" << it->second.first
+                << " Offset:" << it->second.second;
+      in_chip_total_alloc += it->second.first;
+    }
+
+    for (auto it = layer_.on_chip_reject.begin(); it != layer_.on_chip_reject.end(); it++) {
+      LOG_STATS << "Reject:" << it->first << " Size:" << it->second;
+      total_reject += it->second;
+    }
+    LOG_STATS << "Total On-chip Alloc:" << in_chip_total_alloc + total_reject
+              << " On-Chip:" << in_chip_total_alloc << " Reject:" << total_reject
+              << " Alert Fail:" << layer_.on_chip_alert_fail;
+
+    auto cws = CLMLWorkspace::Global();
+    for (auto it = cws->ddr_global_pool.begin(); it != cws->ddr_global_pool.end(); it++) {
+      LOG_STATS << "DDR Global pool - size:" << it->second.first << " Ref:" << it->second.second;
+    }
+    for (auto it = this->layer_.ddr_storage_ref_map.begin();
+         it != this->layer_.ddr_storage_ref_map.end(); it++) {
+      LOG_STATS << "DDR Local pool - size:" << it->second.first << " Ref cnt:" << it->second.second;
+    }
+  }
+
+  /*!
+   * \brief Create an CLML tensor from JSON node entry. Lookup storage map before creation.
+   *
+   * \param tensor The tensor as Node Entry .
+   * \param shape shape information of tensor
+   * \param layout the tensor layout to be used
+   * \param dtype tensor data type
+   * \return CLML Tensor descriptor.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONEntry(
+      const JSONGraphNodeEntry& tensor, std::vector<size_t> shape, cl_ml_tensor_layout_qcom layout,
+      cl_uint dtype) {
+    JSONGraphNode node = nodes_[tensor.id_];
+
+    if (this->layer_.storage_map.find(tensor.id_) == this->layer_.storage_map.end()) {
+      void* node_data = nullptr;
+      if (node.GetOpType() == "const") {
+        node_data = data_entry_[EntryID(tensor)]->data;
+      }
+      auto clml_tensor = MakeCLMLTensorFromJSONNode(node, layout, dtype, node_data, shape);
+      this->layer_.storage_map.insert({tensor.id_, std::make_pair(clml_tensor, node)});
+
+      if ("input" == node.GetOpType()) {
+        this->layer_.inputs.insert({tensor.id_, clml_tensor});
+        // Input copy placeholder Tensor
+        this->layer_.in_placeholder.insert(
+            {tensor.id_, MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM, dtype, node_data, shape)});
+      }
+
+      return clml_tensor;
+    } else {
+      return this->layer_.storage_map[tensor.id_].first;
+    }
+  }
+
   /*!
    * \brief Build CLML layer from JSON representation and cache.
    *
@@ -392,12 +558,7 @@ class CLMLRuntime : public JSONRuntimeBase {
       DLDataType tvm_dtype = node.GetOpDataType()[0];
       cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
       if (node.GetOpType() == "input") {
-        auto clml_input = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
-        this->layer_.storage_map.insert({nid, std::make_pair(clml_input, node)});
-        this->layer_.inputs.push_back(clml_input);
-        // Input copy placeholder Tensor
-        this->layer_.in_placeholder.push_back(
-            MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype));
+        // Layers may request for different layout. Differ the input allocation.
       } else if (node.GetOpType() == "kernel") {
         auto op_name = node.GetOpName();
         if ("nn.conv2d" == op_name) {
@@ -474,6 +635,10 @@ class CLMLRuntime : public JSONRuntimeBase {
           auto out = CreateResizeLayer(&layer_, node);
           this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
           this->layer_.func_outs.push_back(out);
+        } else if ("nn.batch_matmul" == op_name) {
+          auto out = CreateBatchMatmulLayer(&layer_, node, nid);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
         } else {
           LOG(FATAL) << "Unsupported op: " << op_name;
         }
@@ -488,17 +653,55 @@ class CLMLRuntime : public JSONRuntimeBase {
       DLDataType tvm_dtype = nodes_[nid].GetOpDataType()[0];
       cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
       this->layer_.outputs.push_back(this->layer_.storage_map[nid].first);
-      this->layer_.out_placeholder.push_back(
-          MakeCLMLTensorFromJSONNode(nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype));
+      if (this->layer_.out_shapes.find(nid) != this->layer_.out_shapes.end()) {
+        // Handle customized shapes here
+        this->layer_.out_placeholder.push_back(
+            MakeCLMLTensorFromJSONNode(nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype, nullptr, this->layer_.out_shapes[nid]));
+      } else {
+        this->layer_.out_placeholder.push_back(
+            MakeCLMLTensorFromJSONNode(nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype));
+      }
     }
+
+    // Plan memory utilization
+    PlanMemory();
+
     // ALlocate device memories and initialize the params if any
     cl_int result = 0;
+    size_t alloc_on_chip = 0;
+    size_t alloc_ddr = 0;
+    size_t alloc_ddr_reuse = 0;
     for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) {
       auto tensor_desc = it->second.first;
+      uint32_t mem_size = 0;
+      result = CL_OUT_OF_HOST_MEMORY;
+      result = CLML_INTF->clGetMLTensorMemorySizeQCOM(CLML_CTX, tensor_desc->tensor, &mem_size);
+      ICHECK(result == CL_SUCCESS) << "clGetMLTensorMemorySizeQCOM:" << result;
+
       JSONGraphNode node = it->second.second;
       void* node_data = nullptr;
-
-      allocateTensorMemory(h_ClmlIntf, workspace->contexts[platform_id], tensor_desc);
+      size_t on_chip_mem_offset = -1;
+      if (layer_.on_chip_alloc_plan.find(it->first) != layer_.on_chip_alloc_plan.end()) {
+        LOG_MEM << "Found GMEM Alloc:" << it->first
+                << " Size:" << layer_.on_chip_alloc_plan[it->first].first
+                << " Offset:" << layer_.on_chip_alloc_plan[it->first].second;
+        on_chip_mem_offset = layer_.on_chip_alloc_plan[it->first].second;
+        alloc_on_chip += mem_size;
+        tensor_desc->memory = AllocateOnChipTensorMemory(mem_size, on_chip_mem_offset);
+      } else if (layer_.ddr_alloc_plan.find(it->first) != layer_.ddr_alloc_plan.end()) {
+        LOG_MEM << "DDR Alloc for nid:" << it->first << " Type:" << node.GetOpType();
+        tensor_desc->memory = layer_.ddr_alloc_plan[it->first];
+        alloc_ddr_reuse += mem_size;
+        //} else if ((node.GetOpType() == "input") || IsOutputTensor(it->first) || (node.GetOpType()
+        //== "const")) {
+      } else if (node.GetOpType() == "const") {
+        LOG_MEM << "DDR Alloc for Const/Input/Output";
+        tensor_desc->memory = AllocateDDRTensorMemory(mem_size);
+        alloc_ddr += mem_size;
+      } else {
+        LOG(FATAL) << "Mem allocation not found on DDR as well as On-Chip nid: " << it->first
+                   << " Type:" << node.GetOpType();
+      }
 
       if (node.GetOpType() == "const") {
         node_data = data_entry_[EntryID(it->first, 0)]->data;
@@ -508,37 +711,55 @@ class CLMLRuntime : public JSONRuntimeBase {
       }
       this->layer_.tensorMemDescs.push_back(*tensor_desc);
     }
+    LOG_STATS << "Total On-Chip Allocation  :" << alloc_on_chip;
+    LOG_STATS << "Total DDR Reuse Allocation:" << alloc_ddr_reuse;
+    LOG_STATS << "Total DDR fixed allocation:" << alloc_ddr;
+    size_t ddr_global_pool = 0;
+    size_t ddr_local_pool = 0;
+    auto cws = CLMLWorkspace::Global();
+    for (auto it = cws->ddr_global_pool.begin(); it != cws->ddr_global_pool.end(); it++) {
+      LOG_STATS << "DDR Global pool - size:" << it->second.first << " Ref:" << it->second.second;
+      ddr_global_pool += it->second.first;
+    }
+    LOG_STATS << "Total Global Pool:" << ddr_global_pool;
+    for (auto it = this->layer_.ddr_storage_ref_map.begin();
+         it != this->layer_.ddr_storage_ref_map.end(); it++) {
+      LOG_STATS << "DDR Local pool - size:" << it->second.first << " Ref cnt:" << it->second.second;
+      ddr_local_pool += it->second.first;
+    }
+    LOG_STATS << "Total Local Pool:" << ddr_local_pool;
 
     // Setup descriptor set
-    result = h_ClmlIntf->clCreateMLTensorMemoryDescriptorSetQCOM(&this->layer_.descriptorSet);
+    result = CLML_INTF->clCreateMLTensorMemoryDescriptorSetQCOM(&this->layer_.descriptorSet);
     ICHECK(result == CL_SUCCESS) << "clCreateMLTensorMemoryDescriptorSetQCOM:" << result;
 
-    result = h_ClmlIntf->clUpdateMLTensorMemoryDescriptorSetQCOM(
+    result = CLML_INTF->clUpdateMLTensorMemoryDescriptorSetQCOM(
         this->layer_.descriptorSet, static_cast<uint32_t>(this->layer_.tensorMemDescs.size()),
         this->layer_.tensorMemDescs.data());
     ICHECK(result == CL_SUCCESS) << "clUpdateMLTensorMemoryDescriptorSetQCOM:" << result;
 
-    if (this->is_tuning_run) {
+    if (cws->is_tuning_run) {
       LOG(WARNING) << "CLML Tunning In Progress:";
       // Let the command queue recreated in profiling mode.
-      cl::OpenCLWorkspace::Global()->EnableQueueProfiling(tentry->device, true);
+      cl::OpenCLWorkspace::Global()->EnableQueueProfiling(cws->tentry->device, true);
       for (size_t i = 0; i < this->layer_.function.size(); ++i) {
         LOG(WARNING) << "CLML Tunning:" << this->layer_.layer_names[i];
-        result = h_ClmlIntf->clTuneMLOpQCOM(workspace->GetQueue(tentry->device),
-                                            this->layer_.function[i], this->layer_.descriptorSet,
-                                            this->tuning_cache, nullptr);
+        result = CLML_INTF->clTuneMLOpQCOM(CLML_QUEUE, this->layer_.function[i],
+                                           this->layer_.descriptorSet, this->layer_.tuning_cache,
+                                           nullptr);
         ICHECK(result == CL_SUCCESS) << "clTuneMLOpQCOM:" << result;
       }
-      cl::OpenCLWorkspace::Global()->EnableQueueProfiling(tentry->device, false);
+      cl::OpenCLWorkspace::Global()->EnableQueueProfiling(cws->tentry->device, false);
 
       size_t cache_len_bytes = 0;
       size_t len_ret = 0;
-      result = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, 0, nullptr, &cache_len_bytes);
+      result =
+          CLML_INTF->clSaveMLTuningCacheQCOM(layer_.tuning_cache, 0, nullptr, &cache_len_bytes);
       ICHECK(result == CL_SUCCESS) << "clSaveMLTuningCacheQCOM:" << result;
 
       std::vector<unsigned char> saved_cache(cache_len_bytes, 0);
-      result = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, saved_cache.size(),
-                                                   saved_cache.data(), &len_ret);
+      result = CLML_INTF->clSaveMLTuningCacheQCOM(layer_.tuning_cache, saved_cache.size(),
+                                                  saved_cache.data(), &len_ret);
       ICHECK(result == CL_SUCCESS) << "clSaveMLTuningCacheQCOM" << result;
 
       std::string tune_str;
@@ -551,189 +772,25 @@ class CLMLRuntime : public JSONRuntimeBase {
       strm->Write(clml_symbol);
       strm->Write(saved_cache);
 
-      std::ofstream fs(tuning_file, std::ios::app | std::ios::binary);
-      ICHECK(!fs.fail()) << "Cannot open " << tuning_file;
+      std::ofstream fs(cws->tuning_file, std::ios::app | std::ios::binary);
+      ICHECK(!fs.fail()) << "Cannot open " << cws->tuning_file;
       fs.write(&tune_str[0], tune_str.length());
-      LOG(WARNING) << "CLML: Tuning cache dumped to:" << tuning_file << " size" << tune_str.length()
-                   << " with tuning blob len " << saved_cache.size();
-    }
-  }
-
-  /*!
-   * \brief CLML objects we cache in order to avoid needing to construct
-   * a new layer each time.
-   */
-  struct CachedLayer {
-    std::vector<cl_ml_op_qcom> function;
-    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> inputs;
-    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> in_placeholder;
-    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> outputs;
-    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> out_placeholder;
-    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> func_outs;
-    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> func_ins;
-    std::map<int, std::pair<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>, JSONGraphNode>>
-        storage_map;
-    std::vector<cl_ml_tensor_memory_desc_qcom> tensorMemDescs;
-    std::vector<cl_ml_tensor_memory_desc_qcom> in_tensorMemDescs;
-    std::vector<cl_ml_tensor_memory_desc_qcom> out_tensorMemDescs;
-    cl_ml_tensor_mem_desc_set_qcom descriptorSet;
-    std::vector<std::string> layer_names;
-    cl_ml_tensor_qcom unusedTensor = nullptr;
-  };
-
-  struct tensor_dims_t {
-    uint32_t n, c, h, w;
-  };
-
-  bool ExtensionStringPresent(void) {
-    cl_int result = 0;
-    size_t reqd_size = 0;
-    cl_device_id device_id =
-        workspace->GetCLDeviceID(workspace->GetThreadEntry()->device.device_id);
-    result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, nullptr, &reqd_size);
-    ICHECK(reqd_size > 0u && result == CL_SUCCESS) << "clGetDeviceInfo:" << result;
-
-    std::vector<char> buf(reqd_size);
-    result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, reqd_size, buf.data(), nullptr);
-    ICHECK(result == CL_SUCCESS) << "clGetDeviceInfo:" << result;
-
-    std::string extensions(buf.data());
-    LOG(WARNING) << "OpenCL Extensions:" << extensions;
-    return (extensions.find("cl_qcom_ml_ops") != std::string::npos);
-  }
-
-  cl_ml_tensor_qcom DeviceMakeCLMLTensor(
-      cl_context context, tensor_dims_t dims,
-      cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
-      cl_channel_type dtype = CL_FLOAT) {
-    cl_ml_tensor_qcom tensor;
-    cl_int result = CL_OUT_OF_RESOURCES;
-
-    cl_ml_tensor_desc_qcom desc = {
-        dtype, layout, dims.n, dims.c, dims.h, dims.w, 0, CL_TENSOR_DIMENSIONS_4D_QCOM, { 0 }};
-    result =
-        h_ClmlIntf->clCreateMLTensorQCOM(workspace->contexts[platform_id], nullptr, &desc, &tensor);
-    ICHECK(tensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result;
-    (void)result;
-    return tensor;
-  }
-
-  cl_int allocateTensorMemory(void* pClmlIntf, cl_context context,
-                              std::shared_ptr<cl_ml_tensor_memory_desc_qcom> pTensorMemDesc) {
-    uint32_t size = 0;
-    cl_int result = CL_OUT_OF_HOST_MEMORY;
-    cl_mem buffer = nullptr;
-
-    result = h_ClmlIntf->clGetMLTensorMemorySizeQCOM(workspace->contexts[platform_id],
-                                                     pTensorMemDesc->tensor, &size);
-    ICHECK(result == CL_SUCCESS) << "clGetMLTensorMemorySizeQCOM:" << result;
-
-    buffer =
-        clCreateBuffer(workspace->contexts[platform_id], CL_MEM_READ_WRITE, size, nullptr, &result);
-    ICHECK(result == CL_SUCCESS) << "clCreateBuffer:" << result;
-
-    pTensorMemDesc->memory = buffer;
-
-    return result;
-  }
-
-  tensor_dims_t get_tensor_dims(const JSONGraphNode& node) {
-    std::vector<int64_t> shape = node.GetOpShape()[0];
-    tensor_dims_t dims;
-    dims.n = shape[0];
-    dims.c = shape[1];
-    dims.h = shape[2];
-    dims.w = shape[3];
-    return dims;
-  }
-
-  cl_channel_type MakeCLDataType(const DLDataType& data_type) {
-    if (data_type.code == DLDataTypeCode::kDLFloat && data_type.bits == 32) {
-      return CL_FLOAT;
-    } else if (data_type.code == DLDataTypeCode::kDLFloat && data_type.bits == 16) {
-      return CL_HALF_FLOAT;
-    } else {
-      LOG(FATAL) << "Datatype " << data_type << " unsupported by CLML runtime";
-    }
-  }
-
-  cl_arithmetic_mode_qcom MakeCLArithMode(const cl_channel_type& data_type,
-                                          const cl_channel_type& acc_type = CL_FLOAT) {
-    if (data_type == CL_FLOAT && acc_type == CL_FLOAT) {
-      return CL_ARITHMETIC_MODE_FP32_QCOM;
-    } else if (data_type == CL_HALF_FLOAT && acc_type == CL_FLOAT) {
-      return CL_ARITHMETIC_MODE_FP16_ACC32_QCOM;
-    } else if (data_type == CL_HALF_FLOAT && acc_type == CL_HALF_FLOAT) {
-      return CL_ARITHMETIC_MODE_FP16_QCOM;
-    } else {
-      LOG(FATAL) << "Datatype " << data_type << " unsupported by CLML runtime";
+      LOG(WARNING) << "CLML: Tuning cache dumped to:" << cws->tuning_file << " size"
+                   << tune_str.length() << " with tuning blob len " << saved_cache.size();
     }
-  }
+    if (cws->is_recordable_queue) {
+      for (size_t i = 0; i < this->layer_.function.size(); ++i) {
+        result =
+            CLML_INTF->clEnqueueMLOpQCOM(this->layer_.recordable_queue, this->layer_.function[i],
+                                         this->layer_.descriptorSet, 0, nullptr, nullptr);
+        ICHECK(result == CL_SUCCESS) << "clEnqueueMLOpQCOM - Recordable Queue:" << result;
+      }
 
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(
-      const JSONGraphNode& tensor_rep, void* data, std::vector<size_t> c_shape,
-      cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_uint dtype = CL_FLOAT) {
-    std::vector<int64_t> shape = tensor_rep.GetOpShape()[0];
-    std::vector<size_t> clml_shape(shape.begin(), shape.end());
-    if (c_shape.size() > 0) {
-      clml_shape = c_shape;
+      result = clEndRecordingQCOM(this->layer_.recording);
+      ICHECK(result == CL_SUCCESS) << "clEndRecordingQCOM:" << result;
     }
-    // Make sure the tensors with dimensions less than 4 are padded with 1.
-    clml_shape.push_back(1);
-    clml_shape.push_back(1);
-    clml_shape.push_back(1);
-
-    tensor_dims_t dims;
-    dims.n = clml_shape[0];
-    dims.c = clml_shape[1];
-    dims.h = clml_shape[2];
-    dims.w = clml_shape[3];
-    DLDataType tvm_dtype = tensor_rep.GetOpDataType()[0];
-    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
-
-    auto tensor_dsc = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
-    tensor_dsc->tensor =
-        DeviceMakeCLMLTensor(workspace->contexts[platform_id], dims, layout, cl_dtype);
-    return tensor_dsc;
   }
 
-  /*!
-   * \brief Create an CLML tensor given the JSON representation. If scale
-   * and offset are given, then create a quantized CLML tensor.
-   *
-   * \param tensor The tensor to represent.
-   * \return CLML Tensor.
-   */
-
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONEntry(
-      const JSONGraphNodeEntry& tensor, std::vector<size_t> shape = {},
-      cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_uint dtype = CL_FLOAT) {
-    JSONGraphNode node = nodes_[tensor.id_];
-    if (this->layer_.storage_map.find(tensor.id_) == this->layer_.storage_map.end()) {
-      void* node_data = nullptr;
-      if (node.GetOpType() == "const") {
-        node_data = data_entry_[EntryID(tensor)]->data;
-      }
-      auto clml_tensor = MakeCLMLTensorFromJSONNode(node, layout, dtype, node_data, shape);
-      this->layer_.storage_map.insert({tensor.id_, std::make_pair(clml_tensor, node)});
-      return clml_tensor;
-    } else {
-      return this->layer_.storage_map[tensor.id_].first;
-    }
-  }
-  /*!
-   * \brief Create an CLML tensor given the JSON representation. If scale
-   * and offset are given, then create a quantized CLML tensor.
-   *
-   * \param node The tensor to represent.
-   * \param data (optional) Constant data of input node.
-   * \return CLML Tensor.
-   */
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONNode(
-      const JSONGraphNode& node, cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
-      cl_uint dtype = CL_FLOAT, void* data = nullptr, std::vector<size_t> shape = {}) {
-    return MakeCLMLTensor(node, data, shape, layout, dtype);
-  }
   /*!
    * \brief Create a 2D convolution layer.
    *
@@ -807,8 +864,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     } else {
       cl_ml_tensor_desc_qcom desc = {};
       desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
-      result = h_ClmlIntf->clCreateMLTensorQCOM(workspace->contexts[platform_id], nullptr, &desc,
-                                                &layer_.unusedTensor);
+      result = CLML_INTF->clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, &layer_.unusedTensor);
       ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result;
       bias->tensor = layer_.unusedTensor;
     }
@@ -827,14 +883,14 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_ml_op_qcom op = nullptr;
     if (!has_bn) {
       if (!has_act) {
-        result = h_ClmlIntf->clCreateMLOpConvolutionForwardQCOM(
-            workspace->contexts[platform_id], nullptr, &conv_desc, input->tensor, weight->tensor,
-            bias->tensor, output->tensor, &op, nullptr);
+        result = CLML_INTF->clCreateMLOpConvolutionForwardQCOM(
+            CLML_CTX, nullptr, &conv_desc, input->tensor, weight->tensor, bias->tensor,
+            output->tensor, &op, nullptr);
         ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
       } else {
-        result = h_ClmlIntf->clCreateMLOpFusedConvolutionActivationForwardQCOM(
-            workspace->contexts[platform_id], nullptr, &conv_desc, &act_desc, input->tensor,
-            weight->tensor, bias->tensor, nullptr, output->tensor, &op, tuning_cache);
+        result = CLML_INTF->clCreateMLOpFusedConvolutionActivationForwardQCOM(
+            CLML_CTX, nullptr, &conv_desc, &act_desc, input->tensor, weight->tensor, bias->tensor,
+            nullptr, output->tensor, &op, layer_.tuning_cache);
         ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
       }
       layer_.func_ins.push_back(input);
@@ -842,7 +898,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     } else {
       int bn_index = has_bias ? 3 : 2;
       int axis = std::stoi(node.GetAttr<std::vector<std::string>>("batchnorm")[0]);
-      auto bn_dims = get_tensor_dims(nodes_[inputs[bn_index].id_]);
+      auto bn_dims = GetTensorDims(nodes_[inputs[bn_index].id_]);
       std::vector<size_t> bn_shape = {1, 1, 1, 1};
       bn_shape[axis] = bn_dims.n;
       auto bn_mean = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
@@ -860,16 +916,16 @@ class CLMLRuntime : public JSONRuntimeBase {
 
       cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode};
       if (!has_act) {
-        result = h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormForwardQCOM(
-            workspace->contexts[platform_id], nullptr, &conv_desc, &bn_desc, input->tensor,
-            weight->tensor, bias->tensor, output->tensor, bn_mean->tensor, bn_var->tensor,
-            bn_scale->tensor, bn_bias->tensor, &op, tuning_cache);
+        result = CLML_INTF->clCreateMLOpFusedConvolutionBatchNormForwardQCOM(
+            CLML_CTX, nullptr, &conv_desc, &bn_desc, input->tensor, weight->tensor, bias->tensor,
+            output->tensor, bn_mean->tensor, bn_var->tensor, bn_scale->tensor, bn_bias->tensor, &op,
+            layer_.tuning_cache);
         ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
       } else {
-        result = h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormActivationForwardQCOM(
-            workspace->contexts[platform_id], nullptr, &conv_desc, &bn_desc, &act_desc,
-            input->tensor, weight->tensor, bias->tensor, output->tensor, nullptr, bn_mean->tensor,
-            bn_var->tensor, bn_scale->tensor, bn_bias->tensor, &op, tuning_cache);
+        result = CLML_INTF->clCreateMLOpFusedConvolutionBatchNormActivationForwardQCOM(
+            CLML_CTX, nullptr, &conv_desc, &bn_desc, &act_desc, input->tensor, weight->tensor,
+            bias->tensor, output->tensor, nullptr, bn_mean->tensor, bn_var->tensor,
+            bn_scale->tensor, bn_bias->tensor, &op, layer_.tuning_cache);
 
         ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
       }
@@ -902,13 +958,12 @@ class CLMLRuntime : public JSONRuntimeBase {
 
     cl_ml_tensor_desc_qcom desc = {};
     desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
-    result = h_ClmlIntf->clCreateMLTensorQCOM(workspace->contexts[platform_id], nullptr, &desc,
-                                              &layer_.unusedTensor);
+    result = CLML_INTF->clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, &layer_.unusedTensor);
     ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << ":" << result;
 
-    result = h_ClmlIntf->clCreateMLOpActivationForwardQCOM(
-        workspace->contexts[platform_id], nullptr, &act_desc, input->tensor, layer_.unusedTensor,
-        output->tensor, &op, tuning_cache);
+    result = CLML_INTF->clCreateMLOpActivationForwardQCOM(CLML_CTX, nullptr, &act_desc,
+                                                          input->tensor, layer_.unusedTensor,
+                                                          output->tensor, &op, layer_.tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "Activation Error:" << result;
 
     layer_.func_ins.push_back(input);
@@ -940,7 +995,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     opProperties.push_back(*reinterpret_cast<cl_ml_op_properties_qcom*>(&epsilon));
     opProperties.push_back(CL_ML_OP_PROPERTY_LIST_END_QCOM);
 
-    auto bn_dims = get_tensor_dims(nodes_[node.GetInputs()[1].id_]);
+    auto bn_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
     std::vector<size_t> bn_shape = {1, 1, 1, 1};
     bn_shape[axis] = bn_dims.n;
     auto bn_mean = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
@@ -960,10 +1015,9 @@ class CLMLRuntime : public JSONRuntimeBase {
 
     cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode};
 
-    result = h_ClmlIntf->clCreateMLOpBatchNormForwardQCOM(
-        workspace->contexts[platform_id], opProperties.data(), &bn_desc, input->tensor,
-        bn_mean->tensor, bn_var->tensor, bn_scale->tensor, bn_bias->tensor, output->tensor, &op,
-        tuning_cache);
+    result = CLML_INTF->clCreateMLOpBatchNormForwardQCOM(
+        CLML_CTX, opProperties.data(), &bn_desc, input->tensor, bn_mean->tensor, bn_var->tensor,
+        bn_scale->tensor, bn_bias->tensor, output->tensor, &op, layer_.tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "Batchnorm Error:" << result;
 
     layer->function.push_back(op);
@@ -1012,13 +1066,12 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_ml_tensor_desc_qcom desc = {};
     cl_ml_tensor_qcom unusedTensor = nullptr;
     desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
-    result = h_ClmlIntf->clCreateMLTensorQCOM(workspace->contexts[platform_id], nullptr, &desc,
-                                              &unusedTensor);
+    result = CLML_INTF->clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, &unusedTensor);
     ICHECK(unusedTensor && result == CL_SUCCESS) << ":" << result;
 
-    result = h_ClmlIntf->clCreateMLOpPoolingForwardQCOM(workspace->contexts[platform_id], nullptr,
-                                                        &pool_desc, input->tensor, unusedTensor,
-                                                        output->tensor, &op, tuning_cache);
+    result = CLML_INTF->clCreateMLOpPoolingForwardQCOM(CLML_CTX, nullptr, &pool_desc, input->tensor,
+                                                       unusedTensor, output->tensor, &op,
+                                                       layer_.tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "Pooling Error:" << result;
 
     layer_.func_ins.push_back(input);
@@ -1044,7 +1097,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
                                              cl_dtype);
     auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
-    auto in_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]);
+    auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
     cl_ml_op_pooling_desc_qcom pool_desc = {
         node.GetOpName() == "nn.global_max_pool2d" ? CL_POOLING_MODE_MAX_QCOM
                                                    : CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM,
@@ -1059,13 +1112,12 @@ class CLMLRuntime : public JSONRuntimeBase {
 
     cl_ml_tensor_desc_qcom desc = {};
     desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
-    result = h_ClmlIntf->clCreateMLTensorQCOM(workspace->contexts[platform_id], nullptr, &desc,
-                                              &layer_.unusedTensor);
+    result = CLML_INTF->clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, &layer_.unusedTensor);
     ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << ":" << result;
 
-    result = h_ClmlIntf->clCreateMLOpPoolingForwardQCOM(
-        workspace->contexts[platform_id], nullptr, &pool_desc, input->tensor, layer_.unusedTensor,
-        output->tensor, &op, tuning_cache);
+    result = CLML_INTF->clCreateMLOpPoolingForwardQCOM(CLML_CTX, nullptr, &pool_desc, input->tensor,
+                                                       layer_.unusedTensor, output->tensor, &op,
+                                                       layer_.tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "Pooling Error:" << result;
 
     layer_.func_ins.push_back(input);
@@ -1088,16 +1140,15 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
                                              cl_dtype);
-    auto out_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]);
+    auto out_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
     auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype, nullptr,
                                              {out_dims.n, out_dims.c, 1, 1});
 
     cl_ml_op_softmax_desc_qcom softmax_desc = {CL_SOFTMAX_ALGORITHM_ACCURATE_QCOM,
                                                CL_SOFTMAX_MODE_INSTANCE_QCOM, cl_arithmetic_mode};
 
-    result = h_ClmlIntf->clCreateMLOpSoftmaxQCOM(workspace->contexts[platform_id], nullptr,
-                                                 &softmax_desc, input->tensor, output->tensor, &op,
-                                                 tuning_cache);
+    result = CLML_INTF->clCreateMLOpSoftmaxQCOM(CLML_CTX, nullptr, &softmax_desc, input->tensor,
+                                                output->tensor, &op, layer_.tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "SoftMax Error:" << result;
 
     layer_.func_ins.push_back(input);
@@ -1142,8 +1193,8 @@ class CLMLRuntime : public JSONRuntimeBase {
         {clml_padding[0], clml_padding[1], clml_padding[2], clml_padding[3], 0, 0, 0, 0},
         cl_arithmetic_mode};
 
-    result = h_ClmlIntf->clCreateMLOpPadQCOM(workspace->contexts[platform_id], nullptr, &pad_desc,
-                                             input->tensor, output->tensor, &op, tuning_cache);
+    result = CLML_INTF->clCreateMLOpPadQCOM(CLML_CTX, nullptr, &pad_desc, input->tensor,
+                                            output->tensor, &op, layer_.tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "Pad Error:" << result;
 
     layer_.func_ins.push_back(input);
@@ -1167,8 +1218,8 @@ class CLMLRuntime : public JSONRuntimeBase {
                                              cl_dtype);
     auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
 
-    result = h_ClmlIntf->clCreateMLOpReshapeQCOM(workspace->contexts[platform_id], nullptr,
-                                                 input->tensor, output->tensor, &op, tuning_cache);
+    result = CLML_INTF->clCreateMLOpReshapeQCOM(CLML_CTX, nullptr, input->tensor, output->tensor,
+                                                &op, layer_.tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "Reshape Error:" << result;
 
     layer_.func_ins.push_back(input);
@@ -1192,8 +1243,8 @@ class CLMLRuntime : public JSONRuntimeBase {
                                              cl_dtype);
     auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
 
-    result = h_ClmlIntf->clCreateMLOpReshapeQCOM(workspace->contexts[platform_id], nullptr,
-                                                 input->tensor, output->tensor, &op, tuning_cache);
+    result = CLML_INTF->clCreateMLOpReshapeQCOM(CLML_CTX, nullptr, input->tensor, output->tensor,
+                                                &op, layer_.tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "Reshape Error:" << result;
 
     layer_.func_ins.push_back(input);
@@ -1227,9 +1278,8 @@ class CLMLRuntime : public JSONRuntimeBase {
     }
     cl_ml_op_concat_desc_qcom concatDesc = {axis, (cl_uint)inputSize, cl_arithmetic_mode};
 
-    result =
-        h_ClmlIntf->clCreateMLOpConcatQCOM(workspace->contexts[platform_id], nullptr, &concatDesc,
-                                           concatInputs, output->tensor, &op, tuning_cache);
+    result = CLML_INTF->clCreateMLOpConcatQCOM(CLML_CTX, nullptr, &concatDesc, concatInputs,
+                                               output->tensor, &op, layer_.tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "Concat Error:" << result;
 
     layer->function.push_back(op);
@@ -1252,44 +1302,82 @@ class CLMLRuntime : public JSONRuntimeBase {
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
-    auto inp_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]);
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {1, inp_dims.c, 1, 1},
-                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
-    auto wt_dims = get_tensor_dims(nodes_[node.GetInputs()[1].id_]);
-    bool has_bias = node.GetInputs().size() == 3 ? true : false;
-    auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {wt_dims.n, wt_dims.c, 1, 1},
-                                              CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
+    auto input =
+        MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
+    auto wt_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
+    auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {1, 1, wt_dims.n, wt_dims.c},
+                                              CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
+    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
+    cl_gemm_transform_qcom b_transform = CL_GEMM_TRANSFORM_NONE_QCOM;
+    if (in_dims.c == wt_dims.c) {
+      b_transform = CL_GEMM_TRANSFORM_TRANSPOSE_QCOM;
+    }
+    cl_ml_op_gemm_desc_qcom gemmDesc = {in_dims.n,                    // m
+                                        wt_dims.n,                    // n
+                                        wt_dims.c,                    // k
+                                        CL_GEMM_TRANSFORM_NONE_QCOM,  // A transform
+                                        b_transform,                  // B transform
+                                        {{1.0}, CL_FLOAT},            // alpha
+                                        {{0.0}, CL_FLOAT},            // beta
+                                        cl_arithmetic_mode};
+
+    result = CLML_INTF->clCreateMLOpGemmQCOM(CLML_CTX, 0, &gemmDesc, input->tensor, weight->tensor,
+                                             output->tensor, &op, layer_.tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Dense Error:" << result;
 
-    auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
-    if (has_bias) {
-      auto bias_dims = get_tensor_dims(nodes_[node.GetInputs()[2].id_]);
-      bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
-                                         cl_dtype);
-    } else {
-      cl_ml_tensor_desc_qcom desc = {};
-      desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
-      result = h_ClmlIntf->clCreateMLTensorQCOM(workspace->contexts[platform_id], nullptr, &desc,
-                                                &layer_.unusedTensor);
-      ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result;
-      bias->tensor = layer_.unusedTensor;
+    layer->function.push_back(op);
+    layer_.func_ins.push_back(input);
+    return output;
+  }
+
+  /*!
+   * \brief Create a batch_matmul layer.
+   *
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateBatchMatmulLayer(CachedLayer* layer,
+                                                                  const JSONGraphNode& node, int nid) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = nullptr;
+    DLDataType tvm_dtype = node.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+    auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
+    auto input =
+        MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {in_dims.c, in_dims.h}, CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
+    auto wt_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
+    auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {1, 1, wt_dims.c, wt_dims.h},
+                                              CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
+
+    std::vector<int64_t> out_shape = node.GetOpShape()[0];
+    std::vector<size_t> clml_out_shape;
+    clml_out_shape.push_back(out_shape[1]);
+    clml_out_shape.push_back(out_shape[2]);
+    clml_out_shape.push_back(1);
+    clml_out_shape.push_back(1);
+    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype, nullptr, clml_out_shape);
+    layer->out_shapes.insert({nid, clml_out_shape});
+
+    cl_bool b_transpose = std::stoi(node.GetAttr<std::vector<std::string>>("transpose_b")[0]);
+    cl_gemm_transform_qcom b_transform = CL_GEMM_TRANSFORM_NONE_QCOM;
+    if (b_transpose) {
+      b_transform = CL_GEMM_TRANSFORM_TRANSPOSE_QCOM;
     }
-    // Output
-    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype, nullptr,
-                                             {1, wt_dims.n, 1, 1});
-    cl_ml_op_convolution_desc_qcom conv_desc = {CL_CONVOLUTION_MODE_CONVOLUTION_QCOM,
-                                                1,
-                                                4,
-                                                {0, 0},
-                                                {0, 0},
-                                                {1, 1},
-                                                {1, 1},
-                                                0,
-                                                cl_arithmetic_mode};
-
-    result = h_ClmlIntf->clCreateMLOpConvolutionForwardQCOM(
-        workspace->contexts[platform_id], nullptr, &conv_desc, input->tensor, weight->tensor,
-        bias->tensor, output->tensor, &op, nullptr);
-    ICHECK(op && result == CL_SUCCESS) << "Fully Connected Error:" << result;
+    cl_ml_op_gemm_desc_qcom gemmDesc = {in_dims.c,                    // m
+                                        wt_dims.c,                    // n
+                                        wt_dims.h,                    // k
+                                        CL_GEMM_TRANSFORM_NONE_QCOM,  // A transform
+                                        b_transform,                  // B transform
+                                        {{1.0}, CL_FLOAT},            // alpha
+                                        {{0.0}, CL_FLOAT},            // beta
+                                        cl_arithmetic_mode};
+
+    result = CLML_INTF->clCreateMLOpGemmQCOM(CLML_CTX, 0, &gemmDesc, input->tensor, weight->tensor,
+                                             output->tensor, &op, layer_.tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "BatchMatmul Error:" << result;
 
     layer->function.push_back(op);
     layer_.func_ins.push_back(input);
@@ -1318,8 +1406,8 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_ml_op_clip_desc_qcom clip_desc = {
         CL_CLIP_BY_VALUE_QCOM, {{a_max}, CL_FLOAT}, {{a_min}, CL_FLOAT}, cl_arithmetic_mode};
 
-    result = h_ClmlIntf->clCreateMLOpClipQCOM(workspace->contexts[platform_id], nullptr, &clip_desc,
-                                              input->tensor, output->tensor, &op, tuning_cache);
+    result = CLML_INTF->clCreateMLOpClipQCOM(CLML_CTX, nullptr, &clip_desc, input->tensor,
+                                             output->tensor, &op, layer_.tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "Clip Error:" << result;
 
     layer_.func_ins.push_back(input);
@@ -1360,9 +1448,9 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_ml_op_binary_desc_qcom add_desc = {
         binary_op, {{1.0}, CL_FLOAT}, {{1.0}, CL_FLOAT}, {{0.0}, CL_FLOAT}, cl_arithmetic_mode};
 
-    result = h_ClmlIntf->clCreateMLOpBinaryQCOM(workspace->contexts[platform_id], nullptr,
-                                                &add_desc, input_a->tensor, input_b->tensor,
-                                                output->tensor, &op, tuning_cache);
+    result = CLML_INTF->clCreateMLOpBinaryQCOM(CLML_CTX, nullptr, &add_desc, input_a->tensor,
+                                               input_b->tensor, output->tensor, &op,
+                                               layer_.tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << op_name << " Node Error:" << result;
 
     layer_.func_ins.push_back(input_a);
@@ -1390,9 +1478,8 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_uint block_size = std::stoi(node.GetAttr<std::vector<std::string>>("block_size")[0]);
 
     cl_ml_op_depthtospace_desc_qcom dtos_desc = {block_size, cl_arithmetic_mode};
-    result = h_ClmlIntf->clCreateMLOpDepthToSpaceQCOM(workspace->contexts[platform_id], nullptr,
-                                                      &dtos_desc, input->tensor, output->tensor,
-                                                      &op, tuning_cache);
+    result = CLML_INTF->clCreateMLOpDepthToSpaceQCOM(CLML_CTX, nullptr, &dtos_desc, input->tensor,
+                                                     output->tensor, &op, layer_.tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "DepthToSpace Layer Error:" << result;
 
     layer_.func_ins.push_back(input);
@@ -1419,9 +1506,8 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_bool align_corners = std::stoi(node.GetAttr<std::vector<std::string>>("align_corners")[0]);
 
     cl_ml_op_resize_bilinear_desc_qcom resize_desc = {align_corners, false, cl_arithmetic_mode};
-    result = h_ClmlIntf->clCreateMLOpResizeBilinearQCOM(workspace->contexts[platform_id], nullptr,
-                                                        &resize_desc, input->tensor, output->tensor,
-                                                        &op, tuning_cache);
+    result = CLML_INTF->clCreateMLOpResizeBilinearQCOM(
+        CLML_CTX, nullptr, &resize_desc, input->tensor, output->tensor, &op, layer_.tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "Resize Layer Error:" << result;
 
     layer_.func_ins.push_back(input);
@@ -1434,16 +1520,12 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \note Currently only supports a single layer.
    */
 
+  // This layer instance
   CachedLayer layer_;
-  // CLML Context
-  GET_ML_API_INTERFACE* h_ClmlIntf = nullptr;
-  cl::OpenCLWorkspace* workspace = nullptr;
-  cl::OpenCLThreadEntry* tentry = nullptr;
-  cl_device_id device_id;
-  cl_platform_id platform_id;
-  cl_ml_tuningcache_qcom tuning_cache = nullptr;
-  bool is_tuning_run;
-  char* tuning_file;
+
+  // CLML Workspace
+  CLMLWorkspace* cws;
+
 #else
   void Run() override {
     LOG(FATAL) << "Cannot call run on CLML module without runtime enabled. "
diff --git a/src/runtime/contrib/clml/clml_runtime.h b/src/runtime/contrib/clml/clml_runtime.h
new file mode 100644
index 0000000000000..0f3f76f79544b
--- /dev/null
+++ b/src/runtime/contrib/clml/clml_runtime.h
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/clml/clml_runtime.h
+ * \brief CLML header
+ */
+#ifndef TVM_RUNTIME_CONTRIB_CLML_CLML_RUNTIME_H_
+#define TVM_RUNTIME_CONTRIB_CLML_CLML_RUNTIME_H_
+#include <CL/cl.h>
+#include <CL/opencl.h>
+#include <stdlib.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/profiling.h>
+#include <tvm/runtime/registry.h>
+
+#include <fstream>
+#include <map>
+#include <utility>
+
+#include "../../file_utils.h"
+#include "../../opencl/opencl_common.h"
+#include "../../thread_storage_scope.h"
+#include "../json/json_node.h"
+#include "../json/json_runtime.h"
+
+#ifdef TVM_GRAPH_EXECUTOR_CLML
+#include <CL/cl_qcom_ml_ops.h>
+
+#define CAT_I(a, b) a##b
+#define CAT(a, b) CAT_I(a, b)
+#define GET_ML_INTERFACE CAT(CAT(clGetMLInterfaceV, CL_QCOM_ML_OPS_H_MAJOR_VERSION), QCOM)
+#define GET_ML_API_INTERFACE CAT(CAT(CLMLInterfaceV, CL_QCOM_ML_OPS_H_MAJOR_VERSION), QCOM)
+
+/*! \brief Magic number for CLML Tuning cache entry */
+static const uint64_t kTVMCLMLTuningCacheMagic = 0x434C4D4C54554E45;
+
+#define DEBUG_MEMORY_ALLOC false
+#define DEBUG_STATS false
+#define LOG_MEM LOG_IF(WARNING, DEBUG_MEMORY_ALLOC)
+#define LOG_STATS LOG_IF(WARNING, DEBUG_STATS)
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace tvm::runtime::json;
+using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+
+class CLMLThreadEntry;
+
+/*!
+ * \brief CLML workspace.
+ */
+class CLMLWorkspace {
+ public:
+  // Constructor
+  CLMLWorkspace();
+  /*!
+   * \brief Get the thread local ThreadEntry
+   */
+  virtual CLMLThreadEntry* GetThreadEntry();
+
+  // CLML Context
+  GET_ML_API_INTERFACE* h_ClmlIntf = nullptr;
+  cl::OpenCLWorkspace* workspace = nullptr;
+  cl::OpenCLThreadEntry* tentry = nullptr;
+  cl_device_id device_id;
+  cl_platform_id platform_id;
+
+  // Tuning Support
+  bool is_tuning_run;
+  char* tuning_file;
+
+  // Recordable Queues
+  bool is_recordable_queue = false;
+
+  // On chip memory support
+  bool is_on_chip_memory = false;
+
+  // On chip memory size
+  size_t onchip_mem_size = 0;
+
+  // get the global workspace
+  static CLMLWorkspace* Global();
+
+  bool ExtensionStringPresent(std::string extn);
+
+  // DDR memory management
+  std::map<cl_mem, std::pair<int, int>> ddr_global_pool;  // buf, size and ref count
+};
+
+/*! \brief Thread local workspace */
+class CLMLThreadEntry {
+ public:
+  // get the global workspace
+  static CLMLThreadEntry* ThreadLocal();
+};
+
+/*!
+ * \brief CLML objects we cache in order to avoid needing to construct
+ * a new layer each time.
+ */
+struct CachedLayer {
+  std::vector<cl_ml_op_qcom> function;
+  std::map<int, std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> inputs;
+  std::map<int, std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> in_placeholder;
+  std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> outputs;
+  std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> out_placeholder;
+  std::map<int, std::vector<size_t>> out_shapes;
+  std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> func_outs;
+  std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> func_ins;
+  std::map<int, std::pair<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>, JSONGraphNode>>
+      storage_map;
+  std::vector<cl_ml_tensor_memory_desc_qcom> tensorMemDescs;
+  std::vector<cl_ml_tensor_memory_desc_qcom> in_tensorMemDescs;
+  std::vector<cl_ml_tensor_memory_desc_qcom> out_tensorMemDescs;
+  cl_ml_tensor_mem_desc_set_qcom descriptorSet;
+  std::vector<std::string> layer_names;
+  cl_ml_tensor_qcom unusedTensor = nullptr;
+  cl_ml_tuningcache_qcom tuning_cache = nullptr;
+  // Memory management
+  std::map<int, int> storage_ref_map;  // NodeId & ref. count
+  // Activation node id & life span (the layer after which we can free).
+  std::map<int, int> life_span;
+  std::map<size_t, size_t> on_chip_pool_size;                   // Mem start & size
+  std::map<size_t, int> on_chip_pool_alloc_info;                // Mem start & node_id
+  std::map<int, std::pair<size_t, size_t>> on_chip_alloc_plan;  // Final Alloc Plan
+  std::map<int, size_t> on_chip_reject;                         // On-Chip reject info
+  bool alloc_ping_pong;                                         // Allocation stratagy
+  int in_chip_total_free;                                       // Total available
+  int in_chip_total_alloc;                                      // Free memory
+  int on_chip_alert_fail;                                       // Faliure due to fragmentation
+
+  // DDR memory planner
+  std::map<cl_mem, std::pair<int, bool>> ddr_storage_ref_map;  // local pool reference count
+  std::map<int, cl_mem> ddr_alloc_plan;                        // allocation map <nid, cl_mem>
+
+  cl_command_queue recordable_queue = nullptr;
+  cl_recording_qcom recording = nullptr;
+};
+
+struct tensor_dims_t {
+  uint32_t n, c, h, w;
+};
+
+#define CLML_INTF CLMLWorkspace::Global()->h_ClmlIntf
+#define CLML_QUEUE \
+  CLMLWorkspace::Global()->workspace->GetQueue(CLMLWorkspace::Global()->tentry->device)
+#define CLML_CTX CLMLWorkspace::Global()->workspace->contexts[CLMLWorkspace::Global()->platform_id]
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_GRAPH_EXECUTOR_CLML
+#endif  // TVM_RUNTIME_CONTRIB_CLML_CLML_RUNTIME_H_
diff --git a/src/runtime/contrib/clml/clml_utils.cc b/src/runtime/contrib/clml/clml_utils.cc
new file mode 100644
index 0000000000000..034ee5c14c81c
--- /dev/null
+++ b/src/runtime/contrib/clml/clml_utils.cc
@@ -0,0 +1,258 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/clml/clml_utils.cc
+ * \brief Utilities.
+ */
+#ifdef TVM_GRAPH_EXECUTOR_CLML
+#include "clml_utils.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace tvm::runtime::json;
+using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+
+/*!
+ * \brief Copy utility to CLML Tensor.
+ *
+ * \param tensor CLML tensor descriptor
+ * \param data pointer to host data
+ * \param layout host data layout
+ */
+void CopyDataToCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tensor, void* data,
+                          cl_ml_tensor_layout_qcom layout) {
+  cl_int result = 0;
+  cl_event evt = nullptr;
+  result = CLML_INTF->clEnqueueWriteMLTensorDataQCOM(CLML_QUEUE, data, layout, tensor->tensor,
+                                                     tensor->memory,
+                                                     0,        // n waitlist
+                                                     nullptr,  // waitlist
+                                                     &evt);    // event
+  ICHECK((evt != nullptr) && result == CL_SUCCESS) << "clEnqueueWriteMLTensorDataQCOM:" << result;
+}
+
+/*!
+ * \brief Copy utility from CLML tensor.
+ *
+ * \param tensor CLML tensor descriptor
+ * \param data pointer to host data
+ * \param layout expectred host data layout
+ */
+void CopyDataFromCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tensor, void* data,
+                            cl_ml_tensor_layout_qcom layout) {
+  cl_int result = 0;
+  cl_event readEvent = nullptr;
+  // Read the output tensor
+  result = CLML_INTF->clEnqueueReadMLTensorDataQCOM(CLML_QUEUE, tensor->tensor, tensor->memory,
+                                                    data, layout,
+                                                    0,            // n waitlist
+                                                    nullptr,      // waitlist
+                                                    &readEvent);  // event
+  ICHECK(result == CL_SUCCESS) << "clEnqueueReadMLTensorDataQCOM:" << result;
+
+  result = clWaitForEvents(1, &readEvent);
+  ICHECK(result == CL_SUCCESS) << "clWaitForEvents:" << result;
+}
+
+/*!
+ * \brief Make a CLML tensor given it's attributes
+ *
+ * \param context OpenCL context
+ * \param dims Tensor dimensions
+ * \param layout CLML tensor layout of tensor
+ * \param dtype Tensor data type
+ * \return CLML tensor
+ */
+cl_ml_tensor_qcom DeviceMakeCLMLTensor(cl_context context, tensor_dims_t dims,
+                                       cl_ml_tensor_layout_qcom layout, cl_channel_type dtype) {
+  cl_ml_tensor_qcom tensor;
+  cl_int result = CL_OUT_OF_RESOURCES;
+
+  cl_ml_tensor_desc_qcom desc = {
+      dtype, layout, dims.n, dims.c, dims.h, dims.w, 0, CL_TENSOR_DIMENSIONS_4D_QCOM, {0}};
+  result = CLML_INTF->clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, &tensor);
+  ICHECK(tensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result;
+  (void)result;
+  return tensor;
+}
+
+/*!
+ * \brief utility that allocates DDR backed memory for the tensor.
+ *
+ * \param context OpenCL context
+ * \param buffer size
+ * \return allocated cl_mem object
+ */
+cl_mem AllocateDDRTensorMemory(size_t size) {
+  cl_int result = CL_OUT_OF_HOST_MEMORY;
+  cl_mem buffer = nullptr;
+
+  buffer = clCreateBuffer(CLML_CTX, CL_MEM_READ_WRITE, size, nullptr, &result);
+  ICHECK(result == CL_SUCCESS) << "clCreateBuffer:" << result;
+
+  return buffer;
+}
+
+/*!
+ * \brief utility that allocates on chip backed memory for the tensor.
+ *
+ * \param context OpenCL context
+ * \param tensor_desc tensor descriptor
+ * \param on_chip_mem_offset on chip memory offset to be used for allocation
+ * \return result API status
+ */
+cl_mem AllocateOnChipTensorMemory(size_t size, cl_uint on_chip_mem_offset) {
+  cl_int result = CL_OUT_OF_HOST_MEMORY;
+  cl_mem buffer = nullptr;
+
+  cl_mem_properties on_chip_buff_prop[] = {CL_MEM_ONCHIP_GLOBAL_QCOM, 1,
+                                           CL_MEM_ONCHIP_GLOBAL_OFFSET_QCOM, on_chip_mem_offset, 0};
+  LOG_MEM << "On-Chip Alloc:" << size << " Offset:" << on_chip_mem_offset;
+  buffer = clCreateBufferWithProperties(CLML_CTX, on_chip_buff_prop, CL_MEM_READ_WRITE, size,
+                                        nullptr, &result);
+  ICHECK(result == CL_SUCCESS) << "clCreateBufferWithProperties:" << result;
+
+  return buffer;
+}
+
+/*!
+ * \brief Utility to extract tensor dimensions from JSON node.
+ *
+ * \param node JSON graph node
+ * \return The CLML tensor dimension
+ */
+tensor_dims_t GetTensorDims(const JSONGraphNode& node) {
+  std::vector<int64_t> shape = node.GetOpShape()[0];
+  tensor_dims_t dims;
+  dims.n = shape[0];
+  dims.c = shape[1];
+  dims.h = shape[2];
+  dims.w = shape[3];
+  return dims;
+}
+
+/*!
+ * \brief Utility to map TVM data type to OpenCL channel type.
+ *
+ * \param data_type TVM DType
+ * \return OpenCL channel type.
+ */
+cl_channel_type MakeCLDataType(const DLDataType& data_type) {
+  if (data_type.code == DLDataTypeCode::kDLFloat && data_type.bits == 32) {
+    return CL_FLOAT;
+  } else if (data_type.code == DLDataTypeCode::kDLFloat && data_type.bits == 16) {
+    return CL_HALF_FLOAT;
+  } else {
+    LOG(FATAL) << "Datatype " << data_type << " unsupported by CLML runtime";
+  }
+}
+
+/*!
+ * \brief Utility to map OpenCL types to CLML operator arthematic mode.
+ *
+ * \param data_type cl data type
+ * \param acc_type accumulation type to be used
+ * \return the operator arthematic mode
+ */
+cl_arithmetic_mode_qcom MakeCLArithMode(const cl_channel_type& data_type,
+                                        const cl_channel_type& acc_type) {
+  if (data_type == CL_FLOAT && acc_type == CL_FLOAT) {
+    return CL_ARITHMETIC_MODE_FP32_QCOM;
+  } else if (data_type == CL_HALF_FLOAT && acc_type == CL_FLOAT) {
+    return CL_ARITHMETIC_MODE_FP16_ACC32_QCOM;
+  } else if (data_type == CL_HALF_FLOAT && acc_type == CL_HALF_FLOAT) {
+    return CL_ARITHMETIC_MODE_FP16_QCOM;
+  } else {
+    LOG(FATAL) << "Datatype " << data_type << " unsupported by CLML runtime";
+  }
+}
+
+/*!
+ * \brief Helper to sanity check before tensor creation.
+ *
+ * \param node The tensor to represent.
+ * \param data data pointer to prefill the tensor
+ * \param shape shape information of tensor
+ * \param layout the tensor layout to be used
+ * \param dtype tensor data type
+ * \return CLML Tensor descriptor.
+ */
+std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(const JSONGraphNode& tensor_rep,
+                                                              void* data,
+                                                              std::vector<size_t> c_shape,
+                                                              cl_ml_tensor_layout_qcom layout,
+                                                              cl_uint dtype) {
+  std::vector<int64_t> shape = tensor_rep.GetOpShape()[0];
+  std::vector<size_t> clml_shape(shape.begin(), shape.end());
+  if (c_shape.size() > 0) {
+    clml_shape = c_shape;
+  }
+  // Make sure the tensors with dimensions less than 4 are padded with 1.
+  clml_shape.push_back(1);
+  clml_shape.push_back(1);
+  clml_shape.push_back(1);
+
+  tensor_dims_t dims;
+  dims.n = clml_shape[0];
+  dims.c = clml_shape[1];
+  dims.h = clml_shape[2];
+  dims.w = clml_shape[3];
+
+  auto tensor_dsc = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+  tensor_dsc->tensor = DeviceMakeCLMLTensor(CLML_CTX, dims, layout, dtype);
+  return tensor_dsc;
+}
+
+/*!
+ * \brief Create an CLML tensor given the JSON Node representation.
+ *
+ * \param node The tensor to represent.
+ * \param layout the tensor layout to be used
+ * \param dtype tensor data type
+ * \param data data pointer to prefill the tensor
+ * \param shape shape information of tensor
+ * \return CLML Tensor descriptor.
+ */
+std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONNode(
+    const JSONGraphNode& node, cl_ml_tensor_layout_qcom layout, cl_uint dtype, void* data,
+    std::vector<size_t> shape) {
+  return MakeCLMLTensor(node, data, shape, layout, dtype);
+}
+
+/*!
+ * \brief Utility function to extract vector values from string.
+ *
+ * \param val vector of strings
+ * \return vector of cl_uints.
+ */
+std::vector<cl_uint> GetVectorValues(const std::vector<std::string>& val) {
+  std::vector<cl_uint> array;
+  for (auto i : val) {
+    array.push_back((cl_uint)stoi(i));
+  }
+  return array;
+}
+
+}  //  namespace contrib
+}  //  namespace runtime
+}  //  namespace tvm
+#endif
diff --git a/src/runtime/contrib/clml/clml_utils.h b/src/runtime/contrib/clml/clml_utils.h
new file mode 100644
index 0000000000000..e3cdb32474467
--- /dev/null
+++ b/src/runtime/contrib/clml/clml_utils.h
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/clml/clml_utils.h
+ * \brief CLML utilities header
+ */
+#ifndef TVM_RUNTIME_CONTRIB_CLML_CLML_UTILS_H_
+#define TVM_RUNTIME_CONTRIB_CLML_CLML_UTILS_H_
+
+#include "clml_runtime.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace tvm::runtime::json;
+using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+
+void CopyDataToCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tensor, void* data,
+                          cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM);
+
+void CopyDataFromCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tensor, void* data,
+                            cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM);
+
+cl_ml_tensor_qcom DeviceMakeCLMLTensor(
+    cl_context context, tensor_dims_t dims,
+    cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+    cl_channel_type dtype = CL_FLOAT);
+
+cl_mem AllocateOnChipTensorMemory(size_t size, cl_uint on_chip_mem_offset);
+
+cl_mem AllocateDDRTensorMemory(size_t size);
+
+tensor_dims_t GetTensorDims(const JSONGraphNode& node);
+
+cl_channel_type MakeCLDataType(const DLDataType& data_type);
+
+cl_arithmetic_mode_qcom MakeCLArithMode(const cl_channel_type& data_type,
+                                        const cl_channel_type& acc_type = CL_FLOAT);
+
+std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(const JSONGraphNode& tensor_rep,
+                                                              void* data,
+                                                              std::vector<size_t> c_shape,
+                                                              cl_ml_tensor_layout_qcom layout,
+                                                              cl_uint dtype);
+
+std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONNode(
+    const JSONGraphNode& node, cl_ml_tensor_layout_qcom layout, cl_uint dtype, void* data = nullptr,
+    std::vector<size_t> shape = {});
+
+std::vector<cl_uint> GetVectorValues(const std::vector<std::string>& val);
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_CONTRIB_CLML_CLML_UTILS_H_
diff --git a/tests/python/contrib/test_clml/infrastructure.py b/tests/python/contrib/test_clml/infrastructure.py
index 1b9cbdac63b55..42dcf083d02da 100644
--- a/tests/python/contrib/test_clml/infrastructure.py
+++ b/tests/python/contrib/test_clml/infrastructure.py
@@ -120,6 +120,25 @@ def visit_call(self, call):
     return c.count
 
 
+def get_non_cpu_op_count(mod):
+    """Traverse graph counting ops not offloaded to TVM."""
+
+    class Counter(tvm.relay.ExprVisitor):
+        def __init__(self):
+            super().__init__()
+            self.count = 0
+
+        def visit_call(self, call):
+            if not isinstance(call.op, tvm.ir.Op):
+                self.count += 1
+
+            super().visit_call(call)
+
+    c = Counter()
+    c.visit(mod["main"])
+    return c.count
+
+
 def skip_codegen_test():
     """Skip test if it requires the CLML codegen and it's not present."""
     if not tvm.get_global_func("relay.ext.clml", True):
diff --git a/tests/python/contrib/test_clml/test_ops.py b/tests/python/contrib/test_clml/test_ops.py
index 6cb90e7af00f5..e59a73a485ab4 100644
--- a/tests/python/contrib/test_clml/test_ops.py
+++ b/tests/python/contrib/test_clml/test_ops.py
@@ -562,34 +562,49 @@ def _get_model(x_shape, k_shape, has_bias=False):
                 "op": "const",
             },
         ]
-        if has_bias:
-            bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
-            out = relay.nn.bias_add(out, bias)
-            bias_node = {
-                "attrs": {
-                    "dtype": [[dtype]],
-                    "shape": [[list((1, k_shape[0]))]],
-                },
-                "name": "",
-                "op": "const",
-            }
-            exp_codegen.append(bias_node)
-            params["bias"] = tvm.nd.array(np.random.uniform(-1, 1, (k_shape[0],)).astype(dtype))
 
         dense_node = {
             "attrs": {
-                "num_inputs": "3" if has_bias else "2",
+                "num_inputs": "2",
                 "num_outputs": "1",
                 "dtype": [[dtype]],
                 "out_dtype": [[""]],
                 "shape": [[[x_shape[0], k_shape[0]]]],
                 "units": [[str(k_shape[0])]],
             },
-            "inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0]] if has_bias else [[0, 0, 0], [1, 0, 0]],
+            "inputs": [[0, 0, 0], [1, 0, 0]],
             "name": "nn.dense",
             "op": "kernel",
         }
         exp_codegen.append(dense_node)
+
+        if has_bias:
+            bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
+            out = relay.nn.bias_add(out, bias)
+            bias_data_node = {
+                "attrs": {
+                    "dtype": [[dtype]],
+                    "shape": [[list((1, k_shape[0]))]],
+                },
+                "name": "",
+                "op": "const",
+            }
+            exp_codegen.append(bias_data_node)
+            bias_node = {
+                "attrs": {
+                    "num_inputs": "2",
+                    "num_outputs": "1",
+                    "dtype": [[dtype]],
+                    "shape": [[[x_shape[0], k_shape[0]]]],
+                },
+                "inputs": [[2, 0, 0], [3, 0, 0]],
+                "name": "add",
+                "op": "kernel",
+            }
+            exp_codegen.append(bias_node)
+
+            params["bias"] = tvm.nd.array(np.random.uniform(-1, 1, (k_shape[0],)).astype(dtype))
+
         return out, params, inputs, exp_codegen
 
     def _verify(out, params, inputs, exp_codegen):
@@ -597,11 +612,11 @@ def _verify(out, params, inputs, exp_codegen):
         opencl_out = build_and_run(mod, inputs, 1, params, device, enable_clml=False)[0]
         clml_out = build_and_run(mod, inputs, 1, params, device, enable_clml=True)[0]
         tvm.testing.assert_allclose(
-            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, atol=1e-3
+            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-2, atol=1e-2
         )
         verify_codegen(out, exp_codegen, device, params)
 
-    _verify(*(_get_model((1, 16), (32, 16))))
+    _verify(*(_get_model((5, 16), (32, 16), False)))
     _verify(*(_get_model((1, 16), (32, 16), True)))
 
 
@@ -775,5 +790,66 @@ def _verify(out, params, inputs):
     _verify(*(_get_model((1, 16, 7, 7), (2, 2), True)))
 
 
+@pytest.mark.parametrize("dtype", ["float32"])
+@tvm.testing.requires_openclml
+def test_batch_matmul(device, dtype):
+    def _get_model(a_shape, b_shape, a_transpose, b_transpose):
+        a = relay.var("a", shape=(a_shape), dtype=dtype)
+        b = relay.var("b", shape=(b_shape), dtype=dtype)
+        out = relay.nn.batch_matmul(a, b, transpose_a=a_transpose, transpose_b=b_transpose)
+        inputs = {
+            "a": tvm.nd.array(np.random.uniform(-1, 1, a_shape).astype(dtype)),
+            "b": tvm.nd.array(np.random.uniform(-1, 1, b_shape).astype(dtype)),
+        }
+        params = {}
+        return out, params, inputs
+
+    def _verify(out, params, inputs):
+        mod = IRModule.from_expr(out)
+        opencl_out = build_and_run(mod, inputs, 1, params, device, enable_clml=False)[0]
+        clml_out = build_and_run(mod, inputs, 1, params, device, enable_clml=True)[0]
+        tvm.testing.assert_allclose(
+            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, atol=1e-3
+        )
+
+        # Check to make sure these ops are offloaded to CLML instead of TVM.
+        exp_codegen = [
+            {
+                "attrs": {
+                    "dtype": [[dtype]],
+                    "shape": [[list(inputs["a"].shape)]],
+                },
+                "name": "",
+                "op": "input",
+            },
+            {
+                "attrs": {
+                    "dtype": [[dtype]],
+                    "shape": [[list(inputs["b"].shape)]],
+                },
+                "name": "",
+                "op": "input",
+            },
+            {
+                "attrs": {
+                    "transpose_a": [[str(int(out.attrs.transpose_a))]],
+                    "transpose_b": [[str(int(out.attrs.transpose_b))]],
+                    "out_dtype": [[""]],
+                    "dtype": [[dtype]],
+                    "num_inputs": "2",
+                    "num_outputs": "1",
+                    "shape": [[list(clml_out[0].shape)]],
+                },
+                "inputs": [[0, 0, 0], [1, 0, 0]],
+                "name": "nn.batch_matmul",
+                "op": "kernel",
+            },
+        ]
+        verify_codegen(out, exp_codegen, device, params)
+
+    _verify(*(_get_model((1, 128, 32), (1, 128, 32), False, True)))
+    _verify(*(_get_model((1, 128, 128), (1, 32, 128), False, True)))
+
+
 if __name__ == "__main__":
     tvm.testing.main()