diff --git a/examples/image_recognition/resnet50/quantization/ptq_static/main.py b/examples/image_recognition/resnet50/quantization/ptq_static/main.py
index cc82d49b4..182c426a0 100644
--- a/examples/image_recognition/resnet50/quantization/ptq_static/main.py
+++ b/examples/image_recognition/resnet50/quantization/ptq_static/main.py
@@ -30,8 +30,8 @@
 from PIL import Image
 from sklearn import metrics
 
-from onnx_neural_compressor import config, data_reader, quantization
-from onnx_neural_compressor.quantization import tuning
+from onnx_neural_compressor import data_reader, quantization
+from onnx_neural_compressor.quantization import config, tuning
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(
diff --git a/examples/nlp/bert/quantization/ptq_dynamic/main.py b/examples/nlp/bert/quantization/ptq_dynamic/main.py
index 2a2b97817..17cb3a5b2 100644
--- a/examples/nlp/bert/quantization/ptq_dynamic/main.py
+++ b/examples/nlp/bert/quantization/ptq_dynamic/main.py
@@ -34,8 +34,7 @@
 from onnxruntime.transformers.fusion_options import FusionOptions
 from torch.utils import data
 
-from onnx_neural_compressor import config
-from onnx_neural_compressor.quantization import tuning
+from onnx_neural_compressor.quantization import config, tuning
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(
diff --git a/examples/nlp/bert/quantization/ptq_static/main.py b/examples/nlp/bert/quantization/ptq_static/main.py
index d1bb34b77..1984bed85 100644
--- a/examples/nlp/bert/quantization/ptq_static/main.py
+++ b/examples/nlp/bert/quantization/ptq_static/main.py
@@ -34,8 +34,8 @@
 from onnxruntime.transformers.fusion_options import FusionOptions
 from torch.utils import data
 
-from onnx_neural_compressor import config, data_reader, quantization
-from onnx_neural_compressor.quantization import tuning
+from onnx_neural_compressor import data_reader, quantization
+from onnx_neural_compressor.quantization import config, tuning
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(
diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py
index 9cafe62d3..7c18b5f35 100644
--- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py
+++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py
@@ -33,8 +33,8 @@
 from torch.nn import functional
 from torch.utils import data
 
-from onnx_neural_compressor import config, data_reader, logger, utility
-from onnx_neural_compressor.quantization import matmul_nbits_quantizer, tuning
+from onnx_neural_compressor import data_reader, logger, utility
+from onnx_neural_compressor.quantization import config, matmul_nbits_quantizer, tuning
 
 logging.basicConfig(
     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN
diff --git a/onnx_neural_compressor/algorithms/layer_wise/core.py b/onnx_neural_compressor/algorithms/layer_wise/core.py
index 1130981e7..aa71dd3aa 100644
--- a/onnx_neural_compressor/algorithms/layer_wise/core.py
+++ b/onnx_neural_compressor/algorithms/layer_wise/core.py
@@ -23,7 +23,6 @@
 import onnxruntime as ort
 
 from onnx_neural_compressor import data_reader, logger, onnx_model
-from onnx_neural_compressor.algorithms import utility as quant_utils
 
 from typing import Callable, List, Union  # isort: skip
 
@@ -48,7 +47,7 @@ def layer_wise_quant(
         _type_: _description_
     """
     # check whether model shape is inferred
-    if not quant_utils.check_model_with_infer_shapes(model):
+    if not _check_model_with_infer_shapes(model):
         logger.error(
             "Before applying layer-wise quantization, please make sure to "
             "run symbolic shape inference on your model like follows:\n"
@@ -276,3 +275,13 @@ def _prepare_data_reader_for_next_split_model(
         inputs.update({name: value for name, value in zip(output_names, out)})
         data_reader_for_next_split_model.append(inputs)
     return DataReader(data_reader_for_next_split_model)
+
+def _check_model_with_infer_shapes(model):
+    """Check if the model has been shape inferred."""
+    if isinstance(model, (pathlib.Path, str)):
+        model = onnx.load(model, load_external_data=False)
+    elif isinstance(model, onnx_model.ONNXModel):
+        model = model.model
+    if len(model.graph.value_info) > 0:
+        return True
+    return False
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py b/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py
index 40e3b9645..af14c3562 100644
--- a/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py
+++ b/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py
@@ -28,7 +28,6 @@
 import numpy as np
 import onnx
 import onnxruntime
-from onnxruntime import quantization as ort_quant
 from packaging import version
 
 from onnx_neural_compressor import logger, onnx_model
@@ -279,9 +278,9 @@ def _collect_data(inputs):
                     node_name = name_to_node[node_output_names[output_idx]]
                     if node_output_names[output_idx] not in name_to_calibrator:
                         calib_method = (
-                            q_config[node_name]["calibrate_method"].name
+                            q_config[node_name]["calibrate_method"]
                             if q_config and node_name in q_config
-                            else ort_quant.CalibrationMethod.MinMax.name
+                            else 0
                         )
                         assert calib_method in calibrator.CALIBRATOR, "Calibration method {} is not registered.".format(
                             calib_method
@@ -294,7 +293,7 @@ def _collect_data(inputs):
                     # the calibration method is minmax, otherwise the tensor data is collected.
                     # TODO: for entropy and percentile method, need to support range collection
                     # per iteration in the future.
-                    if _calibrator.method_name == ort_quant.CalibrationMethod.MinMax.name:
+                    if _calibrator.method_name == "MinMax":
                         _calibrator.collect(output)
                         activation_tensors_calib_range[node_output_names[output_idx]] = [list(_calibrator.calib_range)]
                         name_to_calibrator[node_output_names[output_idx]] = _calibrator
@@ -325,9 +324,9 @@ def _collect_data(inputs):
             if any([data.dtype in [bool] for data in datas]):  # output type of some ops is bool, skip
                 continue
             calib_method = (
-                q_config[node_name]["calibrate_method"].name
+                q_config[node_name]["calibrate_method"]
                 if q_config and node_name in q_config
-                else ort_quant.CalibrationMethod.MinMax.name
+                else 0
             )
             _calibrator = calibrator.CALIBRATOR[calib_method]()
             _calibrator.collect(datas)
@@ -396,9 +395,7 @@ def get_weight_tensors_calib_range(self):
                     os.path.dirname(self.model_wrapper.model_path) if self.model_wrapper.model_path is not None else ""
                 ),
             )
-            _calibrator = calibrator.CALIBRATOR[
-                ort_quant.CalibrationMethod.MinMax.name
-            ]()  # use minmax method to calibrate initializer tensors
+            _calibrator = calibrator.CALIBRATOR[0]() # use minmax method to calibrate initializer tensors
             if initializer_tensor.flatten().size > 0:
                 _calibrator.collect(initializer_tensor)
                 weight_tensors_calib_range[initializer_tensor_name] = [list(_calibrator.calib_range)]
@@ -598,13 +595,12 @@ def calculate_quantization_params(self, q_config, quantization_thresholds):
                 node_thresholds[1],
                 sym,
                 qType,
-                quant_utils.get_qmin_qmax_for_qType(qType, self.reduce_range, sym),
             )
             quantization_params[tensor_name] = node_params
 
         return quantization_params
 
-    def calculate_scale_zeropoint(self, last_node, next_node, rmin, rmax, sym, qType, quantize_range):
+    def calculate_scale_zeropoint(self, last_node, next_node, rmin, rmax, sym, qType):
         """Given the source and destination node of tensor, return calculated zero point and scales."""
         zp_and_scale = []
         # adjust rmin and rmax such that 0 is included in the range. This is required
@@ -640,7 +636,7 @@ def calculate_scale_zeropoint(self, last_node, next_node, rmin, rmax, sym, qType
                         rmin = min(rmin, clip_params[0], clip_params[1])
                         rmax = max(rmax, clip_params[0], clip_params[1])
 
-        scale, zp = quant_utils.calculate_scale_zp(rmin, rmax, quantize_range, qType, sym)
+        scale, zp = quant_utils.calculate_scale_zp(rmin, rmax, qType, sym, self.reduce_range)
         zp_and_scale.append(zp)
         zp_and_scale.append(scale)
 
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py b/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py
index 042518092..97506b0d2 100644
--- a/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py
+++ b/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py
@@ -36,7 +36,7 @@ def decorator_calib(cls):
         ), "The name of subclass of Calibrator should end with 'Calibrator' substring."
         if cls.__name__[: -len("Calibrator")] in CALIBRATOR:  # pragma: no cover
             raise ValueError("Cannot have two operators with the same name.")
-        CALIBRATOR[calib_method.strip()] = cls
+        CALIBRATOR[calib_method] = cls
         return cls
 
     return decorator_calib
@@ -69,7 +69,7 @@ def calib_range(self):
         return self._calib_min, self._calib_max
 
 
-@calib_registry(calib_method="MinMax")
+@calib_registry(calib_method=0)
 class MinMaxCalibrator(CalibratorBase):
     """MinMax calibrator class."""
 
@@ -109,7 +109,7 @@ def method_name(self):
         return "MinMax"
 
 
-@calib_registry(calib_method="Percentile")
+@calib_registry(calib_method=2)
 class PercentileCalibrator(CalibratorBase):
     """Percentile calibrator class.
 
@@ -163,7 +163,7 @@ def method_name(self):
         return "Percentile"
 
 
-@calib_registry(calib_method="Entropy")
+@calib_registry(calib_method=1)
 class EntropyCalibrator(CalibratorBase):
     """Entropy calibrator class.
 
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py
index c3c97617a..4efcfd71a 100644
--- a/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py
@@ -13,9 +13,7 @@
 # limitations under the License.
 """Base Operator."""
 
-from onnxruntime import quantization
-
-from onnx_neural_compressor import constants
+from onnx_neural_compressor import constants, quantization
 
 OPERATORS = {
     "dynamic_quant": {},
@@ -56,7 +54,7 @@ def __init__(self, onnx_quantizer, onnx_node):
             True if onnx_node.op_type in onnx_quantizer.op_types_to_exclude_output_quantization else False
         )
         self.per_channel = False
-        self.calibrate_method = quantization.CalibrationMethod.MinMax
+        self.calibrate_method = 0 # minmax
         self.weight_sym = True
         self.weight_dtype = None
         self.activation_dtype = None
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py
index fd851885f..d18833adc 100644
--- a/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py
@@ -57,19 +57,27 @@ def convert_check(self):
 
     def convert(self):
         """Convert to QOperator format."""
+        # DQ-Gather-Q-DQ-op
         node = self.node
 
         parents = self.quantizer.model.get_parents(node)
         children = self.quantizer.model.get_children(node)
 
         if any([i.op_type == "DequantizeLinear" for i in parents]):
-            from onnx import numpy_helper
 
             inputs = []
             inputs.append(parents[0].input[0])
             inputs.append(node.input[1])
 
-            gather_new_output = node.output[0] + "_quantized"
+            out_scale = 1.0
+            out_zp = 0
+            gather_new_output = node.output[0] + "_quantized" # dynamic quant output name
+            for child in children:
+                if child.op_type == "QuantizeLinear":
+                    out_scale = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(children[0].input[1]))
+                    out_zp = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(children[0].input[2]))
+                    gather_new_output = children[0].output[0] # static quant output name
+                    self.quantizer.remove_nodes.append(child)
 
             kwargs = {}
             for attribute in node.attribute:  # pragma: no cover
@@ -77,7 +85,7 @@ def convert(self):
 
             gather_node = onnx.helper.make_node(node.op_type, inputs, [gather_new_output], node.name, **kwargs)
             self.quantizer.new_nodes.append(gather_node)
-            if any([i.op_type != "QuantizeLinear" for i in children]):  # pragma: no cover
+            if any([i.op_type != "QuantizeLinear" for i in children]):
                 dq_inputs = []
                 dq_inputs.append(gather_new_output)
                 dq_inputs.extend(parents[0].input[1:])
@@ -86,25 +94,15 @@ def convert(self):
                 )
                 self.quantizer.new_nodes.append(dq_node)
 
-            out_scale = 1.0
-            out_zp = 0
-            for child in children:
-                if child.op_type == "QuantizeLinear":
-                    out_scale = numpy_helper.to_array(self.quantizer.model.get_initializer(child.input[1]))
-                    out_zp = numpy_helper.to_array(self.quantizer.model.get_initializer(child.input[2]))
-                    self.quantizer.remove_nodes.append(child)
-                    for n in self.quantizer.model.get_children(child):
-                        self.quantizer.model.replace_node_input(n, child.output[0], gather_new_output)
-
             # int8 weight will be recalculated for the first time
             if (
                 any([child.op_type == "QuantizeLinear" for child in children])
                 and self.quantizer.model.get_initializer(parents[0].input[0]) is not None
                 and parents[0].input[0] not in self.quantizer.recalculate_quantized_value
             ):
-                int8_tensor = numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[0]))
-                in_scale = numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[1]))
-                in_zp = numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[2]))
+                int8_tensor = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[0]))
+                in_scale = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[1]))
+                in_zp = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[2]))
                 new_int8_tensor = (((int8_tensor.astype("float32") - in_zp) * in_scale) / out_scale).round() + out_zp
                 self.quantizer.model.set_initializer(parents[0].input[0], new_int8_tensor.astype(int8_tensor.dtype))
                 self.quantizer.recalculate_quantized_value.append(parents[0].input[0])
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py
index 61f7efd9e..6ffe742b5 100644
--- a/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py
@@ -76,7 +76,10 @@ def convert(self):
                     scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0]
                     padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer)
                     quantized_padding_constant_array = quant_utils.quantize_nparray(
-                        self.weight_dtype, padding_constant_array, scale_value, zp_value
+                        onnx.helper.tensor_dtype_to_np_dtype(self.weight_dtype),
+                        padding_constant_array,
+                        scale_value,
+                        zp_value,
                     )
                     quantized_padding_constant_name = node.input[2] + "_quantized"
                     quantized_padding_constant_initializer = onnx.numpy_helper.from_array(
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py
index 97bded14f..3192b51d1 100644
--- a/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py
+++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py
@@ -71,9 +71,10 @@ def convert(self):
         if len(node.input) > 1:  # pragma: no cover
             quantized_input_names.extend(node.input[1:])
         outputs = []
+        input_name_to_nodes = self.quantizer.model.input_name_to_nodes()
         for output in node.output:
-            if output in self.quantizer.model.input_name_to_nodes():
-                child = self.quantizer.model.input_name_to_nodes()[output][0]
+            if output in input_name_to_nodes:
+                child = input_name_to_nodes[output][0]
                 if child.op_type == "QuantizeLinear":
                     self.quantizer.remove_nodes.append(child)
                     outputs.append(child.output[0])
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/quantizer.py b/onnx_neural_compressor/algorithms/post_training_quant/quantizer.py
index 4e8b815e5..2596e8791 100644
--- a/onnx_neural_compressor/algorithms/post_training_quant/quantizer.py
+++ b/onnx_neural_compressor/algorithms/post_training_quant/quantizer.py
@@ -21,7 +21,7 @@
 import onnx
 import onnxruntime as ort
 
-from onnx_neural_compressor import logger, onnx_model
+from onnx_neural_compressor import logger, onnx_model, utility
 from onnx_neural_compressor.algorithms import utility as quant_utils
 from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
 
@@ -150,17 +150,24 @@ def should_convert(self, node):
 
     def _postprocess(self):
         if "TensorrtExecutionProvider" in self.execution_provider:
-            quant_utils.trt_env_setup(self.model.model)
+            utility.trt_env_setup(self.model.model)
         self.merge_dedicated_qdq_pair()
         self.model.remove_unused_nodes()
 
         self.model.model.producer_name = quant_utils.__producer__
         self.model.model.producer_version = quant_utils.__version__
 
+    def _preprocess(self):
+        quant_utils.remove_init_from_model_input(self.model)
+        quant_utils.split_shared_bias(self.model)
+
     def quantize_model(self):
         """Quantize onnx model."""
+        self._preprocess()
+
         # step 1: insert q-dq pairs
         self.insert_qdq()
+
         self.remove_duplicate_qdq_paris()
 
         # step 2: convert q-node-dq to qoperator format if needed
@@ -168,6 +175,7 @@ def quantize_model(self):
             self.convert_qdq_to_operator_oriented()
 
         self._postprocess()
+        quant_utils.dump_model_op_stats(self.model.model, self.config, self.op_types_to_quantize)
         return self.model.model
 
     def merge_dedicated_qdq_pair(self):
@@ -430,25 +438,11 @@ def quantize_bias(self, bias_name, input_name, weight_name, beta=1.0):
         packed_bias_zp_initializer = onnx.numpy_helper.from_array(bias_zp_data, quantized_bias_zp_name)
         self.model.initializer().extend([packed_bias_zp_initializer])
 
-        # log entries for this quantized bias value
-        quantized_bias_entry = quant_utils.QuantizedInitializer(
-            bias_name,
-            bias_initializer,
-            [0],
-            [0],
-            [0],
-            [bias_scale],
-            bias_data,
-            quantized_data,
-            qType=onnx.TensorProto.INT32,
-        )
-
         quantized_value = quant_utils.QuantizedValue(
             bias_name,
             quantized_bias_name,
             quantized_bias_scale_name,
             quantized_bias_zp_name,
-            quant_utils.QuantizedValueType.Initializer,
             None,
             onnx.TensorProto.INT32,
         )
@@ -476,9 +470,9 @@ def quantize_weight_per_channel(self, weight_name, weight_qType, sym, channel_ax
         rmin, rmax, zero_point, scale, quantized_weights = quant_utils.quantize_data_per_channel(
             weights,
             channel_axis,
-            quant_utils.get_qmin_qmax_for_qType(weight_qType, self.reduce_range, sym),
             weight_qType,
             sym,
+            self.reduce_range,
         )
 
         weight = quant_utils.QuantizedInitializer(
@@ -500,7 +494,6 @@ def quantize_weight_per_channel(self, weight_name, weight_qType, sym, channel_ax
             weight.name + "_quantized",
             weight.name + "_scale",
             weight.name + "_zero_point",
-            quant_utils.QuantizedValueType.Initializer,
             None,
             weight_qType,
         )
@@ -579,7 +572,7 @@ def tensor_proto_to_array(initializer, base_dir=""):
             raise ValueError(
                 "Only float type quantization is supported. \
                 Weights {} is {}.".format(
-                    initializer.name, quant_utils.dtype_to_name(quant_utils.dtype_mapping, initializer.data_type)
+                    initializer.name, str(onnx.helper.tensor_dtype_to_np_dtype(initializer.data_type)),
                 )
             )
         return weights
@@ -636,9 +629,9 @@ def _get_quantized_weight(self, initializer, qType, sym):
         )
         rmin, rmax, zero_point, scale, quantized_weights_data = quant_utils.quantize_data(
             weights_data.flatten().tolist(),
-            quant_utils.get_qmin_qmax_for_qType(qType, self.reduce_range, sym),
             qType,
             sym,
+            self.reduce_range,
         )
         weight = quant_utils.QuantizedInitializer(
             name,
@@ -752,7 +745,7 @@ def quantize_outputs(self, node, initializer_use_weight_qType=True, direct_int8=
                 self.replace_input.append([child, tensor_name, dequant_node.output[0]])
             if tensor_name not in self.quantized_value_map:
                 quantized_value = quant_utils.QuantizedValue(
-                    tensor_name, dq_output, scale_name, zp_name, quant_utils.QuantizedValueType.Input
+                    tensor_name, dq_output, scale_name, zp_name
                 )
                 self.quantized_value_map[tensor_name] = quantized_value
 
@@ -807,7 +800,6 @@ def quantize_inputs(self, node, indices=None, initializer_use_weight_qType=True,
                         q_weight_name,
                         scale_name,
                         zp_name,
-                        quant_utils.QuantizedValueType.Initializer,
                         None,
                         dtype,
                     )
@@ -999,7 +991,7 @@ def _quantize_activation(self, node, tensor_name, direct_int8=False):
 
         if tensor_name not in self.quantized_value_map:
             quantized_value = quant_utils.QuantizedValue(
-                tensor_name, dq_output, scale_name, zp_name, quant_utils.QuantizedValueType.Input
+                tensor_name, dq_output, scale_name, zp_name,
             )
             self.quantized_value_map[tensor_name] = quantized_value
 
@@ -1041,7 +1033,11 @@ def __init__(
 
     def _quantize_activation(self, node, tensor_name, direct_int8=False):
         """Quantize node activation."""
-        qlinear_node = self.model.find_node_by_name(tensor_name + "_QuantizeLinear", self.new_nodes, self.model.graph())
+        qlinear_node = None
+        if quant_utils.find_by_name(tensor_name + "_QuantizeLinear", self.model.nodes()) is not None:
+            qlinear_node = quant_utils.find_by_name(tensor_name + "_QuantizeLinear", self.model.nodes())
+        elif quant_utils.find_by_name(tensor_name + "_QuantizeLinear", self.new_nodes) is not None:
+            qlinear_node = quant_utils.find_by_name(tensor_name + "_QuantizeLinear", self.new_nodes)
         if qlinear_node is None:
             if (
                 self.fuse_dynamic_quant
diff --git a/onnx_neural_compressor/algorithms/smoother/core.py b/onnx_neural_compressor/algorithms/smoother/core.py
index ab902de07..bcf830f1a 100644
--- a/onnx_neural_compressor/algorithms/smoother/core.py
+++ b/onnx_neural_compressor/algorithms/smoother/core.py
@@ -28,17 +28,6 @@
 from typing import List, Union  # isort: skip
 
 
-_dtype_map = {
-    np.dtype("float32"): 1,
-    np.dtype("uint8"): 2,
-    np.dtype("int8"): 3,
-    np.dtype("int32"): 6,
-    np.dtype("int64"): 7,
-    np.dtype("float16"): 10,
-    np.dtype("double"): 11,
-}
-
-
 def _get_quant_dequant_output(model, input_data, output_data, providers):
     """Get loss between fp32 output and QDQ output.
 
@@ -48,7 +37,7 @@ def _get_quant_dequant_output(model, input_data, output_data, providers):
         output_data (numpy.ndarray): fp32 output
         providers (list): execution provider
     """
-    input_data = _quant_dequant_data(input_data, 2, "asym")
+    input_data = quant_utils.qdq_data(input_data, 2, False)
     sess = ort.InferenceSession(model.SerializeToString(), providers=providers)
     preds = sess.run(None, {model.graph.input[0].name: input_data})
     loss = np.sum(np.abs(output_data - preds) ** 2)
@@ -66,31 +55,22 @@ def _make_sub_graph(node, inits, input_data, output_data, opset, ir_version):
         opset (object): opset of the model
         ir_version (object): ir_version of the model
     """
-    input = onnx.helper.make_tensor_value_info(node.input[0], _dtype_map[input_data.dtype], input_data.shape)
-    output = onnx.helper.make_tensor_value_info(node.output[0], _dtype_map[output_data.dtype], output_data.shape)
+    input = onnx.helper.make_tensor_value_info(
+        node.input[0],
+        onnx.helper.np_dtype_to_tensor_dtype(input_data.dtype),
+        input_data.shape,
+    )
+    output = onnx.helper.make_tensor_value_info(
+        node.output[0],
+        onnx.helper.np_dtype_to_tensor_dtype(output_data.dtype),
+        output_data.shape,
+    )
     graph = onnx.helper.make_graph([node], "sub_graph", [input], [output], inits)
     model = onnx.helper.make_model(graph, opset_imports=opset)
     model.ir_version = ir_version
     return model
 
 
-def _quant_dequant_data(data, qType=3, sym=True):
-    """Quantize and then dequantize data.
-
-    Args:
-        data (numpy.ndarray): target data
-        qType (int): data type
-        sym (bool): sym or asym quantization
-    """
-    rmin, rmax, zero_point, scale, quantized_data = quant_utils.quantize_data(
-        data.flatten().tolist(),
-        quant_utils.get_qmin_qmax_for_qType(qType, False, sym),
-        qType,
-        sym,
-    )
-    return ((quantized_data - zero_point) * scale).astype(data.dtype).reshape(data.shape)
-
-
 class Smoother:
     """Fake input channel quantization.
 
@@ -386,7 +366,7 @@ def _get_output_loss(self, node_name, scale, calib_iter):
             )
             base_dir = "" if not self.model.is_large_model else os.path.dirname(self.model.model_path)
             weight = onnx.numpy_helper.to_array(self.model.get_initializer(node.input[1]), base_dir)
-            weight_q = _quant_dequant_data(weight)
+            weight_q = quant_utils.qdq_data(weight, 3, True)
 
             self.model.set_initializer(node.input[1], weight_q)
             inits = [self.model.get_initializer(i) for i in node.input if self.model.get_initializer(i) is not None]
diff --git a/onnx_neural_compressor/algorithms/utility.py b/onnx_neural_compressor/algorithms/utility.py
index 44496664f..f6a85c598 100644
--- a/onnx_neural_compressor/algorithms/utility.py
+++ b/onnx_neural_compressor/algorithms/utility.py
@@ -15,56 +15,49 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import enum
-import os
-import pathlib
+import numpy as np
+from packaging import version
 import re
 import struct
 import sys
 from importlib import util
-
-import numpy as np
-from onnxruntime.quantization import onnx_model
-from packaging import version
-
-from onnx_neural_compressor import constants, logger, utility
+from onnx_neural_compressor import constants, utility
 
 if sys.version_info < (3, 11) and util.find_spec("onnxruntime_extensions"):  # pragma: no cover
     import onnxruntime_extensions
 
-torch = utility.LazyImport("torch")
-symbolic_shape_infer = utility.LazyImport("onnxruntime.tools.symbolic_shape_infer")
 onnx = utility.LazyImport("onnx")
 ort = utility.LazyImport("onnxruntime")
 
-
-dtype_mapping = {
-    "fp32": 1,
-    "float32": 1,
-    "uint8": 2,
-    "int8": 3,
-    "uint16": 4,
-    "int16": 5,
-    "int32": 6,
-    "int64": 7,
-    "string": 8,
-    "bool": 9,
-    "fp16": 10,
-    "float16": 10,
-    "double": 11,
-    "uint32": 12,
-    "uint64": 13,
-    "complex64": 14,
-    "complex128": 15,
-    "bf16": 16,
-    "bfloat16": 16,
-}
-
-QUANT_OP_NAME_SUFFIX = "_quant"
 __producer__ = "onnx.quantize"
 __version__ = "0.1.0"
 onnx_domain = "ai.onnx"
 ms_domain = "com.microsoft"
+QUANT_OP_NAME_SUFFIX = "_quant"
+
+
+def attribute_to_kwarg(attribute):
+    """Convert attribute to kwarg format for use with onnx.helper.make_node."""
+    attribute_mapping = {
+        1: attribute.f,
+        2: attribute.i,
+        3: attribute.s,
+        4: attribute.t,
+        5: attribute.g,
+        6: attribute.floats,
+        7: attribute.ints,
+        8: attribute.strings,
+        9: attribute.tensors,
+        10: attribute.graphs,
+    }
+    if attribute.type in attribute_mapping:
+        value = attribute_mapping[attribute.type]
+    else:  # pragma: no cover
+        raise ValueError(
+            "attribute {} has no type specified " "or unsupported type {}.".format(attribute.name, attribute.type)
+        )
+    return {attribute.name: value}
+
 
 ONNX_INT_TYPE_RANGE = {
     onnx.TensorProto.UINT8: (0, 255),
@@ -80,23 +73,39 @@
     onnx.TensorProto.INT8: (-64, 64),
 }
 
+ONNX_STR_TYPE_RANGE = {
+    "int1": (-1, 0),
+    "int2": (-2, 1),
+    "int3": (-4, 3),
+    "int4": (-8, 7), # onnx >= 1.16.0 defines TensorProto.INT4
+    "int5": (-16, 15),
+    "int6": (-32, 31),
+    "int7": (-64, 63),
+    "int8": (-128, 127),
+    "uint1": (0, 1),
+    "uint2": (0, 3),
+    "uint3": (0, 7),
+    "uint4": (0, 15), # onnx >= 1.16.0 defines TensorProto.UINT4
+    "uint5": (0, 31),
+    "uint6": (0, 63),
+    "uint7": (0, 127),
+    "uint8": (0, 255),
+}
 
-def check_model_with_infer_shapes(model):
-    """Check if the model has been shape inferred."""
-    if isinstance(model, (pathlib.Path, str)):
-        model = onnx.load(model, load_external_data=False)
-    elif isinstance(model, onnx_model.ONNXModel):
-        model = model.model
-    if len(model.graph.value_info) > 0:
-        return True
-    return False
+def _qType_to_np_type(qType):
+    if isinstance(qType, int):
+        return onnx.helper.tensor_dtype_to_np_dtype(qType)
+    elif isinstance(qType, str) and "uint" in qType:
+        return np.dtype("uint8")
+    else:
+        return np.dtype("int8")
 
 
 def find_by_name(name, item_list):
     """Helper function to find item by name in a list."""
     items = []
     for item in item_list:
-        assert hasattr(item, "name"), "{} should have a 'name' attribute defined".format(item)  # pragma: no cover
+        assert hasattr(item, "name"), "{} should have a 'name' attribute defined".format(item)
         if item.name == name:
             items.append(item)
     if len(items) > 0:
@@ -104,19 +113,22 @@ def find_by_name(name, item_list):
     else:
         return None
 
-
-def is_quantizable_type(data_type):
-    return data_type in [onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16, onnx.TensorProto.BFLOAT16]
-
-
 def get_qmin_qmax_for_qType(qType, reduce_range=False, sym=False):  # noqa: N802
-    """Get qmin, qmax for qType."""
+    """Get qmin, qmax for qType.
+
+    Args:
+        qType (int or str): int for onnx defined type, str for onnx not defined type
+        reduce_range (bool, optional): whether use 7 bit for 8bit quantization
+        sym (bool, optional): quantization scheme. Defaults to False.
+    """
     if qType == onnx.TensorProto.FLOAT8E4M3FN:
         raise NotImplementedError("This function is not implemented for float 8 as not needed.")
 
     qrange = None
 
-    if reduce_range:
+    if isinstance(qType, str):
+        qrange = ONNX_STR_TYPE_RANGE.get(qType)
+    elif reduce_range:
         qrange = ONNX_INT_TYPE_REDUCED_RANGE.get(qType)
     elif sym and qType in ONNX_INT_TYPE_SYMMETRIC_RANGE:
         qrange = ONNX_INT_TYPE_SYMMETRIC_RANGE[qType]
@@ -124,14 +136,137 @@ def get_qmin_qmax_for_qType(qType, reduce_range=False, sym=False):  # noqa: N802
         qrange = ONNX_INT_TYPE_RANGE.get(qType)
 
     if not qrange:
-        raise ValueError(f"Unexpected data type {qType} requested. Only INT8 and UINT8 are supported.")
+        raise ValueError(f"Unexpected data type {qType} requested.")
 
     return qrange
 
+def quantize_nparray(dtype, arr, scale, zero_point, low=None, high=None):
+    """Quantize numpy array."""
+    q_weight = np.empty_like(np.asarray(arr), dtype=scale.dtype)
+    np.divide(arr, scale, out=q_weight)
+    np.add(q_weight, zero_point, out=q_weight)
+    np.round(q_weight, out=q_weight)
+    if low is not None and high is not None:
+        np.clip(q_weight, low, high, out=q_weight)
+    return q_weight.astype(dtype)
+
+def quantize_data_per_channel(data, axis, qType, sym, reduce_range=False):
+    """Quantize tensor per-channel."""
+    quantize_range = get_qmin_qmax_for_qType(qType, reduce_range, sym)
+    rmin = None
+    rmax = None
+    for i in range(len(data.shape)):
+        if i != axis:
+            rmin = np.min(data, axis=i, keepdims=True) if rmin is None else np.min(rmin, axis=i, keepdims=True)
+            rmax = np.max(data, axis=i, keepdims=True) if rmax is None else np.max(rmax, axis=i, keepdims=True)
+    rmin = np.minimum(rmin, 0)
+    rmax = np.maximum(rmax, 0)
+    scale, zero_point = calculate_scale_zp(rmin, rmax, qType, sym, reduce_range)
+
+    dtype = _qType_to_np_type(qType)
+    quantized_data = quantize_nparray(dtype, data, scale, zero_point, low=quantize_range[0], high=quantize_range[1])
+    return rmin.reshape(-1, 1), rmax.reshape(-1, 1), zero_point.reshape(-1, 1), scale.reshape(-1, 1), quantized_data
 
-def dtype_to_name(dtype_mapping, dtype):
-    """Map data type and its string representation."""
-    return list(dtype_mapping.keys())[list(dtype_mapping.values()).index(dtype)]
+def dequantize_data_with_scale_zero(tensor_value, scale_value, zo_value):  # pragma: no cover
+    """Dequantize tensor with scale and zero point."""
+    return (tensor_value.astype(scale_value.dtype) - zo_value.astype(scale_value.dtype)) * scale_value
+
+def dequantize_data(tensor_value, scale_value, zo_value, axis=0):  # pragma: no cover
+    """Dequantize tensor."""
+    if not isinstance(scale_value, np.ndarray):
+        return dequantize_data_with_scale_zero(tensor_value, scale_value, zo_value)
+    else:
+        channel_count = tensor_value.shape[axis]  # TBD, default from axis 0
+        new_per_channel_tensor_values = []
+        for i in range(channel_count):
+            per_channel_tensor_value = tensor_value.take(i, axis)
+            per_channel_scale_value = scale_value.take(i)
+            per_channel_zero_value = zo_value.take(i)
+            new_per_channel_tensor_values.append(
+                dequantize_data_with_scale_zero(
+                    per_channel_tensor_value, per_channel_scale_value, per_channel_zero_value
+                )
+            )
+        # combine per_channel_data into one
+        reshape_dims = list(tensor_value.shape)  # deep copy
+        reshape_dims[axis] = 1  # only one per channel for reshape
+        new_tensor_value = new_per_channel_tensor_values[0].reshape(reshape_dims)
+        for i in range(1, channel_count):
+            new_per_channel_tensor_value = new_per_channel_tensor_values[i].reshape(reshape_dims)
+            new_tensor_value = np.concatenate((new_tensor_value, new_per_channel_tensor_value), axis)
+        return new_tensor_value
+
+def calculate_scale_zp(rmin, rmax, qType, sym, reduce_range=False):
+    """Calculate scale and zero point."""
+    qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, sym)
+    dtype = _qType_to_np_type(qType)
+    if isinstance(rmax, np.ndarray):
+        if sym:
+            max_range = np.maximum(abs(rmin), abs(rmax))
+            rmin = -max_range
+            rmax = max_range
+        scale = (rmax - rmin) / (qmax - qmin)
+        scale[scale < np.finfo(rmax.dtype).tiny] = 1
+        zero_point = (
+            np.multiply(np.ones(rmax.shape), np.round((qmax + qmin) / 2.0)).astype(dtype)
+            if sym
+            else np.round(qmin - rmin / scale).astype(dtype)
+        )
+    else:
+        if sym:
+            max_range = max(abs(rmin), abs(rmax))
+            scale = (float(max_range) * 2) / (qmax - qmin) if max_range > 0 else 1
+        else:
+            scale = (float(rmax) - float(rmin)) / (qmax - qmin) if rmin != rmax else 1
+        zero_point = np.round((qmax + qmin) / 2.0).astype(dtype) if sym else np.round(qmin - rmin / scale).astype(dtype)
+    return np.float32(scale), zero_point
+
+def quantize_data(data, qType, sym, reduce_range=False, ratio=1.0, axis=None):
+    """Quantize data.
+
+    To pack weights, we compute a linear transformation
+        - when data type == uint8 mode, from [rmin, rmax] -> [0, 2^{b-1}] and
+        - when data type == int8, from [-m , m] -> [-(2^{b-1}-1), 2^{b-1}-1] where
+            m = max(abs(rmin), abs(rmax))
+    and add necessary intermediate nodes to transform quantized weight to full weight
+    using the equation r = S(q-z), where
+        r: real original value
+        q: quantized value
+        S: scale
+        z: zero point
+
+    Args:
+        data (array): data to quantize
+        qType (int): data type to quantize to. Supported types UINT8 and INT8
+        sym (bool): whether use sym quantization.
+        reduce_range (bool): whether use 7 bit or not. Defaults to False
+        ratio (float, optional): percentile of clip. Defaults to 1.0
+        axis (int, optional): process data along a specific axis. Default is None (process the whole data)
+    """
+    quantize_range = get_qmin_qmax_for_qType(qType, reduce_range, sym)
+    rmin = np.min(np.min(data), 0) if axis is None else np.min(data, axis=1, keepdims=True)
+    rmax = np.max(np.max(data), 0) if axis is None else np.max(data, axis=1, keepdims=True)
+    rmin *= ratio
+    rmax *= ratio
+
+    scale, zero_point = calculate_scale_zp(rmin, rmax, qType, sym, reduce_range)
+    dtype = _qType_to_np_type(qType)
+    quantized_data = quantize_nparray(dtype, data, scale, zero_point, low=quantize_range[0], high=quantize_range[1])
+    return rmin, rmax, zero_point, scale, quantized_data
+
+def qdq_data(data, qType, sym, reduce_range=False, ratio=1.0, axis=None):
+    _, _, zero_point, scale, quantized_data = quantize_data(data, qType, sym, reduce_range, ratio, axis)
+    return scale * (quantized_data - zero_point)
+
+def is_B_transposed(node):
+    """Whether inuput B is transposed."""
+    transB = [attr for attr in node.attribute if attr.name == "transB"]
+    if len(transB):
+        return 0 < onnx.helper.get_attribute_value(transB[0])
+    return False
+
+def is_quantizable_type(data_type):
+    return data_type in [onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16, onnx.TensorProto.BFLOAT16]
 
 
 def _get_blob_size(group_size, has_zp):  # pragma: no cover
@@ -201,7 +336,7 @@ def make_matmul_weight_only_node(
         scale = np.reshape(scale, (-1, k_blocks))
         scale_tensor = onnx.helper.make_tensor(
             name=node.input[1] + "_scale",
-            data_type=dtype_mapping[str(scale.dtype)],
+            data_type=onnx.helper.np_dtype_to_tensor_dtype(scale.dtype),
             dims=scale.shape,
             vals=scale.tobytes(),
             raw=True,
@@ -348,147 +483,51 @@ def pad_tensor(weight, group_size, k_blocks):
     return weight
 
 
-def quant_tensor(
-    data: np.array,
-    num_bits: int = 4,
-    group_size: int = 32,
-    sym: bool = False,
-    dtype: str = "int",
-    ratio: float = 1.0,
-):
-    """Quantize tensor per group.
-
-    Args:
-        data (np.array): input weight
-        num_bits (int, optional): number of bits used to represent weights. Defaults to 4.
-        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
-        sym (bool, optional): _quantization scheme. Defaults to False.
-        dtype (str, optional): data type. Defaults to "int".
-        ratio (float, optional): percentile of clip. Defaults to 1.0.
-
-    Returns:
-        output: quantized weight
-        scale: scale
-        zero_point: zero point
-    """
-    data = np.reshape(data, (-1, group_size))
-    if not sym or dtype == "uint":
-        maxq = 2**num_bits - 1
-        minq = 0
-    elif sym:
-        maxq = 2 ** (num_bits - 1) - 1 if num_bits != 1 else 0
-        minq = -(2 ** (num_bits - 1)) if num_bits != 1 else -1
-
-    rmin = np.min(data, axis=1, keepdims=True) * ratio
-    rmax = np.max(data, axis=1, keepdims=True) * ratio
-    if sym:
-        max_range = np.maximum(np.abs(rmin), np.abs(rmax))
-        scale = np.ones(rmax.shape)
-        scale[max_range > 0] = np.array(
-            [float(i) / (maxq - minq) for i in (max_range[max_range > 0] * 2.0).flatten().tolist()]
-        )
-        zero_point = (
-            np.zeros(scale.shape) if dtype == "int" else np.ones(rmax.shape, dtype="uint8") * (1 << (num_bits - 1))
-        )
-    else:
-        scale = np.ones(rmax.shape)
-        scale[rmin != rmax] = np.array(
-            [float(i) / (maxq - minq) for i in (rmax - rmin)[rmin != rmax].flatten().tolist()]
-        )
-        zero_point = (
-            ((np.zeros(scale.shape) - rmin) / scale).round()
-            if dtype == "int"
-            else np.maximum(0, np.minimum(maxq, ((np.zeros(scale.shape) - rmin) / scale).round())).astype("uint8")
-        )
-    return np.clip((data / scale + zero_point).round(), minq, maxq), scale, zero_point
-
-
-def qdq_tensor(
-    data: np.array,
-    num_bits: int = 4,
-    group_size: int = 32,
-    sym: bool = False,
-    dtype: str = "int",
-    ratio: float = 1.0,
-):
-    """Quant dequant tensor per group.
-
-    Args:
-        data (np.array): input weight
-        num_bits (int, optional): number of bits used to represent weights. Defaults to 4.
-        group_size (int, optional):  how many elements share one scale/zp. Defaults to 32.
-        sym (bool, optional): quantization scheme. Defaults to False.
-        dtype (str, optional): data type. Defaults to "int".
-        ratio (float, optional): percentile of clip. Defaults to 1.0.
-
-    Returns:
-        output: quant-dequant weight
-    """
-    org_shape = data.shape
-    weight, scale, zp = quant_tensor(data, num_bits, group_size, sym, dtype, ratio)
-    return np.reshape(scale * (weight - zp), org_shape)
-
-
-def is_B_transposed(node):
-    """Whether inuput B is transposed."""
-    transB = [attr for attr in node.attribute if attr.name == "transB"]
-    if len(transB):
-        return 0 < onnx.helper.get_attribute_value(transB[0])
-    return False
+def dump_woq_stats(model, quantize_config):
+    res = {}
 
+    dtype_set = set()
+    for node in model.graph.node:
+        if node.name.split("_Q")[0] not in quantize_config:
+            continue
+        if node.op_type in ["MatMulFpQ4", "MatMulNBits"]:
+            optype = "MatMul"
+        else:
+            optype = node.op_type
 
-def calculate_scale_zp(rmin, rmax, quantize_range, qType, sym):
-    """Calculate scale and zero point."""
-    qmin, qmax = quantize_range
-    dtype = onnx.helper.tensor_dtype_to_np_dtype(qType)
-    if isinstance(rmax, np.ndarray):
-        if sym:
-            max_range = np.maximum(abs(rmin), abs(rmax))
-            rmin = -max_range
-            rmax = max_range
-        scale = (rmax - rmin) / (qmax - qmin)
-        scale[scale < np.finfo(rmax.dtype).tiny] = 1
-        zero_point = (
-            np.multiply(np.ones(rmax.shape), np.round((qmax + qmin) / 2.0)).astype(dtype)
-            if sym
-            else np.round(qmin - rmin / scale).astype(dtype)
-        )
-    else:
-        if sym:
-            max_range = max(abs(rmin), abs(rmax))
-            scale = (float(max_range) * 2) / (qmax - qmin) if max_range > 0 else 1
+        if optype not in res:
+            res[optype] = {}
+        if re.fullmatch("^.*_Q\d*G\d*", node.input[1]):
+            search_out = re.search("_Q\d*", node.input[1])
+            dtype = "A32W{}G{}".format(
+                node.input[1][search_out.start() + 2 : search_out.end()], node.input[1][search_out.end() + 1 :]
+            )
         else:
-            scale = (float(rmax) - float(rmin)) / (qmax - qmin) if rmin != rmax else 1
-        zero_point = np.round((qmax + qmin) / 2.0).astype(dtype) if sym else np.round(qmin - rmin / scale).astype(dtype)
-    return np.float32(scale), zero_point
+            dtype = "FP32"
+        dtype_set.add(dtype)
 
+        if dtype in res[optype]:
+            res[optype][dtype] += 1
+        else:
+            res[optype][dtype] = 1
 
-def quantize_data(data, quantize_range, qType, sym):
-    """Quantize data.
+    dtype_list = list(dtype_set)
+    for dtype in dtype_list:
+        for optype in res.keys():
+            if dtype not in res[optype]:
+                res[optype][dtype] = 0
 
-    To pack weights, we compute a linear transformation
-        - when data type == uint8 mode, from [rmin, rmax] -> [0, 2^{b-1}] and
-        - when data type == int8, from [-m , m] -> [-(2^{b-1}-1), 2^{b-1}-1] where
-            m = max(abs(rmin), abs(rmax))
-    and add necessary intermediate nodes to transform quantized weight to full weight
-    using the equation r = S(q-z), where
-        r: real original value
-        q: quantized value
-        S: scale
-        z: zero point
+    # update stats format for dump.
+    field_names = ["Op Type", "Total"]
+    field_names.extend(dtype_list)
+    output_data = []
+    for op_type in res.keys():
+        field_results = [op_type, sum(res[op_type].values())]
+        field_results.extend([res[op_type][dtype] for dtype in dtype_list])
+        output_data.append(field_results)
 
-    Args:
-        data (array): data to quantize
-        quantize_range (list): list of data to weight pack.
-        qType (int): data type to quantize to. Supported types UINT8 and INT8
-        sym (bool): whether use sym quantization.
-    """
-    rmin = np.min(np.min(data), 0)
-    rmax = np.max(np.max(data), 0)
+    utility.Statistics(output_data, header="Mixed Precision Statistics", field_names=field_names).print_stat()
 
-    scale, zero_point = calculate_scale_zp(rmin, rmax, quantize_range, qType, sym)
-    quantized_data = quantize_nparray(qType, data, scale, zero_point, low=quantize_range[0], high=quantize_range[1])
-    return rmin, rmax, zero_point, scale, quantized_data
 
 
 def get_node_original_name(node) -> str:
@@ -502,16 +541,10 @@ def get_node_original_name(node) -> str:
         return node_name
 
 
-class QuantType(enum.Enum):  # pragma: no cover
-    """Represent QuantType value."""
-
-    QInt8 = 0
-    QUInt8 = 1
-
-
 def split_shared_bias(model):
     """Split shared tensor."""
-    for input_name, node_list in model.input_name_to_nodes.items():
+    input_name_to_nodes = model.input_name_to_nodes()
+    for input_name, node_list in input_name_to_nodes.items():
         if len(node_list) > 1 and input_name in [i.name for i in model.model.graph.initializer]:
             for node in node_list[1:]:
                 if node.op_type not in ["Conv", "FusedConv"]:
@@ -541,68 +574,6 @@ def remove_init_from_model_input(model):
             inputs.remove(name_to_input[initializer.name])
 
 
-def quantize_data_per_channel(data, axis, quantize_range, qType, sym):
-    """Quantize tensor per-channel."""
-    rmin = None
-    rmax = None
-    for i in range(len(data.shape)):
-        if i != axis:
-            rmin = np.min(data, axis=i, keepdims=True) if rmin is None else np.min(rmin, axis=i, keepdims=True)
-            rmax = np.max(data, axis=i, keepdims=True) if rmax is None else np.max(rmax, axis=i, keepdims=True)
-    rmin = np.minimum(rmin, 0)
-    rmax = np.maximum(rmax, 0)
-    scale, zero_point = calculate_scale_zp(rmin, rmax, quantize_range, qType, sym)
-    quantized_data = quantize_nparray(qType, data, scale, zero_point, low=quantize_range[0], high=quantize_range[1])
-    return rmin.reshape(-1, 1), rmax.reshape(-1, 1), zero_point.reshape(-1, 1), scale.reshape(-1, 1), quantized_data
-
-
-def dequantize_data_with_scale_zero(tensor_value, scale_value, zo_value):  # pragma: no cover
-    """Dequantize tensor with scale and zero point."""
-    return (tensor_value.astype(scale_value.dtype) - zo_value.astype(scale_value.dtype)) * scale_value
-
-
-def dequantize_data(tensor_value, scale_value, zo_value, axis=0):  # pragma: no cover
-    """Dequantize tensor."""
-    if not isinstance(scale_value, np.ndarray):
-        return dequantize_data_with_scale_zero(tensor_value, scale_value, zo_value)
-    else:
-        channel_count = tensor_value.shape[axis]  # TBD, default from axis 0
-        new_per_channel_tensor_values = []
-        for i in range(channel_count):
-            per_channel_tensor_value = tensor_value.take(i, axis)
-            per_channel_scale_value = scale_value.take(i)
-            per_channel_zero_value = zo_value.take(i)
-            new_per_channel_tensor_values.append(
-                dequantize_data_with_scale_zero(
-                    per_channel_tensor_value, per_channel_scale_value, per_channel_zero_value
-                )
-            )
-        # combine per_channel_data into one
-        reshape_dims = list(tensor_value.shape)  # deep copy
-        reshape_dims[axis] = 1  # only one per channel for reshape
-        new_tensor_value = new_per_channel_tensor_values[0].reshape(reshape_dims)
-        for i in range(1, channel_count):
-            new_per_channel_tensor_value = new_per_channel_tensor_values[i].reshape(reshape_dims)
-            new_tensor_value = np.concatenate((new_tensor_value, new_per_channel_tensor_value), axis)
-        return new_tensor_value
-
-
-class ValueInfo:  # pragma: no cover
-    """Represents a casted tensor info."""
-
-    def __init__(self, tensor_name, dtype, new_dtype):
-        """Initialization.
-
-        Args:
-            tensor_name (string): tensor name
-            dtype (int): original data type
-            new_dtype (int): target data type
-        """
-        self.tensor_name = tensor_name
-        self.dtype = dtype
-        self.new_dtype = new_dtype
-
-
 class QuantizedValue:
     """Represents a linearly quantized value (input/output/initializer)."""
 
@@ -612,9 +583,8 @@ def __init__(
         new_quantized_name,
         scale_name,
         zero_point_name,
-        quantized_value_type,
         axis=None,
-        qType=QuantType.QUInt8,
+        qType=1,
     ):
         """Initialization.
 
@@ -623,15 +593,13 @@ def __init__(
             new_quantized_name (string): quantized tensor name
             scale_name (string): scale name
             zero_point_name (string): zero point name
-            quantized_value_type (QuantizedValueType): quantized value type
             axis (int, optional): quantized axis. Defaults to None.
-            qType (int, optional): quantized data type. Defaults to QuantType.QUInt8.
+            qType (int, optional): quantized data type. Defaults to 1 (uint8).
         """
         self.name = name
         self.q_name = new_quantized_name
         self.scale_name = scale_name
         self.zp_name = zero_point_name
-        self.value_type = quantized_value_type
         self.axis = axis
         self.qType = qType
 
@@ -650,7 +618,7 @@ def __init__(
         data=[],
         quantized_data=[],
         axis=None,
-        qType=QuantType.QUInt8,
+        qType=1,
     ):
         """Initialization.
 
@@ -664,7 +632,7 @@ def __init__(
             data (list, optional): array version of the initializer. Defaults to [].
             quantized_data (list, optional): quantized data. Defaults to [].
             axis (int, optional): quantized axis. Defaults to None.
-            qType (int, optional): quantized data type. Defaults to QuantType.QUInt8.
+            qType (int, optional): quantized data type. Defaults to 1 (uint8).
         """
         self.name = name
         self.initializer = initializer  # TensorProto initializer in ONNX graph
@@ -681,93 +649,6 @@ def __init__(
         self.qType = qType
 
 
-class QuantizedValueType(enum.Enum):  # pragma: no cover
-    """Represent QuantizedValueType value."""
-
-    Input = 0
-    Initializer = 1
-
-
-def quantize_nparray(qtype, arr, scale, zero_point, low=None, high=None):
-    """Quantize numpy array."""
-    dtype = onnx.helper.tensor_dtype_to_np_dtype(qtype)
-    arr_fp32 = np.asarray((np.asarray(arr).astype(np.float32) / scale).round() + zero_point)
-    if low is not None and high is not None:
-        np.clip(arr_fp32, low, high, out=arr_fp32)
-    return arr_fp32.astype(dtype)
-
-
-def attribute_to_kwarg(attribute):
-    """Convert attribute to kwarg format for use with onnx.helper.make_node."""
-    attribute_mapping = {
-        1: attribute.f,
-        2: attribute.i,
-        3: attribute.s,
-        4: attribute.t,
-        5: attribute.g,
-        6: attribute.floats,
-        7: attribute.ints,
-        8: attribute.strings,
-        9: attribute.tensors,
-        10: attribute.graphs,
-    }
-    if attribute.type in attribute_mapping:
-        value = attribute_mapping[attribute.type]
-    else:  # pragma: no cover
-        raise ValueError(
-            "attribute {} has no type specified " "or unsupported type {}.".format(attribute.name, attribute.type)
-        )
-    return {attribute.name: value}
-
-
-def trt_env_setup(model):
-    """Set environment variable for Tensorrt Execution Provider."""
-    is_int8 = False
-    for node in model.graph.node:
-        if node.op_type in ["QuantizeLinear", "DequantizeLinear"]:
-            is_int8 = True
-            break
-    if is_int8:
-        os.environ["ORT_TENSORRT_INT8_ENABLE"] = "1"
-    else:
-        os.environ["ORT_TENSORRT_INT8_ENABLE"] = "0"
-
-
-def infer_shapes(in_mp, int_max=2**31 - 1, auto_merge=False, guess_output_rank=False, verbose=0, base_dir=""):
-    """Symbolic shape inference."""
-
-    class SymbolicShapeInference(symbolic_shape_infer.SymbolicShapeInference):
-        def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix="", base_dir=""):
-            super().__init__(int_max, auto_merge, guess_output_rank, verbose, prefix)
-            self.base_dir = base_dir
-
-        def _get_value(self, node, idx):
-            name = node.input[idx]
-            assert name in self.sympy_data_ or name in self.initializers_
-            return (
-                self.sympy_data_[name]
-                if name in self.sympy_data_
-                else onnx.numpy_helper.to_array(self.initializers_[name], base_dir=self.base_dir)
-            )
-
-    onnx_opset = symbolic_shape_infer.get_opset(in_mp)
-    if (not onnx_opset) or onnx_opset < 7:
-        logger.warning("Only support models of onnx opset 7 and above.")
-        return None
-    symbolic_shape_inference = SymbolicShapeInference(
-        int_max, auto_merge, guess_output_rank, verbose, base_dir=base_dir
-    )
-    all_shapes_inferred = False
-    symbolic_shape_inference._preprocess(in_mp)
-    while symbolic_shape_inference.run_:
-        all_shapes_inferred = symbolic_shape_inference._infer_impl()
-    symbolic_shape_inference._update_output_from_vi()
-    if not all_shapes_inferred:
-        onnx.save_model(symbolic_shape_inference.out_mp_, "sym_shape_infer_temp.onnx", save_as_external_data=True)
-        raise Exception("Incomplete symbolic shape inference")
-    return symbolic_shape_inference.out_mp_
-
-
 def dump_model_op_stats(model, quantize_config, fp32_op_list):
     qdq_ops = ["QuantizeLinear", "DequantizeLinear", "DynamicQuantizeLinear"]
     res = {}
@@ -809,49 +690,3 @@ def dump_model_op_stats(model, quantize_config, fp32_op_list):
     ]
 
     utility.Statistics(output_data, header="Quantization Statistics", field_names=field_names).print_stat()
-
-
-def dump_woq_stats(model, quantize_config, fp32_op_list):
-    res = {}
-    for optype in fp32_op_list:
-        res[optype] = {}
-
-    dtype_set = set()
-    for node in model.graph.node:
-        if node.op_type in ["MatMulFpQ4", "MatMulNBits"]:
-            optype = "MatMul"
-        else:
-            optype = node.op_type
-
-        if optype not in res:
-            continue
-        if re.fullmatch("^.*_Q\d*G\d*", node.input[1]):
-            search_out = re.search("_Q\d*", node.input[1])
-            dtype = "A32W{}G{}".format(
-                node.input[1][search_out.start() + 2 : search_out.end()], node.input[1][search_out.end() + 1 :]
-            )
-        else:
-            dtype = "FP32"
-        dtype_set.add(dtype)
-
-        if dtype in res[optype]:
-            res[optype][dtype] += 1
-        else:
-            res[optype][dtype] = 1
-
-    dtype_list = list(dtype_set)
-    for dtype in dtype_list:
-        for optype in res.keys():
-            if dtype not in res[optype]:
-                res[optype][dtype] = 0
-
-    # update stats format for dump.
-    field_names = ["Op Type", "Total"]
-    field_names.extend(dtype_list)
-    output_data = []
-    for op_type in res.keys():
-        field_results = [op_type, sum(res[op_type].values())]
-        field_results.extend([res[op_type][dtype] for dtype in dtype_list])
-        output_data.append(field_results)
-
-    utility.Statistics(output_data, header="Mixed Precision Statistics", field_names=field_names).print_stat()
diff --git a/onnx_neural_compressor/algorithms/weight_only/awq.py b/onnx_neural_compressor/algorithms/weight_only/awq.py
index b2db33dcb..9e07b45a6 100644
--- a/onnx_neural_compressor/algorithms/weight_only/awq.py
+++ b/onnx_neural_compressor/algorithms/weight_only/awq.py
@@ -24,7 +24,7 @@
 import onnxruntime as ort
 from packaging import version
 
-from onnx_neural_compressor import config, constants, data_reader, logger, onnx_model
+from onnx_neural_compressor import constants, data_reader, logger, onnx_model
 from onnx_neural_compressor.algorithms import utility as quant_utils
 from onnx_neural_compressor.algorithms.weight_only import rtn
 
@@ -39,7 +39,7 @@ def _get_weight_scale(weight, group_size):
     return scale
 
 
-def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, group_size, sym):
+def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts):
     """Apply scale for salient weight."""
     best_scales = {}
     new_init_tensors = []
@@ -48,6 +48,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
     updated_nodes = []
     base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
 
+    input_name_to_nodes = model.input_name_to_nodes()
     for parent, nodes in absorb_pairs.items():
         if any([node.input[0] not in output_dicts for node in nodes]):
             logger.warning(
@@ -61,13 +62,17 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
         dtype = None
         weight = []
         org_out = []
+
+        num_bits = weight_config[nodes[0].name].get("weight_bits", 4)
+        group_size = weight_config[nodes[0].name].get("weight_group_size", 32)
+        sym = weight_config[nodes[0].name].get("weight_sym", True)
+        accuracy_level = weight_config[nodes[0].name].get("accuracy_level", 0)
+
+        # use same params for all children of one parent
         for node in nodes:
-            if node.name in weight_config and weight_config.get(node.name, "fp32") != "fp32":
-                num_bits = weight_config[node.name].get("weight_bits", 4)
-                group_size = weight_config[node.name].get("weight_group_size", 32)
-                sym = weight_config[node.name].get("weight_sym", True)
-                accuracy_level = weight_config[node.name].get("accuracy_level", 0)
-                break
+            weight_config.setdefault(node.name, {}).update({"weight_bits": num_bits})
+            weight_config.setdefault(node.name, {}).update({"weight_group_size": group_size})
+            weight_config.setdefault(node.name, {}).update({"weight_sym": sym})
 
         # search scale
         best_error = float("inf")
@@ -79,9 +84,6 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
             ratio = ratio * 1 / n_grid
             loss = 0
             for node in nodes:
-                if weight_config.get((node.name, node.op_type), {}) == "fp32":
-                    continue
-
                 weight = onnx.numpy_helper.to_array(model.get_initializer(node.input[1]), base_dir)
                 if len(weight.shape) != 2:
                     continue
@@ -103,9 +105,17 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
                 ):  # pragma: no cover
                     # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions
                     # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1
-                    q_weight = quant_utils.qdq_tensor(weight, num_bits, group_size, sym, "uint")
+                    q_weight = quant_utils.qdq_data(
+                        weight.reshape((-1, group_size)),
+                        "uint" + str(num_bits),
+                        sym,
+                    ).reshape(weight.shape)
                 else:
-                    q_weight = quant_utils.qdq_tensor(weight, num_bits, group_size, sym, "int")
+                    q_weight = quant_utils.qdq_data(
+                        weight.reshape((-1, group_size)),
+                        "int" + str(num_bits),
+                        sym,
+                    ).reshape(weight.shape)
 
                 q_weight = q_weight[: org_w_shape[0], :] / np.expand_dims(scales, axis=-1)
                 out = np.matmul(inp, q_weight)
@@ -118,10 +128,6 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
                 best_scale = scales
 
         for node in nodes:
-            weight_config.setdefault(node.name, {}).update({"weight_bits": num_bits})
-            weight_config.setdefault(node.name, {}).update({"weight_group_size": group_size})
-            weight_config.setdefault(node.name, {}).update({"weight_sym": sym})
-
             init_share_num = model.get_initializer_share_num(node.input[1])
             weight_tensor = model.get_initializer(node.input[1])
             tensor = onnx.numpy_helper.to_array(weight_tensor, base_dir)
@@ -131,7 +137,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
 
             new_tensor = onnx.helper.make_tensor(
                 name=node.input[1] + "_scaled",
-                data_type=quant_utils.dtype_mapping[str(dtype)],
+                data_type=onnx.helper.np_dtype_to_tensor_dtype(dtype),
                 dims=tensor.shape,
                 vals=tensor.tobytes(),
                 raw=True,
@@ -147,7 +153,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
             continue
 
         if parent.op_type in ["LayerNormalization", "BatchNormalization", "InstanceNormalization"] and len(
-            model.input_name_to_nodes()[nodes[0].input[0]]
+            input_name_to_nodes[nodes[0].input[0]]
         ) == len(nodes):
             for idx in [1, 2]:
                 tensor = onnx.numpy_helper.to_array(model.get_initializer(parent.input[idx]), base_dir)
@@ -160,7 +166,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
         elif (
             parent.op_type in ["SimplifiedLayerNormalization", "MatMul", "Gemm", "Mul"]
             and not all([model.get_initializer(inp) is None for inp in parent.input])
-            and len(model.input_name_to_nodes()[nodes[0].input[0]]) == len(nodes)
+            and len(input_name_to_nodes[nodes[0].input[0]]) == len(nodes)
         ):  # pragma: no cover
             for inp in parent.input:
                 if model.get_initializer(inp) is not None:
@@ -171,7 +177,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
             updated_nodes.append(parent.name)
             output_dicts[parent.output[0]] = output_dicts[parent.output[0]] / np.reshape(best_scale, (1, -1))
 
-        elif parent.op_type in ["Conv", "FusedConv"] and len(model.input_name_to_nodes()[nodes[0].input[0]]) == len(
+        elif parent.op_type in ["Conv", "FusedConv"] and len(input_name_to_nodes[nodes[0].input[0]]) == len(
             nodes
         ):  # pragma: no cover
             tensor = onnx.numpy_helper.to_array(model.get_initializer(parent.input[2]), base_dir)
@@ -185,7 +191,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
             # insert mul
             scale_tensor = onnx.helper.make_tensor(
                 name=parent.output[0] + "_weight_only_scale",
-                data_type=quant_utils.dtype_mapping[str(dtype)],
+                data_type=onnx.helper.np_dtype_to_tensor_dtype(dtype),
                 dims=best_scale.shape,
                 vals=(1.0 / best_scale).flatten().tolist(),
             )
@@ -211,7 +217,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
     return model, output_dicts
 
 
-def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts, num_bits, group_size, sym):
+def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts):
     """Apply clip for weight by checking mse."""
     base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
     ratios = {}
@@ -227,11 +233,10 @@ def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts, num_bits,
         inp = np.concatenate(output_dicts[nodes[0].input[0]], axis=0)
 
         for node in nodes:
-            if node.name in weight_config:
-                num_bits = weight_config[node.name].get("weight_bits", 4)
-                group_size = weight_config[node.name].get("weight_group_size", 32)
-                sym = weight_config[node.name].get("weight_sym", True)
-                accuracy_level = weight_config[node.name].get("accuracy_level", 0)
+            num_bits = weight_config[node.name].get("weight_bits", 4)
+            group_size = weight_config[node.name].get("weight_group_size", 32)
+            sym = weight_config[node.name].get("weight_sym", True)
+            accuracy_level = weight_config[node.name].get("accuracy_level", 0)
 
             org_weight = onnx.numpy_helper.to_array(model.get_initializer(node.input[1]), base_dir=base_dir)
             org_w_shape = org_weight.shape  # ic, oc
@@ -254,9 +259,19 @@ def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts, num_bits,
                 ):  # pragma: no cover
                     # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions
                     # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1
-                    weight = quant_utils.qdq_tensor(weight, num_bits, group_size, sym, "uint", ratio)
+                    weight = quant_utils.qdq_data(
+                        weight.reshape((-1, group_size)),
+                        "uint" + str(num_bits),
+                        sym,
+                        ratio=ratio,
+                    ).reshape(org_weight.shape)
                 else:
-                    weight = quant_utils.qdq_tensor(weight, num_bits, group_size, sym, "int", ratio)
+                    weight = quant_utils.qdq_data(
+                        weight.reshape((-1, group_size)),
+                        "int" + str(num_bits),
+                        sym,
+                        ratio=ratio,
+                    ).reshape(org_weight.shape)
 
                 cur_out = np.matmul(inp, weight[:, : org_w_shape[0]].T)
                 loss = np.mean(np.power((org_out - cur_out), 2))
@@ -272,12 +287,8 @@ def awq_quantize(
     model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str],
     data_reader: data_reader.CalibrationDataReader,
     weight_config: dict = {},
-    num_bits: int = 4,
-    group_size: int = 32,
-    sym: bool = False,
     enable_auto_scale: bool = True,
     enable_mse_search: bool = True,
-    accuracy_level: int = 0,
     providers: List[str] = ["CPUExecutionProvider"],
 ) -> onnx.ModelProto:
     """Quant the model with Activation-aware Weight quantization(AWQ) method.
@@ -297,16 +308,10 @@ def awq_quantize(
                     'accuracy_level': 0
                 }
             }. Defaults to {}.
-        num_bits (int, optional): number of bits used to represent weights. Defaults to 4.
-        group_size (int, optional): size of weight groups. Defaults to 32.
-        sym (bool, optional): indicates whether weights are symmetric. Defaults to False.
         enable_auto_scale (bool, optional): whether to search for best scales based on activation
             distribution. Defaults to True.
         enable_mse_search (bool, optional): whether to search for the best clip range from range
             [0.91, 1.0, 0.01]. Defaults to True.
-        accuracy_level (int, optional): accuracy level. Support 0 (unset),
-            1(fp32 compute type of jblas kernel), 2 (fp16 compute type of jblas kernel),
-            3 (bf16 compute type of jblas kernel), 4 (int8 compute type of jblas kernel). Defaults to 0.
         providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"].
 
     Returns:
@@ -352,11 +357,13 @@ def awq_quantize(
             else ort.InferenceSession(model.model_path + "_augment.onnx", so, providers=providers)
         )
 
+        output_name_to_node = model.output_name_to_node()
+        input_name_to_nodes = model.input_name_to_nodes()
         for input_name in output_names:
-            parent = model.output_name_to_node()[input_name]
+            parent = output_name_to_node[input_name]
             dump_pairs = {parent.name: []}
 
-            for node in model.input_name_to_nodes()[input_name]:
+            for node in input_name_to_nodes[input_name]:
                 # check op_type of node is MatMul
                 # check dim 1 of input is weight tensor
                 # check weight_type is not "fp32"
@@ -381,9 +388,6 @@ def awq_quantize(
                     weight_config,
                     dump_pairs,
                     output_dicts,
-                    num_bits,
-                    group_size,
-                    sym,
                 )
             if enable_mse_search:
                 ratios = _apply_awq_clip(
@@ -391,9 +395,6 @@ def awq_quantize(
                     weight_config,
                     dump_pairs,
                     output_dicts,
-                    num_bits,
-                    group_size,
-                    sym,
                 )
             del output_dicts
             del dump_pairs
@@ -401,7 +402,7 @@ def awq_quantize(
 
         model.remove_tensors_from_outputs(output_names)
         model.model.graph.output.MergeFrom(org_output)
-    model = rtn.rtn_quantize(model, weight_config, num_bits, group_size, sym, full_ratio, accuracy_level, providers)
+    model = rtn.rtn_quantize(model, weight_config, full_ratio, providers)
     return model
 
 
@@ -409,6 +410,9 @@ def apply_awq_on_model(
     model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str],
     quant_config: dict,
     calibration_data_reader: data_reader.CalibrationDataReader,
+    enable_auto_scale: bool = True,
+    enable_mse_search: bool = True,
+    providers: List[str] = ["CPUExecutionProvider"],
 ) -> onnx.ModelProto:
     """Apply Activation-aware Weight quantization(AWQ) on onnx model.
 
@@ -421,12 +425,11 @@ def apply_awq_on_model(
         onnx.ModelProto: quantized onnx model.
     """
     # set model params
-    kwargs = {}
-    kwargs = {key: quant_config.pop(key) for key in config.AWQConfig.model_params_list if key in quant_config}
-
-    # change op config to dict type
-    for op_name_type, op_config in quant_config.items():
-        if isinstance(op_config, config.AWQConfig):
-            quant_config[op_name_type] = op_config.to_dict()
-
-    return awq_quantize(model, data_reader=calibration_data_reader, weight_config=quant_config, **kwargs)
+    kwargs = {
+        "enable_auto_scale": enable_auto_scale,
+        "enable_mse_search": enable_mse_search,
+        "providers": providers,
+    }
+    q_model = awq_quantize(model, data_reader=calibration_data_reader, weight_config=quant_config, **kwargs)
+    quant_utils.dump_woq_stats(q_model, quant_config)
+    return q_model
diff --git a/onnx_neural_compressor/algorithms/weight_only/gptq.py b/onnx_neural_compressor/algorithms/weight_only/gptq.py
index c95c346f8..ae3813280 100644
--- a/onnx_neural_compressor/algorithms/weight_only/gptq.py
+++ b/onnx_neural_compressor/algorithms/weight_only/gptq.py
@@ -24,9 +24,10 @@
 import onnxruntime as ort
 from packaging.version import Version
 
-from onnx_neural_compressor import config, constants, data_reader, onnx_model, utility
+from onnx_neural_compressor import constants, data_reader, onnx_model, utility
 from onnx_neural_compressor.algorithms import utility as quant_utils
 from onnx_neural_compressor.algorithms.layer_wise import core
+from onnx_neural_compressor.quantization import config
 
 from typing import List, Union  # isort: skip
 
@@ -178,15 +179,11 @@ def gptq_quantize(
     model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str],
     data_reader: data_reader.CalibrationDataReader,
     weight_config: dict = {},
-    num_bits: int = 4,
-    group_size: int = 32,
-    sym: bool = False,
     percdamp: float = 0.01,
     block_size: int = 128,
     actorder: bool = False,
     mse: bool = False,
     perchannel: bool = True,
-    accuracy_level: int = 0,
     providers: List[str] = ["CPUExecutionProvider"],
     return_modelproto: bool = True,
 ):
@@ -206,9 +203,6 @@ def gptq_quantize(
                         'weight_sym': True,
                         'accuracy_level': 0
                     }. Defaults to {}.
-        num_bits (int, optional): number of bits used to represent weights. Defaults to 4.
-        group_size (int, optional): size of weight groups. Defaults to 32.
-        sym (bool, optional): indicates whether weights are symmetric. Defaults to False.
         percdamp (float, optional): percentage of Hessian's diagonal values' average, which will be added
             to Hessian's diagonal to increase numerical stability. Defaults to 0.01.
         block_size (int, optional): execute GPTQ quantization per block. Defaults to 128.
@@ -216,9 +210,6 @@ def gptq_quantize(
             quantization order. Defaults to False.
         mse (bool, optional): whether get scale and zero point with mse error. Defaults to False.
         perchannel (bool, optional): whether quantize weight per-channel. Defaults to True.
-        accuracy_level (int, optional): accuracy level. Support 0 (unset),
-            1(fp32 compute type of jblas kernel), 2 (fp16 compute type of jblas kernel),
-            3 (bf16 compute type of jblas kernel), 4 (int8 compute type of jblas kernel). Defaults to 0.
         providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"].
         return_modelproto (bool, optionmal): whether to return onnx.Modelproto. set False for layer-wise quant.
             Default to True
@@ -262,12 +253,14 @@ def gptq_quantize(
         else ort.InferenceSession(model.model_path + "_augment.onnx", so, providers=providers)
     )
 
+    input_name_to_nodes = model.input_name_to_nodes()
+
     for idx, input_name in enumerate(output_names):
         utility.simple_progress_bar(len(output_names), idx + 1)
         node_list = []
         weights = []
 
-        for node in model.input_name_to_nodes()[input_name]:
+        for node in input_name_to_nodes[input_name]:
             # check op_type of node is MatMul
             # check dim 1 of input is weight tensor
             # check weight_type is not "fp32"
@@ -304,11 +297,10 @@ def gptq_quantize(
             weight,
             H,
         ) in zip(node_list, weights, Hs):
-            if node.name in weight_config:
-                num_bits = weight_config[node.name].get("weight_bits", 4)
-                group_size = weight_config[node.name].get("weight_group_size", 32)
-                sym = weight_config[node.name].get("weight_sym", True)
-                accuracy_level = weight_config[node.name].get("accuracy_level", 0)
+            num_bits = weight_config[node.name].get("weight_bits", 4)
+            group_size = weight_config[node.name].get("weight_group_size", 32)
+            sym = weight_config[node.name].get("weight_sym", True)
+            accuracy_level = weight_config[node.name].get("accuracy_level", 0)
             group_size = group_size if group_size != -1 else weight.shape[0]
             dtype = weight.dtype
 
@@ -341,7 +333,12 @@ def gptq_quantize(
                 org_shape = weight.shape
                 k_blocks = (org_shape[0] + group_size - 1) // group_size
                 q_weight = quant_utils.pad_tensor(q_weight, group_size, k_blocks)
-                q_weight, scale, zp = quant_utils.quant_tensor(q_weight.T, num_bits, group_size, sym, "uint")
+                _, _, zp, scale, q_weight = quant_utils.quantize_data(
+                    q_weight.T,
+                    "uint" + str(num_bits),
+                    sym,
+                    axis=1,
+                )
                 q_matmul_node, new_inits = quant_utils.make_matmul_weight_only_node(
                     node=node,
                     weight_shape=org_shape,
@@ -360,7 +357,7 @@ def gptq_quantize(
             else:
                 q_weight_tensor = onnx.helper.make_tensor(
                     name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)),
-                    data_type=quant_utils.dtype_mapping[str(dtype)],
+                    data_type=onnx.helper.np_dtype_to_tensor_dtype(dtype),
                     dims=q_weight.shape,
                     vals=q_weight.astype(dtype).tobytes(),
                     raw=True,
@@ -390,6 +387,13 @@ def apply_gptq_on_model(
     model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str],
     quant_config: dict,
     calibration_data_reader: data_reader.CalibrationDataReader,
+    percdamp: float = 0.01,
+    block_size: int = 128,
+    actorder: bool = False,
+    mse: bool = False,
+    perchannel: bool = True,
+    providers: List[str] = ["CPUExecutionProvider"],
+    layer_wise_quant: bool = False,
 ) -> onnx.ModelProto:
     """Apply GPTQ on onnx model.
 
@@ -401,18 +405,17 @@ def apply_gptq_on_model(
     Returns:
         onnx.ModelProto: quantized onnx model.
     """
-    # check whether to do layer_wise quant
-    layer_wise = quant_config.pop("layer_wise_quant", False)
-
     # set other model params
-    quant_kwargs = {}
-    quant_kwargs = {key: quant_config.pop(key) for key in config.GPTQConfig.model_params_list if key in quant_config}
-
-    # change op config to dict type
-    for op_name_type, op_config in quant_config.items():
-        if isinstance(op_config, config.GPTQConfig):
-            quant_config[op_name_type] = op_config.to_dict()
-    if layer_wise:
+    quant_kwargs = {
+        "percdamp": percdamp,
+        "block_size": block_size,
+        "actorder": actorder,
+        "mse": mse,
+        "perchannel": perchannel,
+        "providers": providers,
+    }
+
+    if layer_wise_quant:
         quantized_model = core.layer_wise_quant(
             model,
             quant_func=gptq_quantize,
@@ -427,4 +430,5 @@ def apply_gptq_on_model(
 
     if isinstance(quantized_model, onnx_model.ONNXModel):
         quantized_model = quantized_model.model
+    quant_utils.dump_woq_stats(quantized_model, quant_config)
     return quantized_model
diff --git a/onnx_neural_compressor/algorithms/weight_only/rtn.py b/onnx_neural_compressor/algorithms/weight_only/rtn.py
index 6856f378d..8837ad7ae 100644
--- a/onnx_neural_compressor/algorithms/weight_only/rtn.py
+++ b/onnx_neural_compressor/algorithms/weight_only/rtn.py
@@ -23,7 +23,7 @@
 import onnxruntime as ort
 from packaging import version
 
-from onnx_neural_compressor import config, constants, onnx_model, utility
+from onnx_neural_compressor import constants, onnx_model, utility
 from onnx_neural_compressor.algorithms import utility as quant_utils
 from onnx_neural_compressor.algorithms.layer_wise import core
 
@@ -33,11 +33,7 @@
 def rtn_quantize(
     model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str],
     weight_config: dict = {},
-    num_bits: int = 4,
-    group_size: int = 32,
-    sym: bool = False,
     ratios: dict = {},
-    accuracy_level: int = 0,
     providers: List[str] = ["CPUExecutionProvider"],
     return_modelproto: bool = True,
 ):
@@ -57,14 +53,7 @@ def rtn_quantize(
                         'accuracy_level': 0
                     }
             }. Defaults to {}.
-        num_bits (int, optional): number of bits used to represent weights. Defaults to 4.
-        group_size (int, optional): size of weight groups. Defaults to 32.
-        sym (bool, optional): indicates whether weights are symmetric. Defaults to False.
         ratios (dict, optional): percentile of clip. Defaults to {}.
-        accuracy_level (int, optional):
-            accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel),
-            2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel),
-            4 (int8 compute type of jblas kernel). Defaults to 0.
         providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"].
         return_modelproto (bool, optionmal): whether to return onnx.Modelproto. set False for layer-wise quant.
             Default to True
@@ -97,11 +86,10 @@ def rtn_quantize(
                 continue
 
             dtype = weight.dtype
-            if node.name in weight_config:
-                num_bits = weight_config[node.name].get("weight_bits", 4)
-                group_size = weight_config[node.name].get("weight_group_size", 32)
-                sym = weight_config[node.name].get("weight_sym", True)
-                accuracy_level = weight_config[node.name].get("accuracy_level", 0)
+            num_bits = weight_config[node.name].get("weight_bits", 4)
+            group_size = weight_config[node.name].get("weight_group_size", 32)
+            sym = weight_config[node.name].get("weight_sym", True)
+            accuracy_level = weight_config[node.name].get("accuracy_level", 0)
 
             org_w_shape = weight.shape  # ic, oc
             group_size = group_size if group_size != -1 else org_w_shape[0]
@@ -123,8 +111,12 @@ def rtn_quantize(
             ):  # pragma: no cover
                 # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions, supported by CPU EP
                 # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1, supported by CPU EP AND CUDA EP
-                q_weight, scale, zp = quant_utils.quant_tensor(
-                    weight.T, num_bits, group_size, sym, "uint", ratios.get(node.input[1], 1)
+                _, _, zp, scale, q_weight = quant_utils.quantize_data(
+                    weight.T.reshape((-1, group_size)),
+                    "uint" + str(num_bits),
+                    sym,
+                    ratio=ratios.get(node.input[1], 1),
+                    axis=1,
                 )
                 q_matmul_node, new_inits = quant_utils.make_matmul_weight_only_node(
                     node=node,
@@ -142,15 +134,18 @@ def rtn_quantize(
                 remove_nodes.append(node)
                 new_nodes.append(q_matmul_node)
             else:
-                q_weight = quant_utils.qdq_tensor(
-                    weight.T, num_bits, group_size, sym, "int", ratios.get(node.input[1], 1)
-                )
+                q_weight = quant_utils.qdq_data(
+                    weight.T.reshape((-1, group_size)),
+                    "int" + str(num_bits),
+                    sym,
+                    ratio=ratios.get(node.input[1], 1),
+                    axis=1)
                 q_weight = np.reshape(q_weight, (org_w_shape[1], -1))
                 q_weight = np.transpose(q_weight)
                 q_weight = q_weight[: org_w_shape[0], :].astype(dtype)
                 q_weight_tensor = onnx.helper.make_tensor(
                     name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)),
-                    data_type=quant_utils.dtype_mapping[str(dtype)],
+                    data_type=onnx.helper.np_dtype_to_tensor_dtype(dtype),
                     dims=weight.shape,
                     vals=q_weight.tobytes(),
                     raw=True,
@@ -175,7 +170,11 @@ def rtn_quantize(
 
 
 def apply_rtn_on_model(
-    model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str], quant_config: dict
+    model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str],
+    quant_config: dict,
+    ratios: dict = {},
+    providers: List[str] = ["CPUExecutionProvider"],
+    layer_wise_quant: bool = False,
 ) -> onnx.ModelProto:
     """Apply RTN on onnx model.
 
@@ -186,19 +185,12 @@ def apply_rtn_on_model(
     Returns:
         onnx.ModelProto: quantized onnx model.
     """
-    # check whether to do layer_wise quant
-    layer_wise = quant_config.pop("layer_wise_quant", False)
-
-    # set other model params
-    quant_kwargs = {}
-    quant_kwargs = {key: quant_config.pop(key) for key in config.RTNConfig.model_params_list if key in quant_config}
-
-    # change op config to dict type
-    for op_name_type, op_config in quant_config.items():
-        if isinstance(op_config, config.RTNConfig):
-            quant_config[op_name_type] = op_config.to_dict()
+    quant_kwargs = {
+        "ratios": ratios,
+        "providers": providers,
+    }
 
-    if layer_wise:
+    if layer_wise_quant:
         quantized_model = core.layer_wise_quant(
             model, quant_func=rtn_quantize, weight_config=quant_config, **quant_kwargs
         )
@@ -207,4 +199,5 @@ def apply_rtn_on_model(
 
     if isinstance(quantized_model, onnx_model.ONNXModel):
         quantized_model = quantized_model.model
+    quant_utils.dump_woq_stats(quantized_model, quant_config)
     return quantized_model
diff --git a/onnx_neural_compressor/data_reader.py b/onnx_neural_compressor/data_reader.py
index eacacd52a..7f76769f0 100644
--- a/onnx_neural_compressor/data_reader.py
+++ b/onnx_neural_compressor/data_reader.py
@@ -14,15 +14,25 @@
 
 import abc
 
-from onnxruntime import quantization
 
+class CalibrationDataReader(metaclass=abc.ABCMeta):
+    @classmethod
+    def __subclasshook__(cls, subclass):
+        return hasattr(subclass, "get_next") and callable(subclass.get_next) or NotImplemented
 
-class CalibrationDataReader(quantization.CalibrationDataReader):
-    """Get data for calibration.
+    @abc.abstractmethod
+    def get_next(self) -> dict:
+        """generate the input data dict for ONNXinferenceSession run"""
+        raise NotImplementedError
+
+    def __iter__(self):
+        return self
 
-    We define our CalibrationDataReader based on the class in below link:
-    https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/calibrate.py#L139
-    """
+    def __next__(self):
+        result = self.get_next()
+        if result is None:
+            raise StopIteration
+        return result
 
     @abc.abstractmethod
     def rewind(self):
diff --git a/onnx_neural_compressor/onnx_model.py b/onnx_neural_compressor/onnx_model.py
index ed3df7f6a..20fcb95e3 100644
--- a/onnx_neural_compressor/onnx_model.py
+++ b/onnx_neural_compressor/onnx_model.py
@@ -21,12 +21,11 @@
 
 import onnx
 import transformers
-from onnxruntime.quantization import onnx_model
 
 from onnx_neural_compressor import constants, logger, utility
 
 
-class ONNXModel(onnx_model.ONNXModel):
+class ONNXModel:
     """Build ONNX model."""
 
     def __init__(self, model, **kwargs):
@@ -36,7 +35,6 @@ def __init__(self, model, **kwargs):
             model (str or ModelProto): path to onnx model or loaded ModelProto model object.
         """
         self.model = model if not isinstance(model, str) else onnx.load(model, load_external_data=False)
-        super().__init__(self.model)
 
         self._model_path = None if not isinstance(model, str) else model
         self.check_is_large_model()
@@ -51,12 +49,57 @@ def __init__(self, model, **kwargs):
         if isinstance(model, str) and os.path.exists(pathlib.Path(model).parent.joinpath("config.json").as_posix()):
             self._config = transformers.PretrainedConfig.from_pretrained(pathlib.Path(model).parent.as_posix())
         self.node_name_counter = {}
-        self._output_name_to_node = self.output_name_to_node()
-        self._input_name_to_nodes = self.input_name_to_nodes()
+        self._output_name_to_node = {}
+        self._input_name_to_nodes = {}
+        self._get_output_name_to_node(self.model.graph.node)
+        self._get_input_name_to_nodes(self.model.graph.node)
         self._graph_info = {}
         self._get_graph_info()
         self._q_config = None
 
+    def output_name_to_node(self):
+        self._output_name_to_node = {}
+        self._get_output_name_to_node(self.model.graph.node)
+        return self._output_name_to_node
+
+    def input_name_to_nodes(self):
+        self._input_name_to_nodes = {}
+        self._get_input_name_to_nodes(self.model.graph.node)
+        return self._input_name_to_nodes
+
+    def _get_input_name_to_nodes(self, nodes):
+        """Get input names of nodes."""
+        for node in nodes:
+            attrs = [
+                attr
+                for attr in node.attribute
+                if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
+            ]
+            if len(attrs) > 0:
+                for attr in attrs:
+                    self._get_input_name_to_nodes(attr.g.node)
+            for input_name in node.input:
+                if len(input_name.strip()) != 0:
+                    if input_name not in self._input_name_to_nodes:
+                        self._input_name_to_nodes[input_name] = [node]
+                    else:
+                        self._input_name_to_nodes[input_name].append(node)
+
+    def _get_output_name_to_node(self, nodes):
+        """Get output names of nodes."""
+        for node in nodes:
+            attrs = [
+                attr
+                for attr in node.attribute
+                if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
+            ]
+            if len(attrs) > 0:
+                for attr in attrs:
+                    self._get_output_name_to_node(attr.g.node)
+            for output_name in node.output:
+                if len(output_name.strip()) != 0:
+                    self._output_name_to_node[output_name] = node
+
     @property
     def model_path(self):
         """Return model path."""
@@ -99,6 +142,11 @@ def framework(self):
         """Return framework."""
         return "onnxruntime"
 
+    def add_initializer(self, tensor):
+        """Add a initializer to model."""
+        if tensor.name not in [i.name for i in self._model.graph.initializer]:
+            self._model.graph.initializer.append(tensor)
+
     def add_initializers(self, tensors):
         """Add initializers to model."""
         for tensor in tensors:
@@ -127,6 +175,42 @@ def output(self):
         """Return output of model."""
         return [i.name for i in self.model.graph.output]
 
+    @property
+    def model(self):
+        """Return model itself."""
+        return self._model
+
+    @model.setter
+    def model(self, model):
+        """Set model itself."""
+        self._model = model
+        self._graph_info = {}
+        self._get_graph_info()
+        self._output_name_to_node = {}
+        self._input_name_to_nodes = {}
+        self._get_input_name_to_nodes(self._model.graph.node)
+        self._get_output_name_to_node(self._model.graph.node)
+
+    def nodes(self):
+        """Return model nodes."""
+        return self._model.graph.node
+
+    def initializer(self):
+        """Return model initializer."""
+        return self._model.graph.initializer
+
+    def graph(self):
+        """Return model graph."""
+        return self._model.graph
+
+    def ir_version(self):
+        """Return model ir_version."""
+        return self._model.ir_version
+
+    def opset_import(self):
+        """Return model opset_import."""
+        return self._model.opset_import
+
     def update(self):
         """Update model info."""
         self._graph_info = {}
@@ -144,6 +228,10 @@ def _get_graph_info(self):
         for node in self.model.graph.node:
             self.graph_info.update({node.name: node.op_type})
 
+    def is_graph_output(self, name):
+        """Check whether the tensor is the graph output."""
+        return name in self.output()
+
     def save(self, root):
         """Save ONNX model."""
         if os.path.split(root)[0] != "" and not os.path.exists(os.path.split(root)[0]):
@@ -168,6 +256,53 @@ def save(self, root):
             output_config_file = pathlib.Path(root).parent.joinpath("config.json").as_posix()
             self._config.to_json_file(output_config_file, use_diff=False)
 
+    def remove_initializer(self, tensor):
+        """Remove an initializer from model."""
+        if tensor in self._model.graph.initializer:
+            self._model.graph.initializer.remove(tensor)
+
+    def remove_initializers(self, init_to_remove):
+        """Remove initializers from model."""
+        for initializer in init_to_remove:
+            self.remove_initializer(initializer)
+
+    def get_initializer(self, name):
+        """"Find the initializer with specified name."""
+        for initializer in self.model.graph.initializer:
+            if initializer.name == name:
+                return initializer
+        return None
+
+    def remove_node(self, node):
+        """Remove a node from model."""
+        if node in self._model.graph.node:
+            self._model.graph.node.remove(node)
+
+    def remove_nodes(self, nodes_to_remove):
+        """Remove nodes from model."""
+        for node in nodes_to_remove:
+            self.remove_node(node)
+
+    def add_node(self, node):
+        """Add a node to model."""
+        self._model.graph.node.extend([node])
+
+    def add_nodes(self, nodes_to_add):
+        """Add nodes to model."""
+        self._model.graph.node.extend(nodes_to_add)
+
+    def get_children(self, node, input_name_to_nodes=None):
+        """Get children nodes."""
+        if input_name_to_nodes is None:
+            input_name_to_nodes = self._input_name_to_nodes
+
+        children = []
+        for output in node.output:
+            if output in input_name_to_nodes:
+                for child in input_name_to_nodes[output]:
+                    children.append(child)
+        return children
+
     def get_initializer_share_num(self, name):
         """Get the number of shares of initializer."""
         num = 0
@@ -186,6 +321,25 @@ def get_node(self, name):
                 return node
         return None
 
+    def get_parent(self, node, idx, output_name_to_node=None):
+        if output_name_to_node is None:
+            output_name_to_node = self._output_name_to_node
+        if len(node.input) <= idx:
+            return None
+
+        input = node.input[idx]
+        return output_name_to_node.get(input, None)
+
+    def get_parents(self, node, output_name_to_node=None):
+        if output_name_to_node is None:
+            output_name_to_node = self._output_name_to_node
+
+        parents = []
+        for input in node.input:
+            if input in output_name_to_node:
+                parents.append(output_name_to_node[input])
+        return parents
+
     def get_node_by_weight(self, weight_name):
         """Get a node by its weight name."""
         if len(self._input_name_to_nodes) == 0:
@@ -277,6 +431,22 @@ def _searcher(tensor_name):
             assert zo_tensor, "missing zero point for tensor {}".format(tensor)
             return scale_tensor, zo_tensor
 
+    @staticmethod
+    def replace_node_input(node, old_input_name, new_input_name):
+        """Replace input of a node."""
+        assert isinstance(old_input_name, str) and isinstance(new_input_name, str)
+        for j in range(len(node.input)):
+            if node.input[j] == old_input_name:
+                node.input[j] = new_input_name
+
+    @staticmethod
+    def replace_node_output(node, old_output_name, new_output_name):
+        """Replace output of a node."""
+        assert isinstance(old_output_name, str) and isinstance(new_output_name, str)
+        for j in range(len(node.output)):
+            if node.output[j] == old_output_name:
+                node.output[j] = new_output_name
+
     def replace_input_of_all_nodes(self, old_input_name, new_input_name, white_optype=[], black_optype=[]):
         """Replace inputs of all nodes."""
         if len(white_optype) > 0:
@@ -331,7 +501,7 @@ def remove_unused_nodes(self):
         unvalid_nodes = [
             i
             for i in self.model.graph.node
-            if all(out not in self._input_name_to_nodes and not self.is_graph_output(out) for out in i.output)
+            if all(out not in self._input_name_to_nodes and out not in self.output() for out in i.output)
         ]
         while len(unvalid_nodes) > 0:
             self.remove_nodes(unvalid_nodes)
@@ -339,12 +509,12 @@ def remove_unused_nodes(self):
             unvalid_nodes = [
                 i
                 for i in self.model.graph.node
-                if all([out not in self._input_name_to_nodes and not self.is_graph_output(out) for out in i.output])
+                if all([out not in self._input_name_to_nodes and out not in self.output() for out in i.output])
             ]
 
         ununsed_weights = []
         for w in self.model.graph.initializer:
-            if w.name not in self._input_name_to_nodes and w.name not in self.model.graph.output:
+            if w.name not in self._input_name_to_nodes and w.name not in self.output():
                 ununsed_weights.append(w)
                 # Remove from graph.input
                 for graph_input in self.graph().input:
diff --git a/onnx_neural_compressor/quantization/__init__.py b/onnx_neural_compressor/quantization/__init__.py
index 1dcd5e428..67e82f0fc 100644
--- a/onnx_neural_compressor/quantization/__init__.py
+++ b/onnx_neural_compressor/quantization/__init__.py
@@ -12,8 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-from onnxruntime.quantization import CalibrationMethod
-from onnxruntime.quantization.quant_utils import QuantFormat, QuantType
-
+from onnx_neural_compressor.quantization.quant_utils import CalibrationMethod, QuantFormat, QuantType
 from onnx_neural_compressor.quantization.quantize import quantize
diff --git a/onnx_neural_compressor/quantization/algorithm_entry.py b/onnx_neural_compressor/quantization/algorithm_entry.py
index 1e42810e4..14284be66 100644
--- a/onnx_neural_compressor/quantization/algorithm_entry.py
+++ b/onnx_neural_compressor/quantization/algorithm_entry.py
@@ -18,13 +18,11 @@
 
 import onnx
 import onnxruntime as ort
-from onnxruntime import quantization
-
-from onnx_neural_compressor import config, constants, data_reader, logger, utility
-from onnx_neural_compressor.algorithms import utility as quant_utils
+from onnx_neural_compressor import constants, data_reader, logger, utility
 from onnx_neural_compressor.algorithms.post_training_quant import calibrate, quantizer
 from onnx_neural_compressor.algorithms.smoother import core
 from onnx_neural_compressor.algorithms.weight_only import awq, gptq, rtn
+from onnx_neural_compressor.quantization import config
 
 
 ###################### RTN Algo Entry ##################################
@@ -40,8 +38,9 @@ def rtn_quantize_entry(
         logger.debug(config_mapping)
     else:
         config_mapping = quant_config.config_mapping
-    model = rtn.apply_rtn_on_model(model, config_mapping)
-    quant_utils.dump_woq_stats(model, config_mapping, quant_config.white_list)
+    quant_kwargs = {}
+    quant_kwargs = {key: getattr(quant_config, key) for key in config.RTNConfig.model_params_list}
+    model = rtn.apply_rtn_on_model(model, config_mapping, **quant_kwargs)
     return model
 
 
@@ -67,11 +66,12 @@ def gptq_quantize_entry(
         logger.debug(config_mapping)
     else:
         config_mapping = quant_config.config_mapping
+    quant_kwargs = {}
+    quant_kwargs = {key: getattr(quant_config, key) for key in config.GPTQConfig.model_params_list}
 
     # regenerate to ensure data exists
     calibration_data_reader.rewind()
-    model = gptq.apply_gptq_on_model(model, config_mapping, calibration_data_reader)
-    quant_utils.dump_woq_stats(model, config_mapping, quant_config.white_list)
+    model = gptq.apply_gptq_on_model(model, config_mapping, calibration_data_reader, **quant_kwargs)
     return model
 
 
@@ -97,11 +97,12 @@ def awq_quantize_entry(
         logger.debug(config_mapping)
     else:
         config_mapping = quant_config.config_mapping
+    quant_kwargs = {}
+    quant_kwargs = {key: getattr(quant_config, key) for key in config.AWQConfig.model_params_list}
 
     # regenerate to ensure data exists
     calibration_data_reader.rewind()
-    model = awq.apply_awq_on_model(model, config_mapping, calibration_data_reader)
-    quant_utils.dump_woq_stats(model, config_mapping, quant_config.white_list)
+    model = awq.apply_awq_on_model(model, config_mapping, calibration_data_reader, **quant_kwargs)
     return model
 
 
@@ -154,7 +155,6 @@ def static_quantize_entry(
     _quantizer.quantize_model()
     if model_output is not None:
         _quantizer.model.save(model_output)
-    quant_utils.dump_model_op_stats(_quantizer.model.model, config_mapping, quant_config.op_types_to_quantize)
     return _quantizer.model.model
 
 
@@ -239,5 +239,4 @@ def dynamic_quantize_entry(
     _quantizer.quantize_model()
     if model_output is not None:
         _quantizer.model.save(model_output)
-    quant_utils.dump_model_op_stats(_quantizer.model.model, config_mapping, quant_config.op_types_to_quantize)
     return _quantizer.model.model
diff --git a/onnx_neural_compressor/config.py b/onnx_neural_compressor/quantization/config.py
similarity index 81%
rename from onnx_neural_compressor/config.py
rename to onnx_neural_compressor/quantization/config.py
index 59d0ceb65..fb9cd220a 100644
--- a/onnx_neural_compressor/config.py
+++ b/onnx_neural_compressor/quantization/config.py
@@ -28,10 +28,10 @@
 import numpy as np
 import onnx
 import pydantic
-from onnxruntime import quantization
 from typing_extensions import Self
 
-from onnx_neural_compressor import constants, data_reader, logger, utility
+from onnxruntime import quantization as ort_quant
+from onnx_neural_compressor import constants, data_reader, logger, quantization, utility
 
 from collections import OrderedDict  # isort: skip
 from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Type, Union, _GenericAlias  # isort: skip
@@ -114,7 +114,7 @@ def is_tunable(self, value: Any) -> bool:
             return False
 
     def __str__(self) -> str:
-        return self.name
+        return "TuningParam(name={}, tunable_type={}, options={}).".format(self.name, str(self.tunable_type), str(self.options))
 
 
 # Config registry to store all registered configs.
@@ -421,22 +421,45 @@ def build_tuning_param(config: BaseConfig, param: str):
     def expand(self) -> List[BaseConfig]:
         """Expand the config.
 
-        case 1
-            {"model_params": { "reduce_range": [True, False]}} ->
-            {"model_params": { "reduce_range": True}}, {"model_params": { "reduce_range": False}}
-
-        case 2: iterate op_params first (for this case, Add op only supports per_tensor)
-            {"model_params": { "reduce_range": [True, False]}, "op_params": {"per_channel": [True, False]}} ->
-            {"model_params": { "reduce_range": True}, "op_params": {"per_channel": True}}
-            {"model_params": { "reduce_range": True}, "op_params": {"per_channel": False}}
-            {"model_params": { "reduce_range": False}, "op_params": {"per_channel": True}}
-            {"model_params": { "reduce_range": False}, "op_params": {"per_channel": False}}
-
-            {"model_params": { "reduce_range": [True, False]}, "op_params": {"Conv": {"per_channel": [True, False], , "Add": {"per_channel": [True, False]}}}} ->
-            {"model_params": { "reduce_range": True},  "op_params": {"Conv": {"per_channel": True}, "Add": {"per_channel": False}}},
-            {"model_params": { "reduce_range": True},  "op_params": {"Conv": {"per_channel": False}, "Add": {"per_channel": False}}},
-            {"model_params": { "reduce_range": False},  "op_params": {"Conv": {"per_channel": True}, "Add": {"per_channel": False}}},
-            {"model_params": { "reduce_range": False},  "op_params": {"Conv": {"per_channel": False}, "Add": {"per_channel": False}}},
+        Expand rule is:
+            1. Expand model_params_list first, then expand params_list
+            2. Expand model_params_list/params_list following the order of param order in model_params_list/params_list
+
+            model_params_list=[A, B]                params_list=[C,D]
+            A=[1,2], B=[3,4]                        C=[5,6], D=[7,8]
+
+            Expanded results:
+                                    --------    Combination 1 (C=5, D=7)
+                                   /
+                                  / --------    Combination 2 (C=6, D=7)
+               Combination 1  ----
+                (A=1, B=3)        \ --------    Combination 3 (C=5, D=8)
+                                   \
+                                    --------    Combination 4 (C=6, D=8)
+
+                                    --------    Combination 1 (C=5, D=7)
+                                   /
+                                  / --------    Combination 2 (C=6, D=7)
+               Combination 2  ----
+                (A=2, B=3)        \ --------    Combination 3 (C=5, D=8)
+                                   \
+                                    --------    Combination 4 (C=6, D=8)
+
+                                    --------    Combination 1 (C=5, D=7)
+                                   /
+                                  / --------    Combination 2 (C=6, D=7)
+               Combination 3  ----
+                (A=1, B=4)        \ --------    Combination 3 (C=5, D=8)
+                                   \
+                                    --------    Combination 4 (C=6, D=8)
+
+                                    --------    Combination 1 (C=5, D=7)
+                                   /
+                                  / --------    Combination 2 (C=6, D=7)
+               Combination 4  ----
+                (A=2, B=4)        \ --------    Combination 3 (C=5, D=8)
+                                   \
+                                    --------    Combination 4 (C=6, D=8)
         """
         config = self
         # set model level params
@@ -455,9 +478,9 @@ def expand(self) -> List[BaseConfig]:
             model_level_config_lst = [config]
         else:
             tuning_param_name_lst = [tuning_param.name for tuning_param in tuning_param_list]
-            for params_values in itertools.product(*[tuning_param.options for tuning_param in tuning_param_list]):
+            for params_values in itertools.product(*[tuning_param.options for tuning_param in tuning_param_list[::-1]]):
                 new_config = copy.deepcopy(self)
-                for param_name, param_value in zip(tuning_param_name_lst, params_values):
+                for param_name, param_value in zip(tuning_param_name_lst[::-1], params_values):
                     setattr(new_config, param_name, param_value)
                 logger.debug(new_config.to_dict())
                 model_level_config_lst.append(new_config)
@@ -471,7 +494,7 @@ def expand(self) -> List[BaseConfig]:
             tuning_param = self.build_tuning_param(config, param)
             param_val = getattr(config, tuning_param.name)
             if param_val is not None:
-                if tuning_param.is_tunable(param_val):
+                if tuning_param.is_tunable(param_val) and len(param_val) > 0:
                     tuning_param.options = param_val
                     op_tuning_param_list.append(tuning_param)
 
@@ -480,9 +503,9 @@ def expand(self) -> List[BaseConfig]:
         else:
             tuning_param_name_lst = [tuning_param.name for tuning_param in op_tuning_param_list]
             tuning_param_val_lst = list(
-                itertools.product(*[tuning_param.options for tuning_param in op_tuning_param_list])
+                itertools.product(*[tuning_param.options for tuning_param in op_tuning_param_list[::-1]])
             )
-            tuning_param_pair_lst = [dict(zip(tuning_param_name_lst[::-1], val[::-1])) for val in tuning_param_val_lst]
+            tuning_param_pair_lst = [dict(zip(tuning_param_name_lst[::-1], val)) for val in tuning_param_val_lst]
 
             for model_level_config in model_level_config_lst:
                 for tuning_param_pair in tuning_param_pair_lst:
@@ -514,11 +537,8 @@ def to_config_mapping(
         if config_list is None:
             config_list = [self]
         for config in config_list:
-            global_config = config.global_config
             op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
             for op_name, op_type in model_info:
-                if self.global_config is not None:
-                    self._config_mapping[op_name] = global_config
                 if op_type in op_type_config_dict:
                     self._config_mapping[op_name] = op_name_config_dict[op_type]
                 for op_name_pattern in op_name_config_dict:
@@ -628,22 +648,17 @@ def register_supported_configs():
 
 @dataclasses.dataclass
 class OperatorConfig:
-
-    def __init__(
-        self,
-        weight_type,
-        activation_type,
-        per_channel,
-        weight_sym,
-        activation_sym,
-        calibrate_method=quantization.CalibrationMethod.MinMax,
-    ):
-        self.weight_type = getattr(weight_type, "tensor_type", weight_type)
-        self.activation_type = getattr(activation_type, "tensor_type", activation_type)
-        self.per_channel = per_channel
-        self.weight_sym = weight_sym
-        self.activation_sym = activation_sym
-        self.calibrate_method = calibrate_method
+    weight_type: quantization.QuantType
+    activation_type: quantization.QuantType
+    per_channel: bool
+    weight_sym: bool
+    activation_sym: bool
+    calibrate_method: quantization.CalibrationMethod=quantization.CalibrationMethod.MinMax
+
+    def __post_init__(self):
+        self.weight_type = getattr(self.weight_type, "tensor_type", self.weight_type)
+        self.activation_type = getattr(self.activation_type, "tensor_type", self.activation_type)
+        self.calibrate_method = getattr(self.calibrate_method, "value", self.calibrate_method)
 
     def __getitem__(self, key):
         return getattr(self, key)
@@ -765,6 +780,19 @@ def __init__(
         self.quant_last_matmul = quant_last_matmul
         self._post_init()
 
+
+    def _post_init(self):
+        if self.white_list == constants.RTN_OP_LIST:
+            global_config = self.get_init_args()
+            self._global_config = self.__class__(**global_config, white_list=None)
+        elif isinstance(self.white_list, list) and len(self.white_list) > 0:
+            for op_name_or_type in self.white_list:
+                global_config = self.get_init_args()
+                tmp_config = self.__class__(**global_config, white_list=None)
+                self.set_local(op_name_or_type, tmp_config)
+        elif self.white_list == constants.EMPTY_WHITE_LIST:
+            return
+
     def get_model_params_dict(self):
         result = dict()
         for param in self.model_params_list:
@@ -793,21 +821,23 @@ def to_config_mapping(self, config_list: List[BaseConfig] = None, model_info: li
             self._config_mapping.update(config.get_model_params_dict())
 
             # update node level setting
+            last_matmul = None
             global_config = config.get_params_dict()
             op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
             for op_name, op_type in model_info:
-                if self.global_config is not None:
+                if op_type == "MatMul":
+                    last_matmul = op_name
+                if global_config is not None:
                     self._config_mapping[op_name] = global_config
                 if op_type in op_type_config_dict:
                     self._config_mapping[op_name] = op_type_config_dict[op_type]
                 for op_name_pattern in op_name_config_dict:
                     if re.match(op_name_pattern, op_name):
                         self._config_mapping[op_name] = op_name_config_dict[op_name_pattern]
-        if not self.quant_last_matmul:
-            self._config_mapping[model_info[-1][0]] = {
-                "weight": {"dtype": "fp32"},
-                "activation": {"dtype": "fp32", "quant_mode": "fp32"},
-            }
+                if op_name in self._config_mapping and hasattr(self._config_mapping[op_name], "to_dict"):
+                    self._config_mapping[op_name] = self._config_mapping[op_name].to_dict()
+        if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping:
+            del self._config_mapping[last_matmul]
         return self._config_mapping
 
     @staticmethod
@@ -926,6 +956,18 @@ def __init__(
         self.quant_last_matmul = quant_last_matmul
         self._post_init()
 
+    def _post_init(self):
+        if self.white_list == constants.GPTQ_OP_LIST:
+            global_config = self.get_init_args()
+            self._global_config = self.__class__(**global_config, white_list=None)
+        elif isinstance(self.white_list, list) and len(self.white_list) > 0:
+            for op_name_or_type in self.white_list:
+                global_config = self.get_init_args()
+                tmp_config = self.__class__(**global_config, white_list=None)
+                self.set_local(op_name_or_type, tmp_config)
+        elif self.white_list == constants.EMPTY_WHITE_LIST:
+            return
+
     def get_model_params_dict(self):
         result = dict()
         for param in self.model_params_list:
@@ -957,21 +999,23 @@ def to_config_mapping(self, config_list: list = None, model_info: list = None) -
             self._config_mapping.update(config.get_model_params_dict())
 
             # update node level setting
+            last_matmul = None
             global_config = config.get_params_dict()
             op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
             for op_name, op_type in model_info:
-                if self.global_config is not None:
+                if op_type == "MatMul":
+                    last_matmul = op_name
+                if global_config is not None:
                     self._config_mapping[op_name] = global_config
                 if op_type in op_type_config_dict:
                     self._config_mapping[op_name] = op_type_config_dict[op_type]
                 for op_name_pattern in op_name_config_dict:
                     if re.match(op_name_pattern, op_name):
                         self._config_mapping[op_name] = op_name_config_dict[op_name_pattern]
-        if not self.quant_last_matmul:
-            self._config_mapping[model_info[-1][0]] = {
-                "weight": {"dtype": "fp32"},
-                "activation": {"dtype": "fp32", "quant_mode": "fp32"},
-            }
+                if op_name in self._config_mapping and hasattr(self._config_mapping[op_name], "to_dict"):
+                    self._config_mapping[op_name] = self._config_mapping[op_name].to_dict()
+        if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping:
+            del self._config_mapping[last_matmul]
         return self._config_mapping
 
     @staticmethod
@@ -1077,6 +1121,18 @@ def __init__(
         self.quant_last_matmul = quant_last_matmul
         self._post_init()
 
+    def _post_init(self):
+        if self.white_list == constants.GPTQ_OP_LIST:
+            global_config = self.get_init_args()
+            self._global_config = self.__class__(**global_config, white_list=None)
+        elif isinstance(self.white_list, list) and len(self.white_list) > 0:
+            for op_name_or_type in self.white_list:
+                global_config = self.get_init_args()
+                tmp_config = self.__class__(**global_config, white_list=None)
+                self.set_local(op_name_or_type, tmp_config)
+        elif self.white_list == constants.EMPTY_WHITE_LIST:
+            return
+
     def get_model_params_dict(self):
         result = dict()
         for param in self.model_params_list:
@@ -1107,21 +1163,23 @@ def to_config_mapping(self, config_list: list = None, model_info: list = None) -
             self._config_mapping.update(config.get_model_params_dict())
 
             # update node level setting
+            last_matmul = None
             global_config = config.get_params_dict()
             op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
             for op_name, op_type in model_info:
-                if self.global_config is not None:
+                if op_type == "MatMul":
+                    last_matmul = op_name
+                if global_config is not None:
                     self._config_mapping[op_name] = global_config
                 if op_type in op_type_config_dict:
                     self._config_mapping[op_name] = op_type_config_dict[op_type]
                 for op_name_pattern in op_name_config_dict:
                     if re.match(op_name_pattern, op_name):
                         self._config_mapping[op_name] = op_name_config_dict[op_name_pattern]
-        if not self.quant_last_matmul:
-            self._config_mapping[model_info[-1][0]] = {
-                "weight": {"dtype": "fp32"},
-                "activation": {"dtype": "fp32", "quant_mode": "fp32"},
-            }
+                if op_name in self._config_mapping and hasattr(self._config_mapping[op_name], "to_dict"):
+                    self._config_mapping[op_name] = self._config_mapping[op_name].to_dict()
+        if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping:
+            del self._config_mapping[last_matmul]
         return self._config_mapping
 
     @staticmethod
@@ -1205,8 +1263,242 @@ def __init__(
         self.SmoothQuantScalesPerOp = SmoothQuantScalesPerOp
 
 
+def static_basic_check(config, optype, execution_provider, quant_format):
+    if getattr(quant_format, "value", quant_format) == 0:
+        if execution_provider not in constants.STATIC_QOPERATOR_OP_LIST_MAP:
+            raise ValueError(
+                "Unsupported execution_provider {}, only support {}.".format(
+                    execution_provider, list(constants.STATIC_QOPERATOR_OP_LIST_MAP.keys())
+                )
+            )
+        supported_optype = constants.STATIC_QOPERATOR_OP_LIST_MAP[execution_provider]
+        if optype not in supported_optype:
+            raise ValueError(
+                "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype)
+            )
+    elif getattr(quant_format, "value", quant_format) == 1:
+        if execution_provider not in constants.STATIC_QDQ_OP_LIST_MAP:
+            raise ValueError(
+                "Unsupported execution_provider {}, only support {}.".format(
+                    execution_provider, list(constants.STATIC_QDQ_OP_LIST_MAP.keys())
+                )
+            )
+        supported_optype = constants.STATIC_QDQ_OP_LIST_MAP[execution_provider]
+        if optype not in supported_optype:
+            raise ValueError(
+                "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype)
+            )
+    else:
+        raise ValueError(
+            "Unsupported quant_format {}, only support QuantFormat.QOperator and QuantFormat.QDQ.".format(quant_format)
+        )
+    return config
+
+
+def static_cpu_check(config, optype, execution_provider, quant_format):
+    if execution_provider != "CPUExecutionProvider":
+        return config
+
+    # only support per-tensor
+    if optype in [
+        "EmbedLayerNormalization",
+        "Relu",
+        "Clip",
+        "LeakyRelu",
+        "Sigmoid",
+        "MaxPool",
+        "GlobalAveragePool",
+        "Pad",
+        "Split",
+        "Squeeze",
+        "Reshape",
+        "Concat",
+        "AveragePool",
+        "Tile",
+        "Unsqueeze",
+        "Transpose",
+        "Resize",
+        "Abs",
+        "Shrink",
+        "Sign",
+        "Attention",
+        "Flatten",
+        "Expand",
+        "Slice",
+        "Mod",
+        "ReduceMax",
+        "ReduceMin",
+        "CenterCropPad",
+        "Add",
+        "Mul",
+        "ArgMax",
+    ]:
+        setattr(config, "per_channel", False)
+
+    if optype in ["Attention"]:
+        setattr(config, "activation_type", onnx.TensorProto.UINT8)
+    return config
+
+
+def static_cuda_check(config, optype, execution_provider, quant_format):
+    if execution_provider != "CUDAExecutionProvider":
+        return config
+
+    # only support per-tensor
+    if optype in [
+        "EmbedLayerNormalization",
+        "Relu",
+        "Clip",
+        "LeakyRelu",
+        "Sigmoid",
+        "MaxPool",
+        "GlobalAveragePool",
+        "Pad",
+        "Split",
+        "Squeeze",
+        "Reshape",
+        "Concat",
+        "AveragePool",
+        "Tile",
+        "Unsqueeze",
+        "Transpose",
+        "Resize",
+        "Abs",
+        "Shrink",
+        "Sign",
+        "Attention",
+        "Flatten",
+        "Expand",
+        "Slice",
+        "Mod",
+        "ReduceMax",
+        "ReduceMin",
+        "CenterCropPad",
+        "Add",
+        "Mul",
+        "ArgMax",
+    ]:
+        setattr(config, "per_channel", False)
+
+    if optype in ["Attention"]:
+        setattr(config, "activation_type", onnx.TensorProto.INT8)
+        setattr(config, "weight_type", onnx.TensorProto.INT8)
+    return config
+
+
+def static_dml_check(config, optype, execution_provider, quant_format):
+    if execution_provider != "DmlExecutionProvider":
+        return config
+
+    # only support per-tensor
+    if optype in ["Conv", "MatMul", "Mul", "Relu", "Clip", "MaxPool", "Add"]:
+        setattr(config, "per_channel", False)
+    return config
+
+
+def static_dnnl_check(config, optype, execution_provider, quant_format):
+    if execution_provider != "DnnlExecutionProvider":
+        return config
+
+    # current configurations are same as CPU EP
+    return static_cpu_check(config, optype, execution_provider, quant_format)
+
+
+def static_trt_check(config, optype, execution_provider, quant_format):
+    if execution_provider != "TensorrtExecutionProvider":
+        return config
+
+    # only support S8S8
+    if optype in ["Conv", "MatMul", "Gather", "Gemm"]:
+        setattr(config, "weight_type", onnx.TensorProto.INT8)
+        setattr(config, "weight_sym", True)
+        setattr(config, "activation_type", onnx.TensorProto.INT8)
+        setattr(config, "activation_sym", True)
+        setattr(config, "per_channel", [False, True])
+    else:
+        setattr(config, "weight_type", onnx.TensorProto.INT8)
+        setattr(config, "weight_sym", True)
+        setattr(config, "activation_type", onnx.TensorProto.INT8)
+        setattr(config, "activation_sym", True)
+    return config
+
+
+STATIC_CHECK_FUNC_LIST = [
+    static_basic_check,
+    static_cpu_check,
+    static_cuda_check,
+    static_dml_check,
+    static_dnnl_check,
+    static_trt_check,
+]
+
+
+def dynamic_basic_check(config, optype, execution_provider, quant_format=None):
+    if execution_provider not in constants.DYNAMIC_OP_LIST_MAP:
+        raise ValueError(
+            "Unsupported execution_provider {}, only support {}.".format(
+                execution_provider, list(constants.DYNAMIC_OP_LIST_MAP.keys())
+            )
+        )
+
+    supported_optype = constants.DYNAMIC_OP_LIST_MAP[execution_provider]
+    if optype not in supported_optype:
+        raise ValueError(
+            "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype)
+        )
+    return config
+
+
+def dynamic_cpu_check(config, optype, execution_provider, quant_format=None):
+    if execution_provider != "CPUExecutionProvider":
+        return config
+    # TODO: add constraints for other EP
+    if optype in ["FusedConv", "Conv", "EmbedLayerNormalization", "Gather", "Attention", "LSTM"]:
+        setattr(config, "per_channel", False)
+    return config
+
+
+def dynamic_cuda_check(config, optype, execution_provider, quant_format=None):
+    if execution_provider != "CUDAExecutionProvider":
+        return config
+    # current configurations are same as CPU EP
+    return dynamic_cpu_check(config, optype, execution_provider, quant_format)
+
+
+def dynamic_dml_check(config, optype, execution_provider, quant_format=None):
+    if execution_provider != "DmlExecutionProvider":
+        return config
+
+    # don't support dynamic quantization
+    return None
+
+
+def dynamic_dnnl_check(config, optype, execution_provider, quant_format=None):
+    if execution_provider != "DnnlExecutionProvider":
+        return config
+    # current configurations are same as CPU EP
+    return dynamic_cpu_check(config, optype, execution_provider, quant_format)
+
+
+def dynamic_trt_check(config, optype, execution_provider, quant_format=None):
+    if execution_provider != "TensorrtExecutionProvider":
+        return config
+
+    # don't support dynamic quantization
+    return None
+
+
+DYNAMIC_CHECK_FUNC_LIST = [
+    dynamic_basic_check,
+    dynamic_cpu_check,
+    dynamic_cuda_check,
+    dynamic_dml_check,
+    dynamic_dnnl_check,
+    dynamic_trt_check,
+]
+
 @register_config(algo_name=constants.STATIC_QUANT, priority=constants.PRIORITY_STATIC_QUANT)
-class StaticQuantConfig(BaseConfig, quantization.StaticQuantConfig):
+class StaticQuantConfig(BaseConfig, ort_quant.StaticQuantConfig):
 
     supported_configs: List[_OperatorConfig] = []
     params_list: List[str] = [
@@ -1284,7 +1576,7 @@ def __init__(
             logger.warning(
                 "VNNI is not supported and reduce_range=False, reduce_range=True is recommended to avoid potential accuracy issue."
             )
-        quantization.StaticQuantConfig.__init__(
+        ort_quant.StaticQuantConfig.__init__(
             self,
             calibration_data_reader=calibration_data_reader,
             calibrate_method=calibrate_method,
@@ -1347,7 +1639,7 @@ def _post_init(self):
             params = self.get_params_dict()
             op_config = OperatorConfig(**params)
 
-            for valid_func in utility.STATIC_CHECK_FUNC_LIST:
+            for valid_func in STATIC_CHECK_FUNC_LIST:
                 op_config = valid_func(op_config, op_name_or_type, self.execution_provider, self.quant_format)
             self.set_local(op_name_or_type, op_config)
         if isinstance(self.white_list, list) and len(self.white_list) > 0:
@@ -1368,6 +1660,8 @@ def to_config_mapping(self, config_list: list = None, model_info: list = None) -
             op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
             last_matmul = None
             for op_name, op_type in model_info:
+                if op_type == "MatMul":
+                    last_matmul = op_name
                 if (
                     isinstance(self.op_types_to_quantize, list)
                     and len(self.op_types_to_quantize) > 0
@@ -1388,8 +1682,6 @@ def to_config_mapping(self, config_list: list = None, model_info: list = None) -
                     continue
                 if op_type in op_type_config_dict:
                     self._config_mapping[op_name] = op_type_config_dict[op_type]
-                    if op_type == "MatMul":
-                        last_matmul = op_name
                 for op_name_pattern in op_name_config_dict:
                     if re.match(op_name_pattern, op_name):
                         self._config_mapping[op_name] = op_name_config_dict[op_name_pattern]
@@ -1468,7 +1760,7 @@ def register_supported_configs(cls) -> None:
                     activation_sym=False,
                 ),
                 operators=["GatherND", "GatherElements", "Gather"],
-                valid_func_list=utility.STATIC_CHECK_FUNC_LIST,
+                valid_func_list=STATIC_CHECK_FUNC_LIST,
             )
         )
         supported_configs.append(
@@ -1486,7 +1778,7 @@ def register_supported_configs(cls) -> None:
                     activation_sym=False,
                 ),
                 operators=["EmbedLayerNormalization"],
-                valid_func_list=utility.STATIC_CHECK_FUNC_LIST,
+                valid_func_list=STATIC_CHECK_FUNC_LIST,
             )
         )
         supported_configs.append(
@@ -1504,7 +1796,7 @@ def register_supported_configs(cls) -> None:
                     activation_sym=False,
                 ),
                 operators=["Conv", "MatMul", "Gemm", "FusedConv"],
-                valid_func_list=utility.STATIC_CHECK_FUNC_LIST,
+                valid_func_list=STATIC_CHECK_FUNC_LIST,
             )
         )
         supported_configs.append(
@@ -1553,7 +1845,7 @@ def register_supported_configs(cls) -> None:
                     "Mul",
                     "ArgMax",
                 ],
-                valid_func_list=utility.STATIC_CHECK_FUNC_LIST,
+                valid_func_list=STATIC_CHECK_FUNC_LIST,
             )
         )
         cls.supported_configs = supported_configs
@@ -1687,7 +1979,7 @@ def get_default_sq_config() -> SmoothQuantConfig:
 
 
 @register_config(algo_name=constants.DYNAMIC_QUANT, priority=constants.PRIORITY_DYNAMIC_QUANT)
-class DynamicQuantConfig(BaseConfig, quantization.DynamicQuantConfig):
+class DynamicQuantConfig(BaseConfig, ort_quant.DynamicQuantConfig):
     """This is a class for dynamic Quant Configuration.
 
     Inherit from DynamicQuantConfig:
@@ -1732,7 +2024,7 @@ def __init__(
             logger.warning(
                 "VNNI is not supported and reduce_range=False, reduce_range=True is recommended to avoid potential accuracy issue."
             )
-        quantization.DynamicQuantConfig.__init__(
+        ort_quant.DynamicQuantConfig.__init__(
             self,
             weight_type=weight_type,
             op_types_to_quantize=op_types_to_quantize,
@@ -1776,7 +2068,7 @@ def _post_init(self):
         for op_name_or_type in self.op_types_to_quantize:
             params = self.get_params_dict()
             op_config = OperatorConfig(**params)
-            for valid_func in utility.DYNAMIC_CHECK_FUNC_LIST:
+            for valid_func in DYNAMIC_CHECK_FUNC_LIST:
                 op_config = valid_func(op_config, op_name_or_type, self.execution_provider)
             self.set_local(op_name_or_type, op_config)
         if isinstance(self.white_list, list) and len(self.white_list) > 0:
@@ -1793,10 +2085,11 @@ def to_config_mapping(self, config_list: list = None, model_info: list = None) -
             self._config_mapping.update(config.get_model_params_dict())
 
             # update node level setting
-            global_config = config.global_config
             op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
             last_matmul = None
             for op_name, op_type in model_info:
+                if op_type == "MatMul":
+                    last_matmul = op_name
                 if (
                     isinstance(self.op_types_to_quantize, list)
                     and len(self.op_types_to_quantize) > 0
@@ -1817,8 +2110,6 @@ def to_config_mapping(self, config_list: list = None, model_info: list = None) -
                     continue
                 if op_type in op_type_config_dict:
                     self._config_mapping[op_name] = op_type_config_dict[op_type]
-                    if op_type == "MatMul":
-                        last_matmul = op_name
                 for op_name_pattern in op_name_config_dict:
                     if re.match(op_name_pattern, op_name):
                         self._config_mapping[op_name] = op_name_config_dict[op_name_pattern]
@@ -1888,7 +2179,7 @@ def register_supported_configs(cls) -> None:
                     activation_sym=False,
                 ),
                 operators=["FusedConv", "Conv", "EmbedLayerNormalization"],
-                valid_func_list=utility.DYNAMIC_CHECK_FUNC_LIST,
+                valid_func_list=DYNAMIC_CHECK_FUNC_LIST,
             )
         )
         supported_configs.append(
@@ -1901,7 +2192,7 @@ def register_supported_configs(cls) -> None:
                     activation_sym=False,
                 ),
                 operators=["MatMul"],
-                valid_func_list=utility.DYNAMIC_CHECK_FUNC_LIST,
+                valid_func_list=DYNAMIC_CHECK_FUNC_LIST,
             )
         )
         supported_configs.append(
@@ -1914,7 +2205,7 @@ def register_supported_configs(cls) -> None:
                     activation_sym=False,
                 ),
                 operators=["Gather", "Attention", "LSTM"],
-                valid_func_list=utility.DYNAMIC_CHECK_FUNC_LIST,
+                valid_func_list=DYNAMIC_CHECK_FUNC_LIST,
             )
         )
         cls.supported_configs = supported_configs
diff --git a/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py b/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py
index 62a671fba..87051221b 100644
--- a/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py
+++ b/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py
@@ -15,7 +15,6 @@
 from typing import List, Union  # isort: skip
 
 import onnx
-from onnxruntime.quantization import matmul_4bits_quantizer
 
 from onnx_neural_compressor.quantization import matmul_nbits_quantizer
 
@@ -33,7 +32,7 @@ def __init__(
         is_symmetric: bool = False,
         accuracy_level: int = 0,
         nodes_to_exclude=None,
-        algo_config: matmul_4bits_quantizer.WeightOnlyQuantConfig = None,
+        algo_config: matmul_nbits_quantizer.WeightOnlyQuantConfig = None,
         providers: List[str] = ["CPUExecutionProvider"],
     ):
         super().__init__(
diff --git a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py
index b41c56270..80cf892c5 100644
--- a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py
+++ b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py
@@ -19,20 +19,36 @@
 
 import onnx
 import onnxruntime as ort
-from onnxruntime.quantization import matmul_4bits_quantizer
 
-from onnx_neural_compressor import config, data_reader, logger, onnx_model, utility
+from onnx_neural_compressor import data_reader, logger, onnx_model, utility
 from onnx_neural_compressor.quantization import algorithm_entry as algos
+from onnx_neural_compressor.quantization import config
 
 
-class RTNWeightOnlyQuantConfig(matmul_4bits_quantizer.RTNWeightOnlyQuantConfig):
+class WeightOnlyQuantConfig:
+    def __init__(self, algorithm):
+        """This is the Base class for Weight Only Quant Configuration.
+
+        Args:
+            algorithm:
+                weight only quantize algorithm name.
+        """
+        self.algorithm = algorithm
+
+
+class RTNWeightOnlyQuantConfig(WeightOnlyQuantConfig):
 
     def __init__(self, ratios=None, layer_wise_quant=False):
-        super().__init__(ratios=ratios)
+        super().__init__(
+            algorithm="RTN",
+        )
+        if ratios is None:
+            ratios = {}
+        self.ratios = ratios
         self.layer_wise_quant = layer_wise_quant
 
 
-class GPTQWeightOnlyQuantConfig(matmul_4bits_quantizer.GPTQWeightOnlyQuantConfig):
+class GPTQWeightOnlyQuantConfig(WeightOnlyQuantConfig):
 
     def __init__(
         self,
@@ -45,17 +61,17 @@ def __init__(
         layer_wise_quant=False,
     ):
         super().__init__(
-            calibration_data_reader=calibration_data_reader,
-            percdamp=percdamp,
-            block_size=block_size,
-            actorder=actorder,
-            mse=mse,
-            perchannel=perchannel,
+            algorithm="GPTQ",
         )
+        self.calibration_data_reader = calibration_data_reader
+        self.percdamp = percdamp
+        self.block_size = block_size
+        self.actorder = actorder
+        self.mse = mse
+        self.perchannel = perchannel
         self.layer_wise_quant = layer_wise_quant
 
-
-class AWQWeightOnlyQuantConfig(matmul_4bits_quantizer.WeightOnlyQuantConfig):
+class AWQWeightOnlyQuantConfig(WeightOnlyQuantConfig):
 
     def __init__(
         self,
@@ -85,7 +101,7 @@ def __init__(
         is_symmetric: bool = False,
         accuracy_level: int = 0,
         nodes_to_exclude: List[str] = None,
-        algo_config: matmul_4bits_quantizer.WeightOnlyQuantConfig = None,
+        algo_config: WeightOnlyQuantConfig = None,
         n_bits: int = 4,
         providers: List[str] = ["CPUExecutionProvider"],
         optimization_level: ort.GraphOptimizationLevel = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
diff --git a/onnx_neural_compressor/quantization/quant_utils.py b/onnx_neural_compressor/quantization/quant_utils.py
new file mode 100644
index 000000000..ca6612f80
--- /dev/null
+++ b/onnx_neural_compressor/quantization/quant_utils.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2023 MIT HAN Lab
+# This source code is licensed under the MIT license
+#
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import enum
+import onnx
+
+class QuantType(enum.Enum):  # pragma: no cover
+    """Represent QuantType value."""
+
+    QInt8 = 0
+    QUInt8 = 1
+
+    @property
+    def tensor_type(self):
+        if self == QuantType.QInt8:
+            return onnx.TensorProto.INT8
+        if self == QuantType.QUInt8:
+            return onnx.TensorProto.UINT8
+        raise ValueError(f"Unexpected value qtype={self!r}.")
+
+class QuantFormat(enum.Enum):
+    QOperator = 0
+    QDQ = 1
+
+class CalibrationMethod(enum.Enum):
+    MinMax = 0
+    Entropy = 1
+    Percentile = 2
+    Distribution = 3
diff --git a/onnx_neural_compressor/quantization/quantize.py b/onnx_neural_compressor/quantization/quantize.py
index c90e16d38..d245145c2 100644
--- a/onnx_neural_compressor/quantization/quantize.py
+++ b/onnx_neural_compressor/quantization/quantize.py
@@ -20,7 +20,7 @@
 import onnxruntime as ort
 from onnxruntime.quantization.quantize import QuantConfig
 
-from onnx_neural_compressor import config
+from onnx_neural_compressor.quantization import config
 from onnx_neural_compressor.quantization import algorithm_entry as algos
 
 
@@ -28,7 +28,7 @@
 def quantize(
     model_input: Union[str, pathlib.Path, onnx.ModelProto],
     model_output: Union[str, pathlib.Path],
-    quant_config: QuantConfig,
+    quant_config: config.BaseConfig,
     optimization_level: ort.GraphOptimizationLevel = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
 ):
     with tempfile.TemporaryDirectory(prefix="ort.opt.") as tmp_dir:
diff --git a/onnx_neural_compressor/quantization/tuning.py b/onnx_neural_compressor/quantization/tuning.py
index a5caa4c35..100d8c3b3 100644
--- a/onnx_neural_compressor/quantization/tuning.py
+++ b/onnx_neural_compressor/quantization/tuning.py
@@ -24,7 +24,8 @@
 import onnxruntime as ort
 from onnx import external_data_helper
 
-from onnx_neural_compressor import config, data_reader, logger, utility
+from onnx_neural_compressor import data_reader, logger, utility
+from onnx_neural_compressor.quantization import config
 
 from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Sized, Tuple, Union  # isort: skip
 
diff --git a/onnx_neural_compressor/utility.py b/onnx_neural_compressor/utility.py
index f1cf126d2..f92b8707a 100644
--- a/onnx_neural_compressor/utility.py
+++ b/onnx_neural_compressor/utility.py
@@ -25,7 +25,6 @@
 import onnxruntime as ort
 import prettytable as pt
 import psutil
-from onnxruntime import quantization
 
 from onnx_neural_compressor import constants, logger
 
@@ -314,236 +313,15 @@ def auto_detect_ep():
         return "CPUExecutionProvider"
 
 
-def static_basic_check(config, optype, execution_provider, quant_format):
-    if quant_format == quantization.QuantFormat.QOperator:
-        if execution_provider not in constants.STATIC_QOPERATOR_OP_LIST_MAP:
-            raise ValueError(
-                "Unsupported execution_provider {}, only support {}.".format(
-                    execution_provider, list(constants.STATIC_QOPERATOR_OP_LIST_MAP.keys())
-                )
-            )
-        supported_optype = constants.STATIC_QOPERATOR_OP_LIST_MAP[execution_provider]
-        if optype not in supported_optype:
-            raise ValueError(
-                "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype)
-            )
-    elif quant_format == quantization.QuantFormat.QDQ:
-        if execution_provider not in constants.STATIC_QDQ_OP_LIST_MAP:
-            raise ValueError(
-                "Unsupported execution_provider {}, only support {}.".format(
-                    execution_provider, list(constants.STATIC_QDQ_OP_LIST_MAP.keys())
-                )
-            )
-        supported_optype = constants.STATIC_QDQ_OP_LIST_MAP[execution_provider]
-        if optype not in supported_optype:
-            raise ValueError(
-                "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype)
-            )
+def trt_env_setup(model):
+    """Set environment variable for Tensorrt Execution Provider."""
+    is_int8 = False
+    for node in model.graph.node:
+        if node.op_type in ["QuantizeLinear", "DequantizeLinear"]:
+            is_int8 = True
+            break
+    if is_int8:
+        os.environ["ORT_TENSORRT_INT8_ENABLE"] = "1"
     else:
-        raise ValueError(
-            "Unsupported quant_format {}, only support QuantFormat.QOperator and QuantFormat.QDQ.".format(quant_format)
-        )
-    return config
-
-
-def static_cpu_check(config, optype, execution_provider, quant_format):
-    if execution_provider != "CPUExecutionProvider":
-        return config
-
-    # only support per-tensor
-    if optype in [
-        "EmbedLayerNormalization",
-        "Relu",
-        "Clip",
-        "LeakyRelu",
-        "Sigmoid",
-        "MaxPool",
-        "GlobalAveragePool",
-        "Pad",
-        "Split",
-        "Squeeze",
-        "Reshape",
-        "Concat",
-        "AveragePool",
-        "Tile",
-        "Unsqueeze",
-        "Transpose",
-        "Resize",
-        "Abs",
-        "Shrink",
-        "Sign",
-        "Attention",
-        "Flatten",
-        "Expand",
-        "Slice",
-        "Mod",
-        "ReduceMax",
-        "ReduceMin",
-        "CenterCropPad",
-        "Add",
-        "Mul",
-        "ArgMax",
-    ]:
-        setattr(config, "per_channel", False)
-
-    if optype in ["Attention"]:
-        setattr(config, "activation_type", onnx.TensorProto.UINT8)
-    return config
-
-
-def static_cuda_check(config, optype, execution_provider, quant_format):
-    if execution_provider != "CUDAExecutionProvider":
-        return config
-
-    # only support per-tensor
-    if optype in [
-        "EmbedLayerNormalization",
-        "Relu",
-        "Clip",
-        "LeakyRelu",
-        "Sigmoid",
-        "MaxPool",
-        "GlobalAveragePool",
-        "Pad",
-        "Split",
-        "Squeeze",
-        "Reshape",
-        "Concat",
-        "AveragePool",
-        "Tile",
-        "Unsqueeze",
-        "Transpose",
-        "Resize",
-        "Abs",
-        "Shrink",
-        "Sign",
-        "Attention",
-        "Flatten",
-        "Expand",
-        "Slice",
-        "Mod",
-        "ReduceMax",
-        "ReduceMin",
-        "CenterCropPad",
-        "Add",
-        "Mul",
-        "ArgMax",
-    ]:
-        setattr(config, "per_channel", False)
-
-    if optype in ["Attention"]:
-        setattr(config, "activation_type", onnx.TensorProto.INT8)
-        setattr(config, "weight_type", onnx.TensorProto.INT8)
-    return config
-
-
-def static_dml_check(config, optype, execution_provider, quant_format):
-    if execution_provider != "DmlExecutionProvider":
-        return config
-
-    # only support per-tensor
-    if optype in ["Conv", "MatMul", "Mul", "Relu", "Clip", "MaxPool", "Add"]:
-        setattr(config, "per_channel", False)
-    return config
-
-
-def static_dnnl_check(config, optype, execution_provider, quant_format):
-    if execution_provider != "DnnlExecutionProvider":
-        return config
-
-    # current configurations are same as CPU EP
-    return static_cpu_check(config, optype, execution_provider, quant_format)
-
-
-def static_trt_check(config, optype, execution_provider, quant_format):
-    if execution_provider != "TensorrtExecutionProvider":
-        return config
-
-    # only support S8S8
-    if optype in ["Conv", "MatMul", "Gather", "Gemm"]:
-        setattr(config, "weight_type", onnx.TensorProto.INT8)
-        setattr(config, "weight_sym", True)
-        setattr(config, "activation_type", onnx.TensorProto.INT8)
-        setattr(config, "activation_sym", True)
-        setattr(config, "per_channel", [False, True])
-    else:
-        setattr(config, "weight_type", onnx.TensorProto.INT8)
-        setattr(config, "weight_sym", True)
-        setattr(config, "activation_type", onnx.TensorProto.INT8)
-        setattr(config, "activation_sym", True)
-    return config
-
-
-STATIC_CHECK_FUNC_LIST = [
-    static_basic_check,
-    static_cpu_check,
-    static_cuda_check,
-    static_dml_check,
-    static_dnnl_check,
-    static_trt_check,
-]
-
-
-def dynamic_basic_check(config, optype, execution_provider, quant_format=None):
-    if execution_provider not in constants.DYNAMIC_OP_LIST_MAP:
-        raise ValueError(
-            "Unsupported execution_provider {}, only support {}.".format(
-                execution_provider, list(constants.DYNAMIC_OP_LIST_MAP.keys())
-            )
-        )
-
-    supported_optype = constants.DYNAMIC_OP_LIST_MAP[execution_provider]
-    if optype not in supported_optype:
-        raise ValueError(
-            "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype)
-        )
-    return config
-
-
-def dynamic_cpu_check(config, optype, execution_provider, quant_format=None):
-    if execution_provider != "CPUExecutionProvider":
-        return config
-    # TODO: add constraints for other EP
-    if optype in ["FusedConv", "Conv", "EmbedLayerNormalization", "Gather", "Attention", "LSTM"]:
-        setattr(config, "per_channel", False)
-    return config
-
-
-def dynamic_cuda_check(config, optype, execution_provider, quant_format=None):
-    if execution_provider != "CUDAExecutionProvider":
-        return config
-    # current configurations are same as CPU EP
-    return dynamic_cpu_check(config, optype, execution_provider, quant_format)
-
-
-def dynamic_dml_check(config, optype, execution_provider, quant_format=None):
-    if execution_provider != "DmlExecutionProvider":
-        return config
-
-    # don't support dynamic quantization
-    return None
-
-
-def dynamic_dnnl_check(config, optype, execution_provider, quant_format=None):
-    if execution_provider != "DnnlExecutionProvider":
-        return config
-    # current configurations are same as CPU EP
-    return dynamic_cpu_check(config, optype, execution_provider, quant_format)
-
-
-def dynamic_trt_check(config, optype, execution_provider, quant_format=None):
-    if execution_provider != "TensorrtExecutionProvider":
-        return config
-
-    # don't support dynamic quantization
-    return None
-
+        os.environ["ORT_TENSORRT_INT8_ENABLE"] = "0"
 
-DYNAMIC_CHECK_FUNC_LIST = [
-    dynamic_basic_check,
-    dynamic_cpu_check,
-    dynamic_cuda_check,
-    dynamic_dml_check,
-    dynamic_dnnl_check,
-    dynamic_trt_check,
-]
diff --git a/requirements.txt b/requirements.txt
index 216af6eff..7e4911f78 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,4 +8,4 @@ py-cpuinfo
 pydantic
 transformers
 prettytable
-scipy
\ No newline at end of file
+scipy
diff --git a/test/quantization/layer_wise/test_layer_wise.py b/test/quantization/layer_wise/test_layer_wise.py
index af0bca3e4..7e14d83d7 100644
--- a/test/quantization/layer_wise/test_layer_wise.py
+++ b/test/quantization/layer_wise/test_layer_wise.py
@@ -10,9 +10,9 @@
 import transformers
 from optimum.exporters.onnx import main_export
 
-from onnx_neural_compressor import config, data_reader, logger
+from onnx_neural_compressor import data_reader, logger
 from onnx_neural_compressor.quantization import algorithm_entry as algos
-from onnx_neural_compressor.quantization import matmul_4bits_quantizer
+from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer
 
 
 def find_onnx_file(folder_path):
diff --git a/test/quantization/post_training_quant/test_post_training_quant.py b/test/quantization/post_training_quant/test_post_training_quant.py
new file mode 100644
index 000000000..b6de12bf2
--- /dev/null
+++ b/test/quantization/post_training_quant/test_post_training_quant.py
@@ -0,0 +1,204 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import glob
+import os
+import shutil
+import unittest
+from unittest import mock
+
+import numpy as np
+import onnx
+import onnxruntime as ort
+from optimum.exporters.onnx import main_export
+
+from onnx_neural_compressor import data_reader, quantization
+from onnx_neural_compressor.quantization import config
+
+from typing import Callable, Dict, List, Optional, Union  # isort: skip
+
+
+def fake_eval(model, eval_result_lst):
+    acc = eval_result_lst.pop(0)
+    return acc
+
+
+class DataReader(data_reader.CalibrationDataReader):
+
+    def __init__(self, model):
+        model = onnx.load(model)
+        batch_size = 1
+        sequence_length = 1
+        self.data = {
+            "input_ids": np.random.randint(10, size=(batch_size, sequence_length)).astype("int64"),
+            "attention_mask": np.zeros((batch_size, sequence_length)).astype("int64"),
+        }
+        for inp in model.graph.input:
+            if inp.name in self.data:
+                continue
+            if inp.name == "position_ids":
+                # model is exported with optimum >= 1.14.0 with new input 'position_ids'
+                self.data[inp.name] = np.random.randint(10, size=(batch_size, sequence_length)).astype("int64")
+
+        self.enum_data = None
+
+    def get_next(self):
+        if self.enum_data is None:
+            self.enum_data = iter([self.data])
+        return next(self.enum_data, None)
+
+    def rewind(self):
+        self.enum_data = None
+
+
+def _count_op_num(model, optype):
+    num = 0
+    for node in model.graph.node:
+        if node.op_type == optype:
+            num += 1
+    return num
+
+
+class TestStaticQuant(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(self):
+        main_export(
+            "hf-internal-testing/tiny-random-gptj",
+            output="model",
+        )
+        self.model = glob.glob(os.path.join("./model", "*.onnx"))[0]
+        self.data_reader = DataReader(self.model)
+
+    @classmethod
+    def tearDownClass(self):
+        shutil.rmtree("./model", ignore_errors=True)
+        os.remove("quant.onnx")
+
+    def test_static_quant(self):
+        cfg = config.StaticQuantConfig(
+            calibration_data_reader=self.data_reader,
+            weight_type=quantization.QuantType.QInt8,
+            per_channel=True,
+            quant_last_matmul=True,
+            extra_options={"WeightSymmetric": True, "ActivationSymmetric": False},
+            execution_provider="CPUExecutionProvider",
+        )
+        quantization.quantize(self.model, "quant.onnx", cfg)
+        q_model = onnx.load("quant.onnx")
+        qmatmul_num_enable_last = _count_op_num(q_model, "QLinearMatMul")
+
+        cfg = config.StaticQuantConfig(
+            calibration_data_reader=self.data_reader,
+            weight_type=quantization.QuantType.QInt8,
+            per_channel=True,
+            quant_last_matmul=False,
+            extra_options={"WeightSymmetric": True, "ActivationSymmetric": False},
+            execution_provider="CPUExecutionProvider",
+        )
+        quantization.quantize(self.model, "quant.onnx", cfg)
+        q_model = onnx.load("quant.onnx")
+        node_num_basic = len(q_model.graph.node)
+        qmatmul_num_disable_last = _count_op_num(q_model, "QLinearMatMul")
+
+        # check quant_last_matmul work
+        self.assertEqual(qmatmul_num_enable_last, qmatmul_num_disable_last + 1)
+
+        cfg = config.StaticQuantConfig(
+            calibration_data_reader=self.data_reader,
+            weight_type=quantization.QuantType.QUInt8,
+            per_channel=False,
+            quant_last_matmul=False,
+            extra_options={"WeightSymmetric": False, "ActivationSymmetric": True},
+            execution_provider="CPUExecutionProvider",
+        )
+        quantization.quantize(self.model, "quant.onnx", cfg, ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED)
+        q_model = onnx.load("quant.onnx")
+        node_num_extended = len(q_model.graph.node)
+
+
+        # check graph optimization work
+        self.assertGreater(node_num_basic, node_num_extended)
+
+
+        # check op_types_to_quantize work
+        cfg = config.StaticQuantConfig(
+            calibration_data_reader=self.data_reader,
+            weight_type=quantization.QuantType.QUInt8,
+            per_channel=False,
+            quant_last_matmul=False,
+            op_types_to_quantize=["MatMul", "Gather"],
+            extra_options={"WeightSymmetric": False, "ActivationSymmetric": True},
+            execution_provider="CPUExecutionProvider",
+        )
+        quantization.quantize(self.model, "quant.onnx", cfg)
+        q_model = onnx.load("quant.onnx")
+        self.assertEqual(_count_op_num(q_model, "QLinearAdd"), 0)
+        self.assertGreater(_count_op_num(q_model, "QLinearMatMul"), 0)
+
+        # check nodes_to_quantize work
+        quantizable_matmuls = [i.name.split("_quant")[0] for i in q_model.graph.node if i.op_type == "QLinearMatMul"]
+        cfg = config.StaticQuantConfig(
+            calibration_data_reader=self.data_reader,
+            weight_type=quantization.QuantType.QUInt8,
+            nodes_to_quantize=[quantizable_matmuls[0]],
+            per_channel=False,
+            quant_last_matmul=False,
+            op_types_to_quantize=["MatMul", "Gather"],
+            extra_options={"WeightSymmetric": False, "ActivationSymmetric": True},
+            execution_provider="CPUExecutionProvider",
+        )
+        quantization.quantize(self.model, "quant.onnx", cfg)
+        q_model = onnx.load("quant.onnx")
+        self.assertEqual(_count_op_num(q_model, "QLinearMatMul"), 1)
+
+        # check nodes_to_exclude work
+        cfg = config.StaticQuantConfig(
+            calibration_data_reader=self.data_reader,
+            weight_type=quantization.QuantType.QUInt8,
+            nodes_to_exclude=[quantizable_matmuls[0]],
+            per_channel=False,
+            quant_last_matmul=False,
+            extra_options={"WeightSymmetric": False, "ActivationSymmetric": True},
+            execution_provider="CPUExecutionProvider",
+        )
+        quantization.quantize(self.model, "quant.onnx", cfg)
+        q_model = onnx.load("quant.onnx")
+        self.assertEqual(_count_op_num(q_model, "QLinearMatMul"), qmatmul_num_disable_last - 1)
+
+
+    def test_dynamic_quant(self):
+        cfg = config.DynamicQuantConfig(
+            weight_type=quantization.QuantType.QInt8,
+            per_channel=True,
+            quant_last_matmul=False,
+            extra_options={"WeightSymmetric": True, "ActivationSymmetric": False},
+            execution_provider="CPUExecutionProvider",
+        )
+        quantization.quantize(self.model, "quant.onnx", cfg)
+
+        cfg = config.DynamicQuantConfig(
+            weight_type=quantization.QuantType.QUInt8,
+            per_channel=False,
+            quant_last_matmul=False,
+            extra_options={"WeightSymmetric": False, "ActivationSymmetric": True},
+            execution_provider="CPUExecutionProvider",
+        )
+        quantization.quantize(self.model, "quant.onnx", cfg, ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED)
+
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/quantization/test_autotune.py b/test/quantization/test_autotune.py
index 051b6cd73..dd6ddf0db 100644
--- a/test/quantization/test_autotune.py
+++ b/test/quantization/test_autotune.py
@@ -24,8 +24,8 @@
 import onnxruntime as ort
 from optimum.exporters.onnx import main_export
 
-from onnx_neural_compressor import config, data_reader, quantization
-from onnx_neural_compressor.quantization import tuning
+from onnx_neural_compressor import data_reader, quantization
+from onnx_neural_compressor.quantization import config, tuning
 
 from typing import Callable, Dict, List, Optional, Union  # isort: skip
 
@@ -418,6 +418,7 @@ def test_static_default_auto_tune(self):
 
     def test_static_custom_auto_tune(self):
         partial_fake_eval = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.99])
+
         custom_tune_config = tuning.TuningConfig(
             config_set=config.StaticQuantConfig(
                 per_channel=[True, False],
diff --git a/test/quantization/test_config.py b/test/quantization/test_config.py
index ec9411b45..cf38c55de 100644
--- a/test/quantization/test_config.py
+++ b/test/quantization/test_config.py
@@ -7,9 +7,9 @@
 import onnx
 from optimum.exporters.onnx import main_export
 
-from onnx_neural_compressor import config, logger, quantization, utility
+from onnx_neural_compressor import logger, quantization, utility
 from onnx_neural_compressor.quantization import algorithm_entry as algos
-from onnx_neural_compressor.quantization import tuning
+from onnx_neural_compressor.quantization import config, tuning
 
 
 def find_onnx_file(folder_path):
@@ -179,12 +179,12 @@ def test_static_quant_config(self):
                 elif idx in [1, 5]:
                     self.assertFalse(configs_mapping["Matmul"]["per_channel"])
                 if idx < 4:
-                    self.assertEqual(configs_mapping["add"]["calibrate_method"], quantization.CalibrationMethod.MinMax)
+                    self.assertEqual(configs_mapping["add"]["calibrate_method"], 0)
                 else:
                     self.assertFalse("add" in configs_mapping)
                 if idx in [0, 1]:
                     self.assertEqual(
-                        configs_mapping["Matmul"]["calibrate_method"], quantization.CalibrationMethod.MinMax
+                        configs_mapping["Matmul"]["calibrate_method"], 0
                     )
                 self.assertLess(idx, 16)
 
@@ -218,10 +218,10 @@ def test_static_quant_config(self):
                 if "Matmul" in configs_mapping:
                     self.assertFalse(configs_mapping["Matmul"]["per_channel"])
                     self.assertEqual(
-                        configs_mapping["Matmul"]["calibrate_method"], quantization.CalibrationMethod.MinMax
+                        configs_mapping["Matmul"]["calibrate_method"], 0
                     )
                 if "add" in configs_mapping:
-                    self.assertEqual(configs_mapping["add"]["calibrate_method"], quantization.CalibrationMethod.MinMax)
+                    self.assertEqual(configs_mapping["add"]["calibrate_method"], 0)
                 self.assertLess(idx, 16)
 
         for execution_provider in ["TensorrtExecutionProvider"]:
@@ -240,8 +240,8 @@ def test_static_quant_config(self):
                 elif idx in [1, 5]:
                     self.assertFalse(configs_mapping["Matmul"]["per_channel"])
                 if "add" in configs_mapping:
-                    self.assertEqual(configs_mapping["add"]["calibrate_method"], quantization.CalibrationMethod.MinMax)
-                    self.assertEqual(configs_mapping["add"]["calibrate_method"], quantization.CalibrationMethod.MinMax)
+                    self.assertEqual(configs_mapping["add"]["calibrate_method"], 0)
+                    self.assertEqual(configs_mapping["add"]["calibrate_method"], 0)
                     self.assertTrue(configs_mapping["add"]["weight_sym"])
                     self.assertTrue(configs_mapping["add"]["activation_sym"])
                 if "Matmul" in configs_mapping:
@@ -265,7 +265,7 @@ def test_static_custom_quant_config(self):
                     self.assertTrue(configs_mapping["Matmul"]["per_channel"])
                 elif idx == 1:
                     self.assertFalse(configs_mapping["Matmul"]["per_channel"])
-                self.assertEqual(configs_mapping["add"]["calibrate_method"], quantization.CalibrationMethod.MinMax)
+                self.assertEqual(configs_mapping["add"]["calibrate_method"], 0)
 
                 self.assertLess(idx, 2)
 
@@ -299,7 +299,7 @@ def test_static_custom_quant_config(self):
                 model_info = quant_config.get_model_info(model=self.simple_onnx_model)
                 configs_mapping = quant_config.to_config_mapping(model_info=model_info)
                 self.assertFalse(configs_mapping["Matmul"]["per_channel"])
-                self.assertEqual(configs_mapping["add"]["calibrate_method"], quantization.CalibrationMethod.MinMax)
+                self.assertEqual(configs_mapping["add"]["calibrate_method"], 0)
                 self.assertLess(idx, 4)
 
         for execution_provider in ["TensorrtExecutionProvider"]:
@@ -318,7 +318,7 @@ def test_static_custom_quant_config(self):
                     self.assertTrue(configs_mapping["Matmul"]["per_channel"])
                 elif idx == 1:
                     self.assertFalse(configs_mapping["Matmul"]["per_channel"])
-                self.assertEqual(configs_mapping["add"]["calibrate_method"], quantization.CalibrationMethod.MinMax)
+                self.assertEqual(configs_mapping["add"]["calibrate_method"], 0)
                 self.assertTrue(configs_mapping["add"]["weight_sym"])
                 self.assertTrue(configs_mapping["add"]["activation_sym"])
                 self.assertTrue(configs_mapping["Matmul"]["weight_sym"])
diff --git a/test/quantization/test_smooth_quant.py b/test/quantization/test_smooth_quant.py
index 52f4bd8b3..73b5fa84b 100644
--- a/test/quantization/test_smooth_quant.py
+++ b/test/quantization/test_smooth_quant.py
@@ -22,10 +22,9 @@
 import onnxruntime as ort
 from optimum.exporters.onnx import main_export
 
-from onnx_neural_compressor import config, data_reader
-from onnx_neural_compressor.quantization import QuantType
+from onnx_neural_compressor import data_reader
+from onnx_neural_compressor.quantization import config, quantize, QuantType
 from onnx_neural_compressor.quantization import algorithm_entry as algos
-from onnx_neural_compressor.quantization import quantize
 
 
 class DataReader(data_reader.CalibrationDataReader):
diff --git a/test/quantization/weight_only/test_awq.py b/test/quantization/weight_only/test_awq.py
index 2d918cc61..e1c23d495 100644
--- a/test/quantization/weight_only/test_awq.py
+++ b/test/quantization/weight_only/test_awq.py
@@ -8,9 +8,9 @@
 import transformers
 from optimum.exporters.onnx import main_export
 
-from onnx_neural_compressor import config, data_reader, logger
+from onnx_neural_compressor import data_reader, logger
 from onnx_neural_compressor.quantization import algorithm_entry as algos
-from onnx_neural_compressor.quantization import matmul_4bits_quantizer, matmul_nbits_quantizer
+from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer, matmul_nbits_quantizer
 
 
 def find_onnx_file(folder_path):
diff --git a/test/quantization/weight_only/test_gptq.py b/test/quantization/weight_only/test_gptq.py
index 133e11fd1..1e674b7dd 100644
--- a/test/quantization/weight_only/test_gptq.py
+++ b/test/quantization/weight_only/test_gptq.py
@@ -8,9 +8,9 @@
 import transformers
 from optimum.exporters.onnx import main_export
 
-from onnx_neural_compressor import config, data_reader, logger
+from onnx_neural_compressor import data_reader, logger
 from onnx_neural_compressor.quantization import algorithm_entry as algos
-from onnx_neural_compressor.quantization import matmul_4bits_quantizer, matmul_nbits_quantizer
+from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer, matmul_nbits_quantizer
 
 
 def find_onnx_file(folder_path):
diff --git a/test/quantization/weight_only/test_rtn.py b/test/quantization/weight_only/test_rtn.py
index 86b3c49a3..aa3672d0c 100644
--- a/test/quantization/weight_only/test_rtn.py
+++ b/test/quantization/weight_only/test_rtn.py
@@ -6,9 +6,9 @@
 
 from optimum.exporters.onnx import main_export
 
-from onnx_neural_compressor import config, logger
+from onnx_neural_compressor import logger
 from onnx_neural_compressor.quantization import algorithm_entry as algos
-from onnx_neural_compressor.quantization import matmul_4bits_quantizer, matmul_nbits_quantizer
+from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer, matmul_nbits_quantizer
 
 
 def find_onnx_file(folder_path):
diff --git a/test/utils/test_general.py b/test/utils/test_general.py
index 32cb80087..47e863561 100644
--- a/test/utils/test_general.py
+++ b/test/utils/test_general.py
@@ -2,8 +2,8 @@
 
 import unittest
 
-from onnx_neural_compressor import config, constants, logger
-from onnx_neural_compressor.quantization import tuning
+from onnx_neural_compressor import constants, logger
+from onnx_neural_compressor.quantization import config, tuning
 
 from typing import Any, Callable, List, Optional, Tuple, Union  # isort: skip
 
@@ -217,6 +217,94 @@ def test_mixed_two_algos(self):
         self.assertIn(OP1_NAME, config_mapping)
         self.assertIn(OP2_NAME, config_mapping)
 
+    def test_config_expand(self) -> None:
+        cfg = config.RTNConfig(weight_bits=[4,8], weight_sym=[True, False], layer_wise_quant=[True, False], providers=[["CPU"], ["CUDA"]])
+        expand_cfgs = cfg.expand()
+        self.assertEqual(expand_cfgs[0].weight_bits, 4)
+        self.assertEqual(expand_cfgs[0].weight_sym, True)
+        self.assertEqual(expand_cfgs[0].layer_wise_quant, True)
+        self.assertEqual(expand_cfgs[0].providers, ["CPU"])
+
+        self.assertEqual(expand_cfgs[1].weight_bits, 8)
+        self.assertEqual(expand_cfgs[1].weight_sym, True)
+        self.assertEqual(expand_cfgs[1].layer_wise_quant, True)
+        self.assertEqual(expand_cfgs[1].providers, ["CPU"])
+
+        self.assertEqual(expand_cfgs[2].weight_bits, 4)
+        self.assertEqual(expand_cfgs[2].weight_sym, False)
+        self.assertEqual(expand_cfgs[2].layer_wise_quant, True)
+        self.assertEqual(expand_cfgs[2].providers, ["CPU"])
+
+        self.assertEqual(expand_cfgs[3].weight_bits, 8)
+        self.assertEqual(expand_cfgs[3].weight_sym, False)
+        self.assertEqual(expand_cfgs[3].layer_wise_quant, True)
+        self.assertEqual(expand_cfgs[3].providers, ["CPU"])
+
+        self.assertEqual(expand_cfgs[4].weight_bits, 4)
+        self.assertEqual(expand_cfgs[4].weight_sym, True)
+        self.assertEqual(expand_cfgs[4].layer_wise_quant, True)
+        self.assertEqual(expand_cfgs[4].providers, ["CUDA"])
+
+        self.assertEqual(expand_cfgs[5].weight_bits, 8)
+        self.assertEqual(expand_cfgs[5].weight_sym, True)
+        self.assertEqual(expand_cfgs[5].layer_wise_quant, True)
+        self.assertEqual(expand_cfgs[5].providers, ["CUDA"])
+
+        self.assertEqual(expand_cfgs[6].weight_bits, 4)
+        self.assertEqual(expand_cfgs[6].weight_sym, False)
+        self.assertEqual(expand_cfgs[6].layer_wise_quant, True)
+        self.assertEqual(expand_cfgs[6].providers, ["CUDA"])
+
+        self.assertEqual(expand_cfgs[7].weight_bits, 8)
+        self.assertEqual(expand_cfgs[7].weight_sym, False)
+        self.assertEqual(expand_cfgs[7].layer_wise_quant, True)
+        self.assertEqual(expand_cfgs[7].providers, ["CUDA"])
+
+        self.assertEqual(expand_cfgs[8].weight_bits, 4)
+        self.assertEqual(expand_cfgs[8].weight_sym, True)
+        self.assertEqual(expand_cfgs[8].layer_wise_quant, False)
+        self.assertEqual(expand_cfgs[8].providers, ["CPU"])
+
+        self.assertEqual(expand_cfgs[9].weight_bits, 8)
+        self.assertEqual(expand_cfgs[9].weight_sym, True)
+        self.assertEqual(expand_cfgs[9].layer_wise_quant, False)
+        self.assertEqual(expand_cfgs[9].providers, ["CPU"])
+
+        self.assertEqual(expand_cfgs[10].weight_bits, 4)
+        self.assertEqual(expand_cfgs[10].weight_sym, False)
+        self.assertEqual(expand_cfgs[10].layer_wise_quant, False)
+        self.assertEqual(expand_cfgs[10].providers, ["CPU"])
+
+        self.assertEqual(expand_cfgs[11].weight_bits, 8)
+        self.assertEqual(expand_cfgs[11].weight_sym, False)
+        self.assertEqual(expand_cfgs[11].layer_wise_quant, False)
+        self.assertEqual(expand_cfgs[11].providers, ["CPU"])
+
+        self.assertEqual(expand_cfgs[12].weight_bits, 4)
+        self.assertEqual(expand_cfgs[12].weight_sym, True)
+        self.assertEqual(expand_cfgs[12].layer_wise_quant, False)
+        self.assertEqual(expand_cfgs[12].providers, ["CUDA"])
+
+        self.assertEqual(expand_cfgs[13].weight_bits, 8)
+        self.assertEqual(expand_cfgs[13].weight_sym, True)
+        self.assertEqual(expand_cfgs[13].layer_wise_quant, False)
+        self.assertEqual(expand_cfgs[13].providers, ["CUDA"])
+
+        self.assertEqual(expand_cfgs[14].weight_bits, 4)
+        self.assertEqual(expand_cfgs[14].weight_sym, False)
+        self.assertEqual(expand_cfgs[14].layer_wise_quant, False)
+        self.assertEqual(expand_cfgs[14].providers, ["CUDA"])
+
+        self.assertEqual(expand_cfgs[15].weight_bits, 8)
+        self.assertEqual(expand_cfgs[15].weight_sym, False)
+        self.assertEqual(expand_cfgs[15].layer_wise_quant, False)
+        self.assertEqual(expand_cfgs[15].providers, ["CUDA"])
+
+
+    def test_config_expand_with_empty_options(self):
+        configs = FakeAlgoConfig(weight_dtype=["int", "float32"], weight_bits=[])
+        configs_list = configs.expand()
+        self.assertEqual(len(configs_list), 2)
 
 class TestConfigSet(unittest.TestCase):
 
@@ -259,5 +347,6 @@ def test_config_loader_skip_verified_config(self) -> None:
         self.assertEqual(config_count, 2)
 
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/utils/test_param.py b/test/utils/test_param.py
index fd8b7d3d3..5c04ccffc 100644
--- a/test/utils/test_param.py
+++ b/test/utils/test_param.py
@@ -3,7 +3,7 @@
 import unittest
 from typing import List
 
-from onnx_neural_compressor import config
+from onnx_neural_compressor.quantization import config
 
 
 class TestTuningParam(unittest.TestCase):
@@ -20,6 +20,7 @@ def test_is_tunable_recursive(self):
         self.assertTrue(param.is_tunable([[5, 6], [7, 8]]))
         # TODO: double check if this is the expected behavior
         self.assertTrue(param.is_tunable([[5, 6], [7, "8"]]))
+        self.assertEqual(str(param), "TuningParam(name=param_name, tunable_type=typing.List[typing.List[int]], options=None).")
 
 
 if __name__ == "__main__":