refine code

Signed-off-by: Mengni Wang <[email protected]>
mengniwang95 · Jun 25, 2024 · 7b03794 · 7b03794
1 parent 7cc43a9
commit 7b03794
Show file tree

Hide file tree

Showing 39 changed files with 1,430 additions and 1,017 deletions.
diff --git a/examples/image_recognition/resnet50/quantization/ptq_static/main.py b/examples/image_recognition/resnet50/quantization/ptq_static/main.py
@@ -30,8 +30,8 @@
 from PIL import Image
 from sklearn import metrics
 
-from onnx_neural_compressor import config, data_reader, quantization
-from onnx_neural_compressor.quantization import tuning
+from onnx_neural_compressor import data_reader, quantization
+from onnx_neural_compressor.quantization import config, tuning
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(

diff --git a/examples/nlp/bert/quantization/ptq_dynamic/main.py b/examples/nlp/bert/quantization/ptq_dynamic/main.py
@@ -34,8 +34,7 @@
 from onnxruntime.transformers.fusion_options import FusionOptions
 from torch.utils import data
 
-from onnx_neural_compressor import config
-from onnx_neural_compressor.quantization import tuning
+from onnx_neural_compressor.quantization import config, tuning
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(

diff --git a/examples/nlp/bert/quantization/ptq_static/main.py b/examples/nlp/bert/quantization/ptq_static/main.py
@@ -34,8 +34,8 @@
 from onnxruntime.transformers.fusion_options import FusionOptions
 from torch.utils import data
 
-from onnx_neural_compressor import config, data_reader, quantization
-from onnx_neural_compressor.quantization import tuning
+from onnx_neural_compressor import data_reader, quantization
+from onnx_neural_compressor.quantization import config, tuning
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(

diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py
@@ -33,8 +33,8 @@
 from torch.nn import functional
 from torch.utils import data
 
-from onnx_neural_compressor import config, data_reader, logger, utility
-from onnx_neural_compressor.quantization import matmul_nbits_quantizer, tuning
+from onnx_neural_compressor import data_reader, logger, utility
+from onnx_neural_compressor.quantization import config, matmul_nbits_quantizer, tuning
 
 logging.basicConfig(
     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN

diff --git a/onnx_neural_compressor/algorithms/layer_wise/core.py b/onnx_neural_compressor/algorithms/layer_wise/core.py
@@ -23,7 +23,6 @@
 import onnxruntime as ort
 
 from onnx_neural_compressor import data_reader, logger, onnx_model
-from onnx_neural_compressor.algorithms import utility as quant_utils
 
 from typing import Callable, List, Union  # isort: skip
 
@@ -48,7 +47,7 @@ def layer_wise_quant(
         _type_: _description_
     """
     # check whether model shape is inferred
-    if not quant_utils.check_model_with_infer_shapes(model):
+    if not _check_model_with_infer_shapes(model):
         logger.error(
             "Before applying layer-wise quantization, please make sure to "
             "run symbolic shape inference on your model like follows:\n"
@@ -276,3 +275,13 @@ def _prepare_data_reader_for_next_split_model(
         inputs.update({name: value for name, value in zip(output_names, out)})
         data_reader_for_next_split_model.append(inputs)
     return DataReader(data_reader_for_next_split_model)
+
+def _check_model_with_infer_shapes(model):
+    """Check if the model has been shape inferred."""
+    if isinstance(model, (pathlib.Path, str)):
+        model = onnx.load(model, load_external_data=False)
+    elif isinstance(model, onnx_model.ONNXModel):
+        model = model.model
+    if len(model.graph.value_info) > 0:
+        return True
+    return False
diff --git a/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py b/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py
@@ -28,7 +28,6 @@
 import numpy as np
 import onnx
 import onnxruntime
-from onnxruntime import quantization as ort_quant
 from packaging import version
 
 from onnx_neural_compressor import logger, onnx_model
@@ -279,9 +278,9 @@ def _collect_data(inputs):
                     node_name = name_to_node[node_output_names[output_idx]]
                     if node_output_names[output_idx] not in name_to_calibrator:
                         calib_method = (
-                            q_config[node_name]["calibrate_method"].name
+                            q_config[node_name]["calibrate_method"]
                             if q_config and node_name in q_config
-                            else ort_quant.CalibrationMethod.MinMax.name
+                            else 0
                         )
                         assert calib_method in calibrator.CALIBRATOR, "Calibration method {} is not registered.".format(
                             calib_method
@@ -294,7 +293,7 @@ def _collect_data(inputs):
                     # the calibration method is minmax, otherwise the tensor data is collected.
                     # TODO: for entropy and percentile method, need to support range collection
                     # per iteration in the future.
-                    if _calibrator.method_name == ort_quant.CalibrationMethod.MinMax.name:
+                    if _calibrator.method_name == "MinMax":
                         _calibrator.collect(output)
                         activation_tensors_calib_range[node_output_names[output_idx]] = [list(_calibrator.calib_range)]
                         name_to_calibrator[node_output_names[output_idx]] = _calibrator
@@ -325,9 +324,9 @@ def _collect_data(inputs):
             if any([data.dtype in [bool] for data in datas]):  # output type of some ops is bool, skip
                 continue
             calib_method = (
-                q_config[node_name]["calibrate_method"].name
+                q_config[node_name]["calibrate_method"]
                 if q_config and node_name in q_config
-                else ort_quant.CalibrationMethod.MinMax.name
+                else 0
             )
             _calibrator = calibrator.CALIBRATOR[calib_method]()
             _calibrator.collect(datas)
@@ -396,9 +395,7 @@ def get_weight_tensors_calib_range(self):
                     os.path.dirname(self.model_wrapper.model_path) if self.model_wrapper.model_path is not None else ""
                 ),
             )
-            _calibrator = calibrator.CALIBRATOR[
-                ort_quant.CalibrationMethod.MinMax.name
-            ]()  # use minmax method to calibrate initializer tensors
+            _calibrator = calibrator.CALIBRATOR[0]() # use minmax method to calibrate initializer tensors
             if initializer_tensor.flatten().size > 0:
                 _calibrator.collect(initializer_tensor)
                 weight_tensors_calib_range[initializer_tensor_name] = [list(_calibrator.calib_range)]
@@ -598,13 +595,12 @@ def calculate_quantization_params(self, q_config, quantization_thresholds):
                 node_thresholds[1],
                 sym,
                 qType,
-                quant_utils.get_qmin_qmax_for_qType(qType, self.reduce_range, sym),
             )
             quantization_params[tensor_name] = node_params
 
         return quantization_params
 
-    def calculate_scale_zeropoint(self, last_node, next_node, rmin, rmax, sym, qType, quantize_range):
+    def calculate_scale_zeropoint(self, last_node, next_node, rmin, rmax, sym, qType):
         """Given the source and destination node of tensor, return calculated zero point and scales."""
         zp_and_scale = []
         # adjust rmin and rmax such that 0 is included in the range. This is required
@@ -640,7 +636,7 @@ def calculate_scale_zeropoint(self, last_node, next_node, rmin, rmax, sym, qType
                         rmin = min(rmin, clip_params[0], clip_params[1])
                         rmax = max(rmax, clip_params[0], clip_params[1])
 
-        scale, zp = quant_utils.calculate_scale_zp(rmin, rmax, quantize_range, qType, sym)
+        scale, zp = quant_utils.calculate_scale_zp(rmin, rmax, qType, sym, self.reduce_range)
         zp_and_scale.append(zp)
         zp_and_scale.append(scale)
 

diff --git a/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py b/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py
@@ -36,7 +36,7 @@ def decorator_calib(cls):
         ), "The name of subclass of Calibrator should end with 'Calibrator' substring."
         if cls.__name__[: -len("Calibrator")] in CALIBRATOR:  # pragma: no cover
             raise ValueError("Cannot have two operators with the same name.")
-        CALIBRATOR[calib_method.strip()] = cls
+        CALIBRATOR[calib_method] = cls
         return cls
 
     return decorator_calib
@@ -69,7 +69,7 @@ def calib_range(self):
         return self._calib_min, self._calib_max
 
 
-@calib_registry(calib_method="MinMax")
+@calib_registry(calib_method=0)
 class MinMaxCalibrator(CalibratorBase):
     """MinMax calibrator class."""
 
@@ -109,7 +109,7 @@ def method_name(self):
         return "MinMax"
 
 
-@calib_registry(calib_method="Percentile")
+@calib_registry(calib_method=2)
 class PercentileCalibrator(CalibratorBase):
     """Percentile calibrator class.
 
@@ -163,7 +163,7 @@ def method_name(self):
         return "Percentile"
 
 
-@calib_registry(calib_method="Entropy")
+@calib_registry(calib_method=1)
 class EntropyCalibrator(CalibratorBase):
     """Entropy calibrator class.
 

diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py
@@ -13,9 +13,7 @@
 # limitations under the License.
 """Base Operator."""
 
-from onnxruntime import quantization
-
-from onnx_neural_compressor import constants
+from onnx_neural_compressor import constants, quantization
 
 OPERATORS = {
     "dynamic_quant": {},
@@ -56,7 +54,7 @@ def __init__(self, onnx_quantizer, onnx_node):
             True if onnx_node.op_type in onnx_quantizer.op_types_to_exclude_output_quantization else False
         )
         self.per_channel = False
-        self.calibrate_method = quantization.CalibrationMethod.MinMax
+        self.calibrate_method = 0 # minmax
         self.weight_sym = True
         self.weight_dtype = None
         self.activation_dtype = None

diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py
@@ -57,27 +57,35 @@ def convert_check(self):
 
     def convert(self):
         """Convert to QOperator format."""
+        # DQ-Gather-Q-DQ-op
         node = self.node
 
         parents = self.quantizer.model.get_parents(node)
         children = self.quantizer.model.get_children(node)
 
         if any([i.op_type == "DequantizeLinear" for i in parents]):
-            from onnx import numpy_helper
 
             inputs = []
             inputs.append(parents[0].input[0])
             inputs.append(node.input[1])
 
-            gather_new_output = node.output[0] + "_quantized"
+            out_scale = 1.0
+            out_zp = 0
+            gather_new_output = node.output[0] + "_quantized" # dynamic quant output name
+            for child in children:
+                if child.op_type == "QuantizeLinear":
+                    out_scale = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(children[0].input[1]))
+                    out_zp = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(children[0].input[2]))
+                    gather_new_output = children[0].output[0] # static quant output name
+                    self.quantizer.remove_nodes.append(child)
 
             kwargs = {}
             for attribute in node.attribute:  # pragma: no cover
                 kwargs.update(quant_utils.attribute_to_kwarg(attribute))
 
             gather_node = onnx.helper.make_node(node.op_type, inputs, [gather_new_output], node.name, **kwargs)
             self.quantizer.new_nodes.append(gather_node)
-            if any([i.op_type != "QuantizeLinear" for i in children]):  # pragma: no cover
+            if any([i.op_type != "QuantizeLinear" for i in children]):
                 dq_inputs = []
                 dq_inputs.append(gather_new_output)
                 dq_inputs.extend(parents[0].input[1:])
@@ -86,25 +94,15 @@ def convert(self):
                 )
                 self.quantizer.new_nodes.append(dq_node)
 
-            out_scale = 1.0
-            out_zp = 0
-            for child in children:
-                if child.op_type == "QuantizeLinear":
-                    out_scale = numpy_helper.to_array(self.quantizer.model.get_initializer(child.input[1]))
-                    out_zp = numpy_helper.to_array(self.quantizer.model.get_initializer(child.input[2]))
-                    self.quantizer.remove_nodes.append(child)
-                    for n in self.quantizer.model.get_children(child):
-                        self.quantizer.model.replace_node_input(n, child.output[0], gather_new_output)
-
             # int8 weight will be recalculated for the first time
             if (
                 any([child.op_type == "QuantizeLinear" for child in children])
                 and self.quantizer.model.get_initializer(parents[0].input[0]) is not None
                 and parents[0].input[0] not in self.quantizer.recalculate_quantized_value
             ):
-                int8_tensor = numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[0]))
-                in_scale = numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[1]))
-                in_zp = numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[2]))
+                int8_tensor = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[0]))
+                in_scale = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[1]))
+                in_zp = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[2]))
                 new_int8_tensor = (((int8_tensor.astype("float32") - in_zp) * in_scale) / out_scale).round() + out_zp
                 self.quantizer.model.set_initializer(parents[0].input[0], new_int8_tensor.astype(int8_tensor.dtype))
                 self.quantizer.recalculate_quantized_value.append(parents[0].input[0])

diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py
@@ -76,7 +76,10 @@ def convert(self):
                     scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0]
                     padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer)
                     quantized_padding_constant_array = quant_utils.quantize_nparray(
-                        self.weight_dtype, padding_constant_array, scale_value, zp_value
+                        onnx.helper.tensor_dtype_to_np_dtype(self.weight_dtype),
+                        padding_constant_array,
+                        scale_value,
+                        zp_value,
                     )
                     quantized_padding_constant_name = node.input[2] + "_quantized"
                     quantized_padding_constant_initializer = onnx.numpy_helper.from_array(

diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py
@@ -71,9 +71,10 @@ def convert(self):
         if len(node.input) > 1:  # pragma: no cover
             quantized_input_names.extend(node.input[1:])
         outputs = []
+        input_name_to_nodes = self.quantizer.model.input_name_to_nodes()
         for output in node.output:
-            if output in self.quantizer.model.input_name_to_nodes():
-                child = self.quantizer.model.input_name_to_nodes()[output][0]
+            if output in input_name_to_nodes:
+                child = input_name_to_nodes[output][0]
                 if child.op_type == "QuantizeLinear":
                     self.quantizer.remove_nodes.append(child)
                     outputs.append(child.output[0])