Skip to content

Commit

Permalink
refine code
Browse files Browse the repository at this point in the history
Signed-off-by: Mengni Wang <[email protected]>
  • Loading branch information
mengniwang95 committed Jun 25, 2024
1 parent 7cc43a9 commit 7b03794
Show file tree
Hide file tree
Showing 39 changed files with 1,430 additions and 1,017 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
from PIL import Image
from sklearn import metrics

from onnx_neural_compressor import config, data_reader, quantization
from onnx_neural_compressor.quantization import tuning
from onnx_neural_compressor import data_reader, quantization
from onnx_neural_compressor.quantization import config, tuning

logger = logging.getLogger(__name__)
logging.basicConfig(
Expand Down
3 changes: 1 addition & 2 deletions examples/nlp/bert/quantization/ptq_dynamic/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@
from onnxruntime.transformers.fusion_options import FusionOptions
from torch.utils import data

from onnx_neural_compressor import config
from onnx_neural_compressor.quantization import tuning
from onnx_neural_compressor.quantization import config, tuning

logger = logging.getLogger(__name__)
logging.basicConfig(
Expand Down
4 changes: 2 additions & 2 deletions examples/nlp/bert/quantization/ptq_static/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
from onnxruntime.transformers.fusion_options import FusionOptions
from torch.utils import data

from onnx_neural_compressor import config, data_reader, quantization
from onnx_neural_compressor.quantization import tuning
from onnx_neural_compressor import data_reader, quantization
from onnx_neural_compressor.quantization import config, tuning

logger = logging.getLogger(__name__)
logging.basicConfig(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
from torch.nn import functional
from torch.utils import data

from onnx_neural_compressor import config, data_reader, logger, utility
from onnx_neural_compressor.quantization import matmul_nbits_quantizer, tuning
from onnx_neural_compressor import data_reader, logger, utility
from onnx_neural_compressor.quantization import config, matmul_nbits_quantizer, tuning

logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN
Expand Down
13 changes: 11 additions & 2 deletions onnx_neural_compressor/algorithms/layer_wise/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import onnxruntime as ort

from onnx_neural_compressor import data_reader, logger, onnx_model
from onnx_neural_compressor.algorithms import utility as quant_utils

from typing import Callable, List, Union # isort: skip

Expand All @@ -48,7 +47,7 @@ def layer_wise_quant(
_type_: _description_
"""
# check whether model shape is inferred
if not quant_utils.check_model_with_infer_shapes(model):
if not _check_model_with_infer_shapes(model):
logger.error(
"Before applying layer-wise quantization, please make sure to "
"run symbolic shape inference on your model like follows:\n"
Expand Down Expand Up @@ -276,3 +275,13 @@ def _prepare_data_reader_for_next_split_model(
inputs.update({name: value for name, value in zip(output_names, out)})
data_reader_for_next_split_model.append(inputs)
return DataReader(data_reader_for_next_split_model)

def _check_model_with_infer_shapes(model):
"""Check if the model has been shape inferred."""
if isinstance(model, (pathlib.Path, str)):
model = onnx.load(model, load_external_data=False)
elif isinstance(model, onnx_model.ONNXModel):
model = model.model
if len(model.graph.value_info) > 0:
return True
return False
20 changes: 8 additions & 12 deletions onnx_neural_compressor/algorithms/post_training_quant/calibrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
import numpy as np
import onnx
import onnxruntime
from onnxruntime import quantization as ort_quant
from packaging import version

from onnx_neural_compressor import logger, onnx_model
Expand Down Expand Up @@ -279,9 +278,9 @@ def _collect_data(inputs):
node_name = name_to_node[node_output_names[output_idx]]
if node_output_names[output_idx] not in name_to_calibrator:
calib_method = (
q_config[node_name]["calibrate_method"].name
q_config[node_name]["calibrate_method"]
if q_config and node_name in q_config
else ort_quant.CalibrationMethod.MinMax.name
else 0
)
assert calib_method in calibrator.CALIBRATOR, "Calibration method {} is not registered.".format(
calib_method
Expand All @@ -294,7 +293,7 @@ def _collect_data(inputs):
# the calibration method is minmax, otherwise the tensor data is collected.
# TODO: for entropy and percentile method, need to support range collection
# per iteration in the future.
if _calibrator.method_name == ort_quant.CalibrationMethod.MinMax.name:
if _calibrator.method_name == "MinMax":
_calibrator.collect(output)
activation_tensors_calib_range[node_output_names[output_idx]] = [list(_calibrator.calib_range)]
name_to_calibrator[node_output_names[output_idx]] = _calibrator
Expand Down Expand Up @@ -325,9 +324,9 @@ def _collect_data(inputs):
if any([data.dtype in [bool] for data in datas]): # output type of some ops is bool, skip
continue
calib_method = (
q_config[node_name]["calibrate_method"].name
q_config[node_name]["calibrate_method"]
if q_config and node_name in q_config
else ort_quant.CalibrationMethod.MinMax.name
else 0
)
_calibrator = calibrator.CALIBRATOR[calib_method]()
_calibrator.collect(datas)
Expand Down Expand Up @@ -396,9 +395,7 @@ def get_weight_tensors_calib_range(self):
os.path.dirname(self.model_wrapper.model_path) if self.model_wrapper.model_path is not None else ""
),
)
_calibrator = calibrator.CALIBRATOR[
ort_quant.CalibrationMethod.MinMax.name
]() # use minmax method to calibrate initializer tensors
_calibrator = calibrator.CALIBRATOR[0]() # use minmax method to calibrate initializer tensors
if initializer_tensor.flatten().size > 0:
_calibrator.collect(initializer_tensor)
weight_tensors_calib_range[initializer_tensor_name] = [list(_calibrator.calib_range)]
Expand Down Expand Up @@ -598,13 +595,12 @@ def calculate_quantization_params(self, q_config, quantization_thresholds):
node_thresholds[1],
sym,
qType,
quant_utils.get_qmin_qmax_for_qType(qType, self.reduce_range, sym),
)
quantization_params[tensor_name] = node_params

return quantization_params

def calculate_scale_zeropoint(self, last_node, next_node, rmin, rmax, sym, qType, quantize_range):
def calculate_scale_zeropoint(self, last_node, next_node, rmin, rmax, sym, qType):
"""Given the source and destination node of tensor, return calculated zero point and scales."""
zp_and_scale = []
# adjust rmin and rmax such that 0 is included in the range. This is required
Expand Down Expand Up @@ -640,7 +636,7 @@ def calculate_scale_zeropoint(self, last_node, next_node, rmin, rmax, sym, qType
rmin = min(rmin, clip_params[0], clip_params[1])
rmax = max(rmax, clip_params[0], clip_params[1])

scale, zp = quant_utils.calculate_scale_zp(rmin, rmax, quantize_range, qType, sym)
scale, zp = quant_utils.calculate_scale_zp(rmin, rmax, qType, sym, self.reduce_range)
zp_and_scale.append(zp)
zp_and_scale.append(scale)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def decorator_calib(cls):
), "The name of subclass of Calibrator should end with 'Calibrator' substring."
if cls.__name__[: -len("Calibrator")] in CALIBRATOR: # pragma: no cover
raise ValueError("Cannot have two operators with the same name.")
CALIBRATOR[calib_method.strip()] = cls
CALIBRATOR[calib_method] = cls
return cls

return decorator_calib
Expand Down Expand Up @@ -69,7 +69,7 @@ def calib_range(self):
return self._calib_min, self._calib_max


@calib_registry(calib_method="MinMax")
@calib_registry(calib_method=0)
class MinMaxCalibrator(CalibratorBase):
"""MinMax calibrator class."""

Expand Down Expand Up @@ -109,7 +109,7 @@ def method_name(self):
return "MinMax"


@calib_registry(calib_method="Percentile")
@calib_registry(calib_method=2)
class PercentileCalibrator(CalibratorBase):
"""Percentile calibrator class.
Expand Down Expand Up @@ -163,7 +163,7 @@ def method_name(self):
return "Percentile"


@calib_registry(calib_method="Entropy")
@calib_registry(calib_method=1)
class EntropyCalibrator(CalibratorBase):
"""Entropy calibrator class.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,7 @@
# limitations under the License.
"""Base Operator."""

from onnxruntime import quantization

from onnx_neural_compressor import constants
from onnx_neural_compressor import constants, quantization

OPERATORS = {
"dynamic_quant": {},
Expand Down Expand Up @@ -56,7 +54,7 @@ def __init__(self, onnx_quantizer, onnx_node):
True if onnx_node.op_type in onnx_quantizer.op_types_to_exclude_output_quantization else False
)
self.per_channel = False
self.calibrate_method = quantization.CalibrationMethod.MinMax
self.calibrate_method = 0 # minmax
self.weight_sym = True
self.weight_dtype = None
self.activation_dtype = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,27 +57,35 @@ def convert_check(self):

def convert(self):
"""Convert to QOperator format."""
# DQ-Gather-Q-DQ-op
node = self.node

parents = self.quantizer.model.get_parents(node)
children = self.quantizer.model.get_children(node)

if any([i.op_type == "DequantizeLinear" for i in parents]):
from onnx import numpy_helper

inputs = []
inputs.append(parents[0].input[0])
inputs.append(node.input[1])

gather_new_output = node.output[0] + "_quantized"
out_scale = 1.0
out_zp = 0
gather_new_output = node.output[0] + "_quantized" # dynamic quant output name
for child in children:
if child.op_type == "QuantizeLinear":
out_scale = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(children[0].input[1]))
out_zp = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(children[0].input[2]))
gather_new_output = children[0].output[0] # static quant output name
self.quantizer.remove_nodes.append(child)

kwargs = {}
for attribute in node.attribute: # pragma: no cover
kwargs.update(quant_utils.attribute_to_kwarg(attribute))

gather_node = onnx.helper.make_node(node.op_type, inputs, [gather_new_output], node.name, **kwargs)
self.quantizer.new_nodes.append(gather_node)
if any([i.op_type != "QuantizeLinear" for i in children]): # pragma: no cover
if any([i.op_type != "QuantizeLinear" for i in children]):
dq_inputs = []
dq_inputs.append(gather_new_output)
dq_inputs.extend(parents[0].input[1:])
Expand All @@ -86,25 +94,15 @@ def convert(self):
)
self.quantizer.new_nodes.append(dq_node)

out_scale = 1.0
out_zp = 0
for child in children:
if child.op_type == "QuantizeLinear":
out_scale = numpy_helper.to_array(self.quantizer.model.get_initializer(child.input[1]))
out_zp = numpy_helper.to_array(self.quantizer.model.get_initializer(child.input[2]))
self.quantizer.remove_nodes.append(child)
for n in self.quantizer.model.get_children(child):
self.quantizer.model.replace_node_input(n, child.output[0], gather_new_output)

# int8 weight will be recalculated for the first time
if (
any([child.op_type == "QuantizeLinear" for child in children])
and self.quantizer.model.get_initializer(parents[0].input[0]) is not None
and parents[0].input[0] not in self.quantizer.recalculate_quantized_value
):
int8_tensor = numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[0]))
in_scale = numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[1]))
in_zp = numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[2]))
int8_tensor = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[0]))
in_scale = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[1]))
in_zp = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[2]))
new_int8_tensor = (((int8_tensor.astype("float32") - in_zp) * in_scale) / out_scale).round() + out_zp
self.quantizer.model.set_initializer(parents[0].input[0], new_int8_tensor.astype(int8_tensor.dtype))
self.quantizer.recalculate_quantized_value.append(parents[0].input[0])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,10 @@ def convert(self):
scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0]
padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer)
quantized_padding_constant_array = quant_utils.quantize_nparray(
self.weight_dtype, padding_constant_array, scale_value, zp_value
onnx.helper.tensor_dtype_to_np_dtype(self.weight_dtype),
padding_constant_array,
scale_value,
zp_value,
)
quantized_padding_constant_name = node.input[2] + "_quantized"
quantized_padding_constant_initializer = onnx.numpy_helper.from_array(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,10 @@ def convert(self):
if len(node.input) > 1: # pragma: no cover
quantized_input_names.extend(node.input[1:])
outputs = []
input_name_to_nodes = self.quantizer.model.input_name_to_nodes()
for output in node.output:
if output in self.quantizer.model.input_name_to_nodes():
child = self.quantizer.model.input_name_to_nodes()[output][0]
if output in input_name_to_nodes:
child = input_name_to_nodes[output][0]
if child.op_type == "QuantizeLinear":
self.quantizer.remove_nodes.append(child)
outputs.append(child.output[0])
Expand Down
Loading

0 comments on commit 7b03794

Please sign in to comment.