diff --git a/examples/image_recognition/resnet50/quantization/ptq_static/main.py b/examples/image_recognition/resnet50/quantization/ptq_static/main.py index cc82d49b4..182c426a0 100644 --- a/examples/image_recognition/resnet50/quantization/ptq_static/main.py +++ b/examples/image_recognition/resnet50/quantization/ptq_static/main.py @@ -30,8 +30,8 @@ from PIL import Image from sklearn import metrics -from onnx_neural_compressor import config, data_reader, quantization -from onnx_neural_compressor.quantization import tuning +from onnx_neural_compressor import data_reader, quantization +from onnx_neural_compressor.quantization import config, tuning logger = logging.getLogger(__name__) logging.basicConfig( diff --git a/examples/nlp/bert/quantization/ptq_dynamic/main.py b/examples/nlp/bert/quantization/ptq_dynamic/main.py index 2a2b97817..17cb3a5b2 100644 --- a/examples/nlp/bert/quantization/ptq_dynamic/main.py +++ b/examples/nlp/bert/quantization/ptq_dynamic/main.py @@ -34,8 +34,7 @@ from onnxruntime.transformers.fusion_options import FusionOptions from torch.utils import data -from onnx_neural_compressor import config -from onnx_neural_compressor.quantization import tuning +from onnx_neural_compressor.quantization import config, tuning logger = logging.getLogger(__name__) logging.basicConfig( diff --git a/examples/nlp/bert/quantization/ptq_static/main.py b/examples/nlp/bert/quantization/ptq_static/main.py index d1bb34b77..1984bed85 100644 --- a/examples/nlp/bert/quantization/ptq_static/main.py +++ b/examples/nlp/bert/quantization/ptq_static/main.py @@ -34,8 +34,8 @@ from onnxruntime.transformers.fusion_options import FusionOptions from torch.utils import data -from onnx_neural_compressor import config, data_reader, quantization -from onnx_neural_compressor.quantization import tuning +from onnx_neural_compressor import data_reader, quantization +from onnx_neural_compressor.quantization import config, tuning logger = logging.getLogger(__name__) logging.basicConfig( diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py index 9cafe62d3..7c18b5f35 100644 --- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py +++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py @@ -33,8 +33,8 @@ from torch.nn import functional from torch.utils import data -from onnx_neural_compressor import config, data_reader, logger, utility -from onnx_neural_compressor.quantization import matmul_nbits_quantizer, tuning +from onnx_neural_compressor import data_reader, logger, utility +from onnx_neural_compressor.quantization import config, matmul_nbits_quantizer, tuning logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN diff --git a/onnx_neural_compressor/algorithms/layer_wise/core.py b/onnx_neural_compressor/algorithms/layer_wise/core.py index 1130981e7..aa71dd3aa 100644 --- a/onnx_neural_compressor/algorithms/layer_wise/core.py +++ b/onnx_neural_compressor/algorithms/layer_wise/core.py @@ -23,7 +23,6 @@ import onnxruntime as ort from onnx_neural_compressor import data_reader, logger, onnx_model -from onnx_neural_compressor.algorithms import utility as quant_utils from typing import Callable, List, Union # isort: skip @@ -48,7 +47,7 @@ def layer_wise_quant( _type_: _description_ """ # check whether model shape is inferred - if not quant_utils.check_model_with_infer_shapes(model): + if not _check_model_with_infer_shapes(model): logger.error( "Before applying layer-wise quantization, please make sure to " "run symbolic shape inference on your model like follows:\n" @@ -276,3 +275,13 @@ def _prepare_data_reader_for_next_split_model( inputs.update({name: value for name, value in zip(output_names, out)}) data_reader_for_next_split_model.append(inputs) return DataReader(data_reader_for_next_split_model) + +def _check_model_with_infer_shapes(model): + """Check if the model has been shape inferred.""" + if isinstance(model, (pathlib.Path, str)): + model = onnx.load(model, load_external_data=False) + elif isinstance(model, onnx_model.ONNXModel): + model = model.model + if len(model.graph.value_info) > 0: + return True + return False diff --git a/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py b/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py index 40e3b9645..af14c3562 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py @@ -28,7 +28,6 @@ import numpy as np import onnx import onnxruntime -from onnxruntime import quantization as ort_quant from packaging import version from onnx_neural_compressor import logger, onnx_model @@ -279,9 +278,9 @@ def _collect_data(inputs): node_name = name_to_node[node_output_names[output_idx]] if node_output_names[output_idx] not in name_to_calibrator: calib_method = ( - q_config[node_name]["calibrate_method"].name + q_config[node_name]["calibrate_method"] if q_config and node_name in q_config - else ort_quant.CalibrationMethod.MinMax.name + else 0 ) assert calib_method in calibrator.CALIBRATOR, "Calibration method {} is not registered.".format( calib_method @@ -294,7 +293,7 @@ def _collect_data(inputs): # the calibration method is minmax, otherwise the tensor data is collected. # TODO: for entropy and percentile method, need to support range collection # per iteration in the future. - if _calibrator.method_name == ort_quant.CalibrationMethod.MinMax.name: + if _calibrator.method_name == "MinMax": _calibrator.collect(output) activation_tensors_calib_range[node_output_names[output_idx]] = [list(_calibrator.calib_range)] name_to_calibrator[node_output_names[output_idx]] = _calibrator @@ -325,9 +324,9 @@ def _collect_data(inputs): if any([data.dtype in [bool] for data in datas]): # output type of some ops is bool, skip continue calib_method = ( - q_config[node_name]["calibrate_method"].name + q_config[node_name]["calibrate_method"] if q_config and node_name in q_config - else ort_quant.CalibrationMethod.MinMax.name + else 0 ) _calibrator = calibrator.CALIBRATOR[calib_method]() _calibrator.collect(datas) @@ -396,9 +395,7 @@ def get_weight_tensors_calib_range(self): os.path.dirname(self.model_wrapper.model_path) if self.model_wrapper.model_path is not None else "" ), ) - _calibrator = calibrator.CALIBRATOR[ - ort_quant.CalibrationMethod.MinMax.name - ]() # use minmax method to calibrate initializer tensors + _calibrator = calibrator.CALIBRATOR[0]() # use minmax method to calibrate initializer tensors if initializer_tensor.flatten().size > 0: _calibrator.collect(initializer_tensor) weight_tensors_calib_range[initializer_tensor_name] = [list(_calibrator.calib_range)] @@ -598,13 +595,12 @@ def calculate_quantization_params(self, q_config, quantization_thresholds): node_thresholds[1], sym, qType, - quant_utils.get_qmin_qmax_for_qType(qType, self.reduce_range, sym), ) quantization_params[tensor_name] = node_params return quantization_params - def calculate_scale_zeropoint(self, last_node, next_node, rmin, rmax, sym, qType, quantize_range): + def calculate_scale_zeropoint(self, last_node, next_node, rmin, rmax, sym, qType): """Given the source and destination node of tensor, return calculated zero point and scales.""" zp_and_scale = [] # adjust rmin and rmax such that 0 is included in the range. This is required @@ -640,7 +636,7 @@ def calculate_scale_zeropoint(self, last_node, next_node, rmin, rmax, sym, qType rmin = min(rmin, clip_params[0], clip_params[1]) rmax = max(rmax, clip_params[0], clip_params[1]) - scale, zp = quant_utils.calculate_scale_zp(rmin, rmax, quantize_range, qType, sym) + scale, zp = quant_utils.calculate_scale_zp(rmin, rmax, qType, sym, self.reduce_range) zp_and_scale.append(zp) zp_and_scale.append(scale) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py b/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py index 042518092..97506b0d2 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py @@ -36,7 +36,7 @@ def decorator_calib(cls): ), "The name of subclass of Calibrator should end with 'Calibrator' substring." if cls.__name__[: -len("Calibrator")] in CALIBRATOR: # pragma: no cover raise ValueError("Cannot have two operators with the same name.") - CALIBRATOR[calib_method.strip()] = cls + CALIBRATOR[calib_method] = cls return cls return decorator_calib @@ -69,7 +69,7 @@ def calib_range(self): return self._calib_min, self._calib_max -@calib_registry(calib_method="MinMax") +@calib_registry(calib_method=0) class MinMaxCalibrator(CalibratorBase): """MinMax calibrator class.""" @@ -109,7 +109,7 @@ def method_name(self): return "MinMax" -@calib_registry(calib_method="Percentile") +@calib_registry(calib_method=2) class PercentileCalibrator(CalibratorBase): """Percentile calibrator class. @@ -163,7 +163,7 @@ def method_name(self): return "Percentile" -@calib_registry(calib_method="Entropy") +@calib_registry(calib_method=1) class EntropyCalibrator(CalibratorBase): """Entropy calibrator class. diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py index c3c97617a..4efcfd71a 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py @@ -13,9 +13,7 @@ # limitations under the License. """Base Operator.""" -from onnxruntime import quantization - -from onnx_neural_compressor import constants +from onnx_neural_compressor import constants, quantization OPERATORS = { "dynamic_quant": {}, @@ -56,7 +54,7 @@ def __init__(self, onnx_quantizer, onnx_node): True if onnx_node.op_type in onnx_quantizer.op_types_to_exclude_output_quantization else False ) self.per_channel = False - self.calibrate_method = quantization.CalibrationMethod.MinMax + self.calibrate_method = 0 # minmax self.weight_sym = True self.weight_dtype = None self.activation_dtype = None diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py index fd851885f..d18833adc 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py @@ -57,19 +57,27 @@ def convert_check(self): def convert(self): """Convert to QOperator format.""" + # DQ-Gather-Q-DQ-op node = self.node parents = self.quantizer.model.get_parents(node) children = self.quantizer.model.get_children(node) if any([i.op_type == "DequantizeLinear" for i in parents]): - from onnx import numpy_helper inputs = [] inputs.append(parents[0].input[0]) inputs.append(node.input[1]) - gather_new_output = node.output[0] + "_quantized" + out_scale = 1.0 + out_zp = 0 + gather_new_output = node.output[0] + "_quantized" # dynamic quant output name + for child in children: + if child.op_type == "QuantizeLinear": + out_scale = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(children[0].input[1])) + out_zp = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(children[0].input[2])) + gather_new_output = children[0].output[0] # static quant output name + self.quantizer.remove_nodes.append(child) kwargs = {} for attribute in node.attribute: # pragma: no cover @@ -77,7 +85,7 @@ def convert(self): gather_node = onnx.helper.make_node(node.op_type, inputs, [gather_new_output], node.name, **kwargs) self.quantizer.new_nodes.append(gather_node) - if any([i.op_type != "QuantizeLinear" for i in children]): # pragma: no cover + if any([i.op_type != "QuantizeLinear" for i in children]): dq_inputs = [] dq_inputs.append(gather_new_output) dq_inputs.extend(parents[0].input[1:]) @@ -86,25 +94,15 @@ def convert(self): ) self.quantizer.new_nodes.append(dq_node) - out_scale = 1.0 - out_zp = 0 - for child in children: - if child.op_type == "QuantizeLinear": - out_scale = numpy_helper.to_array(self.quantizer.model.get_initializer(child.input[1])) - out_zp = numpy_helper.to_array(self.quantizer.model.get_initializer(child.input[2])) - self.quantizer.remove_nodes.append(child) - for n in self.quantizer.model.get_children(child): - self.quantizer.model.replace_node_input(n, child.output[0], gather_new_output) - # int8 weight will be recalculated for the first time if ( any([child.op_type == "QuantizeLinear" for child in children]) and self.quantizer.model.get_initializer(parents[0].input[0]) is not None and parents[0].input[0] not in self.quantizer.recalculate_quantized_value ): - int8_tensor = numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[0])) - in_scale = numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[1])) - in_zp = numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[2])) + int8_tensor = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[0])) + in_scale = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[1])) + in_zp = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[2])) new_int8_tensor = (((int8_tensor.astype("float32") - in_zp) * in_scale) / out_scale).round() + out_zp self.quantizer.model.set_initializer(parents[0].input[0], new_int8_tensor.astype(int8_tensor.dtype)) self.quantizer.recalculate_quantized_value.append(parents[0].input[0]) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py index 61f7efd9e..6ffe742b5 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py @@ -76,7 +76,10 @@ def convert(self): scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0] padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer) quantized_padding_constant_array = quant_utils.quantize_nparray( - self.weight_dtype, padding_constant_array, scale_value, zp_value + onnx.helper.tensor_dtype_to_np_dtype(self.weight_dtype), + padding_constant_array, + scale_value, + zp_value, ) quantized_padding_constant_name = node.input[2] + "_quantized" quantized_padding_constant_initializer = onnx.numpy_helper.from_array( diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py index 97bded14f..3192b51d1 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py @@ -71,9 +71,10 @@ def convert(self): if len(node.input) > 1: # pragma: no cover quantized_input_names.extend(node.input[1:]) outputs = [] + input_name_to_nodes = self.quantizer.model.input_name_to_nodes() for output in node.output: - if output in self.quantizer.model.input_name_to_nodes(): - child = self.quantizer.model.input_name_to_nodes()[output][0] + if output in input_name_to_nodes: + child = input_name_to_nodes[output][0] if child.op_type == "QuantizeLinear": self.quantizer.remove_nodes.append(child) outputs.append(child.output[0]) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/quantizer.py b/onnx_neural_compressor/algorithms/post_training_quant/quantizer.py index 4e8b815e5..2596e8791 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/quantizer.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/quantizer.py @@ -21,7 +21,7 @@ import onnx import onnxruntime as ort -from onnx_neural_compressor import logger, onnx_model +from onnx_neural_compressor import logger, onnx_model, utility from onnx_neural_compressor.algorithms import utility as quant_utils from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op @@ -150,17 +150,24 @@ def should_convert(self, node): def _postprocess(self): if "TensorrtExecutionProvider" in self.execution_provider: - quant_utils.trt_env_setup(self.model.model) + utility.trt_env_setup(self.model.model) self.merge_dedicated_qdq_pair() self.model.remove_unused_nodes() self.model.model.producer_name = quant_utils.__producer__ self.model.model.producer_version = quant_utils.__version__ + def _preprocess(self): + quant_utils.remove_init_from_model_input(self.model) + quant_utils.split_shared_bias(self.model) + def quantize_model(self): """Quantize onnx model.""" + self._preprocess() + # step 1: insert q-dq pairs self.insert_qdq() + self.remove_duplicate_qdq_paris() # step 2: convert q-node-dq to qoperator format if needed @@ -168,6 +175,7 @@ def quantize_model(self): self.convert_qdq_to_operator_oriented() self._postprocess() + quant_utils.dump_model_op_stats(self.model.model, self.config, self.op_types_to_quantize) return self.model.model def merge_dedicated_qdq_pair(self): @@ -430,25 +438,11 @@ def quantize_bias(self, bias_name, input_name, weight_name, beta=1.0): packed_bias_zp_initializer = onnx.numpy_helper.from_array(bias_zp_data, quantized_bias_zp_name) self.model.initializer().extend([packed_bias_zp_initializer]) - # log entries for this quantized bias value - quantized_bias_entry = quant_utils.QuantizedInitializer( - bias_name, - bias_initializer, - [0], - [0], - [0], - [bias_scale], - bias_data, - quantized_data, - qType=onnx.TensorProto.INT32, - ) - quantized_value = quant_utils.QuantizedValue( bias_name, quantized_bias_name, quantized_bias_scale_name, quantized_bias_zp_name, - quant_utils.QuantizedValueType.Initializer, None, onnx.TensorProto.INT32, ) @@ -476,9 +470,9 @@ def quantize_weight_per_channel(self, weight_name, weight_qType, sym, channel_ax rmin, rmax, zero_point, scale, quantized_weights = quant_utils.quantize_data_per_channel( weights, channel_axis, - quant_utils.get_qmin_qmax_for_qType(weight_qType, self.reduce_range, sym), weight_qType, sym, + self.reduce_range, ) weight = quant_utils.QuantizedInitializer( @@ -500,7 +494,6 @@ def quantize_weight_per_channel(self, weight_name, weight_qType, sym, channel_ax weight.name + "_quantized", weight.name + "_scale", weight.name + "_zero_point", - quant_utils.QuantizedValueType.Initializer, None, weight_qType, ) @@ -579,7 +572,7 @@ def tensor_proto_to_array(initializer, base_dir=""): raise ValueError( "Only float type quantization is supported. \ Weights {} is {}.".format( - initializer.name, quant_utils.dtype_to_name(quant_utils.dtype_mapping, initializer.data_type) + initializer.name, str(onnx.helper.tensor_dtype_to_np_dtype(initializer.data_type)), ) ) return weights @@ -636,9 +629,9 @@ def _get_quantized_weight(self, initializer, qType, sym): ) rmin, rmax, zero_point, scale, quantized_weights_data = quant_utils.quantize_data( weights_data.flatten().tolist(), - quant_utils.get_qmin_qmax_for_qType(qType, self.reduce_range, sym), qType, sym, + self.reduce_range, ) weight = quant_utils.QuantizedInitializer( name, @@ -752,7 +745,7 @@ def quantize_outputs(self, node, initializer_use_weight_qType=True, direct_int8= self.replace_input.append([child, tensor_name, dequant_node.output[0]]) if tensor_name not in self.quantized_value_map: quantized_value = quant_utils.QuantizedValue( - tensor_name, dq_output, scale_name, zp_name, quant_utils.QuantizedValueType.Input + tensor_name, dq_output, scale_name, zp_name ) self.quantized_value_map[tensor_name] = quantized_value @@ -807,7 +800,6 @@ def quantize_inputs(self, node, indices=None, initializer_use_weight_qType=True, q_weight_name, scale_name, zp_name, - quant_utils.QuantizedValueType.Initializer, None, dtype, ) @@ -999,7 +991,7 @@ def _quantize_activation(self, node, tensor_name, direct_int8=False): if tensor_name not in self.quantized_value_map: quantized_value = quant_utils.QuantizedValue( - tensor_name, dq_output, scale_name, zp_name, quant_utils.QuantizedValueType.Input + tensor_name, dq_output, scale_name, zp_name, ) self.quantized_value_map[tensor_name] = quantized_value @@ -1041,7 +1033,11 @@ def __init__( def _quantize_activation(self, node, tensor_name, direct_int8=False): """Quantize node activation.""" - qlinear_node = self.model.find_node_by_name(tensor_name + "_QuantizeLinear", self.new_nodes, self.model.graph()) + qlinear_node = None + if quant_utils.find_by_name(tensor_name + "_QuantizeLinear", self.model.nodes()) is not None: + qlinear_node = quant_utils.find_by_name(tensor_name + "_QuantizeLinear", self.model.nodes()) + elif quant_utils.find_by_name(tensor_name + "_QuantizeLinear", self.new_nodes) is not None: + qlinear_node = quant_utils.find_by_name(tensor_name + "_QuantizeLinear", self.new_nodes) if qlinear_node is None: if ( self.fuse_dynamic_quant diff --git a/onnx_neural_compressor/algorithms/smoother/core.py b/onnx_neural_compressor/algorithms/smoother/core.py index ab902de07..bcf830f1a 100644 --- a/onnx_neural_compressor/algorithms/smoother/core.py +++ b/onnx_neural_compressor/algorithms/smoother/core.py @@ -28,17 +28,6 @@ from typing import List, Union # isort: skip -_dtype_map = { - np.dtype("float32"): 1, - np.dtype("uint8"): 2, - np.dtype("int8"): 3, - np.dtype("int32"): 6, - np.dtype("int64"): 7, - np.dtype("float16"): 10, - np.dtype("double"): 11, -} - - def _get_quant_dequant_output(model, input_data, output_data, providers): """Get loss between fp32 output and QDQ output. @@ -48,7 +37,7 @@ def _get_quant_dequant_output(model, input_data, output_data, providers): output_data (numpy.ndarray): fp32 output providers (list): execution provider """ - input_data = _quant_dequant_data(input_data, 2, "asym") + input_data = quant_utils.qdq_data(input_data, 2, False) sess = ort.InferenceSession(model.SerializeToString(), providers=providers) preds = sess.run(None, {model.graph.input[0].name: input_data}) loss = np.sum(np.abs(output_data - preds) ** 2) @@ -66,31 +55,22 @@ def _make_sub_graph(node, inits, input_data, output_data, opset, ir_version): opset (object): opset of the model ir_version (object): ir_version of the model """ - input = onnx.helper.make_tensor_value_info(node.input[0], _dtype_map[input_data.dtype], input_data.shape) - output = onnx.helper.make_tensor_value_info(node.output[0], _dtype_map[output_data.dtype], output_data.shape) + input = onnx.helper.make_tensor_value_info( + node.input[0], + onnx.helper.np_dtype_to_tensor_dtype(input_data.dtype), + input_data.shape, + ) + output = onnx.helper.make_tensor_value_info( + node.output[0], + onnx.helper.np_dtype_to_tensor_dtype(output_data.dtype), + output_data.shape, + ) graph = onnx.helper.make_graph([node], "sub_graph", [input], [output], inits) model = onnx.helper.make_model(graph, opset_imports=opset) model.ir_version = ir_version return model -def _quant_dequant_data(data, qType=3, sym=True): - """Quantize and then dequantize data. - - Args: - data (numpy.ndarray): target data - qType (int): data type - sym (bool): sym or asym quantization - """ - rmin, rmax, zero_point, scale, quantized_data = quant_utils.quantize_data( - data.flatten().tolist(), - quant_utils.get_qmin_qmax_for_qType(qType, False, sym), - qType, - sym, - ) - return ((quantized_data - zero_point) * scale).astype(data.dtype).reshape(data.shape) - - class Smoother: """Fake input channel quantization. @@ -386,7 +366,7 @@ def _get_output_loss(self, node_name, scale, calib_iter): ) base_dir = "" if not self.model.is_large_model else os.path.dirname(self.model.model_path) weight = onnx.numpy_helper.to_array(self.model.get_initializer(node.input[1]), base_dir) - weight_q = _quant_dequant_data(weight) + weight_q = quant_utils.qdq_data(weight, 3, True) self.model.set_initializer(node.input[1], weight_q) inits = [self.model.get_initializer(i) for i in node.input if self.model.get_initializer(i) is not None] diff --git a/onnx_neural_compressor/algorithms/utility.py b/onnx_neural_compressor/algorithms/utility.py index 44496664f..f6a85c598 100644 --- a/onnx_neural_compressor/algorithms/utility.py +++ b/onnx_neural_compressor/algorithms/utility.py @@ -15,56 +15,49 @@ # See the License for the specific language governing permissions and # limitations under the License. -import enum -import os -import pathlib +import numpy as np +from packaging import version import re import struct import sys from importlib import util - -import numpy as np -from onnxruntime.quantization import onnx_model -from packaging import version - -from onnx_neural_compressor import constants, logger, utility +from onnx_neural_compressor import constants, utility if sys.version_info < (3, 11) and util.find_spec("onnxruntime_extensions"): # pragma: no cover import onnxruntime_extensions -torch = utility.LazyImport("torch") -symbolic_shape_infer = utility.LazyImport("onnxruntime.tools.symbolic_shape_infer") onnx = utility.LazyImport("onnx") ort = utility.LazyImport("onnxruntime") - -dtype_mapping = { - "fp32": 1, - "float32": 1, - "uint8": 2, - "int8": 3, - "uint16": 4, - "int16": 5, - "int32": 6, - "int64": 7, - "string": 8, - "bool": 9, - "fp16": 10, - "float16": 10, - "double": 11, - "uint32": 12, - "uint64": 13, - "complex64": 14, - "complex128": 15, - "bf16": 16, - "bfloat16": 16, -} - -QUANT_OP_NAME_SUFFIX = "_quant" __producer__ = "onnx.quantize" __version__ = "0.1.0" onnx_domain = "ai.onnx" ms_domain = "com.microsoft" +QUANT_OP_NAME_SUFFIX = "_quant" + + +def attribute_to_kwarg(attribute): + """Convert attribute to kwarg format for use with onnx.helper.make_node.""" + attribute_mapping = { + 1: attribute.f, + 2: attribute.i, + 3: attribute.s, + 4: attribute.t, + 5: attribute.g, + 6: attribute.floats, + 7: attribute.ints, + 8: attribute.strings, + 9: attribute.tensors, + 10: attribute.graphs, + } + if attribute.type in attribute_mapping: + value = attribute_mapping[attribute.type] + else: # pragma: no cover + raise ValueError( + "attribute {} has no type specified " "or unsupported type {}.".format(attribute.name, attribute.type) + ) + return {attribute.name: value} + ONNX_INT_TYPE_RANGE = { onnx.TensorProto.UINT8: (0, 255), @@ -80,23 +73,39 @@ onnx.TensorProto.INT8: (-64, 64), } +ONNX_STR_TYPE_RANGE = { + "int1": (-1, 0), + "int2": (-2, 1), + "int3": (-4, 3), + "int4": (-8, 7), # onnx >= 1.16.0 defines TensorProto.INT4 + "int5": (-16, 15), + "int6": (-32, 31), + "int7": (-64, 63), + "int8": (-128, 127), + "uint1": (0, 1), + "uint2": (0, 3), + "uint3": (0, 7), + "uint4": (0, 15), # onnx >= 1.16.0 defines TensorProto.UINT4 + "uint5": (0, 31), + "uint6": (0, 63), + "uint7": (0, 127), + "uint8": (0, 255), +} -def check_model_with_infer_shapes(model): - """Check if the model has been shape inferred.""" - if isinstance(model, (pathlib.Path, str)): - model = onnx.load(model, load_external_data=False) - elif isinstance(model, onnx_model.ONNXModel): - model = model.model - if len(model.graph.value_info) > 0: - return True - return False +def _qType_to_np_type(qType): + if isinstance(qType, int): + return onnx.helper.tensor_dtype_to_np_dtype(qType) + elif isinstance(qType, str) and "uint" in qType: + return np.dtype("uint8") + else: + return np.dtype("int8") def find_by_name(name, item_list): """Helper function to find item by name in a list.""" items = [] for item in item_list: - assert hasattr(item, "name"), "{} should have a 'name' attribute defined".format(item) # pragma: no cover + assert hasattr(item, "name"), "{} should have a 'name' attribute defined".format(item) if item.name == name: items.append(item) if len(items) > 0: @@ -104,19 +113,22 @@ def find_by_name(name, item_list): else: return None - -def is_quantizable_type(data_type): - return data_type in [onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16, onnx.TensorProto.BFLOAT16] - - def get_qmin_qmax_for_qType(qType, reduce_range=False, sym=False): # noqa: N802 - """Get qmin, qmax for qType.""" + """Get qmin, qmax for qType. + + Args: + qType (int or str): int for onnx defined type, str for onnx not defined type + reduce_range (bool, optional): whether use 7 bit for 8bit quantization + sym (bool, optional): quantization scheme. Defaults to False. + """ if qType == onnx.TensorProto.FLOAT8E4M3FN: raise NotImplementedError("This function is not implemented for float 8 as not needed.") qrange = None - if reduce_range: + if isinstance(qType, str): + qrange = ONNX_STR_TYPE_RANGE.get(qType) + elif reduce_range: qrange = ONNX_INT_TYPE_REDUCED_RANGE.get(qType) elif sym and qType in ONNX_INT_TYPE_SYMMETRIC_RANGE: qrange = ONNX_INT_TYPE_SYMMETRIC_RANGE[qType] @@ -124,14 +136,137 @@ def get_qmin_qmax_for_qType(qType, reduce_range=False, sym=False): # noqa: N802 qrange = ONNX_INT_TYPE_RANGE.get(qType) if not qrange: - raise ValueError(f"Unexpected data type {qType} requested. Only INT8 and UINT8 are supported.") + raise ValueError(f"Unexpected data type {qType} requested.") return qrange +def quantize_nparray(dtype, arr, scale, zero_point, low=None, high=None): + """Quantize numpy array.""" + q_weight = np.empty_like(np.asarray(arr), dtype=scale.dtype) + np.divide(arr, scale, out=q_weight) + np.add(q_weight, zero_point, out=q_weight) + np.round(q_weight, out=q_weight) + if low is not None and high is not None: + np.clip(q_weight, low, high, out=q_weight) + return q_weight.astype(dtype) + +def quantize_data_per_channel(data, axis, qType, sym, reduce_range=False): + """Quantize tensor per-channel.""" + quantize_range = get_qmin_qmax_for_qType(qType, reduce_range, sym) + rmin = None + rmax = None + for i in range(len(data.shape)): + if i != axis: + rmin = np.min(data, axis=i, keepdims=True) if rmin is None else np.min(rmin, axis=i, keepdims=True) + rmax = np.max(data, axis=i, keepdims=True) if rmax is None else np.max(rmax, axis=i, keepdims=True) + rmin = np.minimum(rmin, 0) + rmax = np.maximum(rmax, 0) + scale, zero_point = calculate_scale_zp(rmin, rmax, qType, sym, reduce_range) + + dtype = _qType_to_np_type(qType) + quantized_data = quantize_nparray(dtype, data, scale, zero_point, low=quantize_range[0], high=quantize_range[1]) + return rmin.reshape(-1, 1), rmax.reshape(-1, 1), zero_point.reshape(-1, 1), scale.reshape(-1, 1), quantized_data -def dtype_to_name(dtype_mapping, dtype): - """Map data type and its string representation.""" - return list(dtype_mapping.keys())[list(dtype_mapping.values()).index(dtype)] +def dequantize_data_with_scale_zero(tensor_value, scale_value, zo_value): # pragma: no cover + """Dequantize tensor with scale and zero point.""" + return (tensor_value.astype(scale_value.dtype) - zo_value.astype(scale_value.dtype)) * scale_value + +def dequantize_data(tensor_value, scale_value, zo_value, axis=0): # pragma: no cover + """Dequantize tensor.""" + if not isinstance(scale_value, np.ndarray): + return dequantize_data_with_scale_zero(tensor_value, scale_value, zo_value) + else: + channel_count = tensor_value.shape[axis] # TBD, default from axis 0 + new_per_channel_tensor_values = [] + for i in range(channel_count): + per_channel_tensor_value = tensor_value.take(i, axis) + per_channel_scale_value = scale_value.take(i) + per_channel_zero_value = zo_value.take(i) + new_per_channel_tensor_values.append( + dequantize_data_with_scale_zero( + per_channel_tensor_value, per_channel_scale_value, per_channel_zero_value + ) + ) + # combine per_channel_data into one + reshape_dims = list(tensor_value.shape) # deep copy + reshape_dims[axis] = 1 # only one per channel for reshape + new_tensor_value = new_per_channel_tensor_values[0].reshape(reshape_dims) + for i in range(1, channel_count): + new_per_channel_tensor_value = new_per_channel_tensor_values[i].reshape(reshape_dims) + new_tensor_value = np.concatenate((new_tensor_value, new_per_channel_tensor_value), axis) + return new_tensor_value + +def calculate_scale_zp(rmin, rmax, qType, sym, reduce_range=False): + """Calculate scale and zero point.""" + qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, sym) + dtype = _qType_to_np_type(qType) + if isinstance(rmax, np.ndarray): + if sym: + max_range = np.maximum(abs(rmin), abs(rmax)) + rmin = -max_range + rmax = max_range + scale = (rmax - rmin) / (qmax - qmin) + scale[scale < np.finfo(rmax.dtype).tiny] = 1 + zero_point = ( + np.multiply(np.ones(rmax.shape), np.round((qmax + qmin) / 2.0)).astype(dtype) + if sym + else np.round(qmin - rmin / scale).astype(dtype) + ) + else: + if sym: + max_range = max(abs(rmin), abs(rmax)) + scale = (float(max_range) * 2) / (qmax - qmin) if max_range > 0 else 1 + else: + scale = (float(rmax) - float(rmin)) / (qmax - qmin) if rmin != rmax else 1 + zero_point = np.round((qmax + qmin) / 2.0).astype(dtype) if sym else np.round(qmin - rmin / scale).astype(dtype) + return np.float32(scale), zero_point + +def quantize_data(data, qType, sym, reduce_range=False, ratio=1.0, axis=None): + """Quantize data. + + To pack weights, we compute a linear transformation + - when data type == uint8 mode, from [rmin, rmax] -> [0, 2^{b-1}] and + - when data type == int8, from [-m , m] -> [-(2^{b-1}-1), 2^{b-1}-1] where + m = max(abs(rmin), abs(rmax)) + and add necessary intermediate nodes to transform quantized weight to full weight + using the equation r = S(q-z), where + r: real original value + q: quantized value + S: scale + z: zero point + + Args: + data (array): data to quantize + qType (int): data type to quantize to. Supported types UINT8 and INT8 + sym (bool): whether use sym quantization. + reduce_range (bool): whether use 7 bit or not. Defaults to False + ratio (float, optional): percentile of clip. Defaults to 1.0 + axis (int, optional): process data along a specific axis. Default is None (process the whole data) + """ + quantize_range = get_qmin_qmax_for_qType(qType, reduce_range, sym) + rmin = np.min(np.min(data), 0) if axis is None else np.min(data, axis=1, keepdims=True) + rmax = np.max(np.max(data), 0) if axis is None else np.max(data, axis=1, keepdims=True) + rmin *= ratio + rmax *= ratio + + scale, zero_point = calculate_scale_zp(rmin, rmax, qType, sym, reduce_range) + dtype = _qType_to_np_type(qType) + quantized_data = quantize_nparray(dtype, data, scale, zero_point, low=quantize_range[0], high=quantize_range[1]) + return rmin, rmax, zero_point, scale, quantized_data + +def qdq_data(data, qType, sym, reduce_range=False, ratio=1.0, axis=None): + _, _, zero_point, scale, quantized_data = quantize_data(data, qType, sym, reduce_range, ratio, axis) + return scale * (quantized_data - zero_point) + +def is_B_transposed(node): + """Whether inuput B is transposed.""" + transB = [attr for attr in node.attribute if attr.name == "transB"] + if len(transB): + return 0 < onnx.helper.get_attribute_value(transB[0]) + return False + +def is_quantizable_type(data_type): + return data_type in [onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16, onnx.TensorProto.BFLOAT16] def _get_blob_size(group_size, has_zp): # pragma: no cover @@ -201,7 +336,7 @@ def make_matmul_weight_only_node( scale = np.reshape(scale, (-1, k_blocks)) scale_tensor = onnx.helper.make_tensor( name=node.input[1] + "_scale", - data_type=dtype_mapping[str(scale.dtype)], + data_type=onnx.helper.np_dtype_to_tensor_dtype(scale.dtype), dims=scale.shape, vals=scale.tobytes(), raw=True, @@ -348,147 +483,51 @@ def pad_tensor(weight, group_size, k_blocks): return weight -def quant_tensor( - data: np.array, - num_bits: int = 4, - group_size: int = 32, - sym: bool = False, - dtype: str = "int", - ratio: float = 1.0, -): - """Quantize tensor per group. - - Args: - data (np.array): input weight - num_bits (int, optional): number of bits used to represent weights. Defaults to 4. - group_size (int, optional): how many elements share one scale/zp. Defaults to 4. - sym (bool, optional): _quantization scheme. Defaults to False. - dtype (str, optional): data type. Defaults to "int". - ratio (float, optional): percentile of clip. Defaults to 1.0. - - Returns: - output: quantized weight - scale: scale - zero_point: zero point - """ - data = np.reshape(data, (-1, group_size)) - if not sym or dtype == "uint": - maxq = 2**num_bits - 1 - minq = 0 - elif sym: - maxq = 2 ** (num_bits - 1) - 1 if num_bits != 1 else 0 - minq = -(2 ** (num_bits - 1)) if num_bits != 1 else -1 - - rmin = np.min(data, axis=1, keepdims=True) * ratio - rmax = np.max(data, axis=1, keepdims=True) * ratio - if sym: - max_range = np.maximum(np.abs(rmin), np.abs(rmax)) - scale = np.ones(rmax.shape) - scale[max_range > 0] = np.array( - [float(i) / (maxq - minq) for i in (max_range[max_range > 0] * 2.0).flatten().tolist()] - ) - zero_point = ( - np.zeros(scale.shape) if dtype == "int" else np.ones(rmax.shape, dtype="uint8") * (1 << (num_bits - 1)) - ) - else: - scale = np.ones(rmax.shape) - scale[rmin != rmax] = np.array( - [float(i) / (maxq - minq) for i in (rmax - rmin)[rmin != rmax].flatten().tolist()] - ) - zero_point = ( - ((np.zeros(scale.shape) - rmin) / scale).round() - if dtype == "int" - else np.maximum(0, np.minimum(maxq, ((np.zeros(scale.shape) - rmin) / scale).round())).astype("uint8") - ) - return np.clip((data / scale + zero_point).round(), minq, maxq), scale, zero_point - - -def qdq_tensor( - data: np.array, - num_bits: int = 4, - group_size: int = 32, - sym: bool = False, - dtype: str = "int", - ratio: float = 1.0, -): - """Quant dequant tensor per group. - - Args: - data (np.array): input weight - num_bits (int, optional): number of bits used to represent weights. Defaults to 4. - group_size (int, optional): how many elements share one scale/zp. Defaults to 32. - sym (bool, optional): quantization scheme. Defaults to False. - dtype (str, optional): data type. Defaults to "int". - ratio (float, optional): percentile of clip. Defaults to 1.0. - - Returns: - output: quant-dequant weight - """ - org_shape = data.shape - weight, scale, zp = quant_tensor(data, num_bits, group_size, sym, dtype, ratio) - return np.reshape(scale * (weight - zp), org_shape) - - -def is_B_transposed(node): - """Whether inuput B is transposed.""" - transB = [attr for attr in node.attribute if attr.name == "transB"] - if len(transB): - return 0 < onnx.helper.get_attribute_value(transB[0]) - return False +def dump_woq_stats(model, quantize_config): + res = {} + dtype_set = set() + for node in model.graph.node: + if node.name.split("_Q")[0] not in quantize_config: + continue + if node.op_type in ["MatMulFpQ4", "MatMulNBits"]: + optype = "MatMul" + else: + optype = node.op_type -def calculate_scale_zp(rmin, rmax, quantize_range, qType, sym): - """Calculate scale and zero point.""" - qmin, qmax = quantize_range - dtype = onnx.helper.tensor_dtype_to_np_dtype(qType) - if isinstance(rmax, np.ndarray): - if sym: - max_range = np.maximum(abs(rmin), abs(rmax)) - rmin = -max_range - rmax = max_range - scale = (rmax - rmin) / (qmax - qmin) - scale[scale < np.finfo(rmax.dtype).tiny] = 1 - zero_point = ( - np.multiply(np.ones(rmax.shape), np.round((qmax + qmin) / 2.0)).astype(dtype) - if sym - else np.round(qmin - rmin / scale).astype(dtype) - ) - else: - if sym: - max_range = max(abs(rmin), abs(rmax)) - scale = (float(max_range) * 2) / (qmax - qmin) if max_range > 0 else 1 + if optype not in res: + res[optype] = {} + if re.fullmatch("^.*_Q\d*G\d*", node.input[1]): + search_out = re.search("_Q\d*", node.input[1]) + dtype = "A32W{}G{}".format( + node.input[1][search_out.start() + 2 : search_out.end()], node.input[1][search_out.end() + 1 :] + ) else: - scale = (float(rmax) - float(rmin)) / (qmax - qmin) if rmin != rmax else 1 - zero_point = np.round((qmax + qmin) / 2.0).astype(dtype) if sym else np.round(qmin - rmin / scale).astype(dtype) - return np.float32(scale), zero_point + dtype = "FP32" + dtype_set.add(dtype) + if dtype in res[optype]: + res[optype][dtype] += 1 + else: + res[optype][dtype] = 1 -def quantize_data(data, quantize_range, qType, sym): - """Quantize data. + dtype_list = list(dtype_set) + for dtype in dtype_list: + for optype in res.keys(): + if dtype not in res[optype]: + res[optype][dtype] = 0 - To pack weights, we compute a linear transformation - - when data type == uint8 mode, from [rmin, rmax] -> [0, 2^{b-1}] and - - when data type == int8, from [-m , m] -> [-(2^{b-1}-1), 2^{b-1}-1] where - m = max(abs(rmin), abs(rmax)) - and add necessary intermediate nodes to transform quantized weight to full weight - using the equation r = S(q-z), where - r: real original value - q: quantized value - S: scale - z: zero point + # update stats format for dump. + field_names = ["Op Type", "Total"] + field_names.extend(dtype_list) + output_data = [] + for op_type in res.keys(): + field_results = [op_type, sum(res[op_type].values())] + field_results.extend([res[op_type][dtype] for dtype in dtype_list]) + output_data.append(field_results) - Args: - data (array): data to quantize - quantize_range (list): list of data to weight pack. - qType (int): data type to quantize to. Supported types UINT8 and INT8 - sym (bool): whether use sym quantization. - """ - rmin = np.min(np.min(data), 0) - rmax = np.max(np.max(data), 0) + utility.Statistics(output_data, header="Mixed Precision Statistics", field_names=field_names).print_stat() - scale, zero_point = calculate_scale_zp(rmin, rmax, quantize_range, qType, sym) - quantized_data = quantize_nparray(qType, data, scale, zero_point, low=quantize_range[0], high=quantize_range[1]) - return rmin, rmax, zero_point, scale, quantized_data def get_node_original_name(node) -> str: @@ -502,16 +541,10 @@ def get_node_original_name(node) -> str: return node_name -class QuantType(enum.Enum): # pragma: no cover - """Represent QuantType value.""" - - QInt8 = 0 - QUInt8 = 1 - - def split_shared_bias(model): """Split shared tensor.""" - for input_name, node_list in model.input_name_to_nodes.items(): + input_name_to_nodes = model.input_name_to_nodes() + for input_name, node_list in input_name_to_nodes.items(): if len(node_list) > 1 and input_name in [i.name for i in model.model.graph.initializer]: for node in node_list[1:]: if node.op_type not in ["Conv", "FusedConv"]: @@ -541,68 +574,6 @@ def remove_init_from_model_input(model): inputs.remove(name_to_input[initializer.name]) -def quantize_data_per_channel(data, axis, quantize_range, qType, sym): - """Quantize tensor per-channel.""" - rmin = None - rmax = None - for i in range(len(data.shape)): - if i != axis: - rmin = np.min(data, axis=i, keepdims=True) if rmin is None else np.min(rmin, axis=i, keepdims=True) - rmax = np.max(data, axis=i, keepdims=True) if rmax is None else np.max(rmax, axis=i, keepdims=True) - rmin = np.minimum(rmin, 0) - rmax = np.maximum(rmax, 0) - scale, zero_point = calculate_scale_zp(rmin, rmax, quantize_range, qType, sym) - quantized_data = quantize_nparray(qType, data, scale, zero_point, low=quantize_range[0], high=quantize_range[1]) - return rmin.reshape(-1, 1), rmax.reshape(-1, 1), zero_point.reshape(-1, 1), scale.reshape(-1, 1), quantized_data - - -def dequantize_data_with_scale_zero(tensor_value, scale_value, zo_value): # pragma: no cover - """Dequantize tensor with scale and zero point.""" - return (tensor_value.astype(scale_value.dtype) - zo_value.astype(scale_value.dtype)) * scale_value - - -def dequantize_data(tensor_value, scale_value, zo_value, axis=0): # pragma: no cover - """Dequantize tensor.""" - if not isinstance(scale_value, np.ndarray): - return dequantize_data_with_scale_zero(tensor_value, scale_value, zo_value) - else: - channel_count = tensor_value.shape[axis] # TBD, default from axis 0 - new_per_channel_tensor_values = [] - for i in range(channel_count): - per_channel_tensor_value = tensor_value.take(i, axis) - per_channel_scale_value = scale_value.take(i) - per_channel_zero_value = zo_value.take(i) - new_per_channel_tensor_values.append( - dequantize_data_with_scale_zero( - per_channel_tensor_value, per_channel_scale_value, per_channel_zero_value - ) - ) - # combine per_channel_data into one - reshape_dims = list(tensor_value.shape) # deep copy - reshape_dims[axis] = 1 # only one per channel for reshape - new_tensor_value = new_per_channel_tensor_values[0].reshape(reshape_dims) - for i in range(1, channel_count): - new_per_channel_tensor_value = new_per_channel_tensor_values[i].reshape(reshape_dims) - new_tensor_value = np.concatenate((new_tensor_value, new_per_channel_tensor_value), axis) - return new_tensor_value - - -class ValueInfo: # pragma: no cover - """Represents a casted tensor info.""" - - def __init__(self, tensor_name, dtype, new_dtype): - """Initialization. - - Args: - tensor_name (string): tensor name - dtype (int): original data type - new_dtype (int): target data type - """ - self.tensor_name = tensor_name - self.dtype = dtype - self.new_dtype = new_dtype - - class QuantizedValue: """Represents a linearly quantized value (input/output/initializer).""" @@ -612,9 +583,8 @@ def __init__( new_quantized_name, scale_name, zero_point_name, - quantized_value_type, axis=None, - qType=QuantType.QUInt8, + qType=1, ): """Initialization. @@ -623,15 +593,13 @@ def __init__( new_quantized_name (string): quantized tensor name scale_name (string): scale name zero_point_name (string): zero point name - quantized_value_type (QuantizedValueType): quantized value type axis (int, optional): quantized axis. Defaults to None. - qType (int, optional): quantized data type. Defaults to QuantType.QUInt8. + qType (int, optional): quantized data type. Defaults to 1 (uint8). """ self.name = name self.q_name = new_quantized_name self.scale_name = scale_name self.zp_name = zero_point_name - self.value_type = quantized_value_type self.axis = axis self.qType = qType @@ -650,7 +618,7 @@ def __init__( data=[], quantized_data=[], axis=None, - qType=QuantType.QUInt8, + qType=1, ): """Initialization. @@ -664,7 +632,7 @@ def __init__( data (list, optional): array version of the initializer. Defaults to []. quantized_data (list, optional): quantized data. Defaults to []. axis (int, optional): quantized axis. Defaults to None. - qType (int, optional): quantized data type. Defaults to QuantType.QUInt8. + qType (int, optional): quantized data type. Defaults to 1 (uint8). """ self.name = name self.initializer = initializer # TensorProto initializer in ONNX graph @@ -681,93 +649,6 @@ def __init__( self.qType = qType -class QuantizedValueType(enum.Enum): # pragma: no cover - """Represent QuantizedValueType value.""" - - Input = 0 - Initializer = 1 - - -def quantize_nparray(qtype, arr, scale, zero_point, low=None, high=None): - """Quantize numpy array.""" - dtype = onnx.helper.tensor_dtype_to_np_dtype(qtype) - arr_fp32 = np.asarray((np.asarray(arr).astype(np.float32) / scale).round() + zero_point) - if low is not None and high is not None: - np.clip(arr_fp32, low, high, out=arr_fp32) - return arr_fp32.astype(dtype) - - -def attribute_to_kwarg(attribute): - """Convert attribute to kwarg format for use with onnx.helper.make_node.""" - attribute_mapping = { - 1: attribute.f, - 2: attribute.i, - 3: attribute.s, - 4: attribute.t, - 5: attribute.g, - 6: attribute.floats, - 7: attribute.ints, - 8: attribute.strings, - 9: attribute.tensors, - 10: attribute.graphs, - } - if attribute.type in attribute_mapping: - value = attribute_mapping[attribute.type] - else: # pragma: no cover - raise ValueError( - "attribute {} has no type specified " "or unsupported type {}.".format(attribute.name, attribute.type) - ) - return {attribute.name: value} - - -def trt_env_setup(model): - """Set environment variable for Tensorrt Execution Provider.""" - is_int8 = False - for node in model.graph.node: - if node.op_type in ["QuantizeLinear", "DequantizeLinear"]: - is_int8 = True - break - if is_int8: - os.environ["ORT_TENSORRT_INT8_ENABLE"] = "1" - else: - os.environ["ORT_TENSORRT_INT8_ENABLE"] = "0" - - -def infer_shapes(in_mp, int_max=2**31 - 1, auto_merge=False, guess_output_rank=False, verbose=0, base_dir=""): - """Symbolic shape inference.""" - - class SymbolicShapeInference(symbolic_shape_infer.SymbolicShapeInference): - def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix="", base_dir=""): - super().__init__(int_max, auto_merge, guess_output_rank, verbose, prefix) - self.base_dir = base_dir - - def _get_value(self, node, idx): - name = node.input[idx] - assert name in self.sympy_data_ or name in self.initializers_ - return ( - self.sympy_data_[name] - if name in self.sympy_data_ - else onnx.numpy_helper.to_array(self.initializers_[name], base_dir=self.base_dir) - ) - - onnx_opset = symbolic_shape_infer.get_opset(in_mp) - if (not onnx_opset) or onnx_opset < 7: - logger.warning("Only support models of onnx opset 7 and above.") - return None - symbolic_shape_inference = SymbolicShapeInference( - int_max, auto_merge, guess_output_rank, verbose, base_dir=base_dir - ) - all_shapes_inferred = False - symbolic_shape_inference._preprocess(in_mp) - while symbolic_shape_inference.run_: - all_shapes_inferred = symbolic_shape_inference._infer_impl() - symbolic_shape_inference._update_output_from_vi() - if not all_shapes_inferred: - onnx.save_model(symbolic_shape_inference.out_mp_, "sym_shape_infer_temp.onnx", save_as_external_data=True) - raise Exception("Incomplete symbolic shape inference") - return symbolic_shape_inference.out_mp_ - - def dump_model_op_stats(model, quantize_config, fp32_op_list): qdq_ops = ["QuantizeLinear", "DequantizeLinear", "DynamicQuantizeLinear"] res = {} @@ -809,49 +690,3 @@ def dump_model_op_stats(model, quantize_config, fp32_op_list): ] utility.Statistics(output_data, header="Quantization Statistics", field_names=field_names).print_stat() - - -def dump_woq_stats(model, quantize_config, fp32_op_list): - res = {} - for optype in fp32_op_list: - res[optype] = {} - - dtype_set = set() - for node in model.graph.node: - if node.op_type in ["MatMulFpQ4", "MatMulNBits"]: - optype = "MatMul" - else: - optype = node.op_type - - if optype not in res: - continue - if re.fullmatch("^.*_Q\d*G\d*", node.input[1]): - search_out = re.search("_Q\d*", node.input[1]) - dtype = "A32W{}G{}".format( - node.input[1][search_out.start() + 2 : search_out.end()], node.input[1][search_out.end() + 1 :] - ) - else: - dtype = "FP32" - dtype_set.add(dtype) - - if dtype in res[optype]: - res[optype][dtype] += 1 - else: - res[optype][dtype] = 1 - - dtype_list = list(dtype_set) - for dtype in dtype_list: - for optype in res.keys(): - if dtype not in res[optype]: - res[optype][dtype] = 0 - - # update stats format for dump. - field_names = ["Op Type", "Total"] - field_names.extend(dtype_list) - output_data = [] - for op_type in res.keys(): - field_results = [op_type, sum(res[op_type].values())] - field_results.extend([res[op_type][dtype] for dtype in dtype_list]) - output_data.append(field_results) - - utility.Statistics(output_data, header="Mixed Precision Statistics", field_names=field_names).print_stat() diff --git a/onnx_neural_compressor/algorithms/weight_only/awq.py b/onnx_neural_compressor/algorithms/weight_only/awq.py index b2db33dcb..9e07b45a6 100644 --- a/onnx_neural_compressor/algorithms/weight_only/awq.py +++ b/onnx_neural_compressor/algorithms/weight_only/awq.py @@ -24,7 +24,7 @@ import onnxruntime as ort from packaging import version -from onnx_neural_compressor import config, constants, data_reader, logger, onnx_model +from onnx_neural_compressor import constants, data_reader, logger, onnx_model from onnx_neural_compressor.algorithms import utility as quant_utils from onnx_neural_compressor.algorithms.weight_only import rtn @@ -39,7 +39,7 @@ def _get_weight_scale(weight, group_size): return scale -def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, group_size, sym): +def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts): """Apply scale for salient weight.""" best_scales = {} new_init_tensors = [] @@ -48,6 +48,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, updated_nodes = [] base_dir = os.path.dirname(model.model_path) if model.model_path is not None else "" + input_name_to_nodes = model.input_name_to_nodes() for parent, nodes in absorb_pairs.items(): if any([node.input[0] not in output_dicts for node in nodes]): logger.warning( @@ -61,13 +62,17 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, dtype = None weight = [] org_out = [] + + num_bits = weight_config[nodes[0].name].get("weight_bits", 4) + group_size = weight_config[nodes[0].name].get("weight_group_size", 32) + sym = weight_config[nodes[0].name].get("weight_sym", True) + accuracy_level = weight_config[nodes[0].name].get("accuracy_level", 0) + + # use same params for all children of one parent for node in nodes: - if node.name in weight_config and weight_config.get(node.name, "fp32") != "fp32": - num_bits = weight_config[node.name].get("weight_bits", 4) - group_size = weight_config[node.name].get("weight_group_size", 32) - sym = weight_config[node.name].get("weight_sym", True) - accuracy_level = weight_config[node.name].get("accuracy_level", 0) - break + weight_config.setdefault(node.name, {}).update({"weight_bits": num_bits}) + weight_config.setdefault(node.name, {}).update({"weight_group_size": group_size}) + weight_config.setdefault(node.name, {}).update({"weight_sym": sym}) # search scale best_error = float("inf") @@ -79,9 +84,6 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, ratio = ratio * 1 / n_grid loss = 0 for node in nodes: - if weight_config.get((node.name, node.op_type), {}) == "fp32": - continue - weight = onnx.numpy_helper.to_array(model.get_initializer(node.input[1]), base_dir) if len(weight.shape) != 2: continue @@ -103,9 +105,17 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, ): # pragma: no cover # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1 - q_weight = quant_utils.qdq_tensor(weight, num_bits, group_size, sym, "uint") + q_weight = quant_utils.qdq_data( + weight.reshape((-1, group_size)), + "uint" + str(num_bits), + sym, + ).reshape(weight.shape) else: - q_weight = quant_utils.qdq_tensor(weight, num_bits, group_size, sym, "int") + q_weight = quant_utils.qdq_data( + weight.reshape((-1, group_size)), + "int" + str(num_bits), + sym, + ).reshape(weight.shape) q_weight = q_weight[: org_w_shape[0], :] / np.expand_dims(scales, axis=-1) out = np.matmul(inp, q_weight) @@ -118,10 +128,6 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, best_scale = scales for node in nodes: - weight_config.setdefault(node.name, {}).update({"weight_bits": num_bits}) - weight_config.setdefault(node.name, {}).update({"weight_group_size": group_size}) - weight_config.setdefault(node.name, {}).update({"weight_sym": sym}) - init_share_num = model.get_initializer_share_num(node.input[1]) weight_tensor = model.get_initializer(node.input[1]) tensor = onnx.numpy_helper.to_array(weight_tensor, base_dir) @@ -131,7 +137,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, new_tensor = onnx.helper.make_tensor( name=node.input[1] + "_scaled", - data_type=quant_utils.dtype_mapping[str(dtype)], + data_type=onnx.helper.np_dtype_to_tensor_dtype(dtype), dims=tensor.shape, vals=tensor.tobytes(), raw=True, @@ -147,7 +153,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, continue if parent.op_type in ["LayerNormalization", "BatchNormalization", "InstanceNormalization"] and len( - model.input_name_to_nodes()[nodes[0].input[0]] + input_name_to_nodes[nodes[0].input[0]] ) == len(nodes): for idx in [1, 2]: tensor = onnx.numpy_helper.to_array(model.get_initializer(parent.input[idx]), base_dir) @@ -160,7 +166,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, elif ( parent.op_type in ["SimplifiedLayerNormalization", "MatMul", "Gemm", "Mul"] and not all([model.get_initializer(inp) is None for inp in parent.input]) - and len(model.input_name_to_nodes()[nodes[0].input[0]]) == len(nodes) + and len(input_name_to_nodes[nodes[0].input[0]]) == len(nodes) ): # pragma: no cover for inp in parent.input: if model.get_initializer(inp) is not None: @@ -171,7 +177,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, updated_nodes.append(parent.name) output_dicts[parent.output[0]] = output_dicts[parent.output[0]] / np.reshape(best_scale, (1, -1)) - elif parent.op_type in ["Conv", "FusedConv"] and len(model.input_name_to_nodes()[nodes[0].input[0]]) == len( + elif parent.op_type in ["Conv", "FusedConv"] and len(input_name_to_nodes[nodes[0].input[0]]) == len( nodes ): # pragma: no cover tensor = onnx.numpy_helper.to_array(model.get_initializer(parent.input[2]), base_dir) @@ -185,7 +191,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, # insert mul scale_tensor = onnx.helper.make_tensor( name=parent.output[0] + "_weight_only_scale", - data_type=quant_utils.dtype_mapping[str(dtype)], + data_type=onnx.helper.np_dtype_to_tensor_dtype(dtype), dims=best_scale.shape, vals=(1.0 / best_scale).flatten().tolist(), ) @@ -211,7 +217,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, return model, output_dicts -def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts, num_bits, group_size, sym): +def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts): """Apply clip for weight by checking mse.""" base_dir = os.path.dirname(model.model_path) if model.model_path is not None else "" ratios = {} @@ -227,11 +233,10 @@ def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts, num_bits, inp = np.concatenate(output_dicts[nodes[0].input[0]], axis=0) for node in nodes: - if node.name in weight_config: - num_bits = weight_config[node.name].get("weight_bits", 4) - group_size = weight_config[node.name].get("weight_group_size", 32) - sym = weight_config[node.name].get("weight_sym", True) - accuracy_level = weight_config[node.name].get("accuracy_level", 0) + num_bits = weight_config[node.name].get("weight_bits", 4) + group_size = weight_config[node.name].get("weight_group_size", 32) + sym = weight_config[node.name].get("weight_sym", True) + accuracy_level = weight_config[node.name].get("accuracy_level", 0) org_weight = onnx.numpy_helper.to_array(model.get_initializer(node.input[1]), base_dir=base_dir) org_w_shape = org_weight.shape # ic, oc @@ -254,9 +259,19 @@ def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts, num_bits, ): # pragma: no cover # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1 - weight = quant_utils.qdq_tensor(weight, num_bits, group_size, sym, "uint", ratio) + weight = quant_utils.qdq_data( + weight.reshape((-1, group_size)), + "uint" + str(num_bits), + sym, + ratio=ratio, + ).reshape(org_weight.shape) else: - weight = quant_utils.qdq_tensor(weight, num_bits, group_size, sym, "int", ratio) + weight = quant_utils.qdq_data( + weight.reshape((-1, group_size)), + "int" + str(num_bits), + sym, + ratio=ratio, + ).reshape(org_weight.shape) cur_out = np.matmul(inp, weight[:, : org_w_shape[0]].T) loss = np.mean(np.power((org_out - cur_out), 2)) @@ -272,12 +287,8 @@ def awq_quantize( model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str], data_reader: data_reader.CalibrationDataReader, weight_config: dict = {}, - num_bits: int = 4, - group_size: int = 32, - sym: bool = False, enable_auto_scale: bool = True, enable_mse_search: bool = True, - accuracy_level: int = 0, providers: List[str] = ["CPUExecutionProvider"], ) -> onnx.ModelProto: """Quant the model with Activation-aware Weight quantization(AWQ) method. @@ -297,16 +308,10 @@ def awq_quantize( 'accuracy_level': 0 } }. Defaults to {}. - num_bits (int, optional): number of bits used to represent weights. Defaults to 4. - group_size (int, optional): size of weight groups. Defaults to 32. - sym (bool, optional): indicates whether weights are symmetric. Defaults to False. enable_auto_scale (bool, optional): whether to search for best scales based on activation distribution. Defaults to True. enable_mse_search (bool, optional): whether to search for the best clip range from range [0.91, 1.0, 0.01]. Defaults to True. - accuracy_level (int, optional): accuracy level. Support 0 (unset), - 1(fp32 compute type of jblas kernel), 2 (fp16 compute type of jblas kernel), - 3 (bf16 compute type of jblas kernel), 4 (int8 compute type of jblas kernel). Defaults to 0. providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"]. Returns: @@ -352,11 +357,13 @@ def awq_quantize( else ort.InferenceSession(model.model_path + "_augment.onnx", so, providers=providers) ) + output_name_to_node = model.output_name_to_node() + input_name_to_nodes = model.input_name_to_nodes() for input_name in output_names: - parent = model.output_name_to_node()[input_name] + parent = output_name_to_node[input_name] dump_pairs = {parent.name: []} - for node in model.input_name_to_nodes()[input_name]: + for node in input_name_to_nodes[input_name]: # check op_type of node is MatMul # check dim 1 of input is weight tensor # check weight_type is not "fp32" @@ -381,9 +388,6 @@ def awq_quantize( weight_config, dump_pairs, output_dicts, - num_bits, - group_size, - sym, ) if enable_mse_search: ratios = _apply_awq_clip( @@ -391,9 +395,6 @@ def awq_quantize( weight_config, dump_pairs, output_dicts, - num_bits, - group_size, - sym, ) del output_dicts del dump_pairs @@ -401,7 +402,7 @@ def awq_quantize( model.remove_tensors_from_outputs(output_names) model.model.graph.output.MergeFrom(org_output) - model = rtn.rtn_quantize(model, weight_config, num_bits, group_size, sym, full_ratio, accuracy_level, providers) + model = rtn.rtn_quantize(model, weight_config, full_ratio, providers) return model @@ -409,6 +410,9 @@ def apply_awq_on_model( model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str], quant_config: dict, calibration_data_reader: data_reader.CalibrationDataReader, + enable_auto_scale: bool = True, + enable_mse_search: bool = True, + providers: List[str] = ["CPUExecutionProvider"], ) -> onnx.ModelProto: """Apply Activation-aware Weight quantization(AWQ) on onnx model. @@ -421,12 +425,11 @@ def apply_awq_on_model( onnx.ModelProto: quantized onnx model. """ # set model params - kwargs = {} - kwargs = {key: quant_config.pop(key) for key in config.AWQConfig.model_params_list if key in quant_config} - - # change op config to dict type - for op_name_type, op_config in quant_config.items(): - if isinstance(op_config, config.AWQConfig): - quant_config[op_name_type] = op_config.to_dict() - - return awq_quantize(model, data_reader=calibration_data_reader, weight_config=quant_config, **kwargs) + kwargs = { + "enable_auto_scale": enable_auto_scale, + "enable_mse_search": enable_mse_search, + "providers": providers, + } + q_model = awq_quantize(model, data_reader=calibration_data_reader, weight_config=quant_config, **kwargs) + quant_utils.dump_woq_stats(q_model, quant_config) + return q_model diff --git a/onnx_neural_compressor/algorithms/weight_only/gptq.py b/onnx_neural_compressor/algorithms/weight_only/gptq.py index c95c346f8..ae3813280 100644 --- a/onnx_neural_compressor/algorithms/weight_only/gptq.py +++ b/onnx_neural_compressor/algorithms/weight_only/gptq.py @@ -24,9 +24,10 @@ import onnxruntime as ort from packaging.version import Version -from onnx_neural_compressor import config, constants, data_reader, onnx_model, utility +from onnx_neural_compressor import constants, data_reader, onnx_model, utility from onnx_neural_compressor.algorithms import utility as quant_utils from onnx_neural_compressor.algorithms.layer_wise import core +from onnx_neural_compressor.quantization import config from typing import List, Union # isort: skip @@ -178,15 +179,11 @@ def gptq_quantize( model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str], data_reader: data_reader.CalibrationDataReader, weight_config: dict = {}, - num_bits: int = 4, - group_size: int = 32, - sym: bool = False, percdamp: float = 0.01, block_size: int = 128, actorder: bool = False, mse: bool = False, perchannel: bool = True, - accuracy_level: int = 0, providers: List[str] = ["CPUExecutionProvider"], return_modelproto: bool = True, ): @@ -206,9 +203,6 @@ def gptq_quantize( 'weight_sym': True, 'accuracy_level': 0 }. Defaults to {}. - num_bits (int, optional): number of bits used to represent weights. Defaults to 4. - group_size (int, optional): size of weight groups. Defaults to 32. - sym (bool, optional): indicates whether weights are symmetric. Defaults to False. percdamp (float, optional): percentage of Hessian's diagonal values' average, which will be added to Hessian's diagonal to increase numerical stability. Defaults to 0.01. block_size (int, optional): execute GPTQ quantization per block. Defaults to 128. @@ -216,9 +210,6 @@ def gptq_quantize( quantization order. Defaults to False. mse (bool, optional): whether get scale and zero point with mse error. Defaults to False. perchannel (bool, optional): whether quantize weight per-channel. Defaults to True. - accuracy_level (int, optional): accuracy level. Support 0 (unset), - 1(fp32 compute type of jblas kernel), 2 (fp16 compute type of jblas kernel), - 3 (bf16 compute type of jblas kernel), 4 (int8 compute type of jblas kernel). Defaults to 0. providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"]. return_modelproto (bool, optionmal): whether to return onnx.Modelproto. set False for layer-wise quant. Default to True @@ -262,12 +253,14 @@ def gptq_quantize( else ort.InferenceSession(model.model_path + "_augment.onnx", so, providers=providers) ) + input_name_to_nodes = model.input_name_to_nodes() + for idx, input_name in enumerate(output_names): utility.simple_progress_bar(len(output_names), idx + 1) node_list = [] weights = [] - for node in model.input_name_to_nodes()[input_name]: + for node in input_name_to_nodes[input_name]: # check op_type of node is MatMul # check dim 1 of input is weight tensor # check weight_type is not "fp32" @@ -304,11 +297,10 @@ def gptq_quantize( weight, H, ) in zip(node_list, weights, Hs): - if node.name in weight_config: - num_bits = weight_config[node.name].get("weight_bits", 4) - group_size = weight_config[node.name].get("weight_group_size", 32) - sym = weight_config[node.name].get("weight_sym", True) - accuracy_level = weight_config[node.name].get("accuracy_level", 0) + num_bits = weight_config[node.name].get("weight_bits", 4) + group_size = weight_config[node.name].get("weight_group_size", 32) + sym = weight_config[node.name].get("weight_sym", True) + accuracy_level = weight_config[node.name].get("accuracy_level", 0) group_size = group_size if group_size != -1 else weight.shape[0] dtype = weight.dtype @@ -341,7 +333,12 @@ def gptq_quantize( org_shape = weight.shape k_blocks = (org_shape[0] + group_size - 1) // group_size q_weight = quant_utils.pad_tensor(q_weight, group_size, k_blocks) - q_weight, scale, zp = quant_utils.quant_tensor(q_weight.T, num_bits, group_size, sym, "uint") + _, _, zp, scale, q_weight = quant_utils.quantize_data( + q_weight.T, + "uint" + str(num_bits), + sym, + axis=1, + ) q_matmul_node, new_inits = quant_utils.make_matmul_weight_only_node( node=node, weight_shape=org_shape, @@ -360,7 +357,7 @@ def gptq_quantize( else: q_weight_tensor = onnx.helper.make_tensor( name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)), - data_type=quant_utils.dtype_mapping[str(dtype)], + data_type=onnx.helper.np_dtype_to_tensor_dtype(dtype), dims=q_weight.shape, vals=q_weight.astype(dtype).tobytes(), raw=True, @@ -390,6 +387,13 @@ def apply_gptq_on_model( model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str], quant_config: dict, calibration_data_reader: data_reader.CalibrationDataReader, + percdamp: float = 0.01, + block_size: int = 128, + actorder: bool = False, + mse: bool = False, + perchannel: bool = True, + providers: List[str] = ["CPUExecutionProvider"], + layer_wise_quant: bool = False, ) -> onnx.ModelProto: """Apply GPTQ on onnx model. @@ -401,18 +405,17 @@ def apply_gptq_on_model( Returns: onnx.ModelProto: quantized onnx model. """ - # check whether to do layer_wise quant - layer_wise = quant_config.pop("layer_wise_quant", False) - # set other model params - quant_kwargs = {} - quant_kwargs = {key: quant_config.pop(key) for key in config.GPTQConfig.model_params_list if key in quant_config} - - # change op config to dict type - for op_name_type, op_config in quant_config.items(): - if isinstance(op_config, config.GPTQConfig): - quant_config[op_name_type] = op_config.to_dict() - if layer_wise: + quant_kwargs = { + "percdamp": percdamp, + "block_size": block_size, + "actorder": actorder, + "mse": mse, + "perchannel": perchannel, + "providers": providers, + } + + if layer_wise_quant: quantized_model = core.layer_wise_quant( model, quant_func=gptq_quantize, @@ -427,4 +430,5 @@ def apply_gptq_on_model( if isinstance(quantized_model, onnx_model.ONNXModel): quantized_model = quantized_model.model + quant_utils.dump_woq_stats(quantized_model, quant_config) return quantized_model diff --git a/onnx_neural_compressor/algorithms/weight_only/rtn.py b/onnx_neural_compressor/algorithms/weight_only/rtn.py index 6856f378d..8837ad7ae 100644 --- a/onnx_neural_compressor/algorithms/weight_only/rtn.py +++ b/onnx_neural_compressor/algorithms/weight_only/rtn.py @@ -23,7 +23,7 @@ import onnxruntime as ort from packaging import version -from onnx_neural_compressor import config, constants, onnx_model, utility +from onnx_neural_compressor import constants, onnx_model, utility from onnx_neural_compressor.algorithms import utility as quant_utils from onnx_neural_compressor.algorithms.layer_wise import core @@ -33,11 +33,7 @@ def rtn_quantize( model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str], weight_config: dict = {}, - num_bits: int = 4, - group_size: int = 32, - sym: bool = False, ratios: dict = {}, - accuracy_level: int = 0, providers: List[str] = ["CPUExecutionProvider"], return_modelproto: bool = True, ): @@ -57,14 +53,7 @@ def rtn_quantize( 'accuracy_level': 0 } }. Defaults to {}. - num_bits (int, optional): number of bits used to represent weights. Defaults to 4. - group_size (int, optional): size of weight groups. Defaults to 32. - sym (bool, optional): indicates whether weights are symmetric. Defaults to False. ratios (dict, optional): percentile of clip. Defaults to {}. - accuracy_level (int, optional): - accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel), - 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), - 4 (int8 compute type of jblas kernel). Defaults to 0. providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"]. return_modelproto (bool, optionmal): whether to return onnx.Modelproto. set False for layer-wise quant. Default to True @@ -97,11 +86,10 @@ def rtn_quantize( continue dtype = weight.dtype - if node.name in weight_config: - num_bits = weight_config[node.name].get("weight_bits", 4) - group_size = weight_config[node.name].get("weight_group_size", 32) - sym = weight_config[node.name].get("weight_sym", True) - accuracy_level = weight_config[node.name].get("accuracy_level", 0) + num_bits = weight_config[node.name].get("weight_bits", 4) + group_size = weight_config[node.name].get("weight_group_size", 32) + sym = weight_config[node.name].get("weight_sym", True) + accuracy_level = weight_config[node.name].get("accuracy_level", 0) org_w_shape = weight.shape # ic, oc group_size = group_size if group_size != -1 else org_w_shape[0] @@ -123,8 +111,12 @@ def rtn_quantize( ): # pragma: no cover # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions, supported by CPU EP # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1, supported by CPU EP AND CUDA EP - q_weight, scale, zp = quant_utils.quant_tensor( - weight.T, num_bits, group_size, sym, "uint", ratios.get(node.input[1], 1) + _, _, zp, scale, q_weight = quant_utils.quantize_data( + weight.T.reshape((-1, group_size)), + "uint" + str(num_bits), + sym, + ratio=ratios.get(node.input[1], 1), + axis=1, ) q_matmul_node, new_inits = quant_utils.make_matmul_weight_only_node( node=node, @@ -142,15 +134,18 @@ def rtn_quantize( remove_nodes.append(node) new_nodes.append(q_matmul_node) else: - q_weight = quant_utils.qdq_tensor( - weight.T, num_bits, group_size, sym, "int", ratios.get(node.input[1], 1) - ) + q_weight = quant_utils.qdq_data( + weight.T.reshape((-1, group_size)), + "int" + str(num_bits), + sym, + ratio=ratios.get(node.input[1], 1), + axis=1) q_weight = np.reshape(q_weight, (org_w_shape[1], -1)) q_weight = np.transpose(q_weight) q_weight = q_weight[: org_w_shape[0], :].astype(dtype) q_weight_tensor = onnx.helper.make_tensor( name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)), - data_type=quant_utils.dtype_mapping[str(dtype)], + data_type=onnx.helper.np_dtype_to_tensor_dtype(dtype), dims=weight.shape, vals=q_weight.tobytes(), raw=True, @@ -175,7 +170,11 @@ def rtn_quantize( def apply_rtn_on_model( - model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str], quant_config: dict + model: Union[onnx.ModelProto, onnx_model.ONNXModel, pathlib.Path, str], + quant_config: dict, + ratios: dict = {}, + providers: List[str] = ["CPUExecutionProvider"], + layer_wise_quant: bool = False, ) -> onnx.ModelProto: """Apply RTN on onnx model. @@ -186,19 +185,12 @@ def apply_rtn_on_model( Returns: onnx.ModelProto: quantized onnx model. """ - # check whether to do layer_wise quant - layer_wise = quant_config.pop("layer_wise_quant", False) - - # set other model params - quant_kwargs = {} - quant_kwargs = {key: quant_config.pop(key) for key in config.RTNConfig.model_params_list if key in quant_config} - - # change op config to dict type - for op_name_type, op_config in quant_config.items(): - if isinstance(op_config, config.RTNConfig): - quant_config[op_name_type] = op_config.to_dict() + quant_kwargs = { + "ratios": ratios, + "providers": providers, + } - if layer_wise: + if layer_wise_quant: quantized_model = core.layer_wise_quant( model, quant_func=rtn_quantize, weight_config=quant_config, **quant_kwargs ) @@ -207,4 +199,5 @@ def apply_rtn_on_model( if isinstance(quantized_model, onnx_model.ONNXModel): quantized_model = quantized_model.model + quant_utils.dump_woq_stats(quantized_model, quant_config) return quantized_model diff --git a/onnx_neural_compressor/data_reader.py b/onnx_neural_compressor/data_reader.py index eacacd52a..7f76769f0 100644 --- a/onnx_neural_compressor/data_reader.py +++ b/onnx_neural_compressor/data_reader.py @@ -14,15 +14,25 @@ import abc -from onnxruntime import quantization +class CalibrationDataReader(metaclass=abc.ABCMeta): + @classmethod + def __subclasshook__(cls, subclass): + return hasattr(subclass, "get_next") and callable(subclass.get_next) or NotImplemented -class CalibrationDataReader(quantization.CalibrationDataReader): - """Get data for calibration. + @abc.abstractmethod + def get_next(self) -> dict: + """generate the input data dict for ONNXinferenceSession run""" + raise NotImplementedError + + def __iter__(self): + return self - We define our CalibrationDataReader based on the class in below link: - https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/calibrate.py#L139 - """ + def __next__(self): + result = self.get_next() + if result is None: + raise StopIteration + return result @abc.abstractmethod def rewind(self): diff --git a/onnx_neural_compressor/onnx_model.py b/onnx_neural_compressor/onnx_model.py index ed3df7f6a..20fcb95e3 100644 --- a/onnx_neural_compressor/onnx_model.py +++ b/onnx_neural_compressor/onnx_model.py @@ -21,12 +21,11 @@ import onnx import transformers -from onnxruntime.quantization import onnx_model from onnx_neural_compressor import constants, logger, utility -class ONNXModel(onnx_model.ONNXModel): +class ONNXModel: """Build ONNX model.""" def __init__(self, model, **kwargs): @@ -36,7 +35,6 @@ def __init__(self, model, **kwargs): model (str or ModelProto): path to onnx model or loaded ModelProto model object. """ self.model = model if not isinstance(model, str) else onnx.load(model, load_external_data=False) - super().__init__(self.model) self._model_path = None if not isinstance(model, str) else model self.check_is_large_model() @@ -51,12 +49,57 @@ def __init__(self, model, **kwargs): if isinstance(model, str) and os.path.exists(pathlib.Path(model).parent.joinpath("config.json").as_posix()): self._config = transformers.PretrainedConfig.from_pretrained(pathlib.Path(model).parent.as_posix()) self.node_name_counter = {} - self._output_name_to_node = self.output_name_to_node() - self._input_name_to_nodes = self.input_name_to_nodes() + self._output_name_to_node = {} + self._input_name_to_nodes = {} + self._get_output_name_to_node(self.model.graph.node) + self._get_input_name_to_nodes(self.model.graph.node) self._graph_info = {} self._get_graph_info() self._q_config = None + def output_name_to_node(self): + self._output_name_to_node = {} + self._get_output_name_to_node(self.model.graph.node) + return self._output_name_to_node + + def input_name_to_nodes(self): + self._input_name_to_nodes = {} + self._get_input_name_to_nodes(self.model.graph.node) + return self._input_name_to_nodes + + def _get_input_name_to_nodes(self, nodes): + """Get input names of nodes.""" + for node in nodes: + attrs = [ + attr + for attr in node.attribute + if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS + ] + if len(attrs) > 0: + for attr in attrs: + self._get_input_name_to_nodes(attr.g.node) + for input_name in node.input: + if len(input_name.strip()) != 0: + if input_name not in self._input_name_to_nodes: + self._input_name_to_nodes[input_name] = [node] + else: + self._input_name_to_nodes[input_name].append(node) + + def _get_output_name_to_node(self, nodes): + """Get output names of nodes.""" + for node in nodes: + attrs = [ + attr + for attr in node.attribute + if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS + ] + if len(attrs) > 0: + for attr in attrs: + self._get_output_name_to_node(attr.g.node) + for output_name in node.output: + if len(output_name.strip()) != 0: + self._output_name_to_node[output_name] = node + @property def model_path(self): """Return model path.""" @@ -99,6 +142,11 @@ def framework(self): """Return framework.""" return "onnxruntime" + def add_initializer(self, tensor): + """Add a initializer to model.""" + if tensor.name not in [i.name for i in self._model.graph.initializer]: + self._model.graph.initializer.append(tensor) + def add_initializers(self, tensors): """Add initializers to model.""" for tensor in tensors: @@ -127,6 +175,42 @@ def output(self): """Return output of model.""" return [i.name for i in self.model.graph.output] + @property + def model(self): + """Return model itself.""" + return self._model + + @model.setter + def model(self, model): + """Set model itself.""" + self._model = model + self._graph_info = {} + self._get_graph_info() + self._output_name_to_node = {} + self._input_name_to_nodes = {} + self._get_input_name_to_nodes(self._model.graph.node) + self._get_output_name_to_node(self._model.graph.node) + + def nodes(self): + """Return model nodes.""" + return self._model.graph.node + + def initializer(self): + """Return model initializer.""" + return self._model.graph.initializer + + def graph(self): + """Return model graph.""" + return self._model.graph + + def ir_version(self): + """Return model ir_version.""" + return self._model.ir_version + + def opset_import(self): + """Return model opset_import.""" + return self._model.opset_import + def update(self): """Update model info.""" self._graph_info = {} @@ -144,6 +228,10 @@ def _get_graph_info(self): for node in self.model.graph.node: self.graph_info.update({node.name: node.op_type}) + def is_graph_output(self, name): + """Check whether the tensor is the graph output.""" + return name in self.output() + def save(self, root): """Save ONNX model.""" if os.path.split(root)[0] != "" and not os.path.exists(os.path.split(root)[0]): @@ -168,6 +256,53 @@ def save(self, root): output_config_file = pathlib.Path(root).parent.joinpath("config.json").as_posix() self._config.to_json_file(output_config_file, use_diff=False) + def remove_initializer(self, tensor): + """Remove an initializer from model.""" + if tensor in self._model.graph.initializer: + self._model.graph.initializer.remove(tensor) + + def remove_initializers(self, init_to_remove): + """Remove initializers from model.""" + for initializer in init_to_remove: + self.remove_initializer(initializer) + + def get_initializer(self, name): + """"Find the initializer with specified name.""" + for initializer in self.model.graph.initializer: + if initializer.name == name: + return initializer + return None + + def remove_node(self, node): + """Remove a node from model.""" + if node in self._model.graph.node: + self._model.graph.node.remove(node) + + def remove_nodes(self, nodes_to_remove): + """Remove nodes from model.""" + for node in nodes_to_remove: + self.remove_node(node) + + def add_node(self, node): + """Add a node to model.""" + self._model.graph.node.extend([node]) + + def add_nodes(self, nodes_to_add): + """Add nodes to model.""" + self._model.graph.node.extend(nodes_to_add) + + def get_children(self, node, input_name_to_nodes=None): + """Get children nodes.""" + if input_name_to_nodes is None: + input_name_to_nodes = self._input_name_to_nodes + + children = [] + for output in node.output: + if output in input_name_to_nodes: + for child in input_name_to_nodes[output]: + children.append(child) + return children + def get_initializer_share_num(self, name): """Get the number of shares of initializer.""" num = 0 @@ -186,6 +321,25 @@ def get_node(self, name): return node return None + def get_parent(self, node, idx, output_name_to_node=None): + if output_name_to_node is None: + output_name_to_node = self._output_name_to_node + if len(node.input) <= idx: + return None + + input = node.input[idx] + return output_name_to_node.get(input, None) + + def get_parents(self, node, output_name_to_node=None): + if output_name_to_node is None: + output_name_to_node = self._output_name_to_node + + parents = [] + for input in node.input: + if input in output_name_to_node: + parents.append(output_name_to_node[input]) + return parents + def get_node_by_weight(self, weight_name): """Get a node by its weight name.""" if len(self._input_name_to_nodes) == 0: @@ -277,6 +431,22 @@ def _searcher(tensor_name): assert zo_tensor, "missing zero point for tensor {}".format(tensor) return scale_tensor, zo_tensor + @staticmethod + def replace_node_input(node, old_input_name, new_input_name): + """Replace input of a node.""" + assert isinstance(old_input_name, str) and isinstance(new_input_name, str) + for j in range(len(node.input)): + if node.input[j] == old_input_name: + node.input[j] = new_input_name + + @staticmethod + def replace_node_output(node, old_output_name, new_output_name): + """Replace output of a node.""" + assert isinstance(old_output_name, str) and isinstance(new_output_name, str) + for j in range(len(node.output)): + if node.output[j] == old_output_name: + node.output[j] = new_output_name + def replace_input_of_all_nodes(self, old_input_name, new_input_name, white_optype=[], black_optype=[]): """Replace inputs of all nodes.""" if len(white_optype) > 0: @@ -331,7 +501,7 @@ def remove_unused_nodes(self): unvalid_nodes = [ i for i in self.model.graph.node - if all(out not in self._input_name_to_nodes and not self.is_graph_output(out) for out in i.output) + if all(out not in self._input_name_to_nodes and out not in self.output() for out in i.output) ] while len(unvalid_nodes) > 0: self.remove_nodes(unvalid_nodes) @@ -339,12 +509,12 @@ def remove_unused_nodes(self): unvalid_nodes = [ i for i in self.model.graph.node - if all([out not in self._input_name_to_nodes and not self.is_graph_output(out) for out in i.output]) + if all([out not in self._input_name_to_nodes and out not in self.output() for out in i.output]) ] ununsed_weights = [] for w in self.model.graph.initializer: - if w.name not in self._input_name_to_nodes and w.name not in self.model.graph.output: + if w.name not in self._input_name_to_nodes and w.name not in self.output(): ununsed_weights.append(w) # Remove from graph.input for graph_input in self.graph().input: diff --git a/onnx_neural_compressor/quantization/__init__.py b/onnx_neural_compressor/quantization/__init__.py index 1dcd5e428..67e82f0fc 100644 --- a/onnx_neural_compressor/quantization/__init__.py +++ b/onnx_neural_compressor/quantization/__init__.py @@ -12,8 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. - -from onnxruntime.quantization import CalibrationMethod -from onnxruntime.quantization.quant_utils import QuantFormat, QuantType - +from onnx_neural_compressor.quantization.quant_utils import CalibrationMethod, QuantFormat, QuantType from onnx_neural_compressor.quantization.quantize import quantize diff --git a/onnx_neural_compressor/quantization/algorithm_entry.py b/onnx_neural_compressor/quantization/algorithm_entry.py index 1e42810e4..14284be66 100644 --- a/onnx_neural_compressor/quantization/algorithm_entry.py +++ b/onnx_neural_compressor/quantization/algorithm_entry.py @@ -18,13 +18,11 @@ import onnx import onnxruntime as ort -from onnxruntime import quantization - -from onnx_neural_compressor import config, constants, data_reader, logger, utility -from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor import constants, data_reader, logger, utility from onnx_neural_compressor.algorithms.post_training_quant import calibrate, quantizer from onnx_neural_compressor.algorithms.smoother import core from onnx_neural_compressor.algorithms.weight_only import awq, gptq, rtn +from onnx_neural_compressor.quantization import config ###################### RTN Algo Entry ################################## @@ -40,8 +38,9 @@ def rtn_quantize_entry( logger.debug(config_mapping) else: config_mapping = quant_config.config_mapping - model = rtn.apply_rtn_on_model(model, config_mapping) - quant_utils.dump_woq_stats(model, config_mapping, quant_config.white_list) + quant_kwargs = {} + quant_kwargs = {key: getattr(quant_config, key) for key in config.RTNConfig.model_params_list} + model = rtn.apply_rtn_on_model(model, config_mapping, **quant_kwargs) return model @@ -67,11 +66,12 @@ def gptq_quantize_entry( logger.debug(config_mapping) else: config_mapping = quant_config.config_mapping + quant_kwargs = {} + quant_kwargs = {key: getattr(quant_config, key) for key in config.GPTQConfig.model_params_list} # regenerate to ensure data exists calibration_data_reader.rewind() - model = gptq.apply_gptq_on_model(model, config_mapping, calibration_data_reader) - quant_utils.dump_woq_stats(model, config_mapping, quant_config.white_list) + model = gptq.apply_gptq_on_model(model, config_mapping, calibration_data_reader, **quant_kwargs) return model @@ -97,11 +97,12 @@ def awq_quantize_entry( logger.debug(config_mapping) else: config_mapping = quant_config.config_mapping + quant_kwargs = {} + quant_kwargs = {key: getattr(quant_config, key) for key in config.AWQConfig.model_params_list} # regenerate to ensure data exists calibration_data_reader.rewind() - model = awq.apply_awq_on_model(model, config_mapping, calibration_data_reader) - quant_utils.dump_woq_stats(model, config_mapping, quant_config.white_list) + model = awq.apply_awq_on_model(model, config_mapping, calibration_data_reader, **quant_kwargs) return model @@ -154,7 +155,6 @@ def static_quantize_entry( _quantizer.quantize_model() if model_output is not None: _quantizer.model.save(model_output) - quant_utils.dump_model_op_stats(_quantizer.model.model, config_mapping, quant_config.op_types_to_quantize) return _quantizer.model.model @@ -239,5 +239,4 @@ def dynamic_quantize_entry( _quantizer.quantize_model() if model_output is not None: _quantizer.model.save(model_output) - quant_utils.dump_model_op_stats(_quantizer.model.model, config_mapping, quant_config.op_types_to_quantize) return _quantizer.model.model diff --git a/onnx_neural_compressor/config.py b/onnx_neural_compressor/quantization/config.py similarity index 81% rename from onnx_neural_compressor/config.py rename to onnx_neural_compressor/quantization/config.py index 59d0ceb65..fb9cd220a 100644 --- a/onnx_neural_compressor/config.py +++ b/onnx_neural_compressor/quantization/config.py @@ -28,10 +28,10 @@ import numpy as np import onnx import pydantic -from onnxruntime import quantization from typing_extensions import Self -from onnx_neural_compressor import constants, data_reader, logger, utility +from onnxruntime import quantization as ort_quant +from onnx_neural_compressor import constants, data_reader, logger, quantization, utility from collections import OrderedDict # isort: skip from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Type, Union, _GenericAlias # isort: skip @@ -114,7 +114,7 @@ def is_tunable(self, value: Any) -> bool: return False def __str__(self) -> str: - return self.name + return "TuningParam(name={}, tunable_type={}, options={}).".format(self.name, str(self.tunable_type), str(self.options)) # Config registry to store all registered configs. @@ -421,22 +421,45 @@ def build_tuning_param(config: BaseConfig, param: str): def expand(self) -> List[BaseConfig]: """Expand the config. - case 1 - {"model_params": { "reduce_range": [True, False]}} -> - {"model_params": { "reduce_range": True}}, {"model_params": { "reduce_range": False}} - - case 2: iterate op_params first (for this case, Add op only supports per_tensor) - {"model_params": { "reduce_range": [True, False]}, "op_params": {"per_channel": [True, False]}} -> - {"model_params": { "reduce_range": True}, "op_params": {"per_channel": True}} - {"model_params": { "reduce_range": True}, "op_params": {"per_channel": False}} - {"model_params": { "reduce_range": False}, "op_params": {"per_channel": True}} - {"model_params": { "reduce_range": False}, "op_params": {"per_channel": False}} - - {"model_params": { "reduce_range": [True, False]}, "op_params": {"Conv": {"per_channel": [True, False], , "Add": {"per_channel": [True, False]}}}} -> - {"model_params": { "reduce_range": True}, "op_params": {"Conv": {"per_channel": True}, "Add": {"per_channel": False}}}, - {"model_params": { "reduce_range": True}, "op_params": {"Conv": {"per_channel": False}, "Add": {"per_channel": False}}}, - {"model_params": { "reduce_range": False}, "op_params": {"Conv": {"per_channel": True}, "Add": {"per_channel": False}}}, - {"model_params": { "reduce_range": False}, "op_params": {"Conv": {"per_channel": False}, "Add": {"per_channel": False}}}, + Expand rule is: + 1. Expand model_params_list first, then expand params_list + 2. Expand model_params_list/params_list following the order of param order in model_params_list/params_list + + model_params_list=[A, B] params_list=[C,D] + A=[1,2], B=[3,4] C=[5,6], D=[7,8] + + Expanded results: + -------- Combination 1 (C=5, D=7) + / + / -------- Combination 2 (C=6, D=7) + Combination 1 ---- + (A=1, B=3) \ -------- Combination 3 (C=5, D=8) + \ + -------- Combination 4 (C=6, D=8) + + -------- Combination 1 (C=5, D=7) + / + / -------- Combination 2 (C=6, D=7) + Combination 2 ---- + (A=2, B=3) \ -------- Combination 3 (C=5, D=8) + \ + -------- Combination 4 (C=6, D=8) + + -------- Combination 1 (C=5, D=7) + / + / -------- Combination 2 (C=6, D=7) + Combination 3 ---- + (A=1, B=4) \ -------- Combination 3 (C=5, D=8) + \ + -------- Combination 4 (C=6, D=8) + + -------- Combination 1 (C=5, D=7) + / + / -------- Combination 2 (C=6, D=7) + Combination 4 ---- + (A=2, B=4) \ -------- Combination 3 (C=5, D=8) + \ + -------- Combination 4 (C=6, D=8) """ config = self # set model level params @@ -455,9 +478,9 @@ def expand(self) -> List[BaseConfig]: model_level_config_lst = [config] else: tuning_param_name_lst = [tuning_param.name for tuning_param in tuning_param_list] - for params_values in itertools.product(*[tuning_param.options for tuning_param in tuning_param_list]): + for params_values in itertools.product(*[tuning_param.options for tuning_param in tuning_param_list[::-1]]): new_config = copy.deepcopy(self) - for param_name, param_value in zip(tuning_param_name_lst, params_values): + for param_name, param_value in zip(tuning_param_name_lst[::-1], params_values): setattr(new_config, param_name, param_value) logger.debug(new_config.to_dict()) model_level_config_lst.append(new_config) @@ -471,7 +494,7 @@ def expand(self) -> List[BaseConfig]: tuning_param = self.build_tuning_param(config, param) param_val = getattr(config, tuning_param.name) if param_val is not None: - if tuning_param.is_tunable(param_val): + if tuning_param.is_tunable(param_val) and len(param_val) > 0: tuning_param.options = param_val op_tuning_param_list.append(tuning_param) @@ -480,9 +503,9 @@ def expand(self) -> List[BaseConfig]: else: tuning_param_name_lst = [tuning_param.name for tuning_param in op_tuning_param_list] tuning_param_val_lst = list( - itertools.product(*[tuning_param.options for tuning_param in op_tuning_param_list]) + itertools.product(*[tuning_param.options for tuning_param in op_tuning_param_list[::-1]]) ) - tuning_param_pair_lst = [dict(zip(tuning_param_name_lst[::-1], val[::-1])) for val in tuning_param_val_lst] + tuning_param_pair_lst = [dict(zip(tuning_param_name_lst[::-1], val)) for val in tuning_param_val_lst] for model_level_config in model_level_config_lst: for tuning_param_pair in tuning_param_pair_lst: @@ -514,11 +537,8 @@ def to_config_mapping( if config_list is None: config_list = [self] for config in config_list: - global_config = config.global_config op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() for op_name, op_type in model_info: - if self.global_config is not None: - self._config_mapping[op_name] = global_config if op_type in op_type_config_dict: self._config_mapping[op_name] = op_name_config_dict[op_type] for op_name_pattern in op_name_config_dict: @@ -628,22 +648,17 @@ def register_supported_configs(): @dataclasses.dataclass class OperatorConfig: - - def __init__( - self, - weight_type, - activation_type, - per_channel, - weight_sym, - activation_sym, - calibrate_method=quantization.CalibrationMethod.MinMax, - ): - self.weight_type = getattr(weight_type, "tensor_type", weight_type) - self.activation_type = getattr(activation_type, "tensor_type", activation_type) - self.per_channel = per_channel - self.weight_sym = weight_sym - self.activation_sym = activation_sym - self.calibrate_method = calibrate_method + weight_type: quantization.QuantType + activation_type: quantization.QuantType + per_channel: bool + weight_sym: bool + activation_sym: bool + calibrate_method: quantization.CalibrationMethod=quantization.CalibrationMethod.MinMax + + def __post_init__(self): + self.weight_type = getattr(self.weight_type, "tensor_type", self.weight_type) + self.activation_type = getattr(self.activation_type, "tensor_type", self.activation_type) + self.calibrate_method = getattr(self.calibrate_method, "value", self.calibrate_method) def __getitem__(self, key): return getattr(self, key) @@ -765,6 +780,19 @@ def __init__( self.quant_last_matmul = quant_last_matmul self._post_init() + + def _post_init(self): + if self.white_list == constants.RTN_OP_LIST: + global_config = self.get_init_args() + self._global_config = self.__class__(**global_config, white_list=None) + elif isinstance(self.white_list, list) and len(self.white_list) > 0: + for op_name_or_type in self.white_list: + global_config = self.get_init_args() + tmp_config = self.__class__(**global_config, white_list=None) + self.set_local(op_name_or_type, tmp_config) + elif self.white_list == constants.EMPTY_WHITE_LIST: + return + def get_model_params_dict(self): result = dict() for param in self.model_params_list: @@ -793,21 +821,23 @@ def to_config_mapping(self, config_list: List[BaseConfig] = None, model_info: li self._config_mapping.update(config.get_model_params_dict()) # update node level setting + last_matmul = None global_config = config.get_params_dict() op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() for op_name, op_type in model_info: - if self.global_config is not None: + if op_type == "MatMul": + last_matmul = op_name + if global_config is not None: self._config_mapping[op_name] = global_config if op_type in op_type_config_dict: self._config_mapping[op_name] = op_type_config_dict[op_type] for op_name_pattern in op_name_config_dict: if re.match(op_name_pattern, op_name): self._config_mapping[op_name] = op_name_config_dict[op_name_pattern] - if not self.quant_last_matmul: - self._config_mapping[model_info[-1][0]] = { - "weight": {"dtype": "fp32"}, - "activation": {"dtype": "fp32", "quant_mode": "fp32"}, - } + if op_name in self._config_mapping and hasattr(self._config_mapping[op_name], "to_dict"): + self._config_mapping[op_name] = self._config_mapping[op_name].to_dict() + if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping: + del self._config_mapping[last_matmul] return self._config_mapping @staticmethod @@ -926,6 +956,18 @@ def __init__( self.quant_last_matmul = quant_last_matmul self._post_init() + def _post_init(self): + if self.white_list == constants.GPTQ_OP_LIST: + global_config = self.get_init_args() + self._global_config = self.__class__(**global_config, white_list=None) + elif isinstance(self.white_list, list) and len(self.white_list) > 0: + for op_name_or_type in self.white_list: + global_config = self.get_init_args() + tmp_config = self.__class__(**global_config, white_list=None) + self.set_local(op_name_or_type, tmp_config) + elif self.white_list == constants.EMPTY_WHITE_LIST: + return + def get_model_params_dict(self): result = dict() for param in self.model_params_list: @@ -957,21 +999,23 @@ def to_config_mapping(self, config_list: list = None, model_info: list = None) - self._config_mapping.update(config.get_model_params_dict()) # update node level setting + last_matmul = None global_config = config.get_params_dict() op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() for op_name, op_type in model_info: - if self.global_config is not None: + if op_type == "MatMul": + last_matmul = op_name + if global_config is not None: self._config_mapping[op_name] = global_config if op_type in op_type_config_dict: self._config_mapping[op_name] = op_type_config_dict[op_type] for op_name_pattern in op_name_config_dict: if re.match(op_name_pattern, op_name): self._config_mapping[op_name] = op_name_config_dict[op_name_pattern] - if not self.quant_last_matmul: - self._config_mapping[model_info[-1][0]] = { - "weight": {"dtype": "fp32"}, - "activation": {"dtype": "fp32", "quant_mode": "fp32"}, - } + if op_name in self._config_mapping and hasattr(self._config_mapping[op_name], "to_dict"): + self._config_mapping[op_name] = self._config_mapping[op_name].to_dict() + if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping: + del self._config_mapping[last_matmul] return self._config_mapping @staticmethod @@ -1077,6 +1121,18 @@ def __init__( self.quant_last_matmul = quant_last_matmul self._post_init() + def _post_init(self): + if self.white_list == constants.GPTQ_OP_LIST: + global_config = self.get_init_args() + self._global_config = self.__class__(**global_config, white_list=None) + elif isinstance(self.white_list, list) and len(self.white_list) > 0: + for op_name_or_type in self.white_list: + global_config = self.get_init_args() + tmp_config = self.__class__(**global_config, white_list=None) + self.set_local(op_name_or_type, tmp_config) + elif self.white_list == constants.EMPTY_WHITE_LIST: + return + def get_model_params_dict(self): result = dict() for param in self.model_params_list: @@ -1107,21 +1163,23 @@ def to_config_mapping(self, config_list: list = None, model_info: list = None) - self._config_mapping.update(config.get_model_params_dict()) # update node level setting + last_matmul = None global_config = config.get_params_dict() op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() for op_name, op_type in model_info: - if self.global_config is not None: + if op_type == "MatMul": + last_matmul = op_name + if global_config is not None: self._config_mapping[op_name] = global_config if op_type in op_type_config_dict: self._config_mapping[op_name] = op_type_config_dict[op_type] for op_name_pattern in op_name_config_dict: if re.match(op_name_pattern, op_name): self._config_mapping[op_name] = op_name_config_dict[op_name_pattern] - if not self.quant_last_matmul: - self._config_mapping[model_info[-1][0]] = { - "weight": {"dtype": "fp32"}, - "activation": {"dtype": "fp32", "quant_mode": "fp32"}, - } + if op_name in self._config_mapping and hasattr(self._config_mapping[op_name], "to_dict"): + self._config_mapping[op_name] = self._config_mapping[op_name].to_dict() + if not self.quant_last_matmul and last_matmul is not None and last_matmul in self._config_mapping: + del self._config_mapping[last_matmul] return self._config_mapping @staticmethod @@ -1205,8 +1263,242 @@ def __init__( self.SmoothQuantScalesPerOp = SmoothQuantScalesPerOp +def static_basic_check(config, optype, execution_provider, quant_format): + if getattr(quant_format, "value", quant_format) == 0: + if execution_provider not in constants.STATIC_QOPERATOR_OP_LIST_MAP: + raise ValueError( + "Unsupported execution_provider {}, only support {}.".format( + execution_provider, list(constants.STATIC_QOPERATOR_OP_LIST_MAP.keys()) + ) + ) + supported_optype = constants.STATIC_QOPERATOR_OP_LIST_MAP[execution_provider] + if optype not in supported_optype: + raise ValueError( + "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype) + ) + elif getattr(quant_format, "value", quant_format) == 1: + if execution_provider not in constants.STATIC_QDQ_OP_LIST_MAP: + raise ValueError( + "Unsupported execution_provider {}, only support {}.".format( + execution_provider, list(constants.STATIC_QDQ_OP_LIST_MAP.keys()) + ) + ) + supported_optype = constants.STATIC_QDQ_OP_LIST_MAP[execution_provider] + if optype not in supported_optype: + raise ValueError( + "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype) + ) + else: + raise ValueError( + "Unsupported quant_format {}, only support QuantFormat.QOperator and QuantFormat.QDQ.".format(quant_format) + ) + return config + + +def static_cpu_check(config, optype, execution_provider, quant_format): + if execution_provider != "CPUExecutionProvider": + return config + + # only support per-tensor + if optype in [ + "EmbedLayerNormalization", + "Relu", + "Clip", + "LeakyRelu", + "Sigmoid", + "MaxPool", + "GlobalAveragePool", + "Pad", + "Split", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Tile", + "Unsqueeze", + "Transpose", + "Resize", + "Abs", + "Shrink", + "Sign", + "Attention", + "Flatten", + "Expand", + "Slice", + "Mod", + "ReduceMax", + "ReduceMin", + "CenterCropPad", + "Add", + "Mul", + "ArgMax", + ]: + setattr(config, "per_channel", False) + + if optype in ["Attention"]: + setattr(config, "activation_type", onnx.TensorProto.UINT8) + return config + + +def static_cuda_check(config, optype, execution_provider, quant_format): + if execution_provider != "CUDAExecutionProvider": + return config + + # only support per-tensor + if optype in [ + "EmbedLayerNormalization", + "Relu", + "Clip", + "LeakyRelu", + "Sigmoid", + "MaxPool", + "GlobalAveragePool", + "Pad", + "Split", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Tile", + "Unsqueeze", + "Transpose", + "Resize", + "Abs", + "Shrink", + "Sign", + "Attention", + "Flatten", + "Expand", + "Slice", + "Mod", + "ReduceMax", + "ReduceMin", + "CenterCropPad", + "Add", + "Mul", + "ArgMax", + ]: + setattr(config, "per_channel", False) + + if optype in ["Attention"]: + setattr(config, "activation_type", onnx.TensorProto.INT8) + setattr(config, "weight_type", onnx.TensorProto.INT8) + return config + + +def static_dml_check(config, optype, execution_provider, quant_format): + if execution_provider != "DmlExecutionProvider": + return config + + # only support per-tensor + if optype in ["Conv", "MatMul", "Mul", "Relu", "Clip", "MaxPool", "Add"]: + setattr(config, "per_channel", False) + return config + + +def static_dnnl_check(config, optype, execution_provider, quant_format): + if execution_provider != "DnnlExecutionProvider": + return config + + # current configurations are same as CPU EP + return static_cpu_check(config, optype, execution_provider, quant_format) + + +def static_trt_check(config, optype, execution_provider, quant_format): + if execution_provider != "TensorrtExecutionProvider": + return config + + # only support S8S8 + if optype in ["Conv", "MatMul", "Gather", "Gemm"]: + setattr(config, "weight_type", onnx.TensorProto.INT8) + setattr(config, "weight_sym", True) + setattr(config, "activation_type", onnx.TensorProto.INT8) + setattr(config, "activation_sym", True) + setattr(config, "per_channel", [False, True]) + else: + setattr(config, "weight_type", onnx.TensorProto.INT8) + setattr(config, "weight_sym", True) + setattr(config, "activation_type", onnx.TensorProto.INT8) + setattr(config, "activation_sym", True) + return config + + +STATIC_CHECK_FUNC_LIST = [ + static_basic_check, + static_cpu_check, + static_cuda_check, + static_dml_check, + static_dnnl_check, + static_trt_check, +] + + +def dynamic_basic_check(config, optype, execution_provider, quant_format=None): + if execution_provider not in constants.DYNAMIC_OP_LIST_MAP: + raise ValueError( + "Unsupported execution_provider {}, only support {}.".format( + execution_provider, list(constants.DYNAMIC_OP_LIST_MAP.keys()) + ) + ) + + supported_optype = constants.DYNAMIC_OP_LIST_MAP[execution_provider] + if optype not in supported_optype: + raise ValueError( + "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype) + ) + return config + + +def dynamic_cpu_check(config, optype, execution_provider, quant_format=None): + if execution_provider != "CPUExecutionProvider": + return config + # TODO: add constraints for other EP + if optype in ["FusedConv", "Conv", "EmbedLayerNormalization", "Gather", "Attention", "LSTM"]: + setattr(config, "per_channel", False) + return config + + +def dynamic_cuda_check(config, optype, execution_provider, quant_format=None): + if execution_provider != "CUDAExecutionProvider": + return config + # current configurations are same as CPU EP + return dynamic_cpu_check(config, optype, execution_provider, quant_format) + + +def dynamic_dml_check(config, optype, execution_provider, quant_format=None): + if execution_provider != "DmlExecutionProvider": + return config + + # don't support dynamic quantization + return None + + +def dynamic_dnnl_check(config, optype, execution_provider, quant_format=None): + if execution_provider != "DnnlExecutionProvider": + return config + # current configurations are same as CPU EP + return dynamic_cpu_check(config, optype, execution_provider, quant_format) + + +def dynamic_trt_check(config, optype, execution_provider, quant_format=None): + if execution_provider != "TensorrtExecutionProvider": + return config + + # don't support dynamic quantization + return None + + +DYNAMIC_CHECK_FUNC_LIST = [ + dynamic_basic_check, + dynamic_cpu_check, + dynamic_cuda_check, + dynamic_dml_check, + dynamic_dnnl_check, + dynamic_trt_check, +] + @register_config(algo_name=constants.STATIC_QUANT, priority=constants.PRIORITY_STATIC_QUANT) -class StaticQuantConfig(BaseConfig, quantization.StaticQuantConfig): +class StaticQuantConfig(BaseConfig, ort_quant.StaticQuantConfig): supported_configs: List[_OperatorConfig] = [] params_list: List[str] = [ @@ -1284,7 +1576,7 @@ def __init__( logger.warning( "VNNI is not supported and reduce_range=False, reduce_range=True is recommended to avoid potential accuracy issue." ) - quantization.StaticQuantConfig.__init__( + ort_quant.StaticQuantConfig.__init__( self, calibration_data_reader=calibration_data_reader, calibrate_method=calibrate_method, @@ -1347,7 +1639,7 @@ def _post_init(self): params = self.get_params_dict() op_config = OperatorConfig(**params) - for valid_func in utility.STATIC_CHECK_FUNC_LIST: + for valid_func in STATIC_CHECK_FUNC_LIST: op_config = valid_func(op_config, op_name_or_type, self.execution_provider, self.quant_format) self.set_local(op_name_or_type, op_config) if isinstance(self.white_list, list) and len(self.white_list) > 0: @@ -1368,6 +1660,8 @@ def to_config_mapping(self, config_list: list = None, model_info: list = None) - op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() last_matmul = None for op_name, op_type in model_info: + if op_type == "MatMul": + last_matmul = op_name if ( isinstance(self.op_types_to_quantize, list) and len(self.op_types_to_quantize) > 0 @@ -1388,8 +1682,6 @@ def to_config_mapping(self, config_list: list = None, model_info: list = None) - continue if op_type in op_type_config_dict: self._config_mapping[op_name] = op_type_config_dict[op_type] - if op_type == "MatMul": - last_matmul = op_name for op_name_pattern in op_name_config_dict: if re.match(op_name_pattern, op_name): self._config_mapping[op_name] = op_name_config_dict[op_name_pattern] @@ -1468,7 +1760,7 @@ def register_supported_configs(cls) -> None: activation_sym=False, ), operators=["GatherND", "GatherElements", "Gather"], - valid_func_list=utility.STATIC_CHECK_FUNC_LIST, + valid_func_list=STATIC_CHECK_FUNC_LIST, ) ) supported_configs.append( @@ -1486,7 +1778,7 @@ def register_supported_configs(cls) -> None: activation_sym=False, ), operators=["EmbedLayerNormalization"], - valid_func_list=utility.STATIC_CHECK_FUNC_LIST, + valid_func_list=STATIC_CHECK_FUNC_LIST, ) ) supported_configs.append( @@ -1504,7 +1796,7 @@ def register_supported_configs(cls) -> None: activation_sym=False, ), operators=["Conv", "MatMul", "Gemm", "FusedConv"], - valid_func_list=utility.STATIC_CHECK_FUNC_LIST, + valid_func_list=STATIC_CHECK_FUNC_LIST, ) ) supported_configs.append( @@ -1553,7 +1845,7 @@ def register_supported_configs(cls) -> None: "Mul", "ArgMax", ], - valid_func_list=utility.STATIC_CHECK_FUNC_LIST, + valid_func_list=STATIC_CHECK_FUNC_LIST, ) ) cls.supported_configs = supported_configs @@ -1687,7 +1979,7 @@ def get_default_sq_config() -> SmoothQuantConfig: @register_config(algo_name=constants.DYNAMIC_QUANT, priority=constants.PRIORITY_DYNAMIC_QUANT) -class DynamicQuantConfig(BaseConfig, quantization.DynamicQuantConfig): +class DynamicQuantConfig(BaseConfig, ort_quant.DynamicQuantConfig): """This is a class for dynamic Quant Configuration. Inherit from DynamicQuantConfig: @@ -1732,7 +2024,7 @@ def __init__( logger.warning( "VNNI is not supported and reduce_range=False, reduce_range=True is recommended to avoid potential accuracy issue." ) - quantization.DynamicQuantConfig.__init__( + ort_quant.DynamicQuantConfig.__init__( self, weight_type=weight_type, op_types_to_quantize=op_types_to_quantize, @@ -1776,7 +2068,7 @@ def _post_init(self): for op_name_or_type in self.op_types_to_quantize: params = self.get_params_dict() op_config = OperatorConfig(**params) - for valid_func in utility.DYNAMIC_CHECK_FUNC_LIST: + for valid_func in DYNAMIC_CHECK_FUNC_LIST: op_config = valid_func(op_config, op_name_or_type, self.execution_provider) self.set_local(op_name_or_type, op_config) if isinstance(self.white_list, list) and len(self.white_list) > 0: @@ -1793,10 +2085,11 @@ def to_config_mapping(self, config_list: list = None, model_info: list = None) - self._config_mapping.update(config.get_model_params_dict()) # update node level setting - global_config = config.global_config op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() last_matmul = None for op_name, op_type in model_info: + if op_type == "MatMul": + last_matmul = op_name if ( isinstance(self.op_types_to_quantize, list) and len(self.op_types_to_quantize) > 0 @@ -1817,8 +2110,6 @@ def to_config_mapping(self, config_list: list = None, model_info: list = None) - continue if op_type in op_type_config_dict: self._config_mapping[op_name] = op_type_config_dict[op_type] - if op_type == "MatMul": - last_matmul = op_name for op_name_pattern in op_name_config_dict: if re.match(op_name_pattern, op_name): self._config_mapping[op_name] = op_name_config_dict[op_name_pattern] @@ -1888,7 +2179,7 @@ def register_supported_configs(cls) -> None: activation_sym=False, ), operators=["FusedConv", "Conv", "EmbedLayerNormalization"], - valid_func_list=utility.DYNAMIC_CHECK_FUNC_LIST, + valid_func_list=DYNAMIC_CHECK_FUNC_LIST, ) ) supported_configs.append( @@ -1901,7 +2192,7 @@ def register_supported_configs(cls) -> None: activation_sym=False, ), operators=["MatMul"], - valid_func_list=utility.DYNAMIC_CHECK_FUNC_LIST, + valid_func_list=DYNAMIC_CHECK_FUNC_LIST, ) ) supported_configs.append( @@ -1914,7 +2205,7 @@ def register_supported_configs(cls) -> None: activation_sym=False, ), operators=["Gather", "Attention", "LSTM"], - valid_func_list=utility.DYNAMIC_CHECK_FUNC_LIST, + valid_func_list=DYNAMIC_CHECK_FUNC_LIST, ) ) cls.supported_configs = supported_configs diff --git a/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py b/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py index 62a671fba..87051221b 100644 --- a/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py +++ b/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py @@ -15,7 +15,6 @@ from typing import List, Union # isort: skip import onnx -from onnxruntime.quantization import matmul_4bits_quantizer from onnx_neural_compressor.quantization import matmul_nbits_quantizer @@ -33,7 +32,7 @@ def __init__( is_symmetric: bool = False, accuracy_level: int = 0, nodes_to_exclude=None, - algo_config: matmul_4bits_quantizer.WeightOnlyQuantConfig = None, + algo_config: matmul_nbits_quantizer.WeightOnlyQuantConfig = None, providers: List[str] = ["CPUExecutionProvider"], ): super().__init__( diff --git a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py index b41c56270..80cf892c5 100644 --- a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py +++ b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py @@ -19,20 +19,36 @@ import onnx import onnxruntime as ort -from onnxruntime.quantization import matmul_4bits_quantizer -from onnx_neural_compressor import config, data_reader, logger, onnx_model, utility +from onnx_neural_compressor import data_reader, logger, onnx_model, utility from onnx_neural_compressor.quantization import algorithm_entry as algos +from onnx_neural_compressor.quantization import config -class RTNWeightOnlyQuantConfig(matmul_4bits_quantizer.RTNWeightOnlyQuantConfig): +class WeightOnlyQuantConfig: + def __init__(self, algorithm): + """This is the Base class for Weight Only Quant Configuration. + + Args: + algorithm: + weight only quantize algorithm name. + """ + self.algorithm = algorithm + + +class RTNWeightOnlyQuantConfig(WeightOnlyQuantConfig): def __init__(self, ratios=None, layer_wise_quant=False): - super().__init__(ratios=ratios) + super().__init__( + algorithm="RTN", + ) + if ratios is None: + ratios = {} + self.ratios = ratios self.layer_wise_quant = layer_wise_quant -class GPTQWeightOnlyQuantConfig(matmul_4bits_quantizer.GPTQWeightOnlyQuantConfig): +class GPTQWeightOnlyQuantConfig(WeightOnlyQuantConfig): def __init__( self, @@ -45,17 +61,17 @@ def __init__( layer_wise_quant=False, ): super().__init__( - calibration_data_reader=calibration_data_reader, - percdamp=percdamp, - block_size=block_size, - actorder=actorder, - mse=mse, - perchannel=perchannel, + algorithm="GPTQ", ) + self.calibration_data_reader = calibration_data_reader + self.percdamp = percdamp + self.block_size = block_size + self.actorder = actorder + self.mse = mse + self.perchannel = perchannel self.layer_wise_quant = layer_wise_quant - -class AWQWeightOnlyQuantConfig(matmul_4bits_quantizer.WeightOnlyQuantConfig): +class AWQWeightOnlyQuantConfig(WeightOnlyQuantConfig): def __init__( self, @@ -85,7 +101,7 @@ def __init__( is_symmetric: bool = False, accuracy_level: int = 0, nodes_to_exclude: List[str] = None, - algo_config: matmul_4bits_quantizer.WeightOnlyQuantConfig = None, + algo_config: WeightOnlyQuantConfig = None, n_bits: int = 4, providers: List[str] = ["CPUExecutionProvider"], optimization_level: ort.GraphOptimizationLevel = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC, diff --git a/onnx_neural_compressor/quantization/quant_utils.py b/onnx_neural_compressor/quantization/quant_utils.py new file mode 100644 index 000000000..ca6612f80 --- /dev/null +++ b/onnx_neural_compressor/quantization/quant_utils.py @@ -0,0 +1,43 @@ +# Copyright (c) 2023 MIT HAN Lab +# This source code is licensed under the MIT license +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import enum +import onnx + +class QuantType(enum.Enum): # pragma: no cover + """Represent QuantType value.""" + + QInt8 = 0 + QUInt8 = 1 + + @property + def tensor_type(self): + if self == QuantType.QInt8: + return onnx.TensorProto.INT8 + if self == QuantType.QUInt8: + return onnx.TensorProto.UINT8 + raise ValueError(f"Unexpected value qtype={self!r}.") + +class QuantFormat(enum.Enum): + QOperator = 0 + QDQ = 1 + +class CalibrationMethod(enum.Enum): + MinMax = 0 + Entropy = 1 + Percentile = 2 + Distribution = 3 diff --git a/onnx_neural_compressor/quantization/quantize.py b/onnx_neural_compressor/quantization/quantize.py index c90e16d38..d245145c2 100644 --- a/onnx_neural_compressor/quantization/quantize.py +++ b/onnx_neural_compressor/quantization/quantize.py @@ -20,7 +20,7 @@ import onnxruntime as ort from onnxruntime.quantization.quantize import QuantConfig -from onnx_neural_compressor import config +from onnx_neural_compressor.quantization import config from onnx_neural_compressor.quantization import algorithm_entry as algos @@ -28,7 +28,7 @@ def quantize( model_input: Union[str, pathlib.Path, onnx.ModelProto], model_output: Union[str, pathlib.Path], - quant_config: QuantConfig, + quant_config: config.BaseConfig, optimization_level: ort.GraphOptimizationLevel = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC, ): with tempfile.TemporaryDirectory(prefix="ort.opt.") as tmp_dir: diff --git a/onnx_neural_compressor/quantization/tuning.py b/onnx_neural_compressor/quantization/tuning.py index a5caa4c35..100d8c3b3 100644 --- a/onnx_neural_compressor/quantization/tuning.py +++ b/onnx_neural_compressor/quantization/tuning.py @@ -24,7 +24,8 @@ import onnxruntime as ort from onnx import external_data_helper -from onnx_neural_compressor import config, data_reader, logger, utility +from onnx_neural_compressor import data_reader, logger, utility +from onnx_neural_compressor.quantization import config from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Sized, Tuple, Union # isort: skip diff --git a/onnx_neural_compressor/utility.py b/onnx_neural_compressor/utility.py index f1cf126d2..f92b8707a 100644 --- a/onnx_neural_compressor/utility.py +++ b/onnx_neural_compressor/utility.py @@ -25,7 +25,6 @@ import onnxruntime as ort import prettytable as pt import psutil -from onnxruntime import quantization from onnx_neural_compressor import constants, logger @@ -314,236 +313,15 @@ def auto_detect_ep(): return "CPUExecutionProvider" -def static_basic_check(config, optype, execution_provider, quant_format): - if quant_format == quantization.QuantFormat.QOperator: - if execution_provider not in constants.STATIC_QOPERATOR_OP_LIST_MAP: - raise ValueError( - "Unsupported execution_provider {}, only support {}.".format( - execution_provider, list(constants.STATIC_QOPERATOR_OP_LIST_MAP.keys()) - ) - ) - supported_optype = constants.STATIC_QOPERATOR_OP_LIST_MAP[execution_provider] - if optype not in supported_optype: - raise ValueError( - "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype) - ) - elif quant_format == quantization.QuantFormat.QDQ: - if execution_provider not in constants.STATIC_QDQ_OP_LIST_MAP: - raise ValueError( - "Unsupported execution_provider {}, only support {}.".format( - execution_provider, list(constants.STATIC_QDQ_OP_LIST_MAP.keys()) - ) - ) - supported_optype = constants.STATIC_QDQ_OP_LIST_MAP[execution_provider] - if optype not in supported_optype: - raise ValueError( - "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype) - ) +def trt_env_setup(model): + """Set environment variable for Tensorrt Execution Provider.""" + is_int8 = False + for node in model.graph.node: + if node.op_type in ["QuantizeLinear", "DequantizeLinear"]: + is_int8 = True + break + if is_int8: + os.environ["ORT_TENSORRT_INT8_ENABLE"] = "1" else: - raise ValueError( - "Unsupported quant_format {}, only support QuantFormat.QOperator and QuantFormat.QDQ.".format(quant_format) - ) - return config - - -def static_cpu_check(config, optype, execution_provider, quant_format): - if execution_provider != "CPUExecutionProvider": - return config - - # only support per-tensor - if optype in [ - "EmbedLayerNormalization", - "Relu", - "Clip", - "LeakyRelu", - "Sigmoid", - "MaxPool", - "GlobalAveragePool", - "Pad", - "Split", - "Squeeze", - "Reshape", - "Concat", - "AveragePool", - "Tile", - "Unsqueeze", - "Transpose", - "Resize", - "Abs", - "Shrink", - "Sign", - "Attention", - "Flatten", - "Expand", - "Slice", - "Mod", - "ReduceMax", - "ReduceMin", - "CenterCropPad", - "Add", - "Mul", - "ArgMax", - ]: - setattr(config, "per_channel", False) - - if optype in ["Attention"]: - setattr(config, "activation_type", onnx.TensorProto.UINT8) - return config - - -def static_cuda_check(config, optype, execution_provider, quant_format): - if execution_provider != "CUDAExecutionProvider": - return config - - # only support per-tensor - if optype in [ - "EmbedLayerNormalization", - "Relu", - "Clip", - "LeakyRelu", - "Sigmoid", - "MaxPool", - "GlobalAveragePool", - "Pad", - "Split", - "Squeeze", - "Reshape", - "Concat", - "AveragePool", - "Tile", - "Unsqueeze", - "Transpose", - "Resize", - "Abs", - "Shrink", - "Sign", - "Attention", - "Flatten", - "Expand", - "Slice", - "Mod", - "ReduceMax", - "ReduceMin", - "CenterCropPad", - "Add", - "Mul", - "ArgMax", - ]: - setattr(config, "per_channel", False) - - if optype in ["Attention"]: - setattr(config, "activation_type", onnx.TensorProto.INT8) - setattr(config, "weight_type", onnx.TensorProto.INT8) - return config - - -def static_dml_check(config, optype, execution_provider, quant_format): - if execution_provider != "DmlExecutionProvider": - return config - - # only support per-tensor - if optype in ["Conv", "MatMul", "Mul", "Relu", "Clip", "MaxPool", "Add"]: - setattr(config, "per_channel", False) - return config - - -def static_dnnl_check(config, optype, execution_provider, quant_format): - if execution_provider != "DnnlExecutionProvider": - return config - - # current configurations are same as CPU EP - return static_cpu_check(config, optype, execution_provider, quant_format) - - -def static_trt_check(config, optype, execution_provider, quant_format): - if execution_provider != "TensorrtExecutionProvider": - return config - - # only support S8S8 - if optype in ["Conv", "MatMul", "Gather", "Gemm"]: - setattr(config, "weight_type", onnx.TensorProto.INT8) - setattr(config, "weight_sym", True) - setattr(config, "activation_type", onnx.TensorProto.INT8) - setattr(config, "activation_sym", True) - setattr(config, "per_channel", [False, True]) - else: - setattr(config, "weight_type", onnx.TensorProto.INT8) - setattr(config, "weight_sym", True) - setattr(config, "activation_type", onnx.TensorProto.INT8) - setattr(config, "activation_sym", True) - return config - - -STATIC_CHECK_FUNC_LIST = [ - static_basic_check, - static_cpu_check, - static_cuda_check, - static_dml_check, - static_dnnl_check, - static_trt_check, -] - - -def dynamic_basic_check(config, optype, execution_provider, quant_format=None): - if execution_provider not in constants.DYNAMIC_OP_LIST_MAP: - raise ValueError( - "Unsupported execution_provider {}, only support {}.".format( - execution_provider, list(constants.DYNAMIC_OP_LIST_MAP.keys()) - ) - ) - - supported_optype = constants.DYNAMIC_OP_LIST_MAP[execution_provider] - if optype not in supported_optype: - raise ValueError( - "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype) - ) - return config - - -def dynamic_cpu_check(config, optype, execution_provider, quant_format=None): - if execution_provider != "CPUExecutionProvider": - return config - # TODO: add constraints for other EP - if optype in ["FusedConv", "Conv", "EmbedLayerNormalization", "Gather", "Attention", "LSTM"]: - setattr(config, "per_channel", False) - return config - - -def dynamic_cuda_check(config, optype, execution_provider, quant_format=None): - if execution_provider != "CUDAExecutionProvider": - return config - # current configurations are same as CPU EP - return dynamic_cpu_check(config, optype, execution_provider, quant_format) - - -def dynamic_dml_check(config, optype, execution_provider, quant_format=None): - if execution_provider != "DmlExecutionProvider": - return config - - # don't support dynamic quantization - return None - - -def dynamic_dnnl_check(config, optype, execution_provider, quant_format=None): - if execution_provider != "DnnlExecutionProvider": - return config - # current configurations are same as CPU EP - return dynamic_cpu_check(config, optype, execution_provider, quant_format) - - -def dynamic_trt_check(config, optype, execution_provider, quant_format=None): - if execution_provider != "TensorrtExecutionProvider": - return config - - # don't support dynamic quantization - return None - + os.environ["ORT_TENSORRT_INT8_ENABLE"] = "0" -DYNAMIC_CHECK_FUNC_LIST = [ - dynamic_basic_check, - dynamic_cpu_check, - dynamic_cuda_check, - dynamic_dml_check, - dynamic_dnnl_check, - dynamic_trt_check, -] diff --git a/requirements.txt b/requirements.txt index 216af6eff..7e4911f78 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,4 @@ py-cpuinfo pydantic transformers prettytable -scipy \ No newline at end of file +scipy diff --git a/test/quantization/layer_wise/test_layer_wise.py b/test/quantization/layer_wise/test_layer_wise.py index af0bca3e4..7e14d83d7 100644 --- a/test/quantization/layer_wise/test_layer_wise.py +++ b/test/quantization/layer_wise/test_layer_wise.py @@ -10,9 +10,9 @@ import transformers from optimum.exporters.onnx import main_export -from onnx_neural_compressor import config, data_reader, logger +from onnx_neural_compressor import data_reader, logger from onnx_neural_compressor.quantization import algorithm_entry as algos -from onnx_neural_compressor.quantization import matmul_4bits_quantizer +from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer def find_onnx_file(folder_path): diff --git a/test/quantization/post_training_quant/test_post_training_quant.py b/test/quantization/post_training_quant/test_post_training_quant.py new file mode 100644 index 000000000..b6de12bf2 --- /dev/null +++ b/test/quantization/post_training_quant/test_post_training_quant.py @@ -0,0 +1,204 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import glob +import os +import shutil +import unittest +from unittest import mock + +import numpy as np +import onnx +import onnxruntime as ort +from optimum.exporters.onnx import main_export + +from onnx_neural_compressor import data_reader, quantization +from onnx_neural_compressor.quantization import config + +from typing import Callable, Dict, List, Optional, Union # isort: skip + + +def fake_eval(model, eval_result_lst): + acc = eval_result_lst.pop(0) + return acc + + +class DataReader(data_reader.CalibrationDataReader): + + def __init__(self, model): + model = onnx.load(model) + batch_size = 1 + sequence_length = 1 + self.data = { + "input_ids": np.random.randint(10, size=(batch_size, sequence_length)).astype("int64"), + "attention_mask": np.zeros((batch_size, sequence_length)).astype("int64"), + } + for inp in model.graph.input: + if inp.name in self.data: + continue + if inp.name == "position_ids": + # model is exported with optimum >= 1.14.0 with new input 'position_ids' + self.data[inp.name] = np.random.randint(10, size=(batch_size, sequence_length)).astype("int64") + + self.enum_data = None + + def get_next(self): + if self.enum_data is None: + self.enum_data = iter([self.data]) + return next(self.enum_data, None) + + def rewind(self): + self.enum_data = None + + +def _count_op_num(model, optype): + num = 0 + for node in model.graph.node: + if node.op_type == optype: + num += 1 + return num + + +class TestStaticQuant(unittest.TestCase): + + @classmethod + def setUpClass(self): + main_export( + "hf-internal-testing/tiny-random-gptj", + output="model", + ) + self.model = glob.glob(os.path.join("./model", "*.onnx"))[0] + self.data_reader = DataReader(self.model) + + @classmethod + def tearDownClass(self): + shutil.rmtree("./model", ignore_errors=True) + os.remove("quant.onnx") + + def test_static_quant(self): + cfg = config.StaticQuantConfig( + calibration_data_reader=self.data_reader, + weight_type=quantization.QuantType.QInt8, + per_channel=True, + quant_last_matmul=True, + extra_options={"WeightSymmetric": True, "ActivationSymmetric": False}, + execution_provider="CPUExecutionProvider", + ) + quantization.quantize(self.model, "quant.onnx", cfg) + q_model = onnx.load("quant.onnx") + qmatmul_num_enable_last = _count_op_num(q_model, "QLinearMatMul") + + cfg = config.StaticQuantConfig( + calibration_data_reader=self.data_reader, + weight_type=quantization.QuantType.QInt8, + per_channel=True, + quant_last_matmul=False, + extra_options={"WeightSymmetric": True, "ActivationSymmetric": False}, + execution_provider="CPUExecutionProvider", + ) + quantization.quantize(self.model, "quant.onnx", cfg) + q_model = onnx.load("quant.onnx") + node_num_basic = len(q_model.graph.node) + qmatmul_num_disable_last = _count_op_num(q_model, "QLinearMatMul") + + # check quant_last_matmul work + self.assertEqual(qmatmul_num_enable_last, qmatmul_num_disable_last + 1) + + cfg = config.StaticQuantConfig( + calibration_data_reader=self.data_reader, + weight_type=quantization.QuantType.QUInt8, + per_channel=False, + quant_last_matmul=False, + extra_options={"WeightSymmetric": False, "ActivationSymmetric": True}, + execution_provider="CPUExecutionProvider", + ) + quantization.quantize(self.model, "quant.onnx", cfg, ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED) + q_model = onnx.load("quant.onnx") + node_num_extended = len(q_model.graph.node) + + + # check graph optimization work + self.assertGreater(node_num_basic, node_num_extended) + + + # check op_types_to_quantize work + cfg = config.StaticQuantConfig( + calibration_data_reader=self.data_reader, + weight_type=quantization.QuantType.QUInt8, + per_channel=False, + quant_last_matmul=False, + op_types_to_quantize=["MatMul", "Gather"], + extra_options={"WeightSymmetric": False, "ActivationSymmetric": True}, + execution_provider="CPUExecutionProvider", + ) + quantization.quantize(self.model, "quant.onnx", cfg) + q_model = onnx.load("quant.onnx") + self.assertEqual(_count_op_num(q_model, "QLinearAdd"), 0) + self.assertGreater(_count_op_num(q_model, "QLinearMatMul"), 0) + + # check nodes_to_quantize work + quantizable_matmuls = [i.name.split("_quant")[0] for i in q_model.graph.node if i.op_type == "QLinearMatMul"] + cfg = config.StaticQuantConfig( + calibration_data_reader=self.data_reader, + weight_type=quantization.QuantType.QUInt8, + nodes_to_quantize=[quantizable_matmuls[0]], + per_channel=False, + quant_last_matmul=False, + op_types_to_quantize=["MatMul", "Gather"], + extra_options={"WeightSymmetric": False, "ActivationSymmetric": True}, + execution_provider="CPUExecutionProvider", + ) + quantization.quantize(self.model, "quant.onnx", cfg) + q_model = onnx.load("quant.onnx") + self.assertEqual(_count_op_num(q_model, "QLinearMatMul"), 1) + + # check nodes_to_exclude work + cfg = config.StaticQuantConfig( + calibration_data_reader=self.data_reader, + weight_type=quantization.QuantType.QUInt8, + nodes_to_exclude=[quantizable_matmuls[0]], + per_channel=False, + quant_last_matmul=False, + extra_options={"WeightSymmetric": False, "ActivationSymmetric": True}, + execution_provider="CPUExecutionProvider", + ) + quantization.quantize(self.model, "quant.onnx", cfg) + q_model = onnx.load("quant.onnx") + self.assertEqual(_count_op_num(q_model, "QLinearMatMul"), qmatmul_num_disable_last - 1) + + + def test_dynamic_quant(self): + cfg = config.DynamicQuantConfig( + weight_type=quantization.QuantType.QInt8, + per_channel=True, + quant_last_matmul=False, + extra_options={"WeightSymmetric": True, "ActivationSymmetric": False}, + execution_provider="CPUExecutionProvider", + ) + quantization.quantize(self.model, "quant.onnx", cfg) + + cfg = config.DynamicQuantConfig( + weight_type=quantization.QuantType.QUInt8, + per_channel=False, + quant_last_matmul=False, + extra_options={"WeightSymmetric": False, "ActivationSymmetric": True}, + execution_provider="CPUExecutionProvider", + ) + quantization.quantize(self.model, "quant.onnx", cfg, ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED) + + + +if __name__ == "__main__": + unittest.main() diff --git a/test/quantization/test_autotune.py b/test/quantization/test_autotune.py index 051b6cd73..dd6ddf0db 100644 --- a/test/quantization/test_autotune.py +++ b/test/quantization/test_autotune.py @@ -24,8 +24,8 @@ import onnxruntime as ort from optimum.exporters.onnx import main_export -from onnx_neural_compressor import config, data_reader, quantization -from onnx_neural_compressor.quantization import tuning +from onnx_neural_compressor import data_reader, quantization +from onnx_neural_compressor.quantization import config, tuning from typing import Callable, Dict, List, Optional, Union # isort: skip @@ -418,6 +418,7 @@ def test_static_default_auto_tune(self): def test_static_custom_auto_tune(self): partial_fake_eval = functools.partial(fake_eval, eval_result_lst=[1.0, 0.8, 0.99]) + custom_tune_config = tuning.TuningConfig( config_set=config.StaticQuantConfig( per_channel=[True, False], diff --git a/test/quantization/test_config.py b/test/quantization/test_config.py index ec9411b45..cf38c55de 100644 --- a/test/quantization/test_config.py +++ b/test/quantization/test_config.py @@ -7,9 +7,9 @@ import onnx from optimum.exporters.onnx import main_export -from onnx_neural_compressor import config, logger, quantization, utility +from onnx_neural_compressor import logger, quantization, utility from onnx_neural_compressor.quantization import algorithm_entry as algos -from onnx_neural_compressor.quantization import tuning +from onnx_neural_compressor.quantization import config, tuning def find_onnx_file(folder_path): @@ -179,12 +179,12 @@ def test_static_quant_config(self): elif idx in [1, 5]: self.assertFalse(configs_mapping["Matmul"]["per_channel"]) if idx < 4: - self.assertEqual(configs_mapping["add"]["calibrate_method"], quantization.CalibrationMethod.MinMax) + self.assertEqual(configs_mapping["add"]["calibrate_method"], 0) else: self.assertFalse("add" in configs_mapping) if idx in [0, 1]: self.assertEqual( - configs_mapping["Matmul"]["calibrate_method"], quantization.CalibrationMethod.MinMax + configs_mapping["Matmul"]["calibrate_method"], 0 ) self.assertLess(idx, 16) @@ -218,10 +218,10 @@ def test_static_quant_config(self): if "Matmul" in configs_mapping: self.assertFalse(configs_mapping["Matmul"]["per_channel"]) self.assertEqual( - configs_mapping["Matmul"]["calibrate_method"], quantization.CalibrationMethod.MinMax + configs_mapping["Matmul"]["calibrate_method"], 0 ) if "add" in configs_mapping: - self.assertEqual(configs_mapping["add"]["calibrate_method"], quantization.CalibrationMethod.MinMax) + self.assertEqual(configs_mapping["add"]["calibrate_method"], 0) self.assertLess(idx, 16) for execution_provider in ["TensorrtExecutionProvider"]: @@ -240,8 +240,8 @@ def test_static_quant_config(self): elif idx in [1, 5]: self.assertFalse(configs_mapping["Matmul"]["per_channel"]) if "add" in configs_mapping: - self.assertEqual(configs_mapping["add"]["calibrate_method"], quantization.CalibrationMethod.MinMax) - self.assertEqual(configs_mapping["add"]["calibrate_method"], quantization.CalibrationMethod.MinMax) + self.assertEqual(configs_mapping["add"]["calibrate_method"], 0) + self.assertEqual(configs_mapping["add"]["calibrate_method"], 0) self.assertTrue(configs_mapping["add"]["weight_sym"]) self.assertTrue(configs_mapping["add"]["activation_sym"]) if "Matmul" in configs_mapping: @@ -265,7 +265,7 @@ def test_static_custom_quant_config(self): self.assertTrue(configs_mapping["Matmul"]["per_channel"]) elif idx == 1: self.assertFalse(configs_mapping["Matmul"]["per_channel"]) - self.assertEqual(configs_mapping["add"]["calibrate_method"], quantization.CalibrationMethod.MinMax) + self.assertEqual(configs_mapping["add"]["calibrate_method"], 0) self.assertLess(idx, 2) @@ -299,7 +299,7 @@ def test_static_custom_quant_config(self): model_info = quant_config.get_model_info(model=self.simple_onnx_model) configs_mapping = quant_config.to_config_mapping(model_info=model_info) self.assertFalse(configs_mapping["Matmul"]["per_channel"]) - self.assertEqual(configs_mapping["add"]["calibrate_method"], quantization.CalibrationMethod.MinMax) + self.assertEqual(configs_mapping["add"]["calibrate_method"], 0) self.assertLess(idx, 4) for execution_provider in ["TensorrtExecutionProvider"]: @@ -318,7 +318,7 @@ def test_static_custom_quant_config(self): self.assertTrue(configs_mapping["Matmul"]["per_channel"]) elif idx == 1: self.assertFalse(configs_mapping["Matmul"]["per_channel"]) - self.assertEqual(configs_mapping["add"]["calibrate_method"], quantization.CalibrationMethod.MinMax) + self.assertEqual(configs_mapping["add"]["calibrate_method"], 0) self.assertTrue(configs_mapping["add"]["weight_sym"]) self.assertTrue(configs_mapping["add"]["activation_sym"]) self.assertTrue(configs_mapping["Matmul"]["weight_sym"]) diff --git a/test/quantization/test_smooth_quant.py b/test/quantization/test_smooth_quant.py index 52f4bd8b3..73b5fa84b 100644 --- a/test/quantization/test_smooth_quant.py +++ b/test/quantization/test_smooth_quant.py @@ -22,10 +22,9 @@ import onnxruntime as ort from optimum.exporters.onnx import main_export -from onnx_neural_compressor import config, data_reader -from onnx_neural_compressor.quantization import QuantType +from onnx_neural_compressor import data_reader +from onnx_neural_compressor.quantization import config, quantize, QuantType from onnx_neural_compressor.quantization import algorithm_entry as algos -from onnx_neural_compressor.quantization import quantize class DataReader(data_reader.CalibrationDataReader): diff --git a/test/quantization/weight_only/test_awq.py b/test/quantization/weight_only/test_awq.py index 2d918cc61..e1c23d495 100644 --- a/test/quantization/weight_only/test_awq.py +++ b/test/quantization/weight_only/test_awq.py @@ -8,9 +8,9 @@ import transformers from optimum.exporters.onnx import main_export -from onnx_neural_compressor import config, data_reader, logger +from onnx_neural_compressor import data_reader, logger from onnx_neural_compressor.quantization import algorithm_entry as algos -from onnx_neural_compressor.quantization import matmul_4bits_quantizer, matmul_nbits_quantizer +from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer, matmul_nbits_quantizer def find_onnx_file(folder_path): diff --git a/test/quantization/weight_only/test_gptq.py b/test/quantization/weight_only/test_gptq.py index 133e11fd1..1e674b7dd 100644 --- a/test/quantization/weight_only/test_gptq.py +++ b/test/quantization/weight_only/test_gptq.py @@ -8,9 +8,9 @@ import transformers from optimum.exporters.onnx import main_export -from onnx_neural_compressor import config, data_reader, logger +from onnx_neural_compressor import data_reader, logger from onnx_neural_compressor.quantization import algorithm_entry as algos -from onnx_neural_compressor.quantization import matmul_4bits_quantizer, matmul_nbits_quantizer +from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer, matmul_nbits_quantizer def find_onnx_file(folder_path): diff --git a/test/quantization/weight_only/test_rtn.py b/test/quantization/weight_only/test_rtn.py index 86b3c49a3..aa3672d0c 100644 --- a/test/quantization/weight_only/test_rtn.py +++ b/test/quantization/weight_only/test_rtn.py @@ -6,9 +6,9 @@ from optimum.exporters.onnx import main_export -from onnx_neural_compressor import config, logger +from onnx_neural_compressor import logger from onnx_neural_compressor.quantization import algorithm_entry as algos -from onnx_neural_compressor.quantization import matmul_4bits_quantizer, matmul_nbits_quantizer +from onnx_neural_compressor.quantization import config, matmul_4bits_quantizer, matmul_nbits_quantizer def find_onnx_file(folder_path): diff --git a/test/utils/test_general.py b/test/utils/test_general.py index 32cb80087..47e863561 100644 --- a/test/utils/test_general.py +++ b/test/utils/test_general.py @@ -2,8 +2,8 @@ import unittest -from onnx_neural_compressor import config, constants, logger -from onnx_neural_compressor.quantization import tuning +from onnx_neural_compressor import constants, logger +from onnx_neural_compressor.quantization import config, tuning from typing import Any, Callable, List, Optional, Tuple, Union # isort: skip @@ -217,6 +217,94 @@ def test_mixed_two_algos(self): self.assertIn(OP1_NAME, config_mapping) self.assertIn(OP2_NAME, config_mapping) + def test_config_expand(self) -> None: + cfg = config.RTNConfig(weight_bits=[4,8], weight_sym=[True, False], layer_wise_quant=[True, False], providers=[["CPU"], ["CUDA"]]) + expand_cfgs = cfg.expand() + self.assertEqual(expand_cfgs[0].weight_bits, 4) + self.assertEqual(expand_cfgs[0].weight_sym, True) + self.assertEqual(expand_cfgs[0].layer_wise_quant, True) + self.assertEqual(expand_cfgs[0].providers, ["CPU"]) + + self.assertEqual(expand_cfgs[1].weight_bits, 8) + self.assertEqual(expand_cfgs[1].weight_sym, True) + self.assertEqual(expand_cfgs[1].layer_wise_quant, True) + self.assertEqual(expand_cfgs[1].providers, ["CPU"]) + + self.assertEqual(expand_cfgs[2].weight_bits, 4) + self.assertEqual(expand_cfgs[2].weight_sym, False) + self.assertEqual(expand_cfgs[2].layer_wise_quant, True) + self.assertEqual(expand_cfgs[2].providers, ["CPU"]) + + self.assertEqual(expand_cfgs[3].weight_bits, 8) + self.assertEqual(expand_cfgs[3].weight_sym, False) + self.assertEqual(expand_cfgs[3].layer_wise_quant, True) + self.assertEqual(expand_cfgs[3].providers, ["CPU"]) + + self.assertEqual(expand_cfgs[4].weight_bits, 4) + self.assertEqual(expand_cfgs[4].weight_sym, True) + self.assertEqual(expand_cfgs[4].layer_wise_quant, True) + self.assertEqual(expand_cfgs[4].providers, ["CUDA"]) + + self.assertEqual(expand_cfgs[5].weight_bits, 8) + self.assertEqual(expand_cfgs[5].weight_sym, True) + self.assertEqual(expand_cfgs[5].layer_wise_quant, True) + self.assertEqual(expand_cfgs[5].providers, ["CUDA"]) + + self.assertEqual(expand_cfgs[6].weight_bits, 4) + self.assertEqual(expand_cfgs[6].weight_sym, False) + self.assertEqual(expand_cfgs[6].layer_wise_quant, True) + self.assertEqual(expand_cfgs[6].providers, ["CUDA"]) + + self.assertEqual(expand_cfgs[7].weight_bits, 8) + self.assertEqual(expand_cfgs[7].weight_sym, False) + self.assertEqual(expand_cfgs[7].layer_wise_quant, True) + self.assertEqual(expand_cfgs[7].providers, ["CUDA"]) + + self.assertEqual(expand_cfgs[8].weight_bits, 4) + self.assertEqual(expand_cfgs[8].weight_sym, True) + self.assertEqual(expand_cfgs[8].layer_wise_quant, False) + self.assertEqual(expand_cfgs[8].providers, ["CPU"]) + + self.assertEqual(expand_cfgs[9].weight_bits, 8) + self.assertEqual(expand_cfgs[9].weight_sym, True) + self.assertEqual(expand_cfgs[9].layer_wise_quant, False) + self.assertEqual(expand_cfgs[9].providers, ["CPU"]) + + self.assertEqual(expand_cfgs[10].weight_bits, 4) + self.assertEqual(expand_cfgs[10].weight_sym, False) + self.assertEqual(expand_cfgs[10].layer_wise_quant, False) + self.assertEqual(expand_cfgs[10].providers, ["CPU"]) + + self.assertEqual(expand_cfgs[11].weight_bits, 8) + self.assertEqual(expand_cfgs[11].weight_sym, False) + self.assertEqual(expand_cfgs[11].layer_wise_quant, False) + self.assertEqual(expand_cfgs[11].providers, ["CPU"]) + + self.assertEqual(expand_cfgs[12].weight_bits, 4) + self.assertEqual(expand_cfgs[12].weight_sym, True) + self.assertEqual(expand_cfgs[12].layer_wise_quant, False) + self.assertEqual(expand_cfgs[12].providers, ["CUDA"]) + + self.assertEqual(expand_cfgs[13].weight_bits, 8) + self.assertEqual(expand_cfgs[13].weight_sym, True) + self.assertEqual(expand_cfgs[13].layer_wise_quant, False) + self.assertEqual(expand_cfgs[13].providers, ["CUDA"]) + + self.assertEqual(expand_cfgs[14].weight_bits, 4) + self.assertEqual(expand_cfgs[14].weight_sym, False) + self.assertEqual(expand_cfgs[14].layer_wise_quant, False) + self.assertEqual(expand_cfgs[14].providers, ["CUDA"]) + + self.assertEqual(expand_cfgs[15].weight_bits, 8) + self.assertEqual(expand_cfgs[15].weight_sym, False) + self.assertEqual(expand_cfgs[15].layer_wise_quant, False) + self.assertEqual(expand_cfgs[15].providers, ["CUDA"]) + + + def test_config_expand_with_empty_options(self): + configs = FakeAlgoConfig(weight_dtype=["int", "float32"], weight_bits=[]) + configs_list = configs.expand() + self.assertEqual(len(configs_list), 2) class TestConfigSet(unittest.TestCase): @@ -259,5 +347,6 @@ def test_config_loader_skip_verified_config(self) -> None: self.assertEqual(config_count, 2) + if __name__ == "__main__": unittest.main() diff --git a/test/utils/test_param.py b/test/utils/test_param.py index fd8b7d3d3..5c04ccffc 100644 --- a/test/utils/test_param.py +++ b/test/utils/test_param.py @@ -3,7 +3,7 @@ import unittest from typing import List -from onnx_neural_compressor import config +from onnx_neural_compressor.quantization import config class TestTuningParam(unittest.TestCase): @@ -20,6 +20,7 @@ def test_is_tunable_recursive(self): self.assertTrue(param.is_tunable([[5, 6], [7, 8]])) # TODO: double check if this is the expected behavior self.assertTrue(param.is_tunable([[5, 6], [7, "8"]])) + self.assertEqual(str(param), "TuningParam(name=param_name, tunable_type=typing.List[typing.List[int]], options=None).") if __name__ == "__main__":