From 3b42dcd493c2054529bb0d3845a88bc3ff28a105 Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Thu, 20 Jun 2024 16:19:26 +0800 Subject: [PATCH] fix CI issue Signed-off-by: Mengni Wang --- .../quantization/ptq_static/README.md | 1 - .../resnet50/quantization/ptq_static/main.py | 116 ++--- .../quantization/ptq_static/run_benchmark.sh | 16 +- .../quantization/ptq_static/run_quant.sh | 20 +- .../bert/quantization/ptq_dynamic/README.md | 1 - .../nlp/bert/quantization/ptq_dynamic/main.py | 258 +++++------ .../quantization/ptq_dynamic/prepare_data.sh | 6 +- .../quantization/ptq_dynamic/prepare_model.py | 53 +-- .../quantization/ptq_dynamic/run_benchmark.sh | 23 +- .../quantization/ptq_dynamic/run_quant.sh | 19 +- .../bert/quantization/ptq_static/README.md | 1 - .../nlp/bert/quantization/ptq_static/main.py | 266 ++++++------ .../quantization/ptq_static/prepare_data.sh | 6 +- .../quantization/ptq_static/prepare_model.py | 54 ++- .../quantization/ptq_static/run_benchmark.sh | 23 +- .../bert/quantization/ptq_static/run_quant.sh | 24 +- .../post_training_quant/__init__.py | 2 +- .../post_training_quant/calibrate.py | 59 ++- .../post_training_quant/calibrator.py | 1 + .../post_training_quant/operators/__init__.py | 3 +- .../operators/activation.py | 10 +- .../post_training_quant/operators/argmax.py | 5 +- .../operators/attention.py | 5 +- .../post_training_quant/operators/base_op.py | 3 +- .../operators/binary_op.py | 10 +- .../post_training_quant/operators/concat.py | 5 +- .../post_training_quant/operators/conv.py | 17 +- .../operators/direct_q8.py | 12 +- .../operators/embed_layernorm.py | 5 +- .../post_training_quant/operators/gather.py | 9 +- .../post_training_quant/operators/gavgpool.py | 5 +- .../post_training_quant/operators/gemm.py | 5 +- .../post_training_quant/operators/lstm.py | 5 +- .../post_training_quant/operators/matmul.py | 21 +- .../post_training_quant/operators/maxpool.py | 5 +- .../post_training_quant/operators/pad.py | 5 +- .../post_training_quant/operators/pooling.py | 5 +- .../post_training_quant/operators/reduce.py | 8 +- .../post_training_quant/operators/resize.py | 5 +- .../post_training_quant/operators/split.py | 5 +- .../post_training_quant/operators/unary_op.py | 5 +- .../post_training_quant/quantizer.py | 68 +-- .../algorithms/smoother/core.py | 1 + onnx_neural_compressor/algorithms/utility.py | 27 +- .../algorithms/weight_only/awq.py | 6 +- .../algorithms/weight_only/gptq.py | 2 +- .../algorithms/weight_only/rtn.py | 2 +- onnx_neural_compressor/config.py | 224 +++++++--- onnx_neural_compressor/constants.py | 269 +++++++++--- .../quantization/__init__.py | 2 +- .../quantization/algorithm_entry.py | 16 +- .../quantization/matmul_nbits_quantizer.py | 11 +- .../quantization/quantize.py | 14 +- onnx_neural_compressor/quantization/tuning.py | 20 +- onnx_neural_compressor/utility.py | 125 +++++- .../layer_wise/test_layer_wise.py | 16 +- .../post_training_quant/test_calibrate.py | 23 +- .../post_training_quant/test_operators.py | 403 +++++++++++++----- .../post_training_quant/test_quant_utils.py | 4 +- test/quantization/test_autotune.py | 8 +- test/quantization/test_config.py | 16 +- test/quantization/test_smooth_quant.py | 23 +- test/utils/test_general.py | 8 +- test/utils/test_utility.py | 20 - 64 files changed, 1468 insertions(+), 947 deletions(-) diff --git a/examples/image_recognition/resnet50/quantization/ptq_static/README.md b/examples/image_recognition/resnet50/quantization/ptq_static/README.md index d88e7bbe1..b8145eff8 100644 --- a/examples/image_recognition/resnet50/quantization/ptq_static/README.md +++ b/examples/image_recognition/resnet50/quantization/ptq_static/README.md @@ -11,7 +11,6 @@ pip install onnx-neural-compressor pip install -r requirements.txt ``` -> Note: Validated ONNX Runtime [Version](/docs/source/installation_guide.md#validated-software-environment). ## 2. Prepare Model diff --git a/examples/image_recognition/resnet50/quantization/ptq_static/main.py b/examples/image_recognition/resnet50/quantization/ptq_static/main.py index 232a12912..cc82d49b4 100644 --- a/examples/image_recognition/resnet50/quantization/ptq_static/main.py +++ b/examples/image_recognition/resnet50/quantization/ptq_static/main.py @@ -16,26 +16,28 @@ # under the License. # pylint:disable=redefined-outer-name,logging-format-interpolation -import logging import argparse +import collections +import logging +import os +import re +import time + import cv2 import numpy as np import onnx -import re -import os -import collections -from PIL import Image import onnxruntime as ort +from PIL import Image from sklearn import metrics -from onnx_neural_compressor import data_reader -from onnx_neural_compressor import config -from onnx_neural_compressor import quantization + +from onnx_neural_compressor import config, data_reader, quantization from onnx_neural_compressor.quantization import tuning logger = logging.getLogger(__name__) -logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.WARN) +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN +) + def _topk_shape_validate(preds, labels): # preds shape can be Nxclass_num or class_num(N=1 by default) @@ -79,13 +81,14 @@ def _topk_shape_validate(preds, labels): class_num = preds.shape[1] label_N = labels.shape[0] - assert label_N == N, 'labels batch size should same with preds' + assert label_N == N, "labels batch size should same with preds" labels = labels.reshape([N, -1]) # one-hot labels will have 2 dimension not equal 1 if labels.shape[1] != 1: labels = labels.argsort()[..., -1:] return preds, labels + class TopK: def __init__(self, k=1): self.k = k @@ -94,7 +97,7 @@ def __init__(self, k=1): def update(self, preds, labels, sample_weight=None): preds, labels = _topk_shape_validate(preds, labels) - preds = preds.argsort()[..., -self.k:] + preds = preds.argsort()[..., -self.k :] if self.k == 1: correct = metrics.accuracy_score(preds, labels, normalize=False) self.num_correct += correct @@ -103,7 +106,7 @@ def update(self, preds, labels, sample_weight=None): for p, l in zip(preds, labels): # get top-k labels with np.argpartition # p = np.argpartition(p, -self.k)[-self.k:] - l = l.astype('int32') + l = l.astype("int32") if l in p: self.num_correct += 1 @@ -128,7 +131,7 @@ def __init__(self, model_path, dataset_location, image_list, batch_size=1, calib src_lst = [] label_lst = [] num = 0 - with open(image_list, 'r') as f: + with open(image_list, "r") as f: for s in f: image_name, label = re.split(r"\s+", s.strip()) src = os.path.join(dataset_location, image_name) @@ -153,18 +156,18 @@ def __init__(self, model_path, dataset_location, image_list, batch_size=1, calib def _preprpcess(self, src): with Image.open(src) as image: - image = np.array(image.convert('RGB')).astype(np.float32) - image = image / 255. + image = np.array(image.convert("RGB")).astype(np.float32) + image = image / 255.0 image = cv2.resize(image, (256, 256), interpolation=cv2.INTER_LINEAR) h, w = image.shape[0], image.shape[1] y0 = (h - 224) // 2 x0 = (w - 224) // 2 - image = image[y0:y0 + 224, x0:x0 + 224, :] + image = image[y0 : y0 + 224, x0 : x0 + 224, :] image = (image - [0.485, 0.456, 0.406]) / [0.229, 0.224, 0.225] image = image.transpose((2, 0, 1)) - return image.astype('float32') + return image.astype("float32") def get_next(self): lst = next(self.iter_next, None) @@ -186,54 +189,22 @@ def eval_func(model, dataloader, metric): metric.update(output, labels[idx]) return metric.result() + if __name__ == "__main__": logger.info("Evaluating ONNXRuntime full precision accuracy and performance:") parser = argparse.ArgumentParser( description="Resnet50 fine-tune examples for image classification tasks.", - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument( - '--model_path', - type=str, - help="Pre-trained model on onnx file" + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) + parser.add_argument("--model_path", type=str, help="Pre-trained model on onnx file") + parser.add_argument("--dataset_location", type=str, help="Imagenet data path") + parser.add_argument("--label_path", type=str, help="Imagenet label path") + parser.add_argument("--benchmark", action="store_true", default=False) + parser.add_argument("--tune", action="store_true", default=False, help="whether quantize the model") + parser.add_argument("--output_model", type=str, help="output model path") + parser.add_argument("--mode", type=str, help="benchmark mode of performance or accuracy") parser.add_argument( - '--dataset_location', - type=str, - help="Imagenet data path" - ) - parser.add_argument( - '--label_path', - type=str, - help="Imagenet label path" - ) - parser.add_argument( - '--benchmark', - action='store_true', \ - default=False - ) - parser.add_argument( - '--tune', - action='store_true', \ - default=False, - help="whether quantize the model" - ) - parser.add_argument( - '--output_model', - type=str, - help="output model path" - ) - parser.add_argument( - '--mode', - type=str, - help="benchmark mode of performance or accuracy" - ) - parser.add_argument( - '--quant_format', - type=str, - default='QOperator', - choices=['QDQ', 'QOperator'], - help="quantization format" + "--quant_format", type=str, default="QOperator", choices=["QDQ", "QOperator"], help="quantization format" ) parser.add_argument( "--batch_size", @@ -245,25 +216,26 @@ def eval_func(model, dataloader, metric): model = onnx.load(args.model_path) top1 = TopK() dataloader = DataReader(args.model_path, args.dataset_location, args.label_path, args.batch_size) + def eval(onnx_model): dataloader.rewind() return eval_func(onnx_model, dataloader, top1) if args.benchmark: - if args.mode == 'performance': + if args.mode == "performance": total_time = 0.0 num_iter = 100 num_warmup = 10 sess_options = ort.SessionOptions() sess_options.intra_op_num_threads = args.intra_op_num_threads - session = onnxruntime.InferenceSession(model.SerializeToString(), - sess_options, - providers=onnxruntime.get_available_providers()) + session = ort.InferenceSession( + model.SerializeToString(), sess_options, providers=ort.get_available_providers() + ) ort_inputs = {} len_inputs = len(session.get_inputs()) inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)] - + for idx, batch in enumerate(dataloader): if idx + 1 > num_iter: break @@ -277,17 +249,23 @@ def eval(onnx_model): print(args) throughput = (num_iter - num_warmup) / total_time print("Throughput: {} samples/s".format(throughput)) - elif args.mode == 'accuracy': + elif args.mode == "accuracy": acc_result = eval_func(model, dataloader, top1) print("Batch size = %d" % dataloader.batch_size) print("Accuracy: %.5f" % acc_result) if args.tune: - calibration_data_reader = DataReader(args.model_path, args.dataset_location, args.label_path, args.batch_size, calibration_sampling_size=100) + calibration_data_reader = DataReader( + args.model_path, args.dataset_location, args.label_path, args.batch_size, calibration_sampling_size=100 + ) custom_tune_config = tuning.TuningConfig( config_set=config.StaticQuantConfig.get_config_set_for_tuning( - quant_format=quantization.QuantFormat.QOperator if args.quant_format == "QOperator" else quantization.QuantFormat.QDQ, + quant_format=( + quantization.QuantFormat.QOperator + if args.quant_format == "QOperator" + else quantization.QuantFormat.QDQ + ), ) ) best_model = tuning.autotune( diff --git a/examples/image_recognition/resnet50/quantization/ptq_static/run_benchmark.sh b/examples/image_recognition/resnet50/quantization/ptq_static/run_benchmark.sh index 9923a2857..2d87088e3 100644 --- a/examples/image_recognition/resnet50/quantization/ptq_static/run_benchmark.sh +++ b/examples/image_recognition/resnet50/quantization/ptq_static/run_benchmark.sh @@ -13,16 +13,16 @@ function init_params { do case $var in --input_model=*) - input_model=$(echo $var |cut -f2 -d=) + input_model=$(echo "$var" |cut -f2 -d=) ;; --dataset_location=*) - dataset_location=$(echo $var |cut -f2 -d=) + dataset_location=$(echo "$var" |cut -f2 -d=) ;; --label_path=*) - label_path=$(echo $var |cut -f2 -d=) + label_path=$(echo "$var" |cut -f2 -d=) ;; --mode=*) - mode=$(echo $var |cut -f2 -d=) + mode=$(echo "$var" |cut -f2 -d=) ;; esac done @@ -33,10 +33,10 @@ function init_params { function run_benchmark { python main.py \ - --model_path ${input_model} \ - --dataset_location ${dataset_location} \ - --label_path ${label_path-${dataset_location}/../val.txt} \ - --mode=${mode} \ + --model_path "${input_model}" \ + --dataset_location "${dataset_location}" \ + --label_path "${label_path-${dataset_location}/../val.txt}" \ + --mode="${mode}" \ --batch_size 1 \ --benchmark diff --git a/examples/image_recognition/resnet50/quantization/ptq_static/run_quant.sh b/examples/image_recognition/resnet50/quantization/ptq_static/run_quant.sh index c7f7a0401..0e44d8d02 100644 --- a/examples/image_recognition/resnet50/quantization/ptq_static/run_quant.sh +++ b/examples/image_recognition/resnet50/quantization/ptq_static/run_quant.sh @@ -14,19 +14,19 @@ function init_params { do case $var in --input_model=*) - input_model=$(echo $var |cut -f2 -d=) + input_model=$(echo "$var" |cut -f2 -d=) ;; --output_model=*) - output_model=$(echo $var |cut -f2 -d=) + output_model=$(echo "$var" |cut -f2 -d=) ;; --dataset_location=*) - dataset_location=$(echo $var |cut -f2 -d=) + dataset_location=$(echo "$var" |cut -f2 -d=) ;; --label_path=*) - label_path=$(echo $var |cut -f2 -d=) + label_path=$(echo "$var" |cut -f2 -d=) ;; --quant_format=*) - quant_format=$(echo $var |cut -f2 -d=) + quant_format=$(echo "$var" |cut -f2 -d=) ;; esac done @@ -36,11 +36,11 @@ function init_params { # run_tuning function run_tuning { python main.py \ - --model_path ${input_model} \ - --dataset_location ${dataset_location} \ - --label_path ${label_path-${dataset_location}/../val.txt} \ - --output_model ${output_model} \ - --quant_format ${quant_format-QOperator} \ + --model_path "${input_model}" \ + --dataset_location "${dataset_location}" \ + --label_path "${label_path-${dataset_location}/../val.txt}" \ + --output_model "${output_model}" \ + --quant_format "${quant_format-QOperator}" \ --tune } diff --git a/examples/nlp/bert/quantization/ptq_dynamic/README.md b/examples/nlp/bert/quantization/ptq_dynamic/README.md index dab252bcb..212c8b899 100644 --- a/examples/nlp/bert/quantization/ptq_dynamic/README.md +++ b/examples/nlp/bert/quantization/ptq_dynamic/README.md @@ -11,7 +11,6 @@ pip install onnx-neural-compressor pip install -r requirements.txt ``` -> Note: Validated ONNX Runtime [Version](/docs/installation_guide.md#validated-software-environment). ## 2. Prepare Dataset diff --git a/examples/nlp/bert/quantization/ptq_dynamic/main.py b/examples/nlp/bert/quantization/ptq_dynamic/main.py index 781b6e8c5..0298054d7 100644 --- a/examples/nlp/bert/quantization/ptq_dynamic/main.py +++ b/examples/nlp/bert/quantization/ptq_dynamic/main.py @@ -16,28 +16,32 @@ # under the License. # pylint:disable=redefined-outer-name,logging-format-interpolation +import argparse +import dataclasses import logging +import os import pathlib import tempfile -import argparse -import os +from typing import List, Optional, Union + +import numpy as np import onnx import onnxruntime -import transformers +import time import torch -import numpy as np -import dataclasses -from typing import List, Optional, Union +import transformers +from onnxruntime.transformers import optimizer +from onnxruntime.transformers.fusion_options import FusionOptions from torch.utils import data + from onnx_neural_compressor import config from onnx_neural_compressor.quantization import tuning -from onnxruntime.transformers import optimizer -from onnxruntime.transformers.fusion_options import FusionOptions logger = logging.getLogger(__name__) -logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.WARN) +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN +) + class ONNXRTBertDataset: """Dataset used for model Bert. @@ -59,57 +63,77 @@ class ONNXRTBertDataset: filter (Filter objects, default=None): filter out examples according to specific conditions. """ - def __init__(self, model, data_dir, model_name_or_path, max_seq_length=128,\ - do_lower_case=True, task='mrpc', model_type='bert', dynamic_length=False,\ - evaluate=True, transform=None, filter=None): + + def __init__( + self, + model, + data_dir, + model_name_or_path, + max_seq_length=128, + do_lower_case=True, + task="mrpc", + model_type="bert", + dynamic_length=False, + evaluate=True, + transform=None, + filter=None, + ): self.inputs = [inp.name for inp in onnx.load(model).graph.input] task = task.lower() model_type = model_type.lower() - assert task in ['mrpc', 'qqp', 'qnli', 'rte', 'sts-b', 'cola', \ - 'mnli', 'wnli', 'sst-2'], 'Unsupported task type' - assert model_type in ['distilbert', 'bert', 'mobilebert', 'roberta'], 'Unsupported \ - model type' + assert task in ["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", "mnli", "wnli", "sst-2"], "Unsupported task type" + assert model_type in [ + "distilbert", + "bert", + "mobilebert", + "roberta", + ], "Unsupported \ + model type" self.dynamic_length = dynamic_length self.model_type = model_type self.max_seq_length = max_seq_length - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, - do_lower_case=do_lower_case) - self.dataset = load_and_cache_examples(data_dir, model_name_or_path, \ - max_seq_length, task, model_type, tokenizer, evaluate) + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case) + self.dataset = load_and_cache_examples( + data_dir, model_name_or_path, max_seq_length, task, model_type, tokenizer, evaluate + ) def __len__(self): return len(self.dataset) def __getitem__(self, index): batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in self.dataset[index]) - return batch[:len(self.inputs)], batch[-1] + return batch[: len(self.inputs)], batch[-1] + -def load_and_cache_examples(data_dir, model_name_or_path, max_seq_length, task, \ - model_type, tokenizer, evaluate): +def load_and_cache_examples(data_dir, model_name_or_path, max_seq_length, task, model_type, tokenizer, evaluate): processor = transformers.glue_processors[task]() output_mode = transformers.glue_output_modes[task] # Load data features from cache or dataset file if not os.path.exists("./dataset_cached"): os.makedirs("./dataset_cached") - cached_features_file = os.path.join("./dataset_cached", 'cached_{}_{}_{}_{}'.format( - 'dev' if evaluate else 'train', - list(filter(None, model_name_or_path.split('/'))).pop(), - str(max_seq_length), - str(task))) + cached_features_file = os.path.join( + "./dataset_cached", + "cached_{}_{}_{}_{}".format( + "dev" if evaluate else "train", + list(filter(None, model_name_or_path.split("/"))).pop(), + str(max_seq_length), + str(task), + ), + ) if os.path.exists(cached_features_file): logger.info("Load features from cached file {}.".format(cached_features_file)) features = torch.load(cached_features_file) else: logger.info("Create features from dataset file at {}.".format(data_dir)) label_list = processor.get_labels() - examples = processor.get_dev_examples(data_dir) if evaluate else \ - processor.get_train_examples(data_dir) - features = convert_examples_to_features(examples, - tokenizer, - task=task, - label_list=label_list, - max_length=max_seq_length, - output_mode=output_mode, + examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir) + features = convert_examples_to_features( + examples, + tokenizer, + task=task, + label_list=label_list, + max_length=max_seq_length, + output_mode=output_mode, ) logger.info("Save features into cached file {}.".format(cached_features_file)) torch.save(features, cached_features_file) @@ -122,10 +146,10 @@ def load_and_cache_examples(data_dir, model_name_or_path, max_seq_length, task, all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) - dataset = data.TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, \ - all_seq_lengths, all_labels) + dataset = data.TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_seq_lengths, all_labels) return dataset + def convert_examples_to_features( examples, tokenizer, @@ -143,7 +167,7 @@ def convert_examples_to_features( logger.info("Use label list {} for task {}.".format(label_list, task)) label_map = {label: i for i, label in enumerate(label_list)} features = [] - for (ex_index, example) in enumerate(examples): + for ex_index, example in enumerate(examples): inputs = tokenizer.encode_plus( example.text_a, example.text_b, @@ -162,19 +186,14 @@ def convert_examples_to_features( padding_length = max_length - len(input_ids) input_ids = input_ids + ([pad_token] * padding_length) - attention_mask = attention_mask + \ - ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) - assert len(input_ids) == max_length, \ - "Error with input_ids length {} vs {}".format( - len(input_ids), max_length) - assert len(attention_mask) == max_length, \ - "Error with attention_mask length {} vs {}".format( + assert len(input_ids) == max_length, "Error with input_ids length {} vs {}".format(len(input_ids), max_length) + assert len(attention_mask) == max_length, "Error with attention_mask length {} vs {}".format( len(attention_mask), max_length ) - assert len(token_type_ids) == max_length, \ - "Error with token_type_ids length {} vs {}".format( + assert len(token_type_ids) == max_length, "Error with token_type_ids length {} vs {}".format( len(token_type_ids), max_length ) if output_mode == "classification": @@ -194,6 +213,7 @@ def convert_examples_to_features( features.append(feats) return features + @dataclasses.dataclass(frozen=True) class InputFeatures: """ @@ -217,6 +237,7 @@ class InputFeatures: label: Optional[Union[int, float]] = None seq_length: Optional[List[int]] = None + class ONNXRTGLUE: """Computes GLUE score. @@ -226,9 +247,9 @@ class ONNXRTGLUE: sts-b, cola, mnli, wnli. """ - def __init__(self, task='mrpc'): - assert task in ['mrpc', 'qqp', 'qnli', 'rte', 'sts-b', 'cola', \ - 'mnli', 'wnli', 'sst-2'], 'Unsupported task type' + + def __init__(self, task="mrpc"): + assert task in ["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", "mnli", "wnli", "sst-2"], "Unsupported task type" self.pred_list = None self.label_list = None self.task = task @@ -241,7 +262,7 @@ def __init__(self, task='mrpc'): "qnli": "acc", "rte": "acc", "wnli": "acc", - "sst-2": "acc" + "sst-2": "acc", } def update(self, preds, labels): @@ -270,102 +291,62 @@ def result(self): processed_preds = np.argmax(self.pred_list, axis=1) elif output_mode == "regression": processed_preds = np.squeeze(self.pred_list) - result = transformers.glue_compute_metrics(\ - self.task, processed_preds, self.label_list) + result = transformers.glue_compute_metrics(self.task, processed_preds, self.label_list) return result[self.return_key[self.task]] + if __name__ == "__main__": - logger.info('Evaluating ONNXRuntime full precision accuracy and performance:') + logger.info("Evaluating ONNXRuntime full precision accuracy and performance:") parser = argparse.ArgumentParser( - description='BERT fine-tune examples for classification/regression tasks.', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument( - '--model_path', - type=str, - help="Pre-trained resnet50 model on onnx file" + description="BERT fine-tune examples for classification/regression tasks.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) + parser.add_argument("--model_path", type=str, help="Pre-trained resnet50 model on onnx file") + parser.add_argument("--benchmark", action="store_true", default=False) + parser.add_argument("--tune", action="store_true", default=False, help="whether quantize the model") + parser.add_argument("--output_model", type=str, help="output model path") + parser.add_argument("--mode", type=str, help="benchmark mode of performance or accuracy") + parser.add_argument("--model_name_or_path", type=str, help="pretrained model name or path") + parser.add_argument("--data_path", type=str, help="input data path") parser.add_argument( - '--benchmark', - action='store_true', \ - default=False - ) - parser.add_argument( - '--tune', - action='store_true', \ - default=False, - help="whether quantize the model" - ) - parser.add_argument( - '--output_model', - type=str, - help="output model path" - ) - parser.add_argument( - '--mode', - type=str, - help="benchmark mode of performance or accuracy" - ) - parser.add_argument( - '--model_name_or_path', - type=str, - help="pretrained model name or path" - ) - parser.add_argument( - '--data_path', - type=str, - help="input data path" - ) - parser.add_argument( - '--batch_size', + "--batch_size", default=8, type=int, ) parser.add_argument( - '--task', + "--task", type=str, - default='mrpc', - choices=['mrpc', 'qqp', 'qnli', 'rte', 'sts-b', 'cola', \ - 'mnli', 'wnli', 'sst-2'], - help="GLUE task name" - ) - parser.add_argument( - "--dynamic_length", - type=bool, - default=False, - help="dynamic length" - ) - parser.add_argument( - "--max_seq_length", - type=int, - default=128, - help="max sequence length" + default="mrpc", + choices=["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", "mnli", "wnli", "sst-2"], + help="GLUE task name", ) + parser.add_argument("--dynamic_length", type=bool, default=False, help="dynamic length") + parser.add_argument("--max_seq_length", type=int, default=128, help="max sequence length") parser.add_argument( "--model_type", type=str, - default="bert", + default="bert", choices=["distilbert", "bert", "mobilebert", "roberta"], - help="model type" - ) - parser.add_argument( - "--intra_op_num_threads", - type=int, - default=4 + help="model type", ) + parser.add_argument("--intra_op_num_threads", type=int, default=4) args = parser.parse_args() - dataset = ONNXRTBertDataset(args.model_path, + dataset = ONNXRTBertDataset( + args.model_path, data_dir=args.data_path, model_name_or_path=args.model_name_or_path, max_seq_length=args.max_seq_length, task=args.task, model_type=args.model_type, - dynamic_length=args.dynamic_length) + dynamic_length=args.dynamic_length, + ) dataloader = data.DataLoader( dataset, sampler=data.SequentialSampler(dataset), batch_size=args.batch_size, shuffle=False, ) + def eval_func(model): metric = ONNXRTGLUE(args.task) session = onnxruntime.InferenceSession(model, providers=onnxruntime.get_available_providers()) @@ -373,11 +354,10 @@ def eval_func(model): len_inputs = len(session.get_inputs()) inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)] - batch_seq_length = args.max_seq_length if not args.dynamic_length else torch.max(batch[-2], 0)[0].item() - for idx, batch in enumerate(dataloader): label = batch[-1] batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in batch[0]) + batch_seq_length = args.max_seq_length if not args.dynamic_length else torch.max(batch[-2], 0)[0].item() data = [ batch[0][:, :batch_seq_length], batch[1][:, :batch_seq_length], @@ -389,28 +369,28 @@ def eval_func(model): metric.update(predictions[0], label) return metric.result() - if args.benchmark: model = onnx.load(args.model_path) - if args.mode == "performance": + if args.mode == "performance": total_time = 0.0 num_iter = 100 num_warmup = 10 - sess_options = ort.SessionOptions() + sess_options = onnxruntime.SessionOptions() sess_options.intra_op_num_threads = args.intra_op_num_threads - session = onnxruntime.InferenceSession(model.SerializeToString(), - sess_options, - providers=onnxruntime.get_available_providers()) + session = onnxruntime.InferenceSession( + model.SerializeToString(), sess_options, providers=onnxruntime.get_available_providers() + ) ort_inputs = {} len_inputs = len(session.get_inputs()) inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)] - + for idx, batch in enumerate(dataloader): if idx + 1 > num_iter: break batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in batch) + batch_seq_length = args.max_seq_length if not args.dynamic_length else torch.max(batch[-2], 0)[0].item() data = [ batch[0][:, :batch_seq_length], batch[1][:, :batch_seq_length], @@ -428,7 +408,7 @@ def eval_func(model): print(args) throughput = (num_iter - num_warmup) / total_time print("Throughput: {} samples/s".format(throughput)) - elif args.mode == 'accuracy': + elif args.mode == "accuracy": acc_result = eval_func(model) print("Batch size = %d" % args.batch_size) print("Accuracy: %.5f" % acc_result) @@ -436,15 +416,12 @@ def eval_func(model): if args.tune: # optimize model with tempfile.TemporaryDirectory(prefix="ort.opt.") as tmp_dir: - opt_options = FusionOptions('bert') + opt_options = FusionOptions("bert") opt_options.enable_embed_layer_norm = False model_optimizer = optimizer.optimize_model( - args.model_path, - 'bert', - num_heads=12, - hidden_size=768, - optimization_options=opt_options) + args.model_path, "bert", num_heads=12, hidden_size=768, optimization_options=opt_options + ) model = model_optimizer.model # check the optimized model is valid @@ -454,13 +431,10 @@ def eval_func(model): model = pathlib.Path(tmp_dir).joinpath("opt.onnx").as_posix() except Exception as e: logger.warning("Optimized model is invalid: {}. ".format(e)) - logger.warning("Model optimizer will be skipped. " \ - "Try to upgrade onnxruntime to avoid this error") + logger.warning("Model optimizer will be skipped. " "Try to upgrade onnxruntime to avoid this error") model = args.model_path - custom_tune_config = tuning.TuningConfig( - config_set=config.DynamicQuantConfig.get_config_set_for_tuning() - ) + custom_tune_config = tuning.TuningConfig(config_set=config.DynamicQuantConfig.get_config_set_for_tuning()) best_model = tuning.autotune( model_input=model, tune_config=custom_tune_config, diff --git a/examples/nlp/bert/quantization/ptq_dynamic/prepare_data.sh b/examples/nlp/bert/quantization/ptq_dynamic/prepare_data.sh index 8e434a5c5..c1fddb546 100644 --- a/examples/nlp/bert/quantization/ptq_dynamic/prepare_data.sh +++ b/examples/nlp/bert/quantization/ptq_dynamic/prepare_data.sh @@ -14,10 +14,10 @@ function init_params { do case $var in --data_dir=*) - data_dir=$(echo $var |cut -f2 -d=) + data_dir=$(echo "$var" |cut -f2 -d=) ;; --task_name=*) - task_name=$(echo $var |cut -f2 -d=) + task_name=$(echo "$var" |cut -f2 -d=) ;; esac done @@ -27,7 +27,7 @@ function init_params { # run_tuning function download_data { wget https://raw.githubusercontent.com/huggingface/transformers/f98ef14d161d7bcdc9808b5ec399981481411cc1/utils/download_glue_data.py - python download_glue_data.py --data_dir=${data_dir} --tasks=${task_name} + python download_glue_data.py --data_dir="${data_dir}" --tasks="${task_name}" } main "$@" diff --git a/examples/nlp/bert/quantization/ptq_dynamic/prepare_model.py b/examples/nlp/bert/quantization/ptq_dynamic/prepare_model.py index 0efed802f..5b9216640 100644 --- a/examples/nlp/bert/quantization/ptq_dynamic/prepare_model.py +++ b/examples/nlp/bert/quantization/ptq_dynamic/prepare_model.py @@ -1,14 +1,14 @@ import argparse import os import sys -import zipfile import urllib +import zipfile import torch import transformers # Please refer to [Bert-GLUE_OnnxRuntime_quantization guide] -# (https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/notebooks/bert/Bert-GLUE_OnnxRuntime_quantization.ipynb) +# (https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/notebooks/bert/Bert-GLUE_OnnxRuntime_quantization.ipynb) # for detailed model export. MODEL_URL = "https://download.pytorch.org/tutorial/MRPC.zip" @@ -19,16 +19,13 @@ def parse_arguments(): parser = argparse.ArgumentParser() parser.add_argument("--input_model", type=str, required=False, default="MRPC.zip") parser.add_argument("--output_model", type=str, required=True) - parser.add_argument('--max_len', - type=int, - default=128, - help='Maximum length of the sentence pairs') + parser.add_argument("--max_len", type=int, default=128, help="Maximum length of the sentence pairs") return parser.parse_args() def progressbar(cur, total=100): - percent = '{:.2%}'.format(cur / total) - sys.stdout.write("\r[%-100s] %s" % ('#' * int(cur), percent)) + percent = "{:.2%}".format(cur / total) + sys.stdout.write("\r[%-100s] %s" % ("#" * int(cur), percent)) sys.stdout.flush() @@ -42,15 +39,15 @@ def schedule(blocknum, blocksize, totalsize): def is_zip_file(filename): try: - with open(filename, 'rb') as f: + with open(filename, "rb") as f: magic_number = f.read(4) - return magic_number == b'PK\x03\x04' # ZIP file magic number + return magic_number == b"PK\x03\x04" # ZIP file magic number except OSError: return False def extrafile(filename, target_folder="."): - with zipfile.ZipFile(filename, 'r') as zin: + with zipfile.ZipFile(filename, "r") as zin: zin.extractall(target_folder) @@ -80,33 +77,30 @@ def download_model(url, model_name, retry_times=5): def export_model(model, output_model, max_len=128): with torch.no_grad(): inputs = { - 'input_ids': torch.ones(1, max_len, dtype=torch.int64), - 'attention_mask': torch.ones(1, max_len, dtype=torch.int64), - 'token_type_ids': torch.ones(1, max_len, dtype=torch.int64) + "input_ids": torch.ones(1, max_len, dtype=torch.int64), + "attention_mask": torch.ones(1, max_len, dtype=torch.int64), + "token_type_ids": torch.ones(1, max_len, dtype=torch.int64), } - symbolic_names = {0: 'batch_size', 1: 'max_seq_len'} + symbolic_names = {0: "batch_size", 1: "max_seq_len"} torch.onnx.export( model, # model being run ( - inputs['input_ids'], - inputs['attention_mask'], - inputs['token_type_ids'], + inputs["input_ids"], + inputs["attention_mask"], + inputs["token_type_ids"], ), # model input (or a tuple for multiple inputs) output_model, # where to save the model (can be a file or file-like object) opset_version=14, # the ONNX version to export the model do_constant_folding=True, # whether to execute constant folding - input_names=[ - 'input_ids', # the model's input names - 'input_mask', - 'segment_ids' - ], - output_names=['output'], # the model's output names + input_names=["input_ids", "input_mask", "segment_ids"], # the model's input names + output_names=["output"], # the model's output names dynamic_axes={ - 'input_ids': symbolic_names, # variable length axes - 'input_mask': symbolic_names, - 'segment_ids': symbolic_names - }) + "input_ids": symbolic_names, # variable length axes + "input_mask": symbolic_names, + "segment_ids": symbolic_names, + }, + ) assert os.path.exists(output_model), f"Export failed! {output_model} doesn't exist!" print("ONNX Model exported to {0}".format(output_model)) @@ -114,8 +108,7 @@ def export_model(model, output_model, max_len=128): def prepare_model(input_model, output_model, max_len): is_download_successful = download_model(MODEL_URL, input_model, MAX_TIMES_RETRY_DOWNLOAD) if is_download_successful: - folder_name = is_download_successful if isinstance(is_download_successful, - str) else "./MRPC" + folder_name = is_download_successful if isinstance(is_download_successful, str) else "./MRPC" model = transformers.BertForSequenceClassification.from_pretrained(folder_name) export_model(model, output_model, max_len) diff --git a/examples/nlp/bert/quantization/ptq_dynamic/run_benchmark.sh b/examples/nlp/bert/quantization/ptq_dynamic/run_benchmark.sh index d71c0a908..766d50476 100644 --- a/examples/nlp/bert/quantization/ptq_dynamic/run_benchmark.sh +++ b/examples/nlp/bert/quantization/ptq_dynamic/run_benchmark.sh @@ -14,16 +14,16 @@ function init_params { do case $var in --input_model=*) - input_model=$(echo $var |cut -f2 -d=) + input_model=$(echo "$var" |cut -f2 -d=) ;; --mode=*) - mode=$(echo $var |cut -f2 -d=) + mode=$(echo "$var" |cut -f2 -d=) ;; --dataset_location=*) - dataset_location=$(echo $var |cut -f2 -d=) + dataset_location=$(echo "$var" |cut -f2 -d=) ;; --batch_size=*) - batch_size=$(echo $var |cut -f2 -d=) + batch_size=$(echo "$var" |cut -f2 -d=) ;; esac done @@ -43,16 +43,15 @@ function run_benchmark { model_name_or_path="bert-base-uncased" task_name="mrpc" - model_type="bert" python main.py \ - --model_path ${input_model} \ - --model_name_or_path ${model_name_or_path} \ - --data_path ${dataset_location} \ - --task ${task_name} \ - --batch_size ${batch_size} \ - --mode ${mode} \ - --dynamic_length ${dynamic_length} \ + --model_path "${input_model}" \ + --model_name_or_path "${model_name_or_path}" \ + --data_path "${dataset_location}" \ + --task "${task_name}" \ + --batch_size "${batch_size}" \ + --mode "${mode}" \ + --dynamic_length "${dynamic_length}" \ --benchmark } diff --git a/examples/nlp/bert/quantization/ptq_dynamic/run_quant.sh b/examples/nlp/bert/quantization/ptq_dynamic/run_quant.sh index 6876ddc50..53e864930 100644 --- a/examples/nlp/bert/quantization/ptq_dynamic/run_quant.sh +++ b/examples/nlp/bert/quantization/ptq_dynamic/run_quant.sh @@ -12,13 +12,13 @@ function init_params { do case $var in --input_model=*) - input_model=$(echo $var |cut -f2 -d=) + input_model=$(echo "$var" |cut -f2 -d=) ;; --output_model=*) - output_model=$(echo $var |cut -f2 -d=) + output_model=$(echo "$var" |cut -f2 -d=) ;; --dataset_location=*) - dataset_location=$(echo $var |cut -f2 -d=) + dataset_location=$(echo "$var" |cut -f2 -d=) ;; esac done @@ -30,15 +30,14 @@ function run_tuning { model_name_or_path="bert-base-uncased" batch_size=8 task_name="mrpc" - model_type="bert" python main.py \ - --model_path ${input_model} \ - --output_model ${output_model} \ - --model_name_or_path ${model_name_or_path} \ - --data_path ${dataset_location} \ - --task ${task_name} \ - --batch_size ${batch_size} \ + --model_path "${input_model}" \ + --output_model "${output_model}" \ + --model_name_or_path "${model_name_or_path}" \ + --data_path "${dataset_location}" \ + --task "${task_name}" \ + --batch_size "${batch_size}" \ --tune } diff --git a/examples/nlp/bert/quantization/ptq_static/README.md b/examples/nlp/bert/quantization/ptq_static/README.md index fb2f13851..c34e76a79 100644 --- a/examples/nlp/bert/quantization/ptq_static/README.md +++ b/examples/nlp/bert/quantization/ptq_static/README.md @@ -11,7 +11,6 @@ This example load a BERT model and confirm its accuracy and speed based on [GLUE pip install onnx-neural-compressor pip install -r requirements.txt ``` -> Note: Validated ONNX Runtime [Version](/docs/source/installation_guide.md#validated-software-environment). ## 2. Prepare Dataset diff --git a/examples/nlp/bert/quantization/ptq_static/main.py b/examples/nlp/bert/quantization/ptq_static/main.py index b12f36d47..9a3996132 100644 --- a/examples/nlp/bert/quantization/ptq_static/main.py +++ b/examples/nlp/bert/quantization/ptq_static/main.py @@ -16,70 +16,43 @@ # under the License. # pylint:disable=redefined-outer-name,logging-format-interpolation -import logging -import pathlib -import tempfile import argparse import dataclasses +import logging import os +import pathlib +import tempfile +from typing import List, Optional, Union + +import numpy as np import onnx import onnxruntime -import transformers +import time import torch -import numpy as np -from onnx_neural_compressor import data_reader -from typing import List, Optional, Union -from torch.utils import data -from onnx_neural_compressor import config -from onnx_neural_compressor import quantization -from onnx_neural_compressor.quantization import tuning +import transformers from onnxruntime.transformers import optimizer from onnxruntime.transformers.fusion_options import FusionOptions +from torch.utils import data + +from onnx_neural_compressor import config, data_reader, quantization +from onnx_neural_compressor.quantization import tuning logger = logging.getLogger(__name__) -logging.basicConfig(format = "%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt = "%m/%d/%Y %H:%M:%S", - level = logging.WARN) +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN +) logger.info("Evaluating ONNXRuntime full precision accuracy and performance:") parser = argparse.ArgumentParser( -description="BERT fine-tune examples for classification/regression tasks.", -formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument( - "--model_path", - type=str, - help="Pre-trained model on onnx file" -) -parser.add_argument( - "--benchmark", - action="store_true", \ - default=False -) -parser.add_argument( - "--tune", - action="store_true", \ - default=False, - help="whether quantize the model" -) -parser.add_argument( - "--output_model", - type=str, - help="output model path" -) -parser.add_argument( - "--mode", - type=str, - help="benchmark mode of performance or accuracy" -) -parser.add_argument( - "--model_name_or_path", - type=str, - help="pretrained model name or path" -) -parser.add_argument( - "--data_path", - type=str, - help="input data path" + description="BERT fine-tune examples for classification/regression tasks.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) +parser.add_argument("--model_path", type=str, help="Pre-trained model on onnx file") +parser.add_argument("--benchmark", action="store_true", default=False) +parser.add_argument("--tune", action="store_true", default=False, help="whether quantize the model") +parser.add_argument("--output_model", type=str, help="output model path") +parser.add_argument("--mode", type=str, help="benchmark mode of performance or accuracy") +parser.add_argument("--model_name_or_path", type=str, help="pretrained model name or path") +parser.add_argument("--data_path", type=str, help="input data path") parser.add_argument( "--batch_size", default=8, @@ -89,41 +62,22 @@ "--task", type=str, default="mrpc", - choices=["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", \ - "mnli", "wnli", "sst-2"], - help="GLUE task name" + choices=["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", "mnli", "wnli", "sst-2"], + help="GLUE task name", ) parser.add_argument( - "--quant_format", - type=str, - default="QOperator", - choices=["QDQ", "QOperator"], - help="quantization format" + "--quant_format", type=str, default="QOperator", choices=["QDQ", "QOperator"], help="quantization format" ) +parser.add_argument("--dynamic_length", type=bool, default=False, help="dynamic length") +parser.add_argument("--max_seq_length", type=int, default=128, help="max sequence length") parser.add_argument( - "--dynamic_length", - type=bool, - default=False, - help="dynamic length" + "--model_type", type=str, default="bert", choices=["distilbert", "bert", "mobilebert", "roberta"], help="model type" ) parser.add_argument( - "--max_seq_length", - type=int, - default=128, - help="max sequence length" -) -parser.add_argument( - "--model_type", + "--device", type=str, - default="bert", - choices=["distilbert", "bert", "mobilebert", "roberta"], - help="model type" -) -parser.add_argument( - '--device', - type=str, - default='cpu', - choices=['cpu', 'npu'], + default="cpu", + choices=["cpu", "npu"], ) args = parser.parse_args() @@ -148,57 +102,77 @@ class ONNXRTBertDataset: filter (Filter objects, default=None): filter out examples according to specific conditions. """ - def __init__(self, model, data_dir, model_name_or_path, max_seq_length=128,\ - do_lower_case=True, task="mrpc", model_type="bert", dynamic_length=False,\ - evaluate=True, transform=None, filter=None): + + def __init__( + self, + model, + data_dir, + model_name_or_path, + max_seq_length=128, + do_lower_case=True, + task="mrpc", + model_type="bert", + dynamic_length=False, + evaluate=True, + transform=None, + filter=None, + ): self.inputs = [inp.name for inp in onnx.load(model).graph.input] task = task.lower() model_type = model_type.lower() - assert task in ["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", \ - "mnli", "wnli", "sst-2"], "Unsupported task type" - assert model_type in ["distilbert", "bert", "mobilebert", "roberta"], "Unsupported \ + assert task in ["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", "mnli", "wnli", "sst-2"], "Unsupported task type" + assert model_type in [ + "distilbert", + "bert", + "mobilebert", + "roberta", + ], "Unsupported \ model type" self.dynamic_length = dynamic_length self.model_type = model_type self.max_seq_length = max_seq_length - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, - do_lower_case=do_lower_case) - self.dataset = load_and_cache_examples(data_dir, model_name_or_path, \ - max_seq_length, task, model_type, tokenizer, evaluate) + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case) + self.dataset = load_and_cache_examples( + data_dir, model_name_or_path, max_seq_length, task, model_type, tokenizer, evaluate + ) def __len__(self): return len(self.dataset) def __getitem__(self, index): batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in self.dataset[index]) - return batch[:len(self.inputs)], batch[-1] + return batch[: len(self.inputs)], batch[-1] + -def load_and_cache_examples(data_dir, model_name_or_path, max_seq_length, task, \ - model_type, tokenizer, evaluate): +def load_and_cache_examples(data_dir, model_name_or_path, max_seq_length, task, model_type, tokenizer, evaluate): processor = transformers.glue_processors[task]() output_mode = transformers.glue_output_modes[task] # Load data features from cache or dataset file if not os.path.exists("./dataset_cached"): os.makedirs("./dataset_cached") - cached_features_file = os.path.join("./dataset_cached", "cached_{}_{}_{}_{}".format( - "dev" if evaluate else "train", - list(filter(None, model_name_or_path.split("/"))).pop(), - str(max_seq_length), - str(task))) + cached_features_file = os.path.join( + "./dataset_cached", + "cached_{}_{}_{}_{}".format( + "dev" if evaluate else "train", + list(filter(None, model_name_or_path.split("/"))).pop(), + str(max_seq_length), + str(task), + ), + ) if os.path.exists(cached_features_file): logger.info("Load features from cached file {}.".format(cached_features_file)) features = torch.load(cached_features_file) else: logger.info("Create features from dataset file at {}.".format(data_dir)) label_list = processor.get_labels() - examples = processor.get_dev_examples(data_dir) if evaluate else \ - processor.get_train_examples(data_dir) - features = convert_examples_to_features(examples, - tokenizer, - task=task, - label_list=label_list, - max_length=max_seq_length, - output_mode=output_mode, + examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir) + features = convert_examples_to_features( + examples, + tokenizer, + task=task, + label_list=label_list, + max_length=max_seq_length, + output_mode=output_mode, ) logger.info("Save features into cached file {}.".format(cached_features_file)) torch.save(features, cached_features_file) @@ -211,10 +185,10 @@ def load_and_cache_examples(data_dir, model_name_or_path, max_seq_length, task, all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) - dataset = data.TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, \ - all_seq_lengths, all_labels) + dataset = data.TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_seq_lengths, all_labels) return dataset + def convert_examples_to_features( examples, tokenizer, @@ -232,7 +206,7 @@ def convert_examples_to_features( logger.info("Use label list {} for task {}.".format(label_list, task)) label_map = {label: i for i, label in enumerate(label_list)} features = [] - for (ex_index, example) in enumerate(examples): + for ex_index, example in enumerate(examples): inputs = tokenizer.encode_plus( example.text_a, example.text_b, @@ -251,19 +225,14 @@ def convert_examples_to_features( padding_length = max_length - len(input_ids) input_ids = input_ids + ([pad_token] * padding_length) - attention_mask = attention_mask + \ - ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) - assert len(input_ids) == max_length, \ - "Error with input_ids length {} vs {}".format( - len(input_ids), max_length) - assert len(attention_mask) == max_length, \ - "Error with attention_mask length {} vs {}".format( + assert len(input_ids) == max_length, "Error with input_ids length {} vs {}".format(len(input_ids), max_length) + assert len(attention_mask) == max_length, "Error with attention_mask length {} vs {}".format( len(attention_mask), max_length ) - assert len(token_type_ids) == max_length, \ - "Error with token_type_ids length {} vs {}".format( + assert len(token_type_ids) == max_length, "Error with token_type_ids length {} vs {}".format( len(token_type_ids), max_length ) if output_mode == "classification": @@ -283,6 +252,7 @@ def convert_examples_to_features( features.append(feats) return features + @dataclasses.dataclass(frozen=True) class InputFeatures: """ @@ -306,6 +276,7 @@ class InputFeatures: label: Optional[Union[int, float]] = None seq_length: Optional[List[int]] = None + class ONNXRTGLUE: """Computes GLUE score. @@ -315,9 +286,9 @@ class ONNXRTGLUE: sts-b, cola, mnli, wnli. """ + def __init__(self, task="mrpc"): - assert task in ["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", \ - "mnli", "wnli", "sst-2"], "Unsupported task type" + assert task in ["mrpc", "qqp", "qnli", "rte", "sts-b", "cola", "mnli", "wnli", "sst-2"], "Unsupported task type" self.pred_list = None self.label_list = None self.task = task @@ -330,7 +301,7 @@ def __init__(self, task="mrpc"): "qnli": "acc", "rte": "acc", "wnli": "acc", - "sst-2": "acc" + "sst-2": "acc", } def update(self, preds, labels): @@ -358,21 +329,23 @@ def result(self): processed_preds = np.argmax(self.pred_list, axis=1) elif output_mode == "regression": processed_preds = np.squeeze(self.pred_list) - result = transformers.glue_compute_metrics(\ - self.task, processed_preds, self.label_list) + result = transformers.glue_compute_metrics(self.task, processed_preds, self.label_list) return result[self.return_key[self.task]] + class DataReader(data_reader.CalibrationDataReader): def __init__(self, model_path, dynamic_length=False, batch_size=1, calibration_sampling_size=8): self.encoded_list = [] - self.batch_size=batch_size - dataset = ONNXRTBertDataset(args.model_path, + self.batch_size = batch_size + dataset = ONNXRTBertDataset( + args.model_path, data_dir=args.data_path, model_name_or_path=args.model_name_or_path, max_seq_length=args.max_seq_length, task=args.task, model_type=args.model_type, - dynamic_length=args.dynamic_length) + dynamic_length=args.dynamic_length, + ) dataloader = data.DataLoader( dataset, sampler=data.SequentialSampler(dataset), @@ -381,7 +354,7 @@ def __init__(self, model_path, dynamic_length=False, batch_size=1, calibration_s ) model = onnx.load(model_path, load_external_data=False) inputs_names = [input.name for input in model.graph.input] - self.batch_size=batch_size + self.batch_size = batch_size for idx, batch in enumerate(dataloader): if idx + 1 > calibration_sampling_size: @@ -403,17 +376,20 @@ def get_next(self): def rewind(self): self.iter_next = iter(self.encoded_list) + if __name__ == "__main__": # set config for npu test provider = "DmlExecutionProvider" if args.device == "npu" else "CPUExecutionProvider" - dataset = ONNXRTBertDataset(args.model_path, + dataset = ONNXRTBertDataset( + args.model_path, data_dir=args.data_path, model_name_or_path=args.model_name_or_path, max_seq_length=args.max_seq_length, task=args.task, model_type=args.model_type, - dynamic_length=args.dynamic_length) + dynamic_length=args.dynamic_length, + ) dataloader = data.DataLoader( dataset, sampler=data.SequentialSampler(dataset), @@ -428,11 +404,10 @@ def eval_func(model): len_inputs = len(session.get_inputs()) inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)] - batch_seq_length = args.max_seq_length if not args.dynamic_length else torch.max(batch[-2], 0)[0].item() - for idx, batch in enumerate(dataloader): label = batch[-1] batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in batch[0]) + batch_seq_length = args.max_seq_length if not args.dynamic_length else torch.max(batch[-2], 0)[0].item() inputs = [ batch[0][:, :batch_seq_length], batch[1][:, :batch_seq_length], @@ -446,24 +421,25 @@ def eval_func(model): if args.benchmark: model = onnx.load(args.model_path) - if args.mode == 'performance': + if args.mode == "performance": total_time = 0.0 num_iter = 100 num_warmup = 10 - sess_options = ort.SessionOptions() + sess_options = onnxruntime.SessionOptions() sess_options.intra_op_num_threads = args.intra_op_num_threads - session = onnxruntime.InferenceSession(model.SerializeToString(), - sess_options, - providers=onnxruntime.get_available_providers()) + session = onnxruntime.InferenceSession( + model.SerializeToString(), sess_options, providers=onnxruntime.get_available_providers() + ) ort_inputs = {} len_inputs = len(session.get_inputs()) inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)] - + for idx, batch in enumerate(dataloader): if idx + 1 > num_iter: break batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in batch) + batch_seq_length = args.max_seq_length if not args.dynamic_length else torch.max(batch[-2], 0)[0].item() inputs = [ batch[0][:, :batch_seq_length], batch[1][:, :batch_seq_length], @@ -481,7 +457,7 @@ def eval_func(model): print(args) throughput = (num_iter - num_warmup) / total_time print("Throughput: {} samples/s".format(throughput)) - elif args.mode == 'accuracy': + elif args.mode == "accuracy": acc_result = eval_func(model) print("Batch size = %d" % args.batch_size) print("Accuracy: %.5f" % acc_result) @@ -493,11 +469,8 @@ def eval_func(model): opt_options.enable_embed_layer_norm = False model_optimizer = optimizer.optimize_model( - args.model_path, - "bert", - num_heads=12, - hidden_size=768, - optimization_options=opt_options) + args.model_path, "bert", num_heads=12, hidden_size=768, optimization_options=opt_options + ) model = model_optimizer.model # check the optimized model is valid @@ -507,14 +480,17 @@ def eval_func(model): model = pathlib.Path(tmp_dir).joinpath("opt.onnx").as_posix() except Exception as e: logger.warning("Optimized model is invalid: {}. ".format(e)) - logger.warning("Model optimizer will be skipped. " \ - "Try to upgrade onnxruntime to avoid this error") + logger.warning("Model optimizer will be skipped. " "Try to upgrade onnxruntime to avoid this error") model = args.model_path calibration_data_reader = DataReader(args.model_path, calibration_sampling_size=8) custom_tune_config = tuning.TuningConfig( config_set=config.StaticQuantConfig.get_config_set_for_tuning( - quant_format=quantization.QuantFormat.QOperator if args.quant_format == "QOperator" else quantization.QuantFormat.QDQ, + quant_format=( + quantization.QuantFormat.QOperator + if args.quant_format == "QOperator" + else quantization.QuantFormat.QDQ + ), calibration_sampling_size=8, extra_options={"optypes_to_exclude_output_quant": ["MatMul", "Gemm", "Attention", "FusedGemm"]}, execution_provider=provider, diff --git a/examples/nlp/bert/quantization/ptq_static/prepare_data.sh b/examples/nlp/bert/quantization/ptq_static/prepare_data.sh index 8e434a5c5..c1fddb546 100644 --- a/examples/nlp/bert/quantization/ptq_static/prepare_data.sh +++ b/examples/nlp/bert/quantization/ptq_static/prepare_data.sh @@ -14,10 +14,10 @@ function init_params { do case $var in --data_dir=*) - data_dir=$(echo $var |cut -f2 -d=) + data_dir=$(echo "$var" |cut -f2 -d=) ;; --task_name=*) - task_name=$(echo $var |cut -f2 -d=) + task_name=$(echo "$var" |cut -f2 -d=) ;; esac done @@ -27,7 +27,7 @@ function init_params { # run_tuning function download_data { wget https://raw.githubusercontent.com/huggingface/transformers/f98ef14d161d7bcdc9808b5ec399981481411cc1/utils/download_glue_data.py - python download_glue_data.py --data_dir=${data_dir} --tasks=${task_name} + python download_glue_data.py --data_dir="${data_dir}" --tasks="${task_name}" } main "$@" diff --git a/examples/nlp/bert/quantization/ptq_static/prepare_model.py b/examples/nlp/bert/quantization/ptq_static/prepare_model.py index 0a29b5830..5b9216640 100644 --- a/examples/nlp/bert/quantization/ptq_static/prepare_model.py +++ b/examples/nlp/bert/quantization/ptq_static/prepare_model.py @@ -1,14 +1,14 @@ import argparse import os import sys -import zipfile import urllib +import zipfile import torch import transformers # Please refer to [Bert-GLUE_OnnxRuntime_quantization guide] -# (https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/notebooks/bert/Bert-GLUE_OnnxRuntime_quantization.ipynb) +# (https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/notebooks/bert/Bert-GLUE_OnnxRuntime_quantization.ipynb) # for detailed model export. MODEL_URL = "https://download.pytorch.org/tutorial/MRPC.zip" @@ -19,16 +19,13 @@ def parse_arguments(): parser = argparse.ArgumentParser() parser.add_argument("--input_model", type=str, required=False, default="MRPC.zip") parser.add_argument("--output_model", type=str, required=True) - parser.add_argument('--max_len', - type=int, - default=128, - help='Maximum length of the sentence pairs') + parser.add_argument("--max_len", type=int, default=128, help="Maximum length of the sentence pairs") return parser.parse_args() def progressbar(cur, total=100): - percent = '{:.2%}'.format(cur / total) - sys.stdout.write("\r[%-100s] %s" % ('#' * int(cur), percent)) + percent = "{:.2%}".format(cur / total) + sys.stdout.write("\r[%-100s] %s" % ("#" * int(cur), percent)) sys.stdout.flush() @@ -42,15 +39,15 @@ def schedule(blocknum, blocksize, totalsize): def is_zip_file(filename): try: - with open(filename, 'rb') as f: + with open(filename, "rb") as f: magic_number = f.read(4) - return magic_number == b'PK\x03\x04' # ZIP file magic number + return magic_number == b"PK\x03\x04" # ZIP file magic number except OSError: return False def extrafile(filename, target_folder="."): - with zipfile.ZipFile(filename, 'r') as zin: + with zipfile.ZipFile(filename, "r") as zin: zin.extractall(target_folder) @@ -80,30 +77,30 @@ def download_model(url, model_name, retry_times=5): def export_model(model, output_model, max_len=128): with torch.no_grad(): inputs = { - 'input_ids': torch.ones(1, max_len, dtype=torch.int64), - 'attention_mask': torch.ones(1, max_len, dtype=torch.int64), - 'token_type_ids': torch.ones(1, max_len, dtype=torch.int64) + "input_ids": torch.ones(1, max_len, dtype=torch.int64), + "attention_mask": torch.ones(1, max_len, dtype=torch.int64), + "token_type_ids": torch.ones(1, max_len, dtype=torch.int64), } - symbolic_names = {0: 'batch_size', 1: 'max_seq_len'} + symbolic_names = {0: "batch_size", 1: "max_seq_len"} torch.onnx.export( model, # model being run - (inputs['input_ids'], inputs['attention_mask'], - inputs['token_type_ids']), # model input (or a tuple for multiple inputs) + ( + inputs["input_ids"], + inputs["attention_mask"], + inputs["token_type_ids"], + ), # model input (or a tuple for multiple inputs) output_model, # where to save the model (can be a file or file-like object) opset_version=14, # the ONNX version to export the model do_constant_folding=True, # whether to execute constant folding - input_names=[ - 'input_ids', # the model's input names - 'input_mask', - 'segment_ids' - ], - output_names=['output'], # the model's output names + input_names=["input_ids", "input_mask", "segment_ids"], # the model's input names + output_names=["output"], # the model's output names dynamic_axes={ - 'input_ids': symbolic_names, # variable length axes - 'input_mask': symbolic_names, - 'segment_ids': symbolic_names - }) + "input_ids": symbolic_names, # variable length axes + "input_mask": symbolic_names, + "segment_ids": symbolic_names, + }, + ) assert os.path.exists(output_model), f"Export failed! {output_model} doesn't exist!" print("ONNX Model exported to {0}".format(output_model)) @@ -111,8 +108,7 @@ def export_model(model, output_model, max_len=128): def prepare_model(input_model, output_model, max_len): is_download_successful = download_model(MODEL_URL, input_model, MAX_TIMES_RETRY_DOWNLOAD) if is_download_successful: - folder_name = is_download_successful if isinstance(is_download_successful, - str) else "./MRPC" + folder_name = is_download_successful if isinstance(is_download_successful, str) else "./MRPC" model = transformers.BertForSequenceClassification.from_pretrained(folder_name) export_model(model, output_model, max_len) diff --git a/examples/nlp/bert/quantization/ptq_static/run_benchmark.sh b/examples/nlp/bert/quantization/ptq_static/run_benchmark.sh index d71c0a908..766d50476 100644 --- a/examples/nlp/bert/quantization/ptq_static/run_benchmark.sh +++ b/examples/nlp/bert/quantization/ptq_static/run_benchmark.sh @@ -14,16 +14,16 @@ function init_params { do case $var in --input_model=*) - input_model=$(echo $var |cut -f2 -d=) + input_model=$(echo "$var" |cut -f2 -d=) ;; --mode=*) - mode=$(echo $var |cut -f2 -d=) + mode=$(echo "$var" |cut -f2 -d=) ;; --dataset_location=*) - dataset_location=$(echo $var |cut -f2 -d=) + dataset_location=$(echo "$var" |cut -f2 -d=) ;; --batch_size=*) - batch_size=$(echo $var |cut -f2 -d=) + batch_size=$(echo "$var" |cut -f2 -d=) ;; esac done @@ -43,16 +43,15 @@ function run_benchmark { model_name_or_path="bert-base-uncased" task_name="mrpc" - model_type="bert" python main.py \ - --model_path ${input_model} \ - --model_name_or_path ${model_name_or_path} \ - --data_path ${dataset_location} \ - --task ${task_name} \ - --batch_size ${batch_size} \ - --mode ${mode} \ - --dynamic_length ${dynamic_length} \ + --model_path "${input_model}" \ + --model_name_or_path "${model_name_or_path}" \ + --data_path "${dataset_location}" \ + --task "${task_name}" \ + --batch_size "${batch_size}" \ + --mode "${mode}" \ + --dynamic_length "${dynamic_length}" \ --benchmark } diff --git a/examples/nlp/bert/quantization/ptq_static/run_quant.sh b/examples/nlp/bert/quantization/ptq_static/run_quant.sh index 08821d983..976e8e0c2 100644 --- a/examples/nlp/bert/quantization/ptq_static/run_quant.sh +++ b/examples/nlp/bert/quantization/ptq_static/run_quant.sh @@ -12,16 +12,16 @@ function init_params { do case $var in --input_model=*) - input_model=$(echo $var |cut -f2 -d=) + input_model=$(echo "$var" |cut -f2 -d=) ;; --output_model=*) - output_model=$(echo $var |cut -f2 -d=) + output_model=$(echo "$var" |cut -f2 -d=) ;; --dataset_location=*) - dataset_location=$(echo $var |cut -f2 -d=) + dataset_location=$(echo "$var" |cut -f2 -d=) ;; --quant_format=*) - quant_format=$(echo $var |cut -f2 -d=) + quant_format=$(echo "$var" |cut -f2 -d=) ;; esac done @@ -36,14 +36,14 @@ function run_tuning { model_type="bert" python main.py \ - --model_path ${input_model} \ - --output_model ${output_model} \ - --model_name_or_path ${model_name_or_path} \ - --data_path ${dataset_location} \ - --task ${task_name} \ - --batch_size ${batch_size} \ - --model_type ${model_type} \ - --quant_format ${quant_format} \ + --model_path "${input_model}" \ + --output_model "${output_model}" \ + --model_name_or_path "${model_name_or_path}" \ + --data_path "${dataset_location}" \ + --task "${task_name}" \ + --batch_size "${batch_size}" \ + --model_type "${model_type}" \ + --quant_format "${quant_format}" \ --tune } diff --git a/onnx_neural_compressor/algorithms/post_training_quant/__init__.py b/onnx_neural_compressor/algorithms/post_training_quant/__init__.py index e3fdc07b1..28f108cb6 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/__init__.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/__init__.py @@ -10,4 +10,4 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. \ No newline at end of file +# limitations under the License. diff --git a/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py b/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py index b7cc35c7d..40e3b9645 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py @@ -30,10 +30,11 @@ import onnxruntime from onnxruntime import quantization as ort_quant from packaging import version -from onnx_neural_compressor.algorithms.post_training_quant import calibrator + +from onnx_neural_compressor import logger, onnx_model from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor import onnx_model -from onnx_neural_compressor import logger +from onnx_neural_compressor.algorithms.post_training_quant import calibrator + if sys.version_info < (3, 11) and util.find_spec("onnxruntime_extensions"): import onnxruntime_extensions @@ -68,7 +69,11 @@ def __init__( execution_provider (list, optional): execution provider for onnxruntime. Defaults to 'CPUExecutionProvider'. reduce_range (bool, optional): use 7 bit or not. Defaults to False. """ - self.model_wrapper = model_wrapper if isinstance(model_wrapper, onnx_model.ONNXModel) else onnx_model.ONNXModel(model_wrapper, load_external_data=True) + self.model_wrapper = ( + model_wrapper + if isinstance(model_wrapper, onnx_model.ONNXModel) + else onnx_model.ONNXModel(model_wrapper, load_external_data=True) + ) self.model = self.model_wrapper.model ai_onnx_domain = [opset for opset in self.model.opset_import if not opset.domain or opset.domain == "ai.onnx"] self.opset_version = ai_onnx_domain[0].version @@ -224,11 +229,17 @@ def get_activation_tensors_calib_range(self, q_config=None): if sys.version_info < (3, 11) and util.find_spec("onnxruntime_extensions"): so.register_custom_ops_library(onnxruntime_extensions.get_library_path()) - execution_provider = self.execution_provider if self.execution_provider != "TensorrtExecutionProvider" else "CUDAExecutionProvider" + execution_provider = ( + self.execution_provider + if self.execution_provider != "TensorrtExecutionProvider" + else "CUDAExecutionProvider" + ) session = ( onnxruntime.InferenceSession(self.augmented_model.SerializeToString(), so, providers=[execution_provider]) if not self.model_wrapper.is_large_model - else onnxruntime.InferenceSession(self.model_wrapper.model_path + "_augment.onnx", so, providers=[execution_provider]) + else onnxruntime.InferenceSession( + self.model_wrapper.model_path + "_augment.onnx", so, providers=[execution_provider] + ) ) len_inputs = len(session.get_inputs()) @@ -268,7 +279,9 @@ def _collect_data(inputs): node_name = name_to_node[node_output_names[output_idx]] if node_output_names[output_idx] not in name_to_calibrator: calib_method = ( - q_config[node_name]["calibrate_method"].name if q_config and node_name in q_config else ort_quant.CalibrationMethod.MinMax.name + q_config[node_name]["calibrate_method"].name + if q_config and node_name in q_config + else ort_quant.CalibrationMethod.MinMax.name ) assert calib_method in calibrator.CALIBRATOR, "Calibration method {} is not registered.".format( calib_method @@ -283,18 +296,13 @@ def _collect_data(inputs): # per iteration in the future. if _calibrator.method_name == ort_quant.CalibrationMethod.MinMax.name: _calibrator.collect(output) - activation_tensors_calib_range[node_output_names[output_idx]] = [ - list(_calibrator.calib_range) - ] + activation_tensors_calib_range[node_output_names[output_idx]] = [list(_calibrator.calib_range)] name_to_calibrator[node_output_names[output_idx]] = _calibrator else: - intermediate_tensor.setdefault((node_output_names[output_idx], node_name), []).append( - output - ) + intermediate_tensor.setdefault((node_output_names[output_idx], node_name), []).append(output) elif q_config is None: activation_tensors_calib_range.setdefault(node_output_names[output_idx], []).append(output) - idx = 0 while True: inputs = self.dataloader.get_next() @@ -314,10 +322,12 @@ def _collect_data(inputs): for (output_name, node_name), datas in merged_dict.items(): if any([data is None for data in datas]): continue - if any([data.dtype in [bool] for data in datas]): # output type of some ops is bool, skip + if any([data.dtype in [bool] for data in datas]): # output type of some ops is bool, skip continue calib_method = ( - q_config[node_name]["calibrate_method"].name if q_config and node_name in q_config else ort_quant.CalibrationMethod.MinMax.name + q_config[node_name]["calibrate_method"].name + if q_config and node_name in q_config + else ort_quant.CalibrationMethod.MinMax.name ) _calibrator = calibrator.CALIBRATOR[calib_method]() _calibrator.collect(datas) @@ -386,7 +396,9 @@ def get_weight_tensors_calib_range(self): os.path.dirname(self.model_wrapper.model_path) if self.model_wrapper.model_path is not None else "" ), ) - _calibrator = calibrator.CALIBRATOR[ort_quant.CalibrationMethod.MinMax.name]() # use minmax method to calibrate initializer tensors + _calibrator = calibrator.CALIBRATOR[ + ort_quant.CalibrationMethod.MinMax.name + ]() # use minmax method to calibrate initializer tensors if initializer_tensor.flatten().size > 0: _calibrator.collect(initializer_tensor) weight_tensors_calib_range[initializer_tensor_name] = [list(_calibrator.calib_range)] @@ -560,16 +572,19 @@ def calculate_quantization_params(self, q_config, quantization_thresholds): qType = 2 # uint8 # input and output tensor follow activation_type and activation_sym - if tensor_name in input_name_to_nodes and \ - any([i.name in q_config for i in input_name_to_nodes[tensor_name]]): + if tensor_name in input_name_to_nodes and any( + [i.name in q_config for i in input_name_to_nodes[tensor_name]] + ): for child in input_name_to_nodes[tensor_name]: if child.name in q_config and q_config[child.name] not in ["fp32", "fp16", "bf16"]: sym = q_config[child.name]["activation_sym"] qType = q_config[child.name]["activation_type"] break - elif tensor_name in output_name_to_node and \ - output_name_to_node[tensor_name].name in q_config and \ - q_config[output_name_to_node[tensor_name].name] not in ["fp32", "fp16", "bf16"]: + elif ( + tensor_name in output_name_to_node + and output_name_to_node[tensor_name].name in q_config + and q_config[output_name_to_node[tensor_name].name] not in ["fp32", "fp16", "bf16"] + ): sym = q_config[output_name_to_node[tensor_name].name]["activation_sym"] qType = q_config[output_name_to_node[tensor_name].name]["activation_type"] if self.execution_provider in ["TensorrtExecutionProvider"]: diff --git a/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py b/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py index 8ffbb0c46..042518092 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py @@ -20,6 +20,7 @@ """Calibrator for onnx models.""" import copy + import numpy as np from scipy import stats diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/__init__.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/__init__.py index 25f7fe13b..454c3ea69 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/__init__.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/__init__.py @@ -13,8 +13,9 @@ # limitations under the License. """Operators for onnx model.""" -from os import path import glob +from os import path + from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op modules = glob.glob(path.join(path.dirname(__file__), "*.py")) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/activation.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/activation.py index baaa82d9a..c06d92dac 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/activation.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/activation.py @@ -15,10 +15,9 @@ import onnx -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op +from onnx_neural_compressor import constants, utility from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor import constants -from onnx_neural_compressor import utility +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op @base_op.op_registry(op_types="LeakyRelu, Sigmoid", mode=[constants.STATIC_QUANT]) @@ -101,7 +100,10 @@ def quantize(self): self.quantizer.model.replace_input_of_all_nodes(node.output[0], node.input[0]) self.quantizer.remove_nodes.append(node) -@base_op.op_registry(op_types="Softmax, BiasGelu, Elu, Exp, FastGelu, Gelu, Softplus, Tanh", mode=[constants.STATIC_QUANT]) + +@base_op.op_registry( + op_types="Softmax, BiasGelu, Elu, Exp, FastGelu, Gelu, Softplus, Tanh", mode=[constants.STATIC_QUANT] +) class Float16ActivationOperator(base_op.Operator): """Float16 Activation operator.""" diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/argmax.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/argmax.py index a6932a8f9..594e24c05 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/argmax.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/argmax.py @@ -13,10 +13,9 @@ # limitations under the License. """ArgMax operator.""" -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op +from onnx_neural_compressor import constants, utility from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor import constants -from onnx_neural_compressor import utility +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op @base_op.op_registry(op_types="ArgMax", mode=[constants.STATIC_QUANT]) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/attention.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/attention.py index d8da2f1ed..46f102352 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/attention.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/attention.py @@ -15,10 +15,9 @@ import onnx -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op +from onnx_neural_compressor import constants, utility from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor import constants -from onnx_neural_compressor import utility +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op @base_op.op_registry(op_types="Attention", mode=[constants.DYNAMIC_QUANT, constants.STATIC_QUANT]) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py index 7e0f0e805..c3c97617a 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py @@ -13,9 +13,10 @@ # limitations under the License. """Base Operator.""" -from onnx_neural_compressor import constants from onnxruntime import quantization +from onnx_neural_compressor import constants + OPERATORS = { "dynamic_quant": {}, "static_quant": {}, diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/binary_op.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/binary_op.py index a40f2e43a..4aa1637b7 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/binary_op.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/binary_op.py @@ -15,10 +15,9 @@ import onnx -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op +from onnx_neural_compressor import constants, utility from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor import constants -from onnx_neural_compressor import utility +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op @base_op.op_registry(op_types="Add, Mul", mode=[constants.STATIC_QUANT]) @@ -139,7 +138,10 @@ def convert(self): self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized") node.output[0] = node.output[0] + "_quantized" -@base_op.op_registry(op_types="Sum, Sub, Div, Pow, Equal, Greater, GreaterOrEqual, Less, LessOrEqual", mode=[constants.STATIC_QUANT]) + +@base_op.op_registry( + op_types="Sum, Sub, Div, Pow, Equal, Greater, GreaterOrEqual, Less, LessOrEqual", mode=[constants.STATIC_QUANT] +) class Float16BinaryOperator(base_op.Operator): """Float16 Binary operator.""" diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/concat.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/concat.py index 1a0df76ba..9e0f0ff6b 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/concat.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/concat.py @@ -15,10 +15,9 @@ import onnx -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op +from onnx_neural_compressor import constants, utility from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor import constants -from onnx_neural_compressor import utility +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op @base_op.op_registry(op_types="Concat", mode=[constants.STATIC_QUANT]) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/conv.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/conv.py index 8e305535c..ede7e1bfa 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/conv.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/conv.py @@ -17,9 +17,9 @@ import onnx from onnx import onnx_pb as onnx_proto -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op -from onnx_neural_compressor.algorithms import utility as quant_utils from onnx_neural_compressor import constants +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op @base_op.op_registry(op_types="Conv, FusedConv", mode=[constants.DYNAMIC_QUANT]) @@ -115,9 +115,7 @@ def convert(self): scales_mul_node = quant_utils.find_by_name(scales_mul_op, self.quantizer.new_nodes) if scales_mul_node is None: - scales_mul_node = onnx.helper.make_node( - "Mul", [scale_0, scale_1], [scales_mul_op + ":0"], scales_mul_op - ) + scales_mul_node = onnx.helper.make_node("Mul", [scale_0, scale_1], [scales_mul_op + ":0"], scales_mul_op) self.quantizer.new_nodes.append(scales_mul_node) scales_mul_op_output = scales_mul_node.output[0] @@ -126,13 +124,12 @@ def convert(self): # and make the output of this node the same as output of original conv node. output_scale_mul_op = node.name + "_output_scale_mul" self.quantizer.new_nodes.append( - onnx.helper.make_node( - "Mul", [cast_op_output, scales_mul_op_output], [node.output[0]], output_scale_mul_op - ) + onnx.helper.make_node("Mul", [cast_op_output, scales_mul_op_output], [node.output[0]], output_scale_mul_op) ) self.quantizer.remove_nodes.extend(parents[1:]) self.quantizer.remove_nodes.append(node) + @base_op.op_registry(op_types="Conv, FusedConv", mode=[constants.STATIC_QUANT]) class StaticConvOperator(ConvOperator): """Conv Operator.""" @@ -174,9 +171,7 @@ def convert(self): """Convert to QOperator format.""" node = self.node - if len(self.quantizer.model.get_children(node)) == 0 or not node.name.endswith( - "_quant" - ): # pragma: no cover + if len(self.quantizer.model.get_children(node)) == 0 or not node.name.endswith("_quant"): # pragma: no cover return parents = self.quantizer.model.get_parents(node) child = self.quantizer.model.get_children(node)[0] diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/direct_q8.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/direct_q8.py index 79639186a..77d09793b 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/direct_q8.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/direct_q8.py @@ -13,16 +13,15 @@ # limitations under the License. """Direct8Bit Operator.""" -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op +from onnx_neural_compressor import constants, utility from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor import constants -from onnx_neural_compressor import utility +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op @base_op.op_registry( op_types="Reshape, Transpose, Squeeze, Unsqueeze, Flatten, Expand, Slice, " "SpaceToDepth, DepthToSpace, Upsample, Tile, CenterCropPad", - mode=[constants.STATIC_QUANT] + mode=[constants.STATIC_QUANT], ) class Direct8BitOperator(base_op.Operator): """Direct8Bit Operator.""" @@ -66,8 +65,9 @@ def convert(self): for parent in parents: if parent.op_type == "DequantizeLinear": # make sure parent DequantizeLinear of input 0 is not used by other ops - if len(self.quantizer.model.get_children(parent)) == 1 and \ - not self.quantizer.model.is_graph_output(parents[0].output[0]): + if len(self.quantizer.model.get_children(parent)) == 1 and not self.quantizer.model.is_graph_output( + parents[0].output[0] + ): self.quantizer.remove_nodes.append(parent) self.node.input[0] = parent.input[0] break diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/embed_layernorm.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/embed_layernorm.py index a4e35796c..0b9967f3d 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/embed_layernorm.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/embed_layernorm.py @@ -15,10 +15,9 @@ import onnx -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op +from onnx_neural_compressor import constants, utility from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor import constants -from onnx_neural_compressor import utility +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op @base_op.op_registry(op_types="EmbedLayerNormalization", mode=[constants.DYNAMIC_QUANT, constants.STATIC_QUANT]) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py index 4a573d08b..fd851885f 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py @@ -15,13 +15,14 @@ import onnx -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op +from onnx_neural_compressor import constants, utility from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor import constants -from onnx_neural_compressor import utility +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op -@base_op.op_registry(op_types="Gather, GatherElements, GatherND", mode=[constants.DYNAMIC_QUANT, constants.STATIC_QUANT]) +@base_op.op_registry( + op_types="Gather, GatherElements, GatherND", mode=[constants.DYNAMIC_QUANT, constants.STATIC_QUANT] +) class GatherOperator(base_op.Operator): """Gather Operator.""" diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/gavgpool.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/gavgpool.py index 6d3dfb460..a91c1e531 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/gavgpool.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/gavgpool.py @@ -15,10 +15,9 @@ import onnx -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op +from onnx_neural_compressor import constants, utility from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor import constants -from onnx_neural_compressor import utility +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op @base_op.op_registry(op_types="GlobalAveragePool", mode=[constants.STATIC_QUANT]) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/gemm.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/gemm.py index 8e05ea63b..8d0b61c73 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/gemm.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/gemm.py @@ -15,10 +15,9 @@ import onnx -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op +from onnx_neural_compressor import constants, logger from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor import constants -from onnx_neural_compressor import logger +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op @base_op.op_registry(op_types="Gemm", mode=[constants.STATIC_QUANT]) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/lstm.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/lstm.py index bfc48ff79..8499f2441 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/lstm.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/lstm.py @@ -16,10 +16,9 @@ import numpy import onnx -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op +from onnx_neural_compressor import constants, utility from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor import constants -from onnx_neural_compressor import utility +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op @base_op.op_registry(op_types="LSTM", mode=[constants.DYNAMIC_QUANT]) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/matmul.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/matmul.py index 18b9841a9..eff98f533 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/matmul.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/matmul.py @@ -16,9 +16,9 @@ import onnx from onnx import onnx_pb as onnx_proto -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op -from onnx_neural_compressor.algorithms import utility as quant_utils from onnx_neural_compressor import constants +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op @base_op.op_registry(op_types="MatMul", mode=[constants.DYNAMIC_QUANT]) @@ -93,9 +93,7 @@ def convert(self): scales_mul_node = quant_utils.find_by_name(scales_mul_op, self.quantizer.new_nodes) if scales_mul_node is None: - scales_mul_node = onnx.helper.make_node( - "Mul", [scale[0], scale[1]], [scales_mul_op + ":0"], scales_mul_op - ) + scales_mul_node = onnx.helper.make_node("Mul", [scale[0], scale[1]], [scales_mul_op + ":0"], scales_mul_op) self.quantizer.new_nodes.append(scales_mul_node) scales_mul_op_output = scales_mul_node.output[0] @@ -104,9 +102,7 @@ def convert(self): # and make the output of this node the same as output of original matmul node. output_scale_mul_op = node.name + "_output_scale_mul" self.quantizer.new_nodes.append( - onnx.helper.make_node( - "Mul", [cast_op_output, scales_mul_op_output], [node.output[0]], output_scale_mul_op - ) + onnx.helper.make_node("Mul", [cast_op_output, scales_mul_op_output], [node.output[0]], output_scale_mul_op) ) if parents[1].op_type == "DequantizeLinear": self.quantizer.remove_nodes.append(parents[1]) @@ -138,9 +134,7 @@ def convert(self): """Convert to QOperator format.""" node = self.node parents = self.quantizer.model.get_parents(node) - if len(self.quantizer.model.get_children(node)) == 0 or not node.name.endswith( - "_quant" - ): # pragma: no cover + if len(self.quantizer.model.get_children(node)) == 0 or not node.name.endswith("_quant"): # pragma: no cover return qlinear_matmul_inputs = [] @@ -166,8 +160,9 @@ def convert(self): self.quantizer.remove_nodes.append(node) # make sure parent DequantizeLinear of input 0 is not used by other ops - if len(self.quantizer.model.get_children(parents[0])) == 1 and \ - not self.quantizer.model.is_graph_output(parents[0].output[0]): + if len(self.quantizer.model.get_children(parents[0])) == 1 and not self.quantizer.model.is_graph_output( + parents[0].output[0] + ): self.quantizer.remove_nodes.extend(parents) else: self.quantizer.remove_nodes.append(parents[1]) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/maxpool.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/maxpool.py index 1e86984cb..cd5119c13 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/maxpool.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/maxpool.py @@ -13,10 +13,9 @@ # limitations under the License. """MaxPool Operator.""" -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op +from onnx_neural_compressor import constants, utility from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor import constants -from onnx_neural_compressor import utility +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op @base_op.op_registry(op_types="MaxPool", mode=[constants.STATIC_QUANT]) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py index f5abde24b..61f7efd9e 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py @@ -15,10 +15,9 @@ import onnx -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op +from onnx_neural_compressor import constants, utility from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor import constants -from onnx_neural_compressor import utility +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op @base_op.op_registry(op_types="Pad", mode=[constants.STATIC_QUANT]) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/pooling.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/pooling.py index 1a5b43fd8..fb97ce630 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/pooling.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/pooling.py @@ -15,10 +15,9 @@ import onnx -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op +from onnx_neural_compressor import constants, utility from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor import constants -from onnx_neural_compressor import utility +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op @base_op.op_registry(op_types="AveragePool", mode=[constants.STATIC_QUANT]) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/reduce.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/reduce.py index 9a089b08e..f89000e2e 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/reduce.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/reduce.py @@ -13,14 +13,14 @@ # limitations under the License. """Reduce Operator.""" -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op +from onnx_neural_compressor import constants, utility from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor import constants -from onnx_neural_compressor import utility +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op @base_op.op_registry( - op_types="ReduceMean, ReduceLogSum, ReduceLogSumExp, " "ReduceL1, ReduceL2, ReduceProd, ReduceSum, ReduceSumSquare" , mode=[constants.STATIC_QUANT] + op_types="ReduceMean, ReduceLogSum, ReduceLogSumExp, " "ReduceL1, ReduceL2, ReduceProd, ReduceSum, ReduceSumSquare", + mode=[constants.STATIC_QUANT], ) class ReduceOperator(base_op.Operator): """Reduce Operator.""" diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/resize.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/resize.py index 177dd6ec9..0cba83441 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/resize.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/resize.py @@ -13,10 +13,9 @@ # limitations under the License. """Resize Operator.""" -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op +from onnx_neural_compressor import constants, utility from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor import constants -from onnx_neural_compressor import utility +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op @base_op.op_registry(op_types="Resize", mode=[constants.STATIC_QUANT]) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py index 551c97acc..97bded14f 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py @@ -15,10 +15,9 @@ import onnx -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op +from onnx_neural_compressor import constants, utility from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor import constants -from onnx_neural_compressor import utility +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op @base_op.op_registry(op_types="Split", mode=[constants.STATIC_QUANT]) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/operators/unary_op.py b/onnx_neural_compressor/algorithms/post_training_quant/operators/unary_op.py index 9d081d9d4..87c402b99 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/operators/unary_op.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/operators/unary_op.py @@ -13,10 +13,9 @@ # limitations under the License. """Unary operator.""" -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op +from onnx_neural_compressor import constants, utility from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor import constants -from onnx_neural_compressor import utility +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op @base_op.op_registry(op_types="Exp, Log, Round, Sqrt", mode=[constants.STATIC_QUANT]) diff --git a/onnx_neural_compressor/algorithms/post_training_quant/quantizer.py b/onnx_neural_compressor/algorithms/post_training_quant/quantizer.py index c3e46730d..4e8b815e5 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/quantizer.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/quantizer.py @@ -16,14 +16,14 @@ import copy import logging import os -import onnxruntime as ort + import numpy as np import onnx +import onnxruntime as ort -from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op +from onnx_neural_compressor import logger, onnx_model from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor import logger -from onnx_neural_compressor import onnx_model +from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op class Quantizer: @@ -143,10 +143,7 @@ def should_quantize(self, node): def should_convert(self, node): """Check if node should be converted.""" name = quant_utils.get_node_original_name(node) - if ( - name in self.config - and self.config[name] not in self.fallback_list - ): + if name in self.config and self.config[name] not in self.fallback_list: return True else: return False @@ -230,8 +227,12 @@ def merge_dedicated_qdq_pair(self): for n in dq_nodes: datas.append( [ - onnx.numpy_helper.to_array(quant_utils.find_by_name(n.input[1], self.model.initializer())), - onnx.numpy_helper.to_array(quant_utils.find_by_name(n.input[2], self.model.initializer())), + onnx.numpy_helper.to_array( + quant_utils.find_by_name(n.input[1], self.model.initializer()) + ), + onnx.numpy_helper.to_array( + quant_utils.find_by_name(n.input[2], self.model.initializer()) + ), ] ) for idx, data in enumerate(datas): @@ -280,15 +281,16 @@ def remove_duplicate_qdq_paris(self): for node in self.model.nodes(): if node.op_type == "DequantizeLinear": matched_parents = self.model.match_parent_path( - node, - ["QuantizeLinear", "DequantizeLinear", "QuantizeLinear"], - [None, None, None], - ) + node, + ["QuantizeLinear", "DequantizeLinear", "QuantizeLinear"], + [None, None, None], + ) if matched_parents is not None: # (node) DQ - (matched_parents) Q-DQ-Q - if all([i.op_type == "QuantizeLinear" for i in self.model.get_children(matched_parents[1])]) and \ - not self.model.is_graph_output(matched_parents[1].output[0]): + if all( + [i.op_type == "QuantizeLinear" for i in self.model.get_children(matched_parents[1])] + ) and not self.model.is_graph_output(matched_parents[1].output[0]): self.remove_nodes.append(matched_parents[1]) if all([i.op_type == "DequantizeLinear" for i in self.model.get_children(matched_parents[0])]): self.remove_nodes.append(matched_parents[0]) @@ -337,7 +339,8 @@ def quantize_bias_tensor(self, node): or input_name not in self.quantized_value_map or ( input_name in self.quantized_value_map - and quant_utils.find_by_name(self.quantized_value_map[input_name].scale_name, self.model.initializer()) is None + and quant_utils.find_by_name(self.quantized_value_map[input_name].scale_name, self.model.initializer()) + is None ) ): self._dynamic_quantize_bias(input_name, weight_name + "_scale", bias_name, bias_name + "_quantized") @@ -505,7 +508,6 @@ def quantize_weight_per_channel(self, weight_name, weight_qType, sym, channel_ax return (weight.name + "_quantized", weight.name + "_zero_point", weight.name + "_scale") - def dequantize_tensor(self, node, value_name): """Dequantize tensor.""" if value_name in self.quantized_value_map: @@ -749,7 +751,9 @@ def quantize_outputs(self, node, initializer_use_weight_qType=True, direct_int8= for child in self.model.get_children(node): self.replace_input.append([child, tensor_name, dequant_node.output[0]]) if tensor_name not in self.quantized_value_map: - quantized_value = quant_utils.QuantizedValue(tensor_name, dq_output, scale_name, zp_name, quant_utils.QuantizedValueType.Input) + quantized_value = quant_utils.QuantizedValue( + tensor_name, dq_output, scale_name, zp_name, quant_utils.QuantizedValueType.Input + ) self.quantized_value_map[tensor_name] = quantized_value def quantize_inputs(self, node, indices=None, initializer_use_weight_qType=True, direct_int8=False): @@ -799,7 +803,13 @@ def quantize_inputs(self, node, indices=None, initializer_use_weight_qType=True, self.replace_input.append([node, weight.name, dequant_node.output[0]]) if weight.name not in self.quantized_value_map: quantized_value = quant_utils.QuantizedValue( - weight.name, q_weight_name, scale_name, zp_name, quant_utils.QuantizedValueType.Initializer, None, dtype + weight.name, + q_weight_name, + scale_name, + zp_name, + quant_utils.QuantizedValueType.Initializer, + None, + dtype, ) self.quantized_value_map[weight.name] = quantized_value else: @@ -822,9 +832,7 @@ def quantize_weights_per_channel(self, node, indices, weight_qType, sym, axis): continue q_name, zp_name, scale_name = self.quantize_weight_per_channel(inp, weight_qType, sym, axis) - weight_name = ( - ("_").join([inp, str(weight_qType)]) if self.model.get_initializer_share_num(inp) > 1 else inp - ) + weight_name = ("_").join([inp, str(weight_qType)]) if self.model.get_initializer_share_num(inp) > 1 else inp dequant_node = onnx.helper.make_node( "DequantizeLinear", [q_name, scale_name, zp_name], @@ -848,6 +856,7 @@ def quantize_weights_per_channel(self, node, indices, weight_qType, sym, axis): ) self.new_nodes.append(qlinear_node) + class StaticQuantizer(Quantizer): """Static quantizer class.""" @@ -887,7 +896,7 @@ def __init__( static=True, quantization_params=quantization_params, op_types_to_quantize=op_types_to_quantize, - ) + ) self.fallback_list = fallback_list self.reduce_range = reduce_range self.add_qdq_pair_to_weight = add_qdq_pair_to_weight @@ -919,9 +928,9 @@ def _revert_conv_add_fusion(self): for node in self.model.nodes(): if node.op_type == "Conv" and len(node.input) == 3: bias_tensor = self.model.get_initializer(node.input[2]) - bias_array = numpy_helper.to_array(bias_tensor).reshape((-1, 1, 1)) + bias_array = onnx.numpy_helper.to_array(bias_tensor).reshape((-1, 1, 1)) self.model.remove_initializer(bias_tensor) - self.model.add_initializer(numpy_helper.from_array(bias_array, bias_tensor.name)) + self.model.add_initializer(onnx.numpy_helper.from_array(bias_array, bias_tensor.name)) kwargs = {} activation_params = None for attr in node.attribute: @@ -994,6 +1003,7 @@ def _quantize_activation(self, node, tensor_name, direct_int8=False): ) self.quantized_value_map[tensor_name] = quantized_value + class DynamicQuantizer(Quantizer): """Dynamic quantizer class.""" @@ -1027,13 +1037,11 @@ def __init__( static=False, quantization_params=quantization_params, op_types_to_quantize=op_types_to_quantize, - ) + ) def _quantize_activation(self, node, tensor_name, direct_int8=False): """Quantize node activation.""" - qlinear_node = self.model.find_node_by_name( - tensor_name + "_QuantizeLinear", self.new_nodes, self.model.graph() - ) + qlinear_node = self.model.find_node_by_name(tensor_name + "_QuantizeLinear", self.new_nodes, self.model.graph()) if qlinear_node is None: if ( self.fuse_dynamic_quant diff --git a/onnx_neural_compressor/algorithms/smoother/core.py b/onnx_neural_compressor/algorithms/smoother/core.py index f4bf049c2..ab902de07 100644 --- a/onnx_neural_compressor/algorithms/smoother/core.py +++ b/onnx_neural_compressor/algorithms/smoother/core.py @@ -20,6 +20,7 @@ import numpy as np import onnx import onnxruntime as ort + from onnx_neural_compressor import data_reader, logger, onnx_model, utility from onnx_neural_compressor.algorithms import utility as quant_utils from onnx_neural_compressor.algorithms.smoother import calibrator diff --git a/onnx_neural_compressor/algorithms/utility.py b/onnx_neural_compressor/algorithms/utility.py index 06e270c33..a45ee2683 100644 --- a/onnx_neural_compressor/algorithms/utility.py +++ b/onnx_neural_compressor/algorithms/utility.py @@ -17,17 +17,17 @@ import enum import os +import pathlib import re import struct import sys from importlib import util import numpy as np +from onnxruntime.quantization import onnx_model from packaging import version -from onnx_neural_compressor import constants -from onnx_neural_compressor import utility -from onnxruntime.quantization import onnx_model +from onnx_neural_compressor import constants, utility, logger if sys.version_info < (3, 11) and util.find_spec("onnxruntime_extensions"): # pragma: no cover import onnxruntime_extensions @@ -80,6 +80,7 @@ onnx.TensorProto.INT8: (-64, 64), } + def check_model_with_infer_shapes(model): """Check if the model has been shape inferred.""" if isinstance(model, (pathlib.Path, str)): @@ -90,6 +91,7 @@ def check_model_with_infer_shapes(model): return True return False + def find_by_name(name, item_list): """Helper function to find item by name in a list.""" items = [] @@ -102,9 +104,11 @@ def find_by_name(name, item_list): else: return None + def is_quantizable_type(data_type): return data_type in [onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16, onnx.TensorProto.BFLOAT16] + def get_qmin_qmax_for_qType(qType, reduce_range=False, sym=False): # noqa: N802 """Get qmin, qmax for qType.""" if qType == onnx.TensorProto.FLOAT8E4M3FN: @@ -124,6 +128,7 @@ def get_qmin_qmax_for_qType(qType, reduce_range=False, sym=False): # noqa: N802 return qrange + def dtype_to_name(dtype_mapping, dtype): """Map data type and its string representation.""" return list(dtype_mapping.keys())[list(dtype_mapping.values()).index(dtype)] @@ -439,22 +444,25 @@ def calculate_scale_zp(rmin, rmax, quantize_range, qType, sym): if isinstance(rmax, np.ndarray): if sym: max_range = np.maximum(abs(rmin), abs(rmax)) - rmin = - max_range + rmin = -max_range rmax = max_range scale = (rmax - rmin) / (qmax - qmin) scale[scale < np.finfo(rmax.dtype).tiny] = 1 - zero_point = np.multiply(np.ones(rmax.shape), np.round((qmax + qmin) / 2.0)).astype(dtype) if sym else \ - np.round(qmin - rmin / scale).astype(dtype) + zero_point = ( + np.multiply(np.ones(rmax.shape), np.round((qmax + qmin) / 2.0)).astype(dtype) + if sym + else np.round(qmin - rmin / scale).astype(dtype) + ) else: if sym: max_range = max(abs(rmin), abs(rmax)) scale = (float(max_range) * 2) / (qmax - qmin) if max_range > 0 else 1 else: scale = (float(rmax) - float(rmin)) / (qmax - qmin) if rmin != rmax else 1 - zero_point = np.round((qmax + qmin) / 2.0).astype(dtype) if sym else \ - np.round(qmin - rmin / scale).astype(dtype) + zero_point = np.round((qmax + qmin) / 2.0).astype(dtype) if sym else np.round(qmin - rmin / scale).astype(dtype) return np.float32(scale), zero_point + def quantize_data(data, quantize_range, qType, sym): """Quantize data. @@ -493,6 +501,7 @@ def get_node_original_name(node) -> str: # For unquantized nodes return node_name + class QuantType(enum.Enum): # pragma: no cover """Represent QuantType value.""" @@ -758,6 +767,7 @@ def _get_value(self, node, idx): raise Exception("Incomplete symbolic shape inference") return symbolic_shape_inference.out_mp_ + def dump_model_op_stats(model, quantize_config, fp32_op_list): qdq_ops = ["QuantizeLinear", "DequantizeLinear", "DynamicQuantizeLinear"] res = {} @@ -800,6 +810,7 @@ def dump_model_op_stats(model, quantize_config, fp32_op_list): utility.Statistics(output_data, header="Quantization Statistics", field_names=field_names).print_stat() + def dump_woq_stats(model, quantize_config, fp32_op_list): res = {} for optype in fp32_op_list: diff --git a/onnx_neural_compressor/algorithms/weight_only/awq.py b/onnx_neural_compressor/algorithms/weight_only/awq.py index bb8783528..b2db33dcb 100644 --- a/onnx_neural_compressor/algorithms/weight_only/awq.py +++ b/onnx_neural_compressor/algorithms/weight_only/awq.py @@ -25,8 +25,8 @@ from packaging import version from onnx_neural_compressor import config, constants, data_reader, logger, onnx_model -from onnx_neural_compressor.algorithms.weight_only import rtn from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.weight_only import rtn from typing import List, Union # isort: skip @@ -107,7 +107,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, else: q_weight = quant_utils.qdq_tensor(weight, num_bits, group_size, sym, "int") - q_weight = q_weight[:org_w_shape[0], :] / np.expand_dims(scales, axis=-1) + q_weight = q_weight[: org_w_shape[0], :] / np.expand_dims(scales, axis=-1) out = np.matmul(inp, q_weight) loss += np.mean(np.power((org_out - out), 2)) @@ -258,7 +258,7 @@ def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts, num_bits, else: weight = quant_utils.qdq_tensor(weight, num_bits, group_size, sym, "int", ratio) - cur_out = np.matmul(inp, weight[:, :org_w_shape[0]].T) + cur_out = np.matmul(inp, weight[:, : org_w_shape[0]].T) loss = np.mean(np.power((org_out - cur_out), 2)) is_best = loss < best_error if is_best: diff --git a/onnx_neural_compressor/algorithms/weight_only/gptq.py b/onnx_neural_compressor/algorithms/weight_only/gptq.py index f0a1b9038..c95c346f8 100644 --- a/onnx_neural_compressor/algorithms/weight_only/gptq.py +++ b/onnx_neural_compressor/algorithms/weight_only/gptq.py @@ -25,8 +25,8 @@ from packaging.version import Version from onnx_neural_compressor import config, constants, data_reader, onnx_model, utility -from onnx_neural_compressor.algorithms.layer_wise import core from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.layer_wise import core from typing import List, Union # isort: skip diff --git a/onnx_neural_compressor/algorithms/weight_only/rtn.py b/onnx_neural_compressor/algorithms/weight_only/rtn.py index f570b5271..6856f378d 100644 --- a/onnx_neural_compressor/algorithms/weight_only/rtn.py +++ b/onnx_neural_compressor/algorithms/weight_only/rtn.py @@ -24,8 +24,8 @@ from packaging import version from onnx_neural_compressor import config, constants, onnx_model, utility -from onnx_neural_compressor.algorithms.layer_wise import core from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.layer_wise import core from typing import List, Union # isort: skip diff --git a/onnx_neural_compressor/config.py b/onnx_neural_compressor/config.py index 7b32b5b79..59d0ceb65 100644 --- a/onnx_neural_compressor/config.py +++ b/onnx_neural_compressor/config.py @@ -23,19 +23,16 @@ import os import pathlib import re -from abc import ABC -from abc import abstractmethod +from abc import ABC, abstractmethod import numpy as np import onnx import pydantic -from onnx_neural_compressor import constants -from onnx_neural_compressor import data_reader -from onnx_neural_compressor import logger -from onnx_neural_compressor import utility from onnxruntime import quantization from typing_extensions import Self +from onnx_neural_compressor import constants, data_reader, logger, utility + from collections import OrderedDict # isort: skip from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Type, Union, _GenericAlias # isort: skip @@ -302,7 +299,7 @@ def __getitem__(self, key): return getattr(self, key) else: raise KeyError(f"No such attribute: {key}") - + def __setitem__(self, key, value): setattr(self, key, value) @@ -421,7 +418,6 @@ def build_tuning_param(config: BaseConfig, param: str): raise ValueError(f"Unsupported param type: {param}") return tuning_param - def expand(self) -> List[BaseConfig]: """Expand the config. @@ -483,7 +479,9 @@ def expand(self) -> List[BaseConfig]: local_op_level_config_lst = model_level_config_lst else: tuning_param_name_lst = [tuning_param.name for tuning_param in op_tuning_param_list] - tuning_param_val_lst = list(itertools.product(*[tuning_param.options for tuning_param in op_tuning_param_list])) + tuning_param_val_lst = list( + itertools.product(*[tuning_param.options for tuning_param in op_tuning_param_list]) + ) tuning_param_pair_lst = [dict(zip(tuning_param_name_lst[::-1], val[::-1])) for val in tuning_param_val_lst] for model_level_config in model_level_config_lst: @@ -544,6 +542,7 @@ def __eq__(self, other: BaseConfig) -> bool: return False return self.get_init_args() == other.get_init_args() + class ComposableConfig(BaseConfig): name = constants.COMPOSABLE_CONFIG @@ -666,10 +665,18 @@ def to_dict(self): result = {} for key, val in self.__dict__.items(): if not isinstance(val, list): - result[key] = getattr(val, "tensor_type", val) if isinstance(val, quantization.QuantType) else getattr(val, "value", val) + result[key] = ( + getattr(val, "tensor_type", val) + if isinstance(val, quantization.QuantType) + else getattr(val, "value", val) + ) else: result[key] = [ - getattr(item, "tensor_type", item) if isinstance(item, quantization.QuantType) else getattr(item, "value", item) + ( + getattr(item, "tensor_type", item) + if isinstance(item, quantization.QuantType) + else getattr(item, "value", item) + ) for item in val ] return result @@ -680,6 +687,7 @@ def __eq__(self, other): else: return self.to_dict() == other + class _OperatorConfig(NamedTuple): config: OperatorConfig operators: List[Union[str, Callable]] @@ -1218,7 +1226,6 @@ class StaticQuantConfig(BaseConfig, quantization.StaticQuantConfig): ] name: str = constants.STATIC_QUANT - def __init__( self, calibration_data_reader: data_reader.CalibrationDataReader = None, @@ -1268,7 +1275,11 @@ def __init__( if execution_provider is None: execution_provider = utility.auto_detect_ep() if op_types_to_quantize is None: - op_types_to_quantize = constants.STATIC_QOPERATOR_OP_LIST_MAP.get(execution_provider, []) if quant_format == quantization.QuantFormat.QOperator else constants.STATIC_QDQ_OP_LIST_MAP.get(execution_provider, []) + op_types_to_quantize = ( + constants.STATIC_QOPERATOR_OP_LIST_MAP.get(execution_provider, []) + if quant_format == quantization.QuantFormat.QOperator + else constants.STATIC_QDQ_OP_LIST_MAP.get(execution_provider, []) + ) if not reduce_range and not utility.CpuInfo().vnni: logger.warning( "VNNI is not supported and reduce_range=False, reduce_range=True is recommended to avoid potential accuracy issue." @@ -1292,14 +1303,16 @@ def __init__( if "TensorrtExecutionProvider" in execution_provider: logger.info("Update some parameters for TensorrtExecutionProvider") os.environ["ORT_TENSORRT_INT8_ENABLE"] = "0" - self.extra_options.update({ - "add_qdq_pair_to_weight": True, - "dedicated_qdq_pair": True, - "optypes_to_exclude_output_quant": ["Conv", "Gemm", "Add", "MatMul"], - }) + self.extra_options.update( + { + "add_qdq_pair_to_weight": True, + "dedicated_qdq_pair": True, + "optypes_to_exclude_output_quant": ["Conv", "Gemm", "Add", "MatMul"], + } + ) else: os.environ["ORT_TENSORRT_UNAVAILABLE"] = "1" - + BaseConfig.__init__(self, white_list=self.op_types_to_quantize) self.execution_provider = execution_provider self.quant_last_matmul = quant_last_matmul @@ -1314,7 +1327,7 @@ def __init__( def get_model_info(model, white_list=constants.STATIC_QOPERATOR_CPU_OP_LIST) -> list: if not isinstance(model, onnx.ModelProto): model = onnx.load(model, load_external_data=False) - + filter_result = [] for node in model.graph.node: if node.op_type in white_list: @@ -1355,11 +1368,23 @@ def to_config_mapping(self, config_list: list = None, model_info: list = None) - op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() last_matmul = None for op_name, op_type in model_info: - if isinstance(self.op_types_to_quantize, list) and len(self.op_types_to_quantize) > 0 and op_type not in self.op_types_to_quantize: + if ( + isinstance(self.op_types_to_quantize, list) + and len(self.op_types_to_quantize) > 0 + and op_type not in self.op_types_to_quantize + ): continue - if isinstance(self.nodes_to_quantize, list) and len(self.nodes_to_quantize) > 0 and op_name not in self.nodes_to_quantize: + if ( + isinstance(self.nodes_to_quantize, list) + and len(self.nodes_to_quantize) > 0 + and op_name not in self.nodes_to_quantize + ): continue - if isinstance(self.nodes_to_exclude, list) and len(self.nodes_to_exclude) > 0 and op_name in self.nodes_to_exclude: + if ( + isinstance(self.nodes_to_exclude, list) + and len(self.nodes_to_exclude) > 0 + and op_name in self.nodes_to_exclude + ): continue if op_type in op_type_config_dict: self._config_mapping[op_name] = op_type_config_dict[op_type] @@ -1390,13 +1415,21 @@ def get_config_set_for_tuning( execution_provider = utility.auto_detect_ep() StaticQuantConfig.register_supported_configs() if op_types_to_quantize is None: - op_types_to_quantize = constants.STATIC_QOPERATOR_OP_LIST_MAP.get(execution_provider, []) if quant_format == quantization.QuantFormat.QOperator else constants.STATIC_QDQ_OP_LIST_MAP.get(execution_provider, []) + op_types_to_quantize = ( + constants.STATIC_QOPERATOR_OP_LIST_MAP.get(execution_provider, []) + if quant_format == quantization.QuantFormat.QOperator + else constants.STATIC_QDQ_OP_LIST_MAP.get(execution_provider, []) + ) op_type_candidate = [ op_types_to_quantize, list(set(op_types_to_quantize).difference({"Add", "Mul"})), list(set(op_types_to_quantize).difference({"Add", "Mul", "Gather", "GatherElements", "GatherND"})), - list(set(op_types_to_quantize).difference({"Add", "Mul", "Gather", "GatherElements", "GatherND", "Attention"})), + list( + set(op_types_to_quantize).difference( + {"Add", "Mul", "Gather", "GatherElements", "GatherND", "Attention"} + ) + ), ] cfg_lst = [] @@ -1426,58 +1459,103 @@ def register_supported_configs(cls) -> None: weight_type=onnx.TensorProto.UINT8, weight_sym=False, per_channel=[True, False], - calibrate_method=[quantization.CalibrationMethod.MinMax, quantization.CalibrationMethod.Entropy, quantization.CalibrationMethod.Percentile], + calibrate_method=[ + quantization.CalibrationMethod.MinMax, + quantization.CalibrationMethod.Entropy, + quantization.CalibrationMethod.Percentile, + ], activation_type=onnx.TensorProto.UINT8, activation_sym=False, ), operators=["GatherND", "GatherElements", "Gather"], valid_func_list=utility.STATIC_CHECK_FUNC_LIST, - )) + ) + ) supported_configs.append( _OperatorConfig( config=OperatorConfig( weight_type=onnx.TensorProto.UINT8, weight_sym=False, per_channel=False, - calibrate_method=[quantization.CalibrationMethod.MinMax, quantization.CalibrationMethod.Entropy, quantization.CalibrationMethod.Percentile], + calibrate_method=[ + quantization.CalibrationMethod.MinMax, + quantization.CalibrationMethod.Entropy, + quantization.CalibrationMethod.Percentile, + ], activation_type=onnx.TensorProto.UINT8, activation_sym=False, ), operators=["EmbedLayerNormalization"], valid_func_list=utility.STATIC_CHECK_FUNC_LIST, - )) + ) + ) supported_configs.append( _OperatorConfig( config=OperatorConfig( weight_type=onnx.TensorProto.INT8, weight_sym=True, per_channel=[True, False], - calibrate_method=[quantization.CalibrationMethod.MinMax, quantization.CalibrationMethod.Entropy, quantization.CalibrationMethod.Percentile], + calibrate_method=[ + quantization.CalibrationMethod.MinMax, + quantization.CalibrationMethod.Entropy, + quantization.CalibrationMethod.Percentile, + ], activation_type=onnx.TensorProto.UINT8, activation_sym=False, ), operators=["Conv", "MatMul", "Gemm", "FusedConv"], valid_func_list=utility.STATIC_CHECK_FUNC_LIST, - )) + ) + ) supported_configs.append( _OperatorConfig( config=OperatorConfig( weight_type=onnx.TensorProto.INT8, weight_sym=True, per_channel=False, - calibrate_method=[quantization.CalibrationMethod.MinMax, quantization.CalibrationMethod.Entropy, quantization.CalibrationMethod.Percentile], + calibrate_method=[ + quantization.CalibrationMethod.MinMax, + quantization.CalibrationMethod.Entropy, + quantization.CalibrationMethod.Percentile, + ], activation_type=onnx.TensorProto.UINT8, activation_sym=False, ), operators=[ - "Relu", "Clip", "LeakyRelu", "Sigmoid", "MaxPool", "GlobalAveragePool", - "Pad", "Split", "Squeeze", "Reshape", "Concat", "AveragePool", "Tile", - "Unsqueeze", "Transpose", "Resize", "Abs", "Shrink", "Sign", "Attention", - "Flatten", "Expand", "Slice", "Mod", "ReduceMax", "ReduceMin", - "CenterCropPad", "Add", "Mul", "ArgMax", + "Relu", + "Clip", + "LeakyRelu", + "Sigmoid", + "MaxPool", + "GlobalAveragePool", + "Pad", + "Split", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Tile", + "Unsqueeze", + "Transpose", + "Resize", + "Abs", + "Shrink", + "Sign", + "Attention", + "Flatten", + "Expand", + "Slice", + "Mod", + "ReduceMax", + "ReduceMin", + "CenterCropPad", + "Add", + "Mul", + "ArgMax", ], valid_func_list=utility.STATIC_CHECK_FUNC_LIST, - )) + ) + ) cls.supported_configs = supported_configs def to_dict(self): @@ -1492,10 +1570,18 @@ def to_dict(self): result[key] = local_result continue if not isinstance(val, list): - result[key] = getattr(val, "tensor_type", val) if isinstance(val, quantization.QuantType) else getattr(val, "value", val) + result[key] = ( + getattr(val, "tensor_type", val) + if isinstance(val, quantization.QuantType) + else getattr(val, "value", val) + ) else: result[key] = [ - getattr(item, "tensor_type", item) if isinstance(item, quantization.QuantType) else getattr(item, "value", item) + ( + getattr(item, "tensor_type", item) + if isinstance(item, quantization.QuantType) + else getattr(item, "value", item) + ) for item in val ] return result @@ -1590,6 +1676,7 @@ def get_config_set_for_tuning( ) -> Union[None, "SmoothQuantConfig", List["SmoothQuantConfig"]]: # pragma: no cover return SmoothQuantConfig(alpha=np.arange(0.3, 0.7, 0.05)) + def get_default_sq_config() -> SmoothQuantConfig: """Generate the default smooth quant config. @@ -1670,7 +1757,7 @@ def __init__( def get_model_info(model, white_list=constants.DYNAMIC_CPU_OP_LIST) -> list: if not isinstance(model, onnx.ModelProto): model = onnx.load(model, load_external_data=False) - + filter_result = [] for node in model.graph.node: if node.op_type in white_list: @@ -1710,11 +1797,23 @@ def to_config_mapping(self, config_list: list = None, model_info: list = None) - op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() last_matmul = None for op_name, op_type in model_info: - if isinstance(self.op_types_to_quantize, list) and len(self.op_types_to_quantize) > 0 and op_type not in self.op_types_to_quantize: + if ( + isinstance(self.op_types_to_quantize, list) + and len(self.op_types_to_quantize) > 0 + and op_type not in self.op_types_to_quantize + ): continue - if isinstance(self.nodes_to_quantize, list) and len(self.nodes_to_quantize) > 0 and op_name not in self.nodes_to_quantize: + if ( + isinstance(self.nodes_to_quantize, list) + and len(self.nodes_to_quantize) > 0 + and op_name not in self.nodes_to_quantize + ): continue - if isinstance(self.nodes_to_exclude, list) and len(self.nodes_to_exclude) > 0 and op_name in self.nodes_to_exclude: + if ( + isinstance(self.nodes_to_exclude, list) + and len(self.nodes_to_exclude) > 0 + and op_name in self.nodes_to_exclude + ): continue if op_type in op_type_config_dict: self._config_mapping[op_name] = op_type_config_dict[op_type] @@ -1746,9 +1845,19 @@ def get_config_set_for_tuning( op_type_candidate = [ op_types_to_quantize, list(set(op_types_to_quantize).difference({"EmbedLayerNormalization", "Gather", "LSTM"})), - list(set(op_types_to_quantize).difference({"EmbedLayerNormalization", "Gather", "LSTM", "Conv", "FusedConv"})), - list(set(op_types_to_quantize).difference({"EmbedLayerNormalization", "Gather", "LSTM", "Conv", "FusedConv", "Attention"})), - list(set(op_types_to_quantize).difference({"EmbedLayerNormalization", "Gather", "LSTM", "Conv", "FusedConv", "MatMul"})), + list( + set(op_types_to_quantize).difference({"EmbedLayerNormalization", "Gather", "LSTM", "Conv", "FusedConv"}) + ), + list( + set(op_types_to_quantize).difference( + {"EmbedLayerNormalization", "Gather", "LSTM", "Conv", "FusedConv", "Attention"} + ) + ), + list( + set(op_types_to_quantize).difference( + {"EmbedLayerNormalization", "Gather", "LSTM", "Conv", "FusedConv", "MatMul"} + ) + ), ] cfg_lst = [] @@ -1780,7 +1889,8 @@ def register_supported_configs(cls) -> None: ), operators=["FusedConv", "Conv", "EmbedLayerNormalization"], valid_func_list=utility.DYNAMIC_CHECK_FUNC_LIST, - )) + ) + ) supported_configs.append( _OperatorConfig( config=OperatorConfig( @@ -1792,7 +1902,8 @@ def register_supported_configs(cls) -> None: ), operators=["MatMul"], valid_func_list=utility.DYNAMIC_CHECK_FUNC_LIST, - )) + ) + ) supported_configs.append( _OperatorConfig( config=OperatorConfig( @@ -1804,7 +1915,8 @@ def register_supported_configs(cls) -> None: ), operators=["Gather", "Attention", "LSTM"], valid_func_list=utility.DYNAMIC_CHECK_FUNC_LIST, - )) + ) + ) cls.supported_configs = supported_configs def to_dict(self): @@ -1819,10 +1931,18 @@ def to_dict(self): result[key] = local_result continue if not isinstance(val, list): - result[key] = getattr(val, "tensor_type", val) if isinstance(val, quantization.QuantType) else getattr(val, "value", val) + result[key] = ( + getattr(val, "tensor_type", val) + if isinstance(val, quantization.QuantType) + else getattr(val, "value", val) + ) else: result[key] = [ - getattr(item, "tensor_type", item) if isinstance(item, quantization.QuantType) else getattr(item, "value", item) + ( + getattr(item, "tensor_type", item) + if isinstance(item, quantization.QuantType) + else getattr(item, "value", item) + ) for item in val ] return result diff --git a/onnx_neural_compressor/constants.py b/onnx_neural_compressor/constants.py index 39e6429ed..71caf2a49 100644 --- a/onnx_neural_compressor/constants.py +++ b/onnx_neural_compressor/constants.py @@ -56,76 +56,247 @@ GPTQ_OP_LIST = ["MatMul"] -DYNAMIC_CPU_OP_LIST = [ - "FusedConv", "Conv", "EmbedLayerNormalization", "MatMul", "Gather", "Attention", "LSTM" -] -DYNAMIC_CUDA_OP_LIST = [ - "FusedConv", "Conv", "EmbedLayerNormalization", "MatMul", "Gather", "Attention", "LSTM" -] +DYNAMIC_CPU_OP_LIST = ["FusedConv", "Conv", "EmbedLayerNormalization", "MatMul", "Gather", "Attention", "LSTM"] +DYNAMIC_CUDA_OP_LIST = ["FusedConv", "Conv", "EmbedLayerNormalization", "MatMul", "Gather", "Attention", "LSTM"] DYNAMIC_DML_OP_LIST = [] -DYNAMIC_DNNL_OP_LIST = [ - "FusedConv", "Conv", "EmbedLayerNormalization", "MatMul", "Gather", "Attention", "LSTM" -] +DYNAMIC_DNNL_OP_LIST = ["FusedConv", "Conv", "EmbedLayerNormalization", "MatMul", "Gather", "Attention", "LSTM"] DYNAMIC_TRT_OP_LIST = [] STATIC_QDQ_CPU_OP_LIST = [ - "FusedConv", "Conv", "Gather", "GatherElements", "GatherND", "Tile", - "MatMul", "Gemm", "EmbedLayerNormalization", "Attention", - "Relu", "Clip", "LeakyRelu", "Sigmoid", "MaxPool", "GlobalAveragePool", - "Pad", "Split", "Squeeze", "Reshape", "Concat", "AveragePool", - "Unsqueeze", "Transpose", "Resize", "Abs", "Shrink", "Sign", - "Flatten", "Expand", "Slice", "Mod", "ReduceMax", "ReduceMin", "CenterCropPad" + "FusedConv", + "Conv", + "Gather", + "GatherElements", + "GatherND", + "Tile", + "MatMul", + "Gemm", + "EmbedLayerNormalization", + "Attention", + "Relu", + "Clip", + "LeakyRelu", + "Sigmoid", + "MaxPool", + "GlobalAveragePool", + "Pad", + "Split", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Unsqueeze", + "Transpose", + "Resize", + "Abs", + "Shrink", + "Sign", + "Flatten", + "Expand", + "Slice", + "Mod", + "ReduceMax", + "ReduceMin", + "CenterCropPad", ] STATIC_QDQ_CUDA_OP_LIST = [ - "FusedConv", "Conv", "Gather", - "MatMul", "Gemm", "EmbedLayerNormalization", "Attention", - "Relu", "Clip", "LeakyRelu", "Sigmoid", "MaxPool", "GlobalAveragePool", - "Pad", "Split", "Squeeze", "Reshape", "Concat", "AveragePool", - "Unsqueeze", "Transpose", "Resize", "Abs", "Shrink", "Sign", - "Flatten", "Expand", "Slice", "Mod", "ReduceMax", "ReduceMin", + "FusedConv", + "Conv", + "Gather", + "MatMul", + "Gemm", + "EmbedLayerNormalization", + "Attention", + "Relu", + "Clip", + "LeakyRelu", + "Sigmoid", + "MaxPool", + "GlobalAveragePool", + "Pad", + "Split", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Unsqueeze", + "Transpose", + "Resize", + "Abs", + "Shrink", + "Sign", + "Flatten", + "Expand", + "Slice", + "Mod", + "ReduceMax", + "ReduceMin", ] STATIC_QDQ_DML_OP_LIST = [ - "Conv", "MatMul", "Relu", "Clip", "MaxPool", + "Conv", + "MatMul", + "Relu", + "Clip", + "MaxPool", ] STATIC_QDQ_DNNL_OP_LIST = [ - "FusedConv", "Conv", "Gather", - "MatMul", "Gemm", "EmbedLayerNormalization", "Attention", - "Relu", "Clip", "LeakyRelu", "Sigmoid", "MaxPool", "GlobalAveragePool", - "Pad", "Split", "Squeeze", "Reshape", "Concat", "AveragePool", - "Unsqueeze", "Transpose", "Resize", + "FusedConv", + "Conv", + "Gather", + "MatMul", + "Gemm", + "EmbedLayerNormalization", + "Attention", + "Relu", + "Clip", + "LeakyRelu", + "Sigmoid", + "MaxPool", + "GlobalAveragePool", + "Pad", + "Split", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Unsqueeze", + "Transpose", + "Resize", ] STATIC_QDQ_TRT_OP_LIST = [ - "Conv", "MatMul", "Attention", "LeakyRelu", "Gather", "Sigmoid", - "MaxPool", "EmbedLayerNormalization", "GlobalAveragePool", "Pad", - "Split", "Squeeze", "Reshape", "Concat", "AveragePool", "Unsqueeze", - "Transpose", "Resize", "Gemm", "Add", + "Conv", + "MatMul", + "Attention", + "LeakyRelu", + "Gather", + "Sigmoid", + "MaxPool", + "EmbedLayerNormalization", + "GlobalAveragePool", + "Pad", + "Split", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Unsqueeze", + "Transpose", + "Resize", + "Gemm", + "Add", ] STATIC_QOPERATOR_CPU_OP_LIST = [ - "FusedConv", "Conv", "Gather", "GatherElements", "GatherND", "Tile", - "MatMul", "Gemm", "EmbedLayerNormalization", "Attention", "Mul", - "Relu", "Clip", "LeakyRelu", "Sigmoid", "MaxPool", "GlobalAveragePool", - "Pad", "Split", "Add", "Squeeze", "Reshape", "Concat", "AveragePool", - "Unsqueeze", "Transpose", "ArgMax", "Resize", "Abs", "Shrink", "Sign", - "Flatten", "Expand", "Slice", "Mod", "ReduceMax", "ReduceMin", "CenterCropPad", + "FusedConv", + "Conv", + "Gather", + "GatherElements", + "GatherND", + "Tile", + "MatMul", + "Gemm", + "EmbedLayerNormalization", + "Attention", + "Mul", + "Relu", + "Clip", + "LeakyRelu", + "Sigmoid", + "MaxPool", + "GlobalAveragePool", + "Pad", + "Split", + "Add", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Unsqueeze", + "Transpose", + "ArgMax", + "Resize", + "Abs", + "Shrink", + "Sign", + "Flatten", + "Expand", + "Slice", + "Mod", + "ReduceMax", + "ReduceMin", + "CenterCropPad", ] STATIC_QOPERATOR_CUDA_OP_LIST = [ - "FusedConv", "Conv", "Gather", - "MatMul", "Gemm", "EmbedLayerNormalization", "Attention", "Mul", - "Relu", "Clip", "LeakyRelu", "Sigmoid", "MaxPool", "GlobalAveragePool", - "Pad", "Split", "Add", "Squeeze", "Reshape", "Concat", "AveragePool", - "Unsqueeze", "Transpose", "ArgMax", "Resize", "Abs", "Shrink", "Sign", - "Flatten", "Expand", "Slice", "Mod", "ReduceMax", "ReduceMin", + "FusedConv", + "Conv", + "Gather", + "MatMul", + "Gemm", + "EmbedLayerNormalization", + "Attention", + "Mul", + "Relu", + "Clip", + "LeakyRelu", + "Sigmoid", + "MaxPool", + "GlobalAveragePool", + "Pad", + "Split", + "Add", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Unsqueeze", + "Transpose", + "ArgMax", + "Resize", + "Abs", + "Shrink", + "Sign", + "Flatten", + "Expand", + "Slice", + "Mod", + "ReduceMax", + "ReduceMin", ] STATIC_QOPERATOR_DML_OP_LIST = [ - "Conv", "MatMul", "Mul", "Relu", "Clip", "MaxPool", "Add", + "Conv", + "MatMul", + "Mul", + "Relu", + "Clip", + "MaxPool", + "Add", ] STATIC_QOPERATOR_DNNL_OP_LIST = [ - "FusedConv", "Conv", "Gather", - "MatMul", "Gemm", "EmbedLayerNormalization", "Attention", "Mul", - "Relu", "Clip", "LeakyRelu", "Sigmoid", "MaxPool", "GlobalAveragePool", - "Pad", "Split", "Add", "Squeeze", "Reshape", "Concat", "AveragePool", - "Unsqueeze", "Transpose", "ArgMax", "Resize", + "FusedConv", + "Conv", + "Gather", + "MatMul", + "Gemm", + "EmbedLayerNormalization", + "Attention", + "Mul", + "Relu", + "Clip", + "LeakyRelu", + "Sigmoid", + "MaxPool", + "GlobalAveragePool", + "Pad", + "Split", + "Add", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Unsqueeze", + "Transpose", + "ArgMax", + "Resize", ] STATIC_QOPERATOR_TRT_OP_LIST = [] diff --git a/onnx_neural_compressor/quantization/__init__.py b/onnx_neural_compressor/quantization/__init__.py index ee6c44379..1dcd5e428 100644 --- a/onnx_neural_compressor/quantization/__init__.py +++ b/onnx_neural_compressor/quantization/__init__.py @@ -13,7 +13,7 @@ # limitations under the License. +from onnxruntime.quantization import CalibrationMethod from onnxruntime.quantization.quant_utils import QuantFormat, QuantType -from onnxruntime.quantization import CalibrationMethod from onnx_neural_compressor.quantization.quantize import quantize diff --git a/onnx_neural_compressor/quantization/algorithm_entry.py b/onnx_neural_compressor/quantization/algorithm_entry.py index 4428ff7ad..1e42810e4 100644 --- a/onnx_neural_compressor/quantization/algorithm_entry.py +++ b/onnx_neural_compressor/quantization/algorithm_entry.py @@ -18,12 +18,11 @@ import onnx import onnxruntime as ort -from onnx_neural_compressor.algorithms import utility as quant_utils -from onnx_neural_compressor.algorithms.post_training_quant import calibrate -from onnx_neural_compressor.algorithms.post_training_quant import quantizer from onnxruntime import quantization from onnx_neural_compressor import config, constants, data_reader, logger, utility +from onnx_neural_compressor.algorithms import utility as quant_utils +from onnx_neural_compressor.algorithms.post_training_quant import calibrate, quantizer from onnx_neural_compressor.algorithms.smoother import core from onnx_neural_compressor.algorithms.weight_only import awq, gptq, rtn @@ -53,7 +52,7 @@ def gptq_quantize_entry( quant_config: config.GPTQConfig, calibration_data_reader: data_reader.CalibrationDataReader, *args, - **kwargs + **kwargs, ) -> onnx.ModelProto: """The main entry to apply gptq quantization.""" assert calibration_data_reader is not None, "Please provide calibration_data_reader" @@ -83,7 +82,7 @@ def awq_quantize_entry( quant_config: config.AWQConfig, calibration_data_reader: data_reader.CalibrationDataReader, *args, - **kwargs + **kwargs, ) -> onnx.ModelProto: """The main entry to apply awq quantization.""" assert calibration_data_reader is not None, "Please provide calibration_data_reader" @@ -105,6 +104,7 @@ def awq_quantize_entry( quant_utils.dump_woq_stats(model, config_mapping, quant_config.white_list) return model + ###################### Static quant Entry ################################## @utility.register_algo(name=constants.STATIC_QUANT) def static_quantize_entry( @@ -166,7 +166,7 @@ def smooth_quant_entry( calibration_data_reader: data_reader.CalibrationDataReader, model_output: Union[pathlib.Path, str] = None, *args, - **kwargs + **kwargs, ) -> Union[pathlib.Path, str, onnx.ModelProto]: """Apply smooth quant.""" assert calibration_data_reader is not None, "Please provide calibration_data_reader" @@ -179,7 +179,7 @@ def smooth_quant_entry( smoother = core.Smoother( model, calibration_data_reader, - execution_provider=getattr(quant_config, "execution_provider", "CPUExecutionProvider") + execution_provider=getattr(quant_config, "execution_provider", "CPUExecutionProvider"), ) smoothed_model = smoother.transform(**quant_config.to_dict()) with tempfile.TemporaryDirectory(prefix="ort.quant.") as tmp_dir: @@ -235,7 +235,7 @@ def dynamic_quantize_entry( model, config_mapping, op_types_to_quantize=quant_config.op_types_to_quantize, - ) + ) _quantizer.quantize_model() if model_output is not None: _quantizer.model.save(model_output) diff --git a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py index c338454f1..b41c56270 100644 --- a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py +++ b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py @@ -14,10 +14,11 @@ from typing import List, Union # isort: skip -import onnx -import onnxruntime as ort import pathlib import tempfile + +import onnx +import onnxruntime as ort from onnxruntime.quantization import matmul_4bits_quantizer from onnx_neural_compressor import config, data_reader, logger, onnx_model, utility @@ -156,7 +157,10 @@ def int4_quant_algo(self): opt_tmp_file = tempfile.TemporaryDirectory() # do graph optimization if not layer_wise_quant - if not getattr(self.algo_config, "layer_wise_quant", False) and self.optimization_level != ort.GraphOptimizationLevel.ORT_DISABLE_ALL: + if ( + not getattr(self.algo_config, "layer_wise_quant", False) + and self.optimization_level != ort.GraphOptimizationLevel.ORT_DISABLE_ALL + ): if not isinstance(model, str): onnx.save(model, pathlib.Path(opt_tmp_file.name).joinpath("tmp.onnx").as_posix()) model = pathlib.Path(opt_tmp_file.name).joinpath("tmp.onnx").as_posix() @@ -179,4 +183,3 @@ def int4_quant_algo(self): def process(self): self.int4_quant_algo() - diff --git a/onnx_neural_compressor/quantization/quantize.py b/onnx_neural_compressor/quantization/quantize.py index dab9c7b5d..c90e16d38 100644 --- a/onnx_neural_compressor/quantization/quantize.py +++ b/onnx_neural_compressor/quantization/quantize.py @@ -13,13 +13,13 @@ # limitations under the License. import pathlib +import tempfile from typing import Union import onnx +import onnxruntime as ort from onnxruntime.quantization.quantize import QuantConfig -import onnxruntime as ort -import tempfile from onnx_neural_compressor import config from onnx_neural_compressor.quantization import algorithm_entry as algos @@ -35,7 +35,7 @@ def quantize( if optimization_level != ort.GraphOptimizationLevel.ORT_DISABLE_ALL: sess_options = ort.SessionOptions() sess_options.graph_optimization_level = optimization_level - sess_options.optimized_model_filepath = pathlib.Path(tmp_dir).joinpath("opt.onnx").as_posix() + sess_options.optimized_model_filepath = pathlib.Path(tmp_dir).joinpath("opt.onnx").as_posix() session = ort.InferenceSession(model_input, sess_options) del session model_input = sess_options.optimized_model_filepath @@ -50,8 +50,8 @@ def quantize( model_input, quant_config, quant_config.calibration_data_reader, model_output=model_output ) elif isinstance(quant_config, config.DynamicQuantConfig): - algos.dynamic_quantize_entry( - model_input, quant_config, model_output=model_output - ) + algos.dynamic_quantize_entry(model_input, quant_config, model_output=model_output) else: - raise TypeError("Invalid quantization config type, it must be either StaticQuantConfig or DynamicQuantConfig.") + raise TypeError( + "Invalid quantization config type, it must be either StaticQuantConfig or DynamicQuantConfig." + ) diff --git a/onnx_neural_compressor/quantization/tuning.py b/onnx_neural_compressor/quantization/tuning.py index b0b96ed59..a5caa4c35 100644 --- a/onnx_neural_compressor/quantization/tuning.py +++ b/onnx_neural_compressor/quantization/tuning.py @@ -21,11 +21,11 @@ import uuid import onnx - -from onnx_neural_compressor import config, data_reader, logger, utility import onnxruntime as ort from onnx import external_data_helper +from onnx_neural_compressor import config, data_reader, logger, utility + from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Sized, Tuple, Union # isort: skip @@ -103,7 +103,9 @@ def _set_eval_fn_registry(self, user_eval_fns: List[Dict]) -> None: { self.EVAL_FN: user_eval_fn_pair[self.EVAL_FN], self.WEIGHT: user_eval_fn_pair.get(self.WEIGHT, 1.0), - self.FN_NAME: user_eval_fn_pair.get(self.FN_NAME, getattr(user_eval_fn_pair[self.EVAL_FN], "__name__", "custom_func")), + self.FN_NAME: user_eval_fn_pair.get( + self.FN_NAME, getattr(user_eval_fn_pair[self.EVAL_FN], "__name__", "custom_func") + ), } for user_eval_fn_pair in user_eval_fns ] @@ -252,6 +254,7 @@ def __iter__(self) -> Generator[config.BaseConfig, Any, None]: self.verify_config_list.append(new_config) yield new_config + class TuningConfig: """Config for auto tuning pipeline. @@ -368,6 +371,7 @@ def print_config_diff(self, config): else: logger.info("quant config difference: {}".format(config.get_diff_dict(self.tuning_history[0].quant_config))) + class TuningLogger: """A unified logger for the tuning/quantization process. @@ -540,9 +544,7 @@ def autotune( pathlib.Path(model_input).parent.joinpath("config.json").as_posix(), pathlib.Path(tmp_folder.name).joinpath("config.json").as_posix(), ) - eval_result: float = eval_func_wrapper.evaluate( - pathlib.Path(tmp_folder.name).joinpath("eval.onnx").as_posix() - ) + eval_result: float = eval_func_wrapper.evaluate(pathlib.Path(tmp_folder.name).joinpath("eval.onnx").as_posix()) tuning_logger.evaluation_end() logger.info("Evaluation result: %.4f", eval_result) tuning_monitor.add_trial_result(trial_index, eval_result, quant_config) @@ -554,8 +556,10 @@ def autotune( tuning_logger.tuning_end() if best_quant_model is None: - logger.info("Don't find the quantized model which meets accuracy requirement. " - "Please try other configs or adjust tolerable_loss.") + logger.info( + "Don't find the quantized model which meets accuracy requirement. " + "Please try other configs or adjust tolerable_loss." + ) exit(0) tmp_folder.cleanup() diff --git a/onnx_neural_compressor/utility.py b/onnx_neural_compressor/utility.py index f0e4ac093..f1cf126d2 100644 --- a/onnx_neural_compressor/utility.py +++ b/onnx_neural_compressor/utility.py @@ -103,8 +103,10 @@ def random_seed(self, random_seed): if check_value("random_seed", random_seed, int): self._random_seed = random_seed + options = Options() + def singleton(cls): """Singleton decorator.""" @@ -311,49 +313,122 @@ def auto_detect_ep(): else: return "CPUExecutionProvider" + def static_basic_check(config, optype, execution_provider, quant_format): if quant_format == quantization.QuantFormat.QOperator: if execution_provider not in constants.STATIC_QOPERATOR_OP_LIST_MAP: - raise ValueError("Unsupported execution_provider {}, only support {}.".format(execution_provider, list(constants.STATIC_QOPERATOR_OP_LIST_MAP.keys()))) + raise ValueError( + "Unsupported execution_provider {}, only support {}.".format( + execution_provider, list(constants.STATIC_QOPERATOR_OP_LIST_MAP.keys()) + ) + ) supported_optype = constants.STATIC_QOPERATOR_OP_LIST_MAP[execution_provider] if optype not in supported_optype: - raise ValueError("Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype)) + raise ValueError( + "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype) + ) elif quant_format == quantization.QuantFormat.QDQ: if execution_provider not in constants.STATIC_QDQ_OP_LIST_MAP: - raise ValueError("Unsupported execution_provider {}, only support {}.".format(execution_provider, list(constants.STATIC_QDQ_OP_LIST_MAP.keys()))) + raise ValueError( + "Unsupported execution_provider {}, only support {}.".format( + execution_provider, list(constants.STATIC_QDQ_OP_LIST_MAP.keys()) + ) + ) supported_optype = constants.STATIC_QDQ_OP_LIST_MAP[execution_provider] if optype not in supported_optype: - raise ValueError("Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype)) + raise ValueError( + "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype) + ) else: - raise ValueError("Unsupported quant_format {}, only support QuantFormat.QOperator and QuantFormat.QDQ.".format(quant_format)) + raise ValueError( + "Unsupported quant_format {}, only support QuantFormat.QOperator and QuantFormat.QDQ.".format(quant_format) + ) return config + def static_cpu_check(config, optype, execution_provider, quant_format): if execution_provider != "CPUExecutionProvider": return config # only support per-tensor - if optype in ["EmbedLayerNormalization", "Relu", "Clip", "LeakyRelu", "Sigmoid", "MaxPool", "GlobalAveragePool", - "Pad", "Split", "Squeeze", "Reshape", "Concat", "AveragePool", "Tile", - "Unsqueeze", "Transpose", "Resize", "Abs", "Shrink", "Sign", "Attention", - "Flatten", "Expand", "Slice", "Mod", "ReduceMax", "ReduceMin", - "CenterCropPad", "Add", "Mul", "ArgMax"]: + if optype in [ + "EmbedLayerNormalization", + "Relu", + "Clip", + "LeakyRelu", + "Sigmoid", + "MaxPool", + "GlobalAveragePool", + "Pad", + "Split", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Tile", + "Unsqueeze", + "Transpose", + "Resize", + "Abs", + "Shrink", + "Sign", + "Attention", + "Flatten", + "Expand", + "Slice", + "Mod", + "ReduceMax", + "ReduceMin", + "CenterCropPad", + "Add", + "Mul", + "ArgMax", + ]: setattr(config, "per_channel", False) if optype in ["Attention"]: setattr(config, "activation_type", onnx.TensorProto.UINT8) return config + def static_cuda_check(config, optype, execution_provider, quant_format): if execution_provider != "CUDAExecutionProvider": return config # only support per-tensor - if optype in ["EmbedLayerNormalization", "Relu", "Clip", "LeakyRelu", "Sigmoid", "MaxPool", "GlobalAveragePool", - "Pad", "Split", "Squeeze", "Reshape", "Concat", "AveragePool", "Tile", - "Unsqueeze", "Transpose", "Resize", "Abs", "Shrink", "Sign", "Attention", - "Flatten", "Expand", "Slice", "Mod", "ReduceMax", "ReduceMin", - "CenterCropPad", "Add", "Mul", "ArgMax"]: + if optype in [ + "EmbedLayerNormalization", + "Relu", + "Clip", + "LeakyRelu", + "Sigmoid", + "MaxPool", + "GlobalAveragePool", + "Pad", + "Split", + "Squeeze", + "Reshape", + "Concat", + "AveragePool", + "Tile", + "Unsqueeze", + "Transpose", + "Resize", + "Abs", + "Shrink", + "Sign", + "Attention", + "Flatten", + "Expand", + "Slice", + "Mod", + "ReduceMax", + "ReduceMin", + "CenterCropPad", + "Add", + "Mul", + "ArgMax", + ]: setattr(config, "per_channel", False) if optype in ["Attention"]: @@ -361,6 +436,7 @@ def static_cuda_check(config, optype, execution_provider, quant_format): setattr(config, "weight_type", onnx.TensorProto.INT8) return config + def static_dml_check(config, optype, execution_provider, quant_format): if execution_provider != "DmlExecutionProvider": return config @@ -370,6 +446,7 @@ def static_dml_check(config, optype, execution_provider, quant_format): setattr(config, "per_channel", False) return config + def static_dnnl_check(config, optype, execution_provider, quant_format): if execution_provider != "DnnlExecutionProvider": return config @@ -377,6 +454,7 @@ def static_dnnl_check(config, optype, execution_provider, quant_format): # current configurations are same as CPU EP return static_cpu_check(config, optype, execution_provider, quant_format) + def static_trt_check(config, optype, execution_provider, quant_format): if execution_provider != "TensorrtExecutionProvider": return config @@ -395,6 +473,7 @@ def static_trt_check(config, optype, execution_provider, quant_format): setattr(config, "activation_sym", True) return config + STATIC_CHECK_FUNC_LIST = [ static_basic_check, static_cpu_check, @@ -407,13 +486,20 @@ def static_trt_check(config, optype, execution_provider, quant_format): def dynamic_basic_check(config, optype, execution_provider, quant_format=None): if execution_provider not in constants.DYNAMIC_OP_LIST_MAP: - raise ValueError("Unsupported execution_provider {}, only support {}.".format(execution_provider, list(constants.DYNAMIC_OP_LIST_MAP.keys()))) + raise ValueError( + "Unsupported execution_provider {}, only support {}.".format( + execution_provider, list(constants.DYNAMIC_OP_LIST_MAP.keys()) + ) + ) supported_optype = constants.DYNAMIC_OP_LIST_MAP[execution_provider] if optype not in supported_optype: - raise ValueError("Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype)) + raise ValueError( + "Unsupported optype {} for {}, only support {}.".format(optype, execution_provider, supported_optype) + ) return config + def dynamic_cpu_check(config, optype, execution_provider, quant_format=None): if execution_provider != "CPUExecutionProvider": return config @@ -422,12 +508,14 @@ def dynamic_cpu_check(config, optype, execution_provider, quant_format=None): setattr(config, "per_channel", False) return config + def dynamic_cuda_check(config, optype, execution_provider, quant_format=None): if execution_provider != "CUDAExecutionProvider": return config # current configurations are same as CPU EP return dynamic_cpu_check(config, optype, execution_provider, quant_format) + def dynamic_dml_check(config, optype, execution_provider, quant_format=None): if execution_provider != "DmlExecutionProvider": return config @@ -435,12 +523,14 @@ def dynamic_dml_check(config, optype, execution_provider, quant_format=None): # don't support dynamic quantization return None + def dynamic_dnnl_check(config, optype, execution_provider, quant_format=None): if execution_provider != "DnnlExecutionProvider": return config # current configurations are same as CPU EP return dynamic_cpu_check(config, optype, execution_provider, quant_format) + def dynamic_trt_check(config, optype, execution_provider, quant_format=None): if execution_provider != "TensorrtExecutionProvider": return config @@ -448,6 +538,7 @@ def dynamic_trt_check(config, optype, execution_provider, quant_format=None): # don't support dynamic quantization return None + DYNAMIC_CHECK_FUNC_LIST = [ dynamic_basic_check, dynamic_cpu_check, diff --git a/test/quantization/layer_wise/test_layer_wise.py b/test/quantization/layer_wise/test_layer_wise.py index 20fe6c547..af0bca3e4 100644 --- a/test/quantization/layer_wise/test_layer_wise.py +++ b/test/quantization/layer_wise/test_layer_wise.py @@ -122,9 +122,9 @@ def test_rtn_layer_wise(self): qmodel = self._apply_quantize(rtn_config, algos.rtn_quantize_entry) self.assertTrue(self._check_model_is_quantized(qmodel)) - lwq_quantized_weight = self._get_quantized_matmul_weight(qmodel_lwq, "/lm_head/MatMul_Q4G32") + lwq_quantized_weight = self._get_quantized_matmul_weight(qmodel_lwq, "/lm_head/MatMul_Q4") self.assertIsNotNone(lwq_quantized_weight) - quantized_weight = self._get_quantized_matmul_weight(qmodel, "/lm_head/MatMul_Q4G32") + quantized_weight = self._get_quantized_matmul_weight(qmodel, "/lm_head/MatMul_Q4") self.assertIsNotNone(quantized_weight) self.assertTrue((lwq_quantized_weight == quantized_weight).all()) @@ -152,9 +152,9 @@ def test_rtn_layer_wise_with_ort_like_api(self): self.assertTrue(self._check_model_is_quantized(qmodel_lwq)) # compare qmodel - lwq_quantized_weight = self._get_quantized_matmul_weight(qmodel_lwq, "/lm_head/MatMul_Q4G128") + lwq_quantized_weight = self._get_quantized_matmul_weight(qmodel_lwq, "/lm_head/MatMul_Q4") self.assertIsNotNone(lwq_quantized_weight) - quantized_weight = self._get_quantized_matmul_weight(qmodel, "/lm_head/MatMul_Q4G128") + quantized_weight = self._get_quantized_matmul_weight(qmodel, "/lm_head/MatMul_Q4") self.assertIsNotNone(quantized_weight) self.assertTrue((lwq_quantized_weight == quantized_weight).all()) @@ -169,9 +169,9 @@ def test_gptq_layer_wise(self): qmodel = self._apply_quantize(gptq_config, algos.gptq_quantize_entry, self.calibration_data_reader) self.assertTrue(self._check_model_is_quantized(qmodel)) - lwq_quantized_weight = self._get_quantized_matmul_weight(qmodel_lwq, "/lm_head/MatMul_Q4G32") + lwq_quantized_weight = self._get_quantized_matmul_weight(qmodel_lwq, "/lm_head/MatMul_Q4") self.assertIsNotNone(lwq_quantized_weight) - quantized_weight = self._get_quantized_matmul_weight(qmodel, "/lm_head/MatMul_Q4G32") + quantized_weight = self._get_quantized_matmul_weight(qmodel, "/lm_head/MatMul_Q4") self.assertIsNotNone(quantized_weight) self.assertTrue((lwq_quantized_weight == quantized_weight).all()) @@ -203,9 +203,9 @@ def test_gptq_layer_wise_with_ort_like_api(self): self.assertTrue(self._check_model_is_quantized(qmodel_lwq)) # compare qmodel - lwq_quantized_weight = self._get_quantized_matmul_weight(qmodel_lwq, "/lm_head/MatMul_Q4G128") + lwq_quantized_weight = self._get_quantized_matmul_weight(qmodel_lwq, "/lm_head/MatMul_Q4") self.assertIsNotNone(lwq_quantized_weight) - quantized_weight = self._get_quantized_matmul_weight(qmodel, "/lm_head/MatMul_Q4G128") + quantized_weight = self._get_quantized_matmul_weight(qmodel, "/lm_head/MatMul_Q4") self.assertIsNotNone(quantized_weight) self.assertTrue((lwq_quantized_weight == quantized_weight).all()) diff --git a/test/quantization/post_training_quant/test_calibrate.py b/test/quantization/post_training_quant/test_calibrate.py index 7f176b9f4..a02880d4a 100644 --- a/test/quantization/post_training_quant/test_calibrate.py +++ b/test/quantization/post_training_quant/test_calibrate.py @@ -5,9 +5,9 @@ import numpy as np import onnx + from onnx_neural_compressor import data_reader -from onnx_neural_compressor.algorithms.post_training_quant import calibrate -from onnx_neural_compressor.algorithms.post_training_quant import calibrator +from onnx_neural_compressor.algorithms.post_training_quant import calibrate, calibrator def generate_input_initializer(tensor_shape, tensor_dtype, input_name): @@ -16,18 +16,31 @@ def generate_input_initializer(tensor_shape, tensor_dtype, input_name): init = onnx.numpy_helper.from_array(tensor, input_name) return init + class DataReader(data_reader.CalibrationDataReader): def __init__(self): self.data_list = [] self.data_list.append( - {"input0": np.array([[[[0.45, 0.60, 0.75]], [[0.25, 0.50, 0.75]], [[0.90, 0.70, 0.50]]]]).astype(np.float32)} + { + "input0": np.array([[[[0.45, 0.60, 0.75]], [[0.25, 0.50, 0.75]], [[0.90, 0.70, 0.50]]]]).astype( + np.float32 + ) + } ) self.data_list.append( - {"input0": np.array([[[[0.62, 0.94, 0.38]], [[0.70, 0.13, 0.07]], [[0.89, 0.75, 0.84]]]]).astype(np.float32)} + { + "input0": np.array([[[[0.62, 0.94, 0.38]], [[0.70, 0.13, 0.07]], [[0.89, 0.75, 0.84]]]]).astype( + np.float32 + ) + } ) self.data_list.append( - {"input0": np.array([[[[0.64, 0.24, 0.97]], [[0.82, 0.58, 0.27]], [[0.019, 0.34, 0.02]]]]).astype(np.float32)} + { + "input0": np.array([[[[0.64, 0.24, 0.97]], [[0.82, 0.58, 0.27]], [[0.019, 0.34, 0.02]]]]).astype( + np.float32 + ) + } ) self.enum_data = None diff --git a/test/quantization/post_training_quant/test_operators.py b/test/quantization/post_training_quant/test_operators.py index 910ed6060..9345305e8 100644 --- a/test/quantization/post_training_quant/test_operators.py +++ b/test/quantization/post_training_quant/test_operators.py @@ -1,8 +1,8 @@ +import collections import copy import os import shutil import unittest -import collections import numpy as np import onnx @@ -30,10 +30,14 @@ def build_model(): conv2_node = onnx.helper.make_node("Conv", ["add_out", "conv2_weight"], ["conv2_output"], name="conv2") # 1, 8, 13, 13 - concat_node = onnx.helper.make_node("Concat", ["conv1_output", "conv2_output"], ["concat_output"], name="Concat", axis=1) + concat_node = onnx.helper.make_node( + "Concat", ["conv1_output", "conv2_output"], ["concat_output"], name="Concat", axis=1 + ) # 1, 8, 11, 11 avg_args = {"kernel_shape": [3, 3]} - avgpool_node = onnx.helper.make_node("AveragePool", ["concat_output"], ["avg_output"], name="AveragePool", **avg_args) + avgpool_node = onnx.helper.make_node( + "AveragePool", ["concat_output"], ["avg_output"], name="AveragePool", **avg_args + ) reshape_node = onnx.helper.make_node("Reshape", ["avg_output", "shape"], ["reshape_output"], name="Reshape") add_node_2 = onnx.helper.make_node("Add", ["reshape_output", "add_init_2"], ["add_out_2"], name="add_2") @@ -128,7 +132,9 @@ def test_resize(self): resize_node = onnx.helper.make_node("Resize", resize_inputs, ["output"], name="resize_node", **resize_attrs) resize_roi = [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0] resize_roi_name = "resize_roi" - resize_roi_initializer = onnx.helper.make_tensor(resize_roi_name, onnx.TensorProto.FLOAT, [len(resize_roi)], resize_roi) + resize_roi_initializer = onnx.helper.make_tensor( + resize_roi_name, onnx.TensorProto.FLOAT, [len(resize_roi)], resize_roi + ) initializers.extend([resize_roi_initializer]) resize_node.input.extend([resize_roi_name]) @@ -159,11 +165,15 @@ def test_resize(self): } q_model = self.qlinear_test(model, q_config, quantize_params, ["Resize", "Conv"]) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) q_model = self.qdq_test(model, q_config, quantize_params, ["Resize", "Conv"]) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 3) # test opset version 10 @@ -171,11 +181,15 @@ def test_resize(self): model.ir_version = 7 # use stable onnx ir version q_model = self.qlinear_test(model, q_config, quantize_params, ["Resize", "Conv"]) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) q_model = self.qdq_test(model, q_config, quantize_params, ["Resize", "Conv"]) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 3) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 3 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) def test_argmax(self): @@ -240,7 +254,9 @@ def test_argmax(self): "output": [np.uint8(0), np.float32(10.0)], } q_model = self.qlinear_test(model, q_config, quantize_params, ["Conv", "ArgMax"]) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) def test_gemm(self): @@ -284,10 +300,14 @@ def test_gemm(self): "output": [np.uint8(0), np.float32(10.0)], } q_model = self.qlinear_test(model, q_config, quantize_params, ["Gemm"]) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) q_model = self.qdq_test(model, q_config, quantize_params, ["Gemm"]) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) # test gemm with non-constant bias @@ -308,10 +328,14 @@ def test_gemm(self): model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)]) model.ir_version = 7 q_model = self.qlinear_test(model, q_config, quantize_params, ["Gemm"]) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 0) q_model = self.qdq_test(model, q_config, quantize_params, ["Gemm"]) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 3) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 3 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) def test_embed(self): @@ -319,7 +343,9 @@ def test_embed(self): input_ids_tensor = onnx.helper.make_tensor_value_info("input_ids", onnx.TensorProto.INT32, input_ids_shape) segment_ids_shape = [1, 4] - segment_ids_tensor = onnx.helper.make_tensor_value_info("segment_ids", onnx.TensorProto.INT32, segment_ids_shape) + segment_ids_tensor = onnx.helper.make_tensor_value_info( + "segment_ids", onnx.TensorProto.INT32, segment_ids_shape + ) # EmbedLayerNormalization Node Constants and Weights: word_embed_shape = [32, 4] @@ -344,10 +370,14 @@ def test_embed(self): # EmbedLayerNormalization Outputs: layernorm_out_shape = [1, 4, 4] - layernorm_out_tensor = onnx.helper.make_tensor_value_info("layernorm_out", onnx.TensorProto.FLOAT, layernorm_out_shape) + layernorm_out_tensor = onnx.helper.make_tensor_value_info( + "layernorm_out", onnx.TensorProto.FLOAT, layernorm_out_shape + ) mask_index_out_shape = [1] - mask_index_out_tensor = onnx.helper.make_tensor_value_info("mask_index_out", onnx.TensorProto.INT32, mask_index_out_shape) + mask_index_out_tensor = onnx.helper.make_tensor_value_info( + "mask_index_out", onnx.TensorProto.INT32, mask_index_out_shape + ) # EmbedLayerNormalization Node: embed_layer_norm_inputs = ["input_ids", "segment_ids", "word_embed", "pos_embed", "seg_embed", "gamma", "beta"] @@ -375,7 +405,8 @@ def test_embed(self): graph = onnx.helper.make_graph(nodes, graph_name, inputs, outputs, initializer=initializers) model = onnx.helper.make_model( - graph, opset_imports=[onnx.helper.make_opsetid("com.microsoft", 14), onnx.helper.make_opsetid("ai.onnx", 14)] + graph, + opset_imports=[onnx.helper.make_opsetid("com.microsoft", 14), onnx.helper.make_opsetid("ai.onnx", 14)], ) model.ir_version = 7 # use stable onnx ir version @@ -391,11 +422,17 @@ def test_embed(self): "input_ids": [np.uint8(10.0), np.float32(0)], } q_model = self.qlinear_test(model, q_config, quantize_params, ["EmbedLayerNormalization"]) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QEmbedLayerNormalization"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["QEmbedLayerNormalization"], 1 + ) q_model = self.qdq_test(model, q_config, quantize_params, ["EmbedLayerNormalization"]) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 5) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["EmbedLayerNormalization"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 5 + ) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["EmbedLayerNormalization"], 1 + ) def test_LSTM(self): input_shape = [1, 1, 200] @@ -431,7 +468,9 @@ def test_LSTM(self): q_config = {"lstm": self.q_config} q_model = self.dynamic_test(model, q_config, None, ["LSTM"]) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DynamicQuantizeLSTM"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DynamicQuantizeLSTM"], 1 + ) def test_concat_reshape_pooling(self): model = build_model() @@ -464,12 +503,16 @@ def test_concat_reshape_pooling(self): model, q_config, quantize_params, quantizable_op_types, **{"dedicated_qdq_pair": True} ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types, **{"dedicated_qdq_pair": True}) - q_model.save('test.onnx') + q_model.save("test.onnx") self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 7) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 9) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 9 + ) q_config = { "Reshape": self.q_config, @@ -480,11 +523,15 @@ def test_concat_reshape_pooling(self): } q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 3) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 3 + ) q_config = { "Reshape": self.q_config, @@ -495,11 +542,15 @@ def test_concat_reshape_pooling(self): } q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 0) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 0) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) q_config = { "Reshape": self.q_config, @@ -510,12 +561,16 @@ def test_concat_reshape_pooling(self): } q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["AveragePool"], 1) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6 + ) quantize_params = { "input": [np.uint8(10.0), np.float32(0)], @@ -542,7 +597,9 @@ def test_concat_reshape_pooling(self): q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 6) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 8) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 8 + ) def test_conv(self): for op in ["Conv", "FusedConv"]: @@ -567,11 +624,19 @@ def test_conv(self): } quantizable_op_types = [op] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2 + ) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 3) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4 + ) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 3 + ) def test_matmul(self): A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) @@ -590,15 +655,21 @@ def test_matmul(self): } quantizable_op_types = ["Matmul"] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 3) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 3 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) q_config = {"Matmul": self.q_config} q_model = self.dynamic_test(model, q_config, None, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DynamicQuantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DynamicQuantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["MatMulInteger"], 1) quantize_params = {"A": [np.float32(10.0)], "B": [np.float32(10.0)], "C": [np.float32(10.0)]} @@ -609,7 +680,9 @@ def test_matmul(self): quantize_params = {} q_model = self.dynamic_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DynamicQuantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DynamicQuantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["MatMulInteger"], 1) def test_attention(self): @@ -632,12 +705,16 @@ def test_attention(self): q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QAttention"], 1) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) self.qdq_test(model, q_config, quantize_params, quantizable_op_types) q_config = {"Attention": self.q_config} q_model = self.dynamic_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DynamicQuantizeLinear"], 2) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DynamicQuantizeLinear"], 2 + ) E = onnx.helper.make_tensor_value_info("E", onnx.TensorProto.INT32, [1, 1, 5, 5]) F = onnx.helper.make_tensor_value_info("F", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) @@ -655,15 +732,21 @@ def test_attention(self): q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2 + ) q_config = {"Attention": self.q_config} q_model = self.dynamic_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DynamicQuantizeLinear"], 2) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DynamicQuantizeLinear"], 2 + ) def test_gather(self): input_tensor = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [3, 2]) @@ -701,11 +784,15 @@ def test_gather(self): quantizable_op_types = ["Gather", "MatMul"] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 3) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4 + ) q_config = {"Gather": self.q_config, "MatMul": self.q_config} q_model = self.dynamic_test(model, q_config, quantize_params, quantizable_op_types) @@ -743,11 +830,15 @@ def test_split(self): } quantizable_op_types = ["Split", "MatMul"] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 5) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 5 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) def test_pad(self): @@ -777,7 +868,9 @@ def test_pad(self): ) else: node = onnx.helper.make_node("Pad", ["A", "B"], ["C"], name="Pad", mode=mode) - graph = onnx.helper.make_graph([conv_node, node], "test_graph_1", [E, F, B], [C], [E_init, F_init, B_init]) + graph = onnx.helper.make_graph( + [conv_node, node], "test_graph_1", [E, F, B], [C], [E_init, F_init, B_init] + ) model = onnx.helper.make_model(graph) conv_config = { "weight_type": 3, @@ -797,11 +890,15 @@ def test_pad(self): } quantizable_op_types = ["Conv", "Pad"] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) q_model = self.qdq_test( model, q_config, quantize_params, quantizable_op_types, **{"dedicated_qdq_pair": True} ) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4 + ) node = onnx.helper.make_node("Pad", ["E", "B", "D"], ["C"], name="Pad", mode="constant") graph = onnx.helper.make_graph([node], "test_graph_1", [E, B, D], [C], [E_init, B_init, D_init]) @@ -810,10 +907,14 @@ def test_pad(self): quantizable_op_types = ["Pad"] q_config = {"Pad": self.q_config} q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2 + ) def test_binary(self): for op in ["Mul", "Add"]: @@ -831,16 +932,24 @@ def test_binary(self): } quantizable_op_types = [op] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) q_model = self.qlinear_test(model, q_config, {}, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) q_model = self.qdq_test(model, q_config, {}, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) def test_relu(self): A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) @@ -879,7 +988,7 @@ def test_relu(self): session = ort.InferenceSession(model.SerializeToString(), sess_options, providers=ort.get_available_providers()) tmp_model = onnx.load(sess_options.optimized_model_filepath) q_model = self.qlinear_test(tmp_model, q_config, quantize_params, quantizable_op_types) - q_model.save('test.onnx') + q_model.save("test.onnx") self.assertEqual(len(q_model.model.graph.node), 5) q_model = self.qdq_test(tmp_model, q_config, quantize_params, quantizable_op_types) self.assertEqual(len(q_model.model.graph.node), 8) @@ -920,10 +1029,14 @@ def test_clip(self): } quantizable_op_types = ["Conv", "Clip"] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 3) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 3 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 3) def test_activation(self): @@ -937,26 +1050,38 @@ def test_activation(self): quantize_params = {"A": [np.uint8(10.0), np.float32(0)], "B": [np.uint8(10.0), np.float32(0)]} quantizable_op_types = [op] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2 + ) a_value = np.random.randn(1, 10).astype(np.float32) A_init = onnx.helper.make_tensor("A", onnx.TensorProto.FLOAT, [1, 10], a_value.reshape(10).tolist()) graph = onnx.helper.make_graph([node], "test_graph_1", [A], [B], [A_init]) model = onnx.helper.make_model(graph) q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2 + ) q_model = self.qlinear_test(model, q_config, {}, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) q_model = self.qdq_test(model, q_config, {}, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) for op in ["Relu"]: B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 10]) @@ -968,26 +1093,38 @@ def test_activation(self): quantize_params = {"A": [np.uint8(10.0), np.float32(0)], "B": [np.uint8(10.0), np.float32(0)]} quantizable_op_types = [op] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) a_value = np.random.randn(1, 10).astype(np.float32) A_init = onnx.helper.make_tensor("A", onnx.TensorProto.FLOAT, [1, 10], a_value.reshape(10).tolist()) graph = onnx.helper.make_graph([node], "test_graph_1", [A], [B], [A_init]) model = onnx.helper.make_model(graph) q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) q_model = self.qlinear_test(model, q_config, {}, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) q_model = self.qdq_test(model, q_config, {}, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 0 + ) def test_pooling(self): op = "MaxPool" @@ -1024,10 +1161,14 @@ def test_pooling(self): } quantizable_op_types = ["Conv", op] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) op = "GlobalAveragePool" @@ -1043,11 +1184,19 @@ def test_pooling(self): opset.version = opset_version model = onnx.helper.make_model(graph, opset_imports=[opset]) q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1 + ) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 2 + ) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2 + ) A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 1, 5, 5]) B = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, [1, 1, 3, 3]) @@ -1068,11 +1217,15 @@ def test_pooling(self): } quantizable_op_types = ["Conv", op] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 2) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 4 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) def test_exclude_node(self): @@ -1167,14 +1320,18 @@ def test_more_direct8bit_nodes(self): } quantizable_op_types = ["MatMul", "Flatten", "Abs", "Sign", "Shrink"] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) self.assertIsNotNone(session) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) q_model.save("qdq.onnx") - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 9) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 9 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 7) session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) self.assertIsNotNone(session) @@ -1229,13 +1386,17 @@ def test_expand(self): } quantizable_op_types = ["MatMul", "Expand"] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) self.assertIsNotNone(session) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) self.assertIsNotNone(session) @@ -1292,13 +1453,17 @@ def test_slice(self): } quantizable_op_types = ["MatMul", "Slice"] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) self.assertIsNotNone(session) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) self.assertIsNotNone(session) @@ -1360,14 +1525,18 @@ def test_mod(self): } quantizable_op_types = ["MatMul", "Mod"] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - q_model.save('test.onnx') - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + q_model.save("test.onnx") + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) self.assertIsNotNone(session) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 8) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 8 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 5) session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) self.assertIsNotNone(session) @@ -1426,13 +1595,17 @@ def test_reducemin_reducemax(self): } quantizable_op_types = ["MatMul", "ReduceMin"] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) self.assertIsNotNone(session) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) self.assertIsNotNone(session) @@ -1490,13 +1663,17 @@ def test_reducemin_reducemax(self): } quantizable_op_types = ["MatMul", "ReduceMax"] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) self.assertIsNotNone(session) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) self.assertIsNotNone(session) @@ -1550,13 +1727,17 @@ def test_tile(self): } quantizable_op_types = ["MatMul", "Tile"] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) self.assertIsNotNone(session) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) self.assertIsNotNone(session) @@ -1571,7 +1752,9 @@ def test_centercroppad(self): matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [20, 10, 3]) matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0") - centercroppad_output = onnx.helper.make_tensor_value_info("centercroppad_output", onnx.TensorProto.FLOAT, [10, 7, 3]) + centercroppad_output = onnx.helper.make_tensor_value_info( + "centercroppad_output", onnx.TensorProto.FLOAT, [10, 7, 3] + ) shape = onnx.helper.make_tensor("shape", onnx.TensorProto.INT64, [3], [10, 7, 3]) centercroppad_node = onnx.helper.make_node( "CenterCropPad", @@ -1614,13 +1797,17 @@ def test_centercroppad(self): } quantizable_op_types = ["MatMul", "CenterCropPad"] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) self.assertIsNotNone(session) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) self.assertIsNotNone(session) @@ -1679,13 +1866,17 @@ def test_gathernd(self): } quantizable_op_types = ["MatMul", "GatherND"] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) self.assertIsNotNone(session) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) self.assertIsNotNone(session) @@ -1700,7 +1891,9 @@ def test_gatherelements(self): matmul1_output = onnx.helper.make_tensor_value_info("matmul1_output", onnx.TensorProto.FLOAT, [3, 3]) matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0") - gatherelements_output = onnx.helper.make_tensor_value_info("gatherelements_output", onnx.TensorProto.FLOAT, [2, 3]) + gatherelements_output = onnx.helper.make_tensor_value_info( + "gatherelements_output", onnx.TensorProto.FLOAT, [2, 3] + ) indices = onnx.helper.make_tensor("indices", onnx.TensorProto.INT64, [2, 3], [-1, -2, 0, -2, 0, 0]) gathernd_node = onnx.helper.make_node( "GatherElements", @@ -1744,13 +1937,17 @@ def test_gatherelements(self): } quantizable_op_types = ["MatMul", "GatherElements"] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1) session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) self.assertIsNotNone(session) q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types) - self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6) + self.assertEqual( + collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6 + ) self.assertEqual(collections.Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4) session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"]) self.assertIsNotNone(session) diff --git a/test/quantization/post_training_quant/test_quant_utils.py b/test/quantization/post_training_quant/test_quant_utils.py index 19df570bc..6fce47d7c 100644 --- a/test/quantization/post_training_quant/test_quant_utils.py +++ b/test/quantization/post_training_quant/test_quant_utils.py @@ -2,6 +2,7 @@ import numpy as np import onnx + from onnx_neural_compressor.algorithms import utility as quant_utils @@ -18,7 +19,6 @@ def test_4bit_quant_tensor(self): data = np.random.random((100, 32)) q_data, scale, zp = quant_utils.quant_tensor(data) - def test_quant_dequant_data(self): data = np.random.random((100, 32)) qrange = quant_utils.get_qmin_qmax_for_qType( @@ -65,4 +65,4 @@ def test_quant_dequant_data(self): if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/test/quantization/test_autotune.py b/test/quantization/test_autotune.py index 72cb735eb..051b6cd73 100644 --- a/test/quantization/test_autotune.py +++ b/test/quantization/test_autotune.py @@ -22,10 +22,9 @@ import numpy as np import onnx import onnxruntime as ort -from onnx_neural_compressor import quantization from optimum.exporters.onnx import main_export -from onnx_neural_compressor import config, data_reader +from onnx_neural_compressor import config, data_reader, quantization from onnx_neural_compressor.quantization import tuning from typing import Callable, Dict, List, Optional, Union # isort: skip @@ -458,9 +457,8 @@ def test_skip_verified_config_mapping(self, mock_warning): ) call_args_list = mock_warning.call_args_list # There may be multiple calls to warning, so we need to check all of them - self.assertIn( - "Skip the verified config mapping.", [info[0][0] for info in call_args_list] - ) + self.assertIn("Skip the verified config mapping.", [info[0][0] for info in call_args_list]) + if __name__ == "__main__": unittest.main() diff --git a/test/quantization/test_config.py b/test/quantization/test_config.py index b5d92258c..ec9411b45 100644 --- a/test/quantization/test_config.py +++ b/test/quantization/test_config.py @@ -5,12 +5,11 @@ import numpy as np import onnx -from onnx_neural_compressor import quantization -from onnx_neural_compressor.quantization import tuning from optimum.exporters.onnx import main_export -from onnx_neural_compressor import config, logger, utility +from onnx_neural_compressor import config, logger, quantization, utility from onnx_neural_compressor.quantization import algorithm_entry as algos +from onnx_neural_compressor.quantization import tuning def find_onnx_file(folder_path): @@ -68,7 +67,7 @@ def setUp(self): # print the test name logger.info(f"Running TestQuantizationConfig test: {self.id()}") - def _check_node_is_quantized(self, model, node_name, bits): + def _check_node_is_quantized(self, model, node_name): for node in model.graph.node: if (node.name == node_name or node.name == node_name + "_Q4") and node.op_type in [ "MatMulNBits", @@ -164,7 +163,6 @@ def test_dynamic_custom_quant_config(self): self.assertEqual(len(config_loader.config_set), 2) - def test_static_quant_config(self): for execution_provider in ["CPUExecutionProvider", "CUDAExecutionProvider", "DnnlExecutionProvider"]: tuning_config = tuning.TuningConfig( @@ -185,7 +183,9 @@ def test_static_quant_config(self): else: self.assertFalse("add" in configs_mapping) if idx in [0, 1]: - self.assertEqual(configs_mapping["Matmul"]["calibrate_method"], quantization.CalibrationMethod.MinMax) + self.assertEqual( + configs_mapping["Matmul"]["calibrate_method"], quantization.CalibrationMethod.MinMax + ) self.assertLess(idx, 16) for execution_provider in ["TensorrtExecutionProvider"]: @@ -217,7 +217,9 @@ def test_static_quant_config(self): configs_mapping = quant_config.to_config_mapping(model_info=model_info) if "Matmul" in configs_mapping: self.assertFalse(configs_mapping["Matmul"]["per_channel"]) - self.assertEqual(configs_mapping["Matmul"]["calibrate_method"], quantization.CalibrationMethod.MinMax) + self.assertEqual( + configs_mapping["Matmul"]["calibrate_method"], quantization.CalibrationMethod.MinMax + ) if "add" in configs_mapping: self.assertEqual(configs_mapping["add"]["calibrate_method"], quantization.CalibrationMethod.MinMax) self.assertLess(idx, 16) diff --git a/test/quantization/test_smooth_quant.py b/test/quantization/test_smooth_quant.py index 217013844..52f4bd8b3 100644 --- a/test/quantization/test_smooth_quant.py +++ b/test/quantization/test_smooth_quant.py @@ -19,10 +19,10 @@ import numpy as np import onnx +import onnxruntime as ort from optimum.exporters.onnx import main_export from onnx_neural_compressor import config, data_reader -import onnxruntime as ort from onnx_neural_compressor.quantization import QuantType from onnx_neural_compressor.quantization import algorithm_entry as algos from onnx_neural_compressor.quantization import quantize @@ -113,13 +113,17 @@ def test_sq_with_ort_like_api(self): def test_smooth_quant_args(self): self.data_reader.rewind() - sq_config = config.SmoothQuantConfig(weight_type=QuantType.QUInt8, activation_type=QuantType.QUInt8, alpha="auto") + sq_config = config.SmoothQuantConfig( + weight_type=QuantType.QUInt8, activation_type=QuantType.QUInt8, alpha="auto" + ) model = algos.smooth_quant_entry(self.gptj, sq_config, self.data_reader) num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"]) self.assertEqual(num_muls, 30) self.data_reader.rewind() - sq_config = config.SmoothQuantConfig(weight_type=QuantType.QUInt8, activation_type=QuantType.QUInt8, scales_per_op=False) + sq_config = config.SmoothQuantConfig( + weight_type=QuantType.QUInt8, activation_type=QuantType.QUInt8, scales_per_op=False + ) model = algos.smooth_quant_entry(self.gptj, sq_config, self.data_reader) num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"]) self.assertEqual(num_muls, 15) @@ -127,20 +131,23 @@ def test_smooth_quant_args(self): sess_options = ort.SessionOptions() sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED sess_options.optimized_model_filepath = "Optimized_model.onnx" - sess = ort.InferenceSession(self.gptj, - sess_options, - providers=["CPUExecutionProvider"]) + sess = ort.InferenceSession(self.gptj, sess_options, providers=["CPUExecutionProvider"]) self.data_reader.rewind() - sq_config = config.SmoothQuantConfig(weight_type=QuantType.QUInt8, activation_type=QuantType.QUInt8, folding=True, scales_per_op=False) + sq_config = config.SmoothQuantConfig( + weight_type=QuantType.QUInt8, activation_type=QuantType.QUInt8, folding=True, scales_per_op=False + ) model = algos.smooth_quant_entry("Optimized_model.onnx", sq_config, self.data_reader) num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"]) self.assertEqual(num_muls, 10) self.data_reader.rewind() - sq_config = config.SmoothQuantConfig(weight_type=QuantType.QUInt8, activation_type=QuantType.QUInt8, folding=False, scales_per_op=False) + sq_config = config.SmoothQuantConfig( + weight_type=QuantType.QUInt8, activation_type=QuantType.QUInt8, folding=False, scales_per_op=False + ) model = algos.smooth_quant_entry("Optimized_model.onnx", sq_config, self.data_reader) num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"]) self.assertEqual(num_muls, 15) + if __name__ == "__main__": unittest.main() diff --git a/test/utils/test_general.py b/test/utils/test_general.py index b4140e7f2..32cb80087 100644 --- a/test/utils/test_general.py +++ b/test/utils/test_general.py @@ -192,7 +192,10 @@ def test_api(self): self.assertEqual(fake_default_config.weight_dtype, "int") config_set = get_all_config_set() self.assertEqual(len(config_set), len(config.config_registry.get_all_config_cls())) - self.assertEqual([i for i in config_set if getattr(i, "name", "None") == FAKE_CONFIG_NAME][0].weight_bits, DEFAULT_WEIGHT_BITS) + self.assertEqual( + [i for i in config_set if getattr(i, "name", "None") == FAKE_CONFIG_NAME][0].weight_bits, + DEFAULT_WEIGHT_BITS, + ) def test_config_expand_complex_tunable_type(self): target_op_type_list_options = [["Conv", "Gemm"], ["Conv", "Matmul"]] @@ -251,9 +254,10 @@ def test_config_loader_skip_verified_config(self) -> None: config_set = [FakeAlgoConfig(weight_bits=[4, 8]), FakeAlgoConfig(weight_bits=8)] config_loader = tuning.ConfigLoader(config_set) config_count = 0 - for i, config in enumerate(config_loader): + for i, _ in enumerate(config_loader): config_count += 1 self.assertEqual(config_count, 2) + if __name__ == "__main__": unittest.main() diff --git a/test/utils/test_utility.py b/test/utils/test_utility.py index fa7a4812f..50ce620b9 100644 --- a/test/utils/test_utility.py +++ b/test/utils/test_utility.py @@ -17,26 +17,6 @@ def test_set_random_seed(self): with self.assertRaises(AssertionError): utility.set_random_seed(seed) - def test_set_workspace(self): - workspace = "/path/to/workspace" - utility.set_workspace(workspace) - self.assertEqual(utility.options.workspace, workspace) - - # non String type - workspace = 12345 - with self.assertRaises(AssertionError): - utility.set_workspace(workspace) - - def test_set_resume_from(self): - resume_from = "/path/to/resume" - utility.set_resume_from(resume_from) - self.assertEqual(utility.options.resume_from, resume_from) - - # non String type - resume_from = 12345 - with self.assertRaises(AssertionError): - utility.set_resume_from(resume_from) - class TestCPUInfo(unittest.TestCase):