Skip to content

Commit

Permalink
fix example and ut issue
Browse files Browse the repository at this point in the history
Signed-off-by: Mengni Wang <[email protected]>
  • Loading branch information
mengniwang95 committed Jun 27, 2024
1 parent f569dbb commit 509a4ae
Show file tree
Hide file tree
Showing 15 changed files with 59 additions and 72 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from torch.nn import functional
from torch.utils import data

from onnx_neural_compressor import data_reader, logger, utility
from onnx_neural_compressor import data_reader
from onnx_neural_compressor.quantization import config, matmul_nbits_quantizer, tuning

logging.basicConfig(
Expand Down Expand Up @@ -315,10 +315,6 @@ def rewind(self):


if __name__ == "__main__":
utility.set_workspace(args.workspace)
if not os.path.exists(args.workspace):
os.mkdir(args.workspace)

if args.benchmark:
if args.mode == "performance":
benchmark(args.model_path)
Expand All @@ -331,23 +327,11 @@ def rewind(self):
model_name = "model.onnx" # require optimum >= 1.14.0
model_path = os.path.join(args.model_path, model_name)

# do graph optimization
logger.info("Start graph optimization...")
sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
sess_options.optimized_model_filepath = os.path.join(args.workspace, "Optimized_model.onnx")
sess_options.add_session_config_entry(
"session.optimized_model_external_initializers_file_name", "Optimized_model.onnx_data"
)
sess_options.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "1024")
sess = ort.InferenceSession(model_path, sess_options, providers=["CPUExecutionProvider"])
logger.info("Graph optimization done.")

best_model = None
if args.algorithm.upper() == "RTN":
algo_config = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig()
quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
sess_options.optimized_model_filepath,
model_path,
n_bits=4,
block_size=32,
is_symmetric=True,
Expand All @@ -362,7 +346,7 @@ def rewind(self):
calibration_data_reader=calibration_data_reader, enable_mse_search=False
)
quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
sess_options.optimized_model_filepath,
model_path,
n_bits=4,
block_size=32,
is_symmetric=True,
Expand All @@ -377,7 +361,7 @@ def rewind(self):
calibration_data_reader=calibration_data_reader,
)
quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
sess_options.optimized_model_filepath,
model_path,
n_bits=4,
block_size=32,
is_symmetric=False,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def _collect_data(inputs):
node_name = name_to_node[node_output_names[output_idx]]
if node_output_names[output_idx] not in name_to_calibrator:
calib_method = (
q_config[node_name]["calibrate_method"] if q_config and node_name in q_config else 0
q_config[node_name]["calibrate_method"] if q_config and node_name in q_config else "MinMax"
)
assert calib_method in calibrator.CALIBRATOR, "Calibration method {} is not registered.".format(
calib_method
Expand Down Expand Up @@ -389,7 +389,7 @@ def get_weight_tensors_calib_range(self):
os.path.dirname(self.model_wrapper.model_path) if self.model_wrapper.model_path is not None else ""
),
)
_calibrator = calibrator.CALIBRATOR[0]() # use minmax method to calibrate initializer tensors
_calibrator = calibrator.CALIBRATOR["MinMax"]() # use minmax method to calibrate initializer tensors
if initializer_tensor.flatten().size > 0:
_calibrator.collect(initializer_tensor)
weight_tensors_calib_range[initializer_tensor_name] = [list(_calibrator.calib_range)]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def calib_range(self):
return self._calib_min, self._calib_max


@calib_registry(calib_method=0)
@calib_registry(calib_method="MinMax")
class MinMaxCalibrator(CalibratorBase):
"""MinMax calibrator class."""

Expand Down Expand Up @@ -109,7 +109,7 @@ def method_name(self):
return "MinMax"


@calib_registry(calib_method=2)
@calib_registry(calib_method="Percentile")
class PercentileCalibrator(CalibratorBase):
"""Percentile calibrator class.
Expand Down Expand Up @@ -163,7 +163,7 @@ def method_name(self):
return "Percentile"


@calib_registry(calib_method=1)
@calib_registry(calib_method="Entropy")
class EntropyCalibrator(CalibratorBase):
"""Entropy calibrator class.
Expand Down
24 changes: 11 additions & 13 deletions onnx_neural_compressor/algorithms/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def get_qmin_qmax_for_qType(qType, reduce_range=False, sym=False): # noqa: N802

def quantize_nparray(dtype, arr, scale, zero_point, low=None, high=None):
"""Quantize numpy array."""
q_weight = np.empty_like(np.asarray(arr), dtype=scale.dtype)
q_weight = np.empty_like(np.asarray(arr), dtype=np.asarray(scale).dtype)
np.divide(arr, scale, out=q_weight)
np.add(q_weight, zero_point, out=q_weight)
np.round(q_weight, out=q_weight)
Expand Down Expand Up @@ -340,9 +340,8 @@ def make_matmul_weight_only_node(
op_type = "MatMulNBits"

# pack quantized weight
for i in range(q_weight.shape[0]):
for k in range(0, group_size, 2):
packed[i][k // 2] = q_weight[i][k] | q_weight[i][k + 1] << 4
q_weight_pairs = q_weight[:, ::2] | q_weight[:, 1::2] << 4
packed[:, :] = q_weight_pairs[:, :blob_size]
packed = np.reshape(packed, (-1, k_blocks, blob_size))

# build scale tensor
Expand All @@ -363,15 +362,14 @@ def make_matmul_weight_only_node(
packed_zp = np.reshape(zero_point, (1, -1)).astype("uint8")
else:
packed_zp = np.full((zero_point.shape[0] + 1) // 2, 136, dtype="uint8")
for i in range(zero_point.shape[0] // k_blocks):
for j in range(k_blocks):
idx = i * k_blocks + j
zp = zero_point[idx]
packed_zp[idx // 2] = (
((packed_zp[idx // 2] & 0x0F) | (zp << 4))
if (idx & 1)
else ((packed_zp[idx // 2] & 0xF0) | zp)
)
# create an index array
idx = np.arange(zero_point.shape[0] // k_blocks * k_blocks).reshape(-1)
# separate odd and even indices
even_idx = idx[::2]
odd_idx = idx[1::2]
# vectorized operation for even and odd indices
packed_zp[even_idx // 2] = (packed_zp[even_idx // 2] & 0xF0) | zero_point[even_idx].ravel()
packed_zp[odd_idx // 2] = (packed_zp[odd_idx // 2] & 0x0F) | (zero_point[odd_idx].ravel() << 4)

zp_tensor = onnx.helper.make_tensor(
name=node.input[1] + "_zp", data_type=2, dims=packed_zp.shape, vals=packed_zp.tobytes(), raw=True
Expand Down
2 changes: 0 additions & 2 deletions onnx_neural_compressor/onnx_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,12 @@ def __init__(self, model, **kwargs):
model (str or ModelProto): path to onnx model or loaded ModelProto model object.
"""
self.model = model if not isinstance(model, str) else onnx.load(model, load_external_data=False)

self._model_path = None if not isinstance(model, str) else model
self.check_is_large_model()
if self._is_large_model and self._model_path is None and not kwargs.get("ignore_warning", False):
logger.warning("Model size > 2GB. Please use model path instead of onnx model object to quantize")

if self._is_large_model and isinstance(model, str) and kwargs.get("load_external_data", True):

onnx.external_data_helper.load_external_data_for_model(self.model, os.path.dirname(self._model_path))

self._config = None
Expand Down
1 change: 1 addition & 0 deletions onnx_neural_compressor/quantization/algorithm_entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ def smooth_quant_entry(
pathlib.Path(tmp_dir).joinpath("smooth.onnx").as_posix(),
quant_config,
calibration_data_reader,
model_output,
)
return q_model

Expand Down
2 changes: 1 addition & 1 deletion onnx_neural_compressor/quantization/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,7 +660,7 @@ class OperatorConfig:
def __post_init__(self):
self.weight_type = getattr(self.weight_type, "tensor_type", self.weight_type)
self.activation_type = getattr(self.activation_type, "tensor_type", self.activation_type)
self.calibrate_method = getattr(self.calibrate_method, "value", self.calibrate_method)
self.calibrate_method = getattr(self.calibrate_method, "name", self.calibrate_method)

def __getitem__(self, key):
return getattr(self, key)
Expand Down
12 changes: 9 additions & 3 deletions onnx_neural_compressor/quantization/matmul_nbits_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,7 @@ def __init__(
):
if nodes_to_exclude is None:
nodes_to_exclude = []
self.model_path = model if isinstance(model, str) else None
self.model = model
self.model = onnx_model.ONNXModel(onnx.load(model)) if isinstance(model, str) else onnx_model.ONNXModel(model)
self.block_size = block_size
self.is_symmetric = is_symmetric
self.accuracy_level = accuracy_level
Expand Down Expand Up @@ -170,7 +168,7 @@ def _generate_nc_config(self):

def int4_quant_algo(self):
qconfig = self._generate_nc_config()
model = self.model_path or self.model
model = self.model
opt_tmp_file = tempfile.TemporaryDirectory()

# do graph optimization if not layer_wise_quant
Expand All @@ -181,12 +179,20 @@ def int4_quant_algo(self):
if not isinstance(model, str):
onnx.save(model, pathlib.Path(opt_tmp_file.name).joinpath("tmp.onnx").as_posix())
model = pathlib.Path(opt_tmp_file.name).joinpath("tmp.onnx").as_posix()
logger.info("Start graph optimization...")
sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = self.optimization_level
sess_options.optimized_model_filepath = pathlib.Path(opt_tmp_file.name).joinpath("opt.onnx").as_posix()
sess_options.add_session_config_entry(
"session.optimized_model_external_initializers_file_name", "opt.onnx_data"
)
sess_options.add_session_config_entry(
"session.optimized_model_external_initializers_min_size_in_bytes", "1024"
)
session = ort.InferenceSession(model, sess_options)
model = sess_options.optimized_model_filepath
del session
logger.info("Graph optimization done.")

logger.info(f"start to quantize model with {self.algorithm} algorithm...")
if self.algorithm == "RTN":
Expand Down
6 changes: 6 additions & 0 deletions onnx_neural_compressor/quantization/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ def quantize(
sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = optimization_level
sess_options.optimized_model_filepath = pathlib.Path(tmp_dir).joinpath("opt.onnx").as_posix()
sess_options.add_session_config_entry(
"session.optimized_model_external_initializers_file_name", "opt.onnx_data"
)
sess_options.add_session_config_entry(
"session.optimized_model_external_initializers_min_size_in_bytes", "1024"
)
session = ort.InferenceSession(model_input, sess_options)
del session
model_input = sess_options.optimized_model_filepath
Expand Down
4 changes: 4 additions & 0 deletions onnx_neural_compressor/quantization/tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,10 @@ def autotune(
sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = optimization_level
sess_options.optimized_model_filepath = pathlib.Path(tmp_folder.name).joinpath("opt.onnx").as_posix()
sess_options.add_session_config_entry(
"session.optimized_model_external_initializers_file_name", "opt.onnx_data"
)
sess_options.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "1024")
session = ort.InferenceSession(model_input, sess_options)
model_input = sess_options.optimized_model_filepath
del session
Expand Down
11 changes: 2 additions & 9 deletions test/quantization/layer_wise/test_layer_wise.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,8 @@ def setUpClass(self):

model = onnx.load(model_path)
model = symbolic_shape_infer.SymbolicShapeInference.infer_shapes(model, auto_merge=True)
infer_shape_model_path = "llama-2-tiny-3layers-random/model-infer-shape.onnx"
onnx.save(model, infer_shape_model_path)

sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
sess_options.optimized_model_filepath = "llama-2-tiny-3layers-random/optimized_model.onnx"
ort.InferenceSession(infer_shape_model_path, sess_options)

self.llama = "llama-2-tiny-3layers-random/optimized_model.onnx"
self.llama = "llama-2-tiny-3layers-random/model-infer-shape.onnx"
onnx.save(model, self.llama)
self.calibration_data_reader = DummyNLPDataloader(llama_id)

@classmethod
Expand Down
6 changes: 3 additions & 3 deletions test/quantization/post_training_quant/test_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class TestQuantizer(unittest.TestCase):
"per_channel": False,
"weight_sym": True,
"activation_sym": False,
"calibrate_method": quantization.CalibrationMethod.MinMax,
"calibrate_method": "MinMax",
}

@classmethod
Expand Down Expand Up @@ -622,7 +622,7 @@ def test_conv(self):
"C": [np.uint8(10.0), np.float32(0)],
"D": [np.uint8(10.0), np.float32(0)],
}
quantizable_op_types = [op]
quantizable_op_types = ["Conv"]
q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
self.assertEqual(
collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
Expand Down Expand Up @@ -653,7 +653,7 @@ def test_matmul(self):
"B": [np.uint8(10.0), np.float32(0)],
"C": [np.uint8(10.0), np.float32(0)],
}
quantizable_op_types = ["Matmul"]
quantizable_op_types = ["MatMul"]
q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
self.assertEqual(
collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def _count_op_num(model, optype):
return num


class TestStaticQuant(unittest.TestCase):
class TestPostTrainingQuant(unittest.TestCase):

@classmethod
def setUpClass(self):
Expand All @@ -86,13 +86,15 @@ def setUpClass(self):
def tearDownClass(self):
shutil.rmtree("./model", ignore_errors=True)
os.remove("quant.onnx")
os.remove("quant.onnx_data")

def test_static_quant(self):
cfg = config.StaticQuantConfig(
calibration_data_reader=self.data_reader,
weight_type=quantization.QuantType.QInt8,
per_channel=True,
quant_last_matmul=True,
calibrate_method=quantization.CalibrationMethod.Entropy,
extra_options={"WeightSymmetric": True, "ActivationSymmetric": False},
execution_provider="CPUExecutionProvider",
)
Expand All @@ -103,6 +105,7 @@ def test_static_quant(self):
cfg = config.StaticQuantConfig(
calibration_data_reader=self.data_reader,
weight_type=quantization.QuantType.QInt8,
calibrate_method=quantization.CalibrationMethod.Percentile,
per_channel=True,
quant_last_matmul=False,
extra_options={"WeightSymmetric": True, "ActivationSymmetric": False},
Expand Down
6 changes: 0 additions & 6 deletions test/quantization/post_training_quant/test_quant_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,6 @@ def test_pad_tensor(self):
pad_data = quant_utils.pad_tensor(data, group_size, k_blocks)
self.assertEqual(pad_data.shape, (k_blocks * group_size, 32))

def test_4bit_quant_tensor(self):
data = np.random.random((100, 32))
q_data, scale, zp = quant_utils.quant_tensor(data)

def test_quant_dequant_data(self):
data = np.random.random((100, 32))
qrange = quant_utils.get_qmin_qmax_for_qType(
Expand All @@ -34,7 +30,6 @@ def test_quant_dequant_data(self):

_, _, zero_point, scale, quantized_data = quant_utils.quantize_data(
data=data,
quantize_range=qrange,
qType=onnx.TensorProto.UINT8,
sym=True,
)
Expand All @@ -48,7 +43,6 @@ def test_quant_dequant_data(self):

_, _, zero_point, scale, quantized_data = quant_utils.quantize_data_per_channel(
data=data,
quantize_range=qrange,
qType=onnx.TensorProto.UINT8,
sym=True,
axis=1,
Expand Down
Loading

0 comments on commit 509a4ae

Please sign in to comment.