From 8e398fc271932b96cd7296409155f836df093bb6 Mon Sep 17 00:00:00 2001 From: Damian Date: Mon, 24 Jul 2023 12:45:25 +0000 Subject: [PATCH 1/9] initial commit --- src/deepsparse/benchmark/benchmark_model.py | 10 ++ .../transformers/engines/nl_decoder_engine.py | 90 +++------------- src/deepsparse/transformers/utils/helpers.py | 100 ++++++++++++++++-- src/deepsparse/utils/onnx.py | 52 +++++++-- 4 files changed, 158 insertions(+), 94 deletions(-) diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py index 0bef7c57ed..cee885ecfb 100644 --- a/src/deepsparse/benchmark/benchmark_model.py +++ b/src/deepsparse/benchmark/benchmark_model.py @@ -104,7 +104,9 @@ from deepsparse.cpu import cpu_architecture from deepsparse.log import set_logging_level from deepsparse.utils import ( + assert_model_sequence_length_one, generate_random_inputs, + has_model_kv_cache, model_to_path, override_onnx_input_shapes, parse_input_shapes, @@ -357,6 +359,14 @@ def benchmark_model( orig_model_path = model_path model_path = model_to_path(model_path) + + if has_model_kv_cache(model_path): + _LOGGER.info( + "Found model that contains KV cache inputs and outputs.\n" + "Enforcing `len(input_ids)=1` (simulating autoregressive inference)." + ) + model_path = assert_model_sequence_length_one(model_path) + num_streams = parse_num_streams(num_streams, num_cores, scenario) # Compile the ONNX into a runnable model diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py index d75d051e56..410ce29df9 100644 --- a/src/deepsparse/transformers/engines/nl_decoder_engine.py +++ b/src/deepsparse/transformers/engines/nl_decoder_engine.py @@ -16,23 +16,23 @@ from typing import Any, Dict, List, Optional, Tuple import numpy -import onnx from transformers import AutoTokenizer from deepsparse.engine import Context from deepsparse.pipeline import DEEPSPARSE_ENGINE, create_engine from deepsparse.transformers.utils.decoder_kv_cache import DecoderKVCache -from deepsparse.transformers.utils.helpers import generate_session_id, softmax -from deepsparse.utils.onnx import translate_onnx_type_to_numpy -from sparsezoo.utils.onnx import save_onnx +from deepsparse.transformers.utils.helpers import generate_session_id +from deepsparse.transformers.utils.helpers import ( + overwrite_onnx_model_inputs_for_kv_cache_models as overwrite_onnx_model_inputs, +) +from deepsparse.transformers.utils.helpers import softmax +from deepsparse.utils.onnx import CACHE_INPUT_NAME, CACHE_OUTPUT_NAME _LOGGER = logging.getLogger(__name__) __all__ = ["NLDecoderEngine"] -_CACHE_INPUT_NAME = "past_key_values" - class NLDecoderEngine: """ @@ -70,7 +70,11 @@ def __init__( # flag to indicate if the model is quantized or not self.kv_cache_data_type = None - onnx_file_path, output_indices_to_be_cached = self.overwrite_onnx_model_inputs( + ( + onnx_file_path, + output_indices_to_be_cached, + kv_cache_data_type, + ) = overwrite_onnx_model_inputs( onnx_file_path=onnx_file_path, batch_size=engine_args.get("batch_size", 1), sequence_length=sequence_length, @@ -79,6 +83,7 @@ def __init__( kv_cache_enabled = False if sum(output_indices_to_be_cached): kv_cache_enabled = True + self.kv_cache_data_type = kv_cache_data_type if use_deepsparse_cache and engine_type == DEEPSPARSE_ENGINE: # inform the engine, that are using the kv cache engine_args["cache_output_bools"] = output_indices_to_be_cached @@ -123,7 +128,7 @@ def onnx_input_names_no_cache(self) -> List[str]: return [ name for name in self.engine.input_names - if not name.startswith(_CACHE_INPUT_NAME) + if not name.startswith(CACHE_INPUT_NAME) ] def __call__( @@ -176,67 +181,6 @@ def transfer_cache_state(self, cache: DecoderKVCache): """ self.kv_cache = copy.deepcopy(cache) - def overwrite_onnx_model_inputs( - self, - onnx_file_path: str, - sequence_length: int, - input_ids_length: int, - batch_size: int = 1, - ) -> Tuple[str, List[int]]: - """ - Enforces the appropriate input shapes for the onnx model, as well as - checks whether kv cache is enabled or not. - - :param onnx_file_path: The path to the onnx model file that will be - overwritten with the new input shapes - :param batch_size: The batch size to use for the input - :param sequence_length: The sequence length to use for the input - :param input_ids_length: The length of input_ids - :return: The path to the onnx model file that has been overwritten - with the new input shapes, as well as the indices of the inputs - that should be cached - """ - model = onnx.load(onnx_file_path, load_external_data=False) - initializer_input_names = set(node.name for node in model.graph.initializer) - external_inputs = [ - inp for inp in model.graph.input if inp.name not in initializer_input_names - ] - for external_input in external_inputs: - # overwrite the batch size for all the inputs - external_input.type.tensor_type.shape.dim[0].dim_value = batch_size - - if external_input.name in ["input_ids", "positions"]: - external_input.type.tensor_type.shape.dim[ - 1 - ].dim_value = input_ids_length - elif external_input.name == "attention_mask": - external_input.type.tensor_type.shape.dim[1].dim_value = sequence_length - elif external_input.name.startswith(_CACHE_INPUT_NAME): - external_input.type.tensor_type.shape.dim[2].dim_value = ( - sequence_length - input_ids_length - ) - else: - raise ValueError( - f"Unexpected external input name: {external_input.name}" - ) - - _LOGGER.info( - "Overwriting in-place the input shapes " - f"of the transformer model at {onnx_file_path}" - ) - save_onnx(model, onnx_file_path) - - output_indices_to_be_cached = [ - 1 if inp.name.startswith("present") else 0 for inp in model.graph.output - ] - - kv_cache_elem_type = next( - inp for inp in model.graph.input if inp.name.startswith(_CACHE_INPUT_NAME) - ).type.tensor_type.elem_type - self.kv_cache_data_type = translate_onnx_type_to_numpy(kv_cache_elem_type) - - return onnx_file_path, output_indices_to_be_cached - def generate_token(self, logits: numpy.ndarray) -> numpy.ndarray: """ Samples a token from the logits using the sampling temperature. @@ -301,7 +245,7 @@ def update_kv_cache( cache_onnx_names = [ name for name in self.engine.input_names - if name.startswith(_CACHE_INPUT_NAME) + if name.startswith(CACHE_INPUT_NAME) ] kv_cache_state = { name: array for name, array in zip(cache_onnx_names, kv_cache_state) @@ -319,7 +263,7 @@ def _initialize_kv_cache_state(self, length: int) -> Dict[str, numpy.ndarray]: cache_engine_input_index = next( i for i, name in enumerate(self.engine.input_names) - if _CACHE_INPUT_NAME in name + if CACHE_INPUT_NAME in name ) batch_size, num_attention_heads, _, hidden_dims = self.engine.input_shapes[ cache_engine_input_index @@ -331,9 +275,9 @@ def _initialize_kv_cache_state(self, length: int) -> Dict[str, numpy.ndarray]: ) cache_keys = [ - output_name.replace("present", _CACHE_INPUT_NAME) + output_name.replace(CACHE_OUTPUT_NAME, CACHE_INPUT_NAME) for output_name in self.engine.output_names - if output_name.startswith("present") + if output_name.startswith(CACHE_OUTPUT_NAME) ] return {key: empty_kv_cache_tensor for key in cache_keys} diff --git a/src/deepsparse/transformers/utils/helpers.py b/src/deepsparse/transformers/utils/helpers.py index f4e72ca665..f9df5955d0 100644 --- a/src/deepsparse/transformers/utils/helpers.py +++ b/src/deepsparse/transformers/utils/helpers.py @@ -11,13 +11,100 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import logging import uuid +from typing import List, Optional, Tuple import numpy +import onnx + +from deepsparse.utils.onnx import ( + CACHE_INPUT_NAME, + default_cached_outputs, + translate_onnx_type_to_numpy, +) +from sparsezoo.utils import save_onnx + + +__all__ = [ + "overwrite_onnx_model_inputs_for_kv_cache_models", + "generate_session_id", + "pad_to_fixed_length", + "softmax", +] + +_LOGGER = logging.getLogger(__name__) + + +def overwrite_onnx_model_inputs_for_kv_cache_models( + onnx_file_path: str, + sequence_length: int = 128, + input_ids_length: int = 1, + batch_size: int = 1, +) -> Tuple[str, List[int], Optional[numpy.dtype]]: + """ + Enforces the appropriate input shapes for the onnx model, as well as + checks whether kv cache is enabled or not. + :param onnx_file_path: The path to the onnx model file that will be + overwritten with the new input shapes + :param batch_size: The batch size to use for the input + :param sequence_length: The sequence length to use for the input + :param input_ids_length: The length of input_ids + :return: A tuple that contains: + - the path to the onnx model file that has been overwritten + with the new input shapes + - boolean list, where elements are set to True if the + corresponding model output should be cached or False + if not. + - the data type of the kv cache. If the model does not + use kv cache, then the data type is None + """ + model = onnx.load(onnx_file_path, load_external_data=False) + initializer_input_names = set(node.name for node in model.graph.initializer) + external_inputs = [ + inp for inp in model.graph.input if inp.name not in initializer_input_names + ] + for external_input in external_inputs: + # overwrite the batch size for all the inputs + external_input.type.tensor_type.shape.dim[0].dim_value = batch_size + + if external_input.name in ["input_ids", "positions"]: + external_input.type.tensor_type.shape.dim[1].dim_value = input_ids_length + elif external_input.name == "attention_mask": + external_input.type.tensor_type.shape.dim[1].dim_value = sequence_length + elif external_input.name.startswith(CACHE_INPUT_NAME): + external_input.type.tensor_type.shape.dim[2].dim_value = ( + sequence_length - input_ids_length + ) + else: + raise ValueError(f"Unexpected external input name: {external_input.name}") + + _LOGGER.info( + "Overwriting in-place the input shapes " + f"of the transformer model at {onnx_file_path}" + ) + save_onnx(model, onnx_file_path) + + output_indices_to_be_cached = default_cached_outputs(model) -__all__ = ["softmax", "generate_session_id", "pad_to_fixed_length"] + kv_cache_data_type = None + if sum(output_indices_to_be_cached): + kv_cache_elem_type = next( + inp for inp in model.graph.input if inp.name.startswith(CACHE_INPUT_NAME) + ).type.tensor_type.elem_type + kv_cache_data_type = translate_onnx_type_to_numpy(kv_cache_elem_type) + + return onnx_file_path, output_indices_to_be_cached, kv_cache_data_type + + +def generate_session_id() -> str: + """ + Generate uuid for session id. This is used to + identify the kv cache session for the user + """ + session_id = str(uuid.uuid4()) + return session_id def softmax(x: numpy.ndarray) -> numpy.ndarray: @@ -36,15 +123,6 @@ def softmax(x: numpy.ndarray) -> numpy.ndarray: return numerator / denominator -def generate_session_id() -> str: - """ - Generate uuid for session id. This is used to - identify the kv cache session for the user - """ - session_id = str(uuid.uuid4()) - return session_id - - def pad_to_fixed_length( array: numpy.ndarray, max_len: int, axis: int = 0, value: int = 0 ) -> numpy.ndarray: diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py index 89d8baf4c9..ce4c552e4f 100644 --- a/src/deepsparse/utils/onnx.py +++ b/src/deepsparse/utils/onnx.py @@ -50,10 +50,17 @@ "truncate_onnx_model", "truncate_onnx_embedding_model", "default_cached_outputs", + "has_model_kv_cache", + "CACHE_INPUT_NAME", + "CACHE_OUTPUT_NAME", + "assert_model_sequence_length_one", ] _LOGGER = logging.getLogger(__name__) +CACHE_INPUT_NAME = "past_key_values" +CACHE_OUTPUT_NAME = "present" + @contextlib.contextmanager def save_onnx_to_temp_files(model: onnx.ModelProto, with_external_data=False) -> str: @@ -475,20 +482,45 @@ def truncate_onnx_embedding_model( return output_filepath, tmp_file -def default_cached_outputs(model_path: str) -> List[bool]: +def default_cached_outputs(model: Union[str, ModelProto]) -> List[bool]: """ + Get a list of bools that indicate which outputs should be cached. + The elements that are set to True correspond to cached outputs, + the rest are set to False. + :param model_path: Path to a model - :return A list of bools that indicates caching of all outputs except the first one. + :return A list of bools that indicate which outputs should be cached. """ - - outputs = list(onnx.load(model_path).graph.output) + model = ( + onnx.load(model, load_external_data=False) if isinstance(model, str) else model + ) + outputs = model.graph.output assert len(outputs) > 0 - # Create a boolean list of every output of the - # model [logits, key0, value0, key1, value1, ..., keyN, valueN] - cached_outputs = [True for i in range(len(outputs))] + return [output.name.startswith(CACHE_OUTPUT_NAME) for output in outputs] + + +def has_model_kv_cache(model: Union[str, ModelProto]) -> bool: + """ + Check whether a model has a KV cache support. - # Assume first input is logits and logits ought not to be cached - cached_outputs[0] = False + :param model_path: Path to a model or a model proto. + :return True if the model has a KV cache support, False otherwise. + """ + return bool(sum(default_cached_outputs(model))) + + +def assert_model_sequence_length_one(model_path: str) -> str: + """ + Takes a path to an onnx model and enforces that it has + static input dimensions. + + :param model_path: Path to a model. + :return: Path to the model with static input dimensions. + """ + from deepsparse.transformers.utils.helpers import ( + overwrite_onnx_model_inputs_for_kv_cache_models, + ) - return cached_outputs + onnx_file_path, _, _ = overwrite_onnx_model_inputs_for_kv_cache_models(model_path) + return onnx_file_path From 0fe9f7e79ce1b2edc720fac61690a4a8b18f61a1 Mon Sep 17 00:00:00 2001 From: Damian Date: Mon, 24 Jul 2023 13:00:44 +0000 Subject: [PATCH 2/9] improve logging docstring --- src/deepsparse/benchmark/benchmark_model.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py index cee885ecfb..a39b2afd02 100644 --- a/src/deepsparse/benchmark/benchmark_model.py +++ b/src/deepsparse/benchmark/benchmark_model.py @@ -361,10 +361,7 @@ def benchmark_model( model_path = model_to_path(model_path) if has_model_kv_cache(model_path): - _LOGGER.info( - "Found model that contains KV cache inputs and outputs.\n" - "Enforcing `len(input_ids)=1` (simulating autoregressive inference)." - ) + _LOGGER.info("Found model that contains KV cache support.") model_path = assert_model_sequence_length_one(model_path) num_streams = parse_num_streams(num_streams, num_cores, scenario) From 55916f0dc6974b4596020d420798c427df277b83 Mon Sep 17 00:00:00 2001 From: Damian Date: Tue, 25 Jul 2023 06:43:25 +0000 Subject: [PATCH 3/9] more verbose logging --- src/deepsparse/benchmark/benchmark_model.py | 5 ++++- src/deepsparse/transformers/engines/nl_decoder_engine.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py index a39b2afd02..44cfc40cd7 100644 --- a/src/deepsparse/benchmark/benchmark_model.py +++ b/src/deepsparse/benchmark/benchmark_model.py @@ -361,7 +361,10 @@ def benchmark_model( model_path = model_to_path(model_path) if has_model_kv_cache(model_path): - _LOGGER.info("Found model that contains KV cache support.") + _LOGGER.info( + "Found model that contains KV cache support. " + "Benchmarking the autoregressive model." + ) model_path = assert_model_sequence_length_one(model_path) num_streams = parse_num_streams(num_streams, num_cores, scenario) diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py index 92101496f7..dd4ed8ccaa 100644 --- a/src/deepsparse/transformers/engines/nl_decoder_engine.py +++ b/src/deepsparse/transformers/engines/nl_decoder_engine.py @@ -25,7 +25,7 @@ from deepsparse.transformers.utils.helpers import ( overwrite_onnx_model_inputs_for_kv_cache_models as overwrite_onnx_model_inputs, ) -from deepsparse.transformers.utils.helpers import softmax +from deepsparse.utils.data import numpy_softmax from deepsparse.utils.onnx import CACHE_INPUT_NAME, CACHE_OUTPUT_NAME From fa755cb3d34babc9b295005b4eae729374ac9a01 Mon Sep 17 00:00:00 2001 From: Damian Date: Tue, 1 Aug 2023 10:06:37 +0000 Subject: [PATCH 4/9] add sequence_length as variable --- src/deepsparse/benchmark/benchmark_model.py | 38 +++++++++++++++++-- .../transformers/engines/nl_decoder_engine.py | 2 +- src/deepsparse/transformers/utils/helpers.py | 5 ++- src/deepsparse/utils/onnx.py | 21 +++++++--- 4 files changed, 55 insertions(+), 11 deletions(-) diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py index 44cfc40cd7..e739f571c9 100644 --- a/src/deepsparse/benchmark/benchmark_model.py +++ b/src/deepsparse/benchmark/benchmark_model.py @@ -81,6 +81,13 @@ zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/base-none \ --input_shapes "[1,512],[1,512],[1,512]" +########## +Example on a OPT (Large Language Model) from SparseZoo with sequence length 256: +deepsparse.benchmark \ + zoo:nlg/text_generation/opt-1.3b/pytorch/huggingface/ + opt_pretrain/pruned50_quantW8A8-none \ + --sequence_length 256 + ########## Example on local ONNX model: deepsparse.benchmark /PATH/TO/model.onnx @@ -96,7 +103,7 @@ import json import logging import os -from typing import Dict +from typing import Dict, Optional from deepsparse import Scheduler, __version__, compile_model from deepsparse.benchmark.ort_engine import ORTEngine @@ -104,11 +111,11 @@ from deepsparse.cpu import cpu_architecture from deepsparse.log import set_logging_level from deepsparse.utils import ( - assert_model_sequence_length_one, generate_random_inputs, has_model_kv_cache, model_to_path, override_onnx_input_shapes, + overwrite_sequence_length, parse_input_shapes, ) @@ -140,6 +147,15 @@ def parse_args(): default=1, help="The batch size to run the analysis for. Must be greater than 0", ) + + parser.add_argument( + "-seq_len", + "--sequence_length", + type=int, + default=None, + help="The sequence length to run the " + "Large Language Models (LLMs) benchmarks for. Must be greater than 0", + ) parser.add_argument( "-i", "-shapes", @@ -334,6 +350,7 @@ def load_custom_engine(custom_engine_identifier: str): def benchmark_model( model_path: str, batch_size: int = 1, + sequence_length: Optional[int] = None, input_shapes: str = "", num_cores: int = None, scenario: str = "sync", @@ -361,11 +378,20 @@ def benchmark_model( model_path = model_to_path(model_path) if has_model_kv_cache(model_path): + if batch_size != 1: + raise ValueError( + "Unable to run models with KV cache support " + "for batch size different than one." + "Please set batch size to 1 and try again" + ) + model_path, sequence_length = overwrite_sequence_length( + model_path=model_path, sequence_length=sequence_length + ) _LOGGER.info( "Found model that contains KV cache support. " - "Benchmarking the autoregressive model." + "Benchmarking the autoregressive model with " + f"sequence length: {sequence_length}." ) - model_path = assert_model_sequence_length_one(model_path) num_streams = parse_num_streams(num_streams, num_cores, scenario) @@ -428,6 +454,7 @@ def benchmark_model( "orig_model_path": orig_model_path, "model_path": model_path, "batch_size": batch_size, + "sequence_length": sequence_length, "input_shapes": input_shapes, "num_cores": num_cores, "scenario": scenario, @@ -453,6 +480,7 @@ def main(): result = benchmark_model( model_path=args.model_path, + sequence_length=args.sequence_length, batch_size=args.batch_size, input_shapes=args.input_shapes, num_cores=args.num_cores, @@ -469,6 +497,8 @@ def main(): # Results summary print("Original Model Path: {}".format(args.model_path)) print("Batch Size: {}".format(args.batch_size)) + if args.sequence_length is not None: + print("Sequence Length: {}".format(args.sequence_length)) print("Scenario: {}".format(args.scenario)) print( "Throughput (items/sec): {:.4f}".format( diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py index ef90101ded..0860b66a14 100644 --- a/src/deepsparse/transformers/engines/nl_decoder_engine.py +++ b/src/deepsparse/transformers/engines/nl_decoder_engine.py @@ -69,7 +69,6 @@ def __init__( ): # flag to indicate if the model is quantized or not self.kv_cache_data_type = None - ( onnx_file_path, output_indices_to_be_cached, @@ -80,6 +79,7 @@ def __init__( sequence_length=sequence_length, input_ids_length=input_ids_length, ) + kv_cache_enabled = False if sum(output_indices_to_be_cached): kv_cache_enabled = True diff --git a/src/deepsparse/transformers/utils/helpers.py b/src/deepsparse/transformers/utils/helpers.py index 2090258577..c771d27aa3 100644 --- a/src/deepsparse/transformers/utils/helpers.py +++ b/src/deepsparse/transformers/utils/helpers.py @@ -77,6 +77,9 @@ def overwrite_onnx_model_inputs_for_kv_cache_models( external_input.type.tensor_type.shape.dim[2].dim_value = ( sequence_length - input_ids_length ) + elif external_input.name.startswith("causal_mask"): + external_input.type.tensor_type.shape.dim[2].dim_value = input_ids_length + external_input.type.tensor_type.shape.dim[3].dim_value = sequence_length else: raise ValueError(f"Unexpected external input name: {external_input.name}") @@ -89,7 +92,7 @@ def overwrite_onnx_model_inputs_for_kv_cache_models( output_indices_to_be_cached = default_cached_outputs(model) kv_cache_data_type = None - if sum(output_indices_to_be_cached): + if any(output_indices_to_be_cached): kv_cache_elem_type = next( inp for inp in model.graph.input if inp.name.startswith(CACHE_INPUT_NAME) ).type.tensor_type.elem_type diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py index ce4c552e4f..dc9fd675d2 100644 --- a/src/deepsparse/utils/onnx.py +++ b/src/deepsparse/utils/onnx.py @@ -53,7 +53,7 @@ "has_model_kv_cache", "CACHE_INPUT_NAME", "CACHE_OUTPUT_NAME", - "assert_model_sequence_length_one", + "overwrite_sequence_length", ] _LOGGER = logging.getLogger(__name__) @@ -507,20 +507,31 @@ def has_model_kv_cache(model: Union[str, ModelProto]) -> bool: :param model_path: Path to a model or a model proto. :return True if the model has a KV cache support, False otherwise. """ - return bool(sum(default_cached_outputs(model))) + return bool(any(default_cached_outputs(model))) -def assert_model_sequence_length_one(model_path: str) -> str: +def overwrite_sequence_length( + model_path: str, sequence_length: Optional[int] = None +) -> str: """ Takes a path to an onnx model and enforces that it has static input dimensions. :param model_path: Path to a model. + :param sequence_length: The sequence length to overwrite the model with. :return: Path to the model with static input dimensions. """ from deepsparse.transformers.utils.helpers import ( overwrite_onnx_model_inputs_for_kv_cache_models, ) - onnx_file_path, _, _ = overwrite_onnx_model_inputs_for_kv_cache_models(model_path) - return onnx_file_path + onnx_file_path, _, _ = overwrite_onnx_model_inputs_for_kv_cache_models( + onnx_file_path=model_path, sequence_length=sequence_length + ) + attention_input_info = [ + input + for input in onnx.load(onnx_file_path, load_external_data=False).graph.input + if "attention" in input.name + ][0] + sequence_length = attention_input_info.type.tensor_type.shape.dim[1].dim_value + return onnx_file_path, sequence_length From 709853d6c733d002f847f521bb667bab359951f1 Mon Sep 17 00:00:00 2001 From: Luka Govedic Date: Tue, 8 Aug 2023 14:20:23 -0400 Subject: [PATCH 5/9] fixed type annotations and avoided overwriting inputs when no sequence_length is passed --- src/deepsparse/transformers/utils/helpers.py | 2 +- src/deepsparse/utils/onnx.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/deepsparse/transformers/utils/helpers.py b/src/deepsparse/transformers/utils/helpers.py index c771d27aa3..09eca2ab07 100644 --- a/src/deepsparse/transformers/utils/helpers.py +++ b/src/deepsparse/transformers/utils/helpers.py @@ -41,7 +41,7 @@ def overwrite_onnx_model_inputs_for_kv_cache_models( sequence_length: int = 128, input_ids_length: int = 1, batch_size: int = 1, -) -> Tuple[str, List[int], Optional[numpy.dtype]]: +) -> Tuple[str, List[bool], Optional[numpy.dtype]]: """ Enforces the appropriate input shapes for the onnx model, as well as checks whether kv cache is enabled or not. diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py index dc9fd675d2..f19d06c3ea 100644 --- a/src/deepsparse/utils/onnx.py +++ b/src/deepsparse/utils/onnx.py @@ -512,7 +512,7 @@ def has_model_kv_cache(model: Union[str, ModelProto]) -> bool: def overwrite_sequence_length( model_path: str, sequence_length: Optional[int] = None -) -> str: +) -> Tuple[str, int]: """ Takes a path to an onnx model and enforces that it has static input dimensions. @@ -525,9 +525,13 @@ def overwrite_sequence_length( overwrite_onnx_model_inputs_for_kv_cache_models, ) - onnx_file_path, _, _ = overwrite_onnx_model_inputs_for_kv_cache_models( - onnx_file_path=model_path, sequence_length=sequence_length - ) + if sequence_length is not None: + onnx_file_path, _, _ = overwrite_onnx_model_inputs_for_kv_cache_models( + onnx_file_path=model_path, sequence_length=sequence_length + ) + else: + onnx_file_path = model_path + attention_input_info = [ input for input in onnx.load(onnx_file_path, load_external_data=False).graph.input From 7f7bf8300b54e8b3fe847be46e45dcdfe84d365a Mon Sep 17 00:00:00 2001 From: Damian Date: Wed, 23 Aug 2023 15:37:48 +0000 Subject: [PATCH 6/9] fix bad merge --- src/deepsparse/benchmark/benchmark_model.py | 1 - .../transformers/engines/nl_decoder_engine.py | 16 ++++++---------- src/deepsparse/transformers/utils/helpers.py | 13 +------------ src/deepsparse/utils/onnx.py | 5 ++--- 4 files changed, 9 insertions(+), 26 deletions(-) diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py index 8a5aede1d7..4a81b7bbb1 100644 --- a/src/deepsparse/benchmark/benchmark_model.py +++ b/src/deepsparse/benchmark/benchmark_model.py @@ -102,7 +102,6 @@ import importlib import json import logging -import os from typing import Dict, Optional from deepsparse import __version__, compile_model diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py index 51bab4bbc6..43c4d9c5de 100644 --- a/src/deepsparse/transformers/engines/nl_decoder_engine.py +++ b/src/deepsparse/transformers/engines/nl_decoder_engine.py @@ -24,12 +24,8 @@ from deepsparse.transformers.utils.helpers import ( overwrite_onnx_model_inputs_for_kv_cache_models as overwrite_onnx_model_inputs, ) -from deepsparse.transformers.utils.helpers import ( - generate_session_id, - overwrite_onnx_model_inputs, -) from deepsparse.utils.data import numpy_softmax -from deepsparse.utils.onnx import CACHE_INPUT_NAME, CACHE_OUTPUT_NAME +from deepsparse.utils.onnx import CACHE_INPUT_PREFIX, CACHE_OUTPUT_PREFIX _LOGGER = logging.getLogger(__name__) @@ -132,7 +128,7 @@ def onnx_input_names_no_cache(self) -> List[str]: return [ name for name in self.engine.input_names - if not name.startswith(CACHE_INPUT_NAME) + if not name.startswith(CACHE_INPUT_PREFIX) ] @property @@ -287,7 +283,7 @@ def update_kv_cache( cache_onnx_names = [ name for name in self.engine.input_names - if name.startswith(CACHE_INPUT_NAME) + if name.startswith(CACHE_INPUT_PREFIX) ] kv_cache_state = { name: array for name, array in zip(cache_onnx_names, kv_cache_state) @@ -305,7 +301,7 @@ def _initialize_kv_cache_state(self, length: int) -> Dict[str, numpy.ndarray]: cache_engine_input_index = next( i for i, name in enumerate(self.engine.input_names) - if CACHE_INPUT_NAME in name + if CACHE_INPUT_PREFIX in name ) batch_size, num_attention_heads, _, hidden_dims = self.engine.input_shapes[ cache_engine_input_index @@ -317,9 +313,9 @@ def _initialize_kv_cache_state(self, length: int) -> Dict[str, numpy.ndarray]: ) cache_keys = [ - output_name.replace(CACHE_OUTPUT_NAME, CACHE_INPUT_NAME) + output_name.replace(CACHE_OUTPUT_PREFIX, CACHE_INPUT_PREFIX) for output_name in self.engine.output_names - if output_name.startswith(CACHE_OUTPUT_NAME) + if output_name.startswith(CACHE_OUTPUT_PREFIX) ] return {key: empty_kv_cache_tensor for key in cache_keys} diff --git a/src/deepsparse/transformers/utils/helpers.py b/src/deepsparse/transformers/utils/helpers.py index 4ba4956648..f5f0b560cb 100644 --- a/src/deepsparse/transformers/utils/helpers.py +++ b/src/deepsparse/transformers/utils/helpers.py @@ -11,24 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import logging - import logging import uuid -from typing import List, Optional, Tuple, Union from typing import List, Tuple, Union import numpy import onnx -from deepsparse.utils.onnx import ( - CACHE_INPUT_NAME, - default_cached_outputs, - translate_onnx_type_to_numpy, -) -from sparsezoo.utils import save_onnx -import onnx - from deepsparse.utils.onnx import translate_onnx_type_to_numpy from sparsezoo.utils import save_onnx @@ -43,7 +32,7 @@ _LOGGER = logging.getLogger(__name__) -def overwrite_onnx_model_inputs( +def overwrite_onnx_model_inputs_for_kv_cache_models( onnx_file_path: str, sequence_length: int, input_ids_length: int, diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py index bf3f028df8..9c0801e2e3 100644 --- a/src/deepsparse/utils/onnx.py +++ b/src/deepsparse/utils/onnx.py @@ -51,8 +51,6 @@ "truncate_onnx_embedding_model", "default_cached_outputs", "has_model_kv_cache", - "CACHE_INPUT_NAME", - "CACHE_OUTPUT_NAME", "overwrite_sequence_length", "CACHE_INPUT_PREFIX", "CACHE_OUTPUT_PREFIX", @@ -499,6 +497,7 @@ def default_cached_outputs(model_path: str) -> List[bool]: return [name.startswith(CACHE_OUTPUT_PREFIX) for name in output_names] + def has_model_kv_cache(model: Union[str, ModelProto]) -> bool: """ Check whether a model has a KV cache support. @@ -537,4 +536,4 @@ def overwrite_sequence_length( if "attention" in input.name ][0] sequence_length = attention_input_info.type.tensor_type.shape.dim[1].dim_value - return onnx_file_path, sequence_length \ No newline at end of file + return onnx_file_path, sequence_length From 652439481354249e59460ae346e4733850bb85ad Mon Sep 17 00:00:00 2001 From: Damian Date: Wed, 23 Aug 2023 16:17:37 +0000 Subject: [PATCH 7/9] tested --- src/deepsparse/benchmark/benchmark_model.py | 44 +++++++++++++++------ src/deepsparse/utils/onnx.py | 35 ++++++++-------- 2 files changed, 51 insertions(+), 28 deletions(-) diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py index 4a81b7bbb1..ae9256b4aa 100644 --- a/src/deepsparse/benchmark/benchmark_model.py +++ b/src/deepsparse/benchmark/benchmark_model.py @@ -82,11 +82,12 @@ --input_shapes "[1,512],[1,512],[1,512]" ########## -Example on a OPT (Large Language Model) from SparseZoo with sequence length 256: +Example on a CodeGen (model with KV cache support) +from SparseZoo with input_ids_length 10 and sequence length 256: deepsparse.benchmark \ - zoo:nlg/text_generation/opt-1.3b/pytorch/huggingface/ - opt_pretrain/pruned50_quantW8A8-none \ - --sequence_length 256 + zoo:nlg/text_generation/codegen_mono-350m/pytorch/ + huggingface/bigpython_bigquery_thepile/pruned50-none + --input_ids_length 10 --sequence_length 256 ########## Example on local ONNX model: @@ -120,7 +121,7 @@ has_model_kv_cache, model_to_path, override_onnx_input_shapes, - overwrite_sequence_length, + overwrite_cache_model_inputs, parse_input_shapes, ) @@ -157,9 +158,20 @@ def parse_args(): "-seq_len", "--sequence_length", type=int, - default=None, + default=128, help="The sequence length to run the " - "Large Language Models (LLMs) benchmarks for. Must be greater than 0", + "KV cache supported model benchmarks for. " + "Must be greater than 0, default is 128", + ) + + parser.add_argument( + "-input_ids_len", + "--input_ids_length", + type=int, + default=1, + help="The input ids length to run the " + "KV cache supported model benchmarks for. " + "Must be greater than 0, default is 1", ) parser.add_argument( "-i", @@ -284,6 +296,7 @@ def benchmark_model( model_path: str, batch_size: int = 1, sequence_length: Optional[int] = None, + input_ids_length: Optional[int] = None, input_shapes: str = "", num_cores: int = None, scenario: str = "sync", @@ -317,15 +330,20 @@ def benchmark_model( "for batch size different than one." "Please set batch size to 1 and try again" ) - model_path, sequence_length = overwrite_sequence_length( - model_path=model_path, sequence_length=sequence_length - ) + _LOGGER.info( - "Found model that contains KV cache support. " + "Found model with KV cache support. " "Benchmarking the autoregressive model with " + f"input_ids_length: {input_ids_length} and " f"sequence length: {sequence_length}." ) + model_path = overwrite_cache_model_inputs( + model_path=model_path, + input_ids_length=input_ids_length, + sequence_length=sequence_length, + ) + num_streams = parse_num_streams(num_streams, num_cores, scenario) # Compile the ONNX into a runnable model @@ -388,6 +406,7 @@ def benchmark_model( "model_path": model_path, "batch_size": batch_size, "sequence_length": sequence_length, + "input_ids_length": input_ids_length, "input_shapes": input_shapes, "num_cores": num_cores, "scenario": scenario, @@ -414,6 +433,7 @@ def main(): result = benchmark_model( model_path=args.model_path, sequence_length=args.sequence_length, + input_ids_length=args.input_ids_length, batch_size=args.batch_size, input_shapes=args.input_shapes, num_cores=args.num_cores, @@ -432,6 +452,8 @@ def main(): print("Batch Size: {}".format(args.batch_size)) if args.sequence_length is not None: print("Sequence Length: {}".format(args.sequence_length)) + if args.input_ids_length is not None: + print("Input IDs Length: {}".format(args.input_ids_length)) print("Scenario: {}".format(args.scenario)) print( "Throughput (items/sec): {:.4f}".format( diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py index 9c0801e2e3..b740fad5ff 100644 --- a/src/deepsparse/utils/onnx.py +++ b/src/deepsparse/utils/onnx.py @@ -51,7 +51,7 @@ "truncate_onnx_embedding_model", "default_cached_outputs", "has_model_kv_cache", - "overwrite_sequence_length", + "overwrite_cache_model_inputs", "CACHE_INPUT_PREFIX", "CACHE_OUTPUT_PREFIX", ] @@ -508,14 +508,17 @@ def has_model_kv_cache(model: Union[str, ModelProto]) -> bool: return bool(any(default_cached_outputs(model))) -def overwrite_sequence_length( - model_path: str, sequence_length: Optional[int] = None +def overwrite_cache_model_inputs( + model_path: str, + input_ids_length: int, + sequence_length: int, ) -> Tuple[str, int]: """ Takes a path to an onnx model and enforces that it has static input dimensions. :param model_path: Path to a model. + :param input_ids_length: The input_ids length to overwrite the model with. :param sequence_length: The sequence length to overwrite the model with. :return: Path to the model with static input dimensions. """ @@ -523,17 +526,15 @@ def overwrite_sequence_length( overwrite_onnx_model_inputs_for_kv_cache_models, ) - if sequence_length is not None: - onnx_file_path, _, _ = overwrite_onnx_model_inputs_for_kv_cache_models( - onnx_file_path=model_path, sequence_length=sequence_length - ) - else: - onnx_file_path = model_path - - attention_input_info = [ - input - for input in onnx.load(onnx_file_path, load_external_data=False).graph.input - if "attention" in input.name - ][0] - sequence_length = attention_input_info.type.tensor_type.shape.dim[1].dim_value - return onnx_file_path, sequence_length + assert input_ids_length < sequence_length, ( + f"input_ids_length {input_ids_length} " + f"must be less than sequence_length {sequence_length}" + ) + + onnx_file_path, _, _ = overwrite_onnx_model_inputs_for_kv_cache_models( + onnx_file_path=model_path, + sequence_length=sequence_length, + input_ids_length=input_ids_length, + ) + + return onnx_file_path From 27dbf42287e3ef655beee9c75c14251a92143258 Mon Sep 17 00:00:00 2001 From: Damian Date: Wed, 23 Aug 2023 16:20:11 +0000 Subject: [PATCH 8/9] update defaults --- src/deepsparse/benchmark/benchmark_model.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py index ae9256b4aa..4554f04317 100644 --- a/src/deepsparse/benchmark/benchmark_model.py +++ b/src/deepsparse/benchmark/benchmark_model.py @@ -103,7 +103,7 @@ import importlib import json import logging -from typing import Dict, Optional +from typing import Dict from deepsparse import __version__, compile_model from deepsparse.benchmark.helpers import ( @@ -158,10 +158,10 @@ def parse_args(): "-seq_len", "--sequence_length", type=int, - default=128, + default=2048, help="The sequence length to run the " "KV cache supported model benchmarks for. " - "Must be greater than 0, default is 128", + "Must be greater than 0, default is 2048", ) parser.add_argument( @@ -295,8 +295,8 @@ def load_custom_engine(custom_engine_identifier: str): def benchmark_model( model_path: str, batch_size: int = 1, - sequence_length: Optional[int] = None, - input_ids_length: Optional[int] = None, + sequence_length: int = 2048, + input_ids_length: int = 1, input_shapes: str = "", num_cores: int = None, scenario: str = "sync", From 8ab1c87cbb3307242b46b476c886509cdbc11128 Mon Sep 17 00:00:00 2001 From: Damian Date: Wed, 23 Aug 2023 16:51:22 +0000 Subject: [PATCH 9/9] address Luka comments --- src/deepsparse/benchmark/benchmark_model.py | 2 +- .../transformers/engines/nl_decoder_engine.py | 6 +++--- src/deepsparse/transformers/utils/helpers.py | 4 ++-- src/deepsparse/utils/onnx.py | 19 +++++++++++++++---- .../pipelines/test_text_generation.py | 4 ++-- 5 files changed, 23 insertions(+), 12 deletions(-) diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py index 4554f04317..aa350fb474 100644 --- a/src/deepsparse/benchmark/benchmark_model.py +++ b/src/deepsparse/benchmark/benchmark_model.py @@ -338,7 +338,7 @@ def benchmark_model( f"sequence length: {sequence_length}." ) - model_path = overwrite_cache_model_inputs( + model_path, _, _ = overwrite_cache_model_inputs( model_path=model_path, input_ids_length=input_ids_length, sequence_length=sequence_length, diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py index 43c4d9c5de..30176b3b10 100644 --- a/src/deepsparse/transformers/engines/nl_decoder_engine.py +++ b/src/deepsparse/transformers/engines/nl_decoder_engine.py @@ -20,9 +20,9 @@ from deepsparse.engine import Context from deepsparse.pipeline import DEEPSPARSE_ENGINE, create_engine from deepsparse.transformers.utils.decoder_kv_cache import DecoderKVCache -from deepsparse.transformers.utils.helpers import generate_session_id from deepsparse.transformers.utils.helpers import ( - overwrite_onnx_model_inputs_for_kv_cache_models as overwrite_onnx_model_inputs, + generate_session_id, + overwrite_onnx_model_inputs_for_kv_cache_models, ) from deepsparse.utils.data import numpy_softmax from deepsparse.utils.onnx import CACHE_INPUT_PREFIX, CACHE_OUTPUT_PREFIX @@ -72,7 +72,7 @@ def __init__( onnx_file_path, output_indices_to_be_cached, kv_cache_data_type, - ) = overwrite_onnx_model_inputs( + ) = overwrite_onnx_model_inputs_for_kv_cache_models( onnx_file_path=onnx_file_path, batch_size=engine_args.get("batch_size", 1), sequence_length=sequence_length, diff --git a/src/deepsparse/transformers/utils/helpers.py b/src/deepsparse/transformers/utils/helpers.py index f5f0b560cb..5fb0f3c1c5 100644 --- a/src/deepsparse/transformers/utils/helpers.py +++ b/src/deepsparse/transformers/utils/helpers.py @@ -13,7 +13,7 @@ # limitations under the License. import logging import uuid -from typing import List, Tuple, Union +from typing import List, Optional, Tuple, Union import numpy import onnx @@ -37,7 +37,7 @@ def overwrite_onnx_model_inputs_for_kv_cache_models( sequence_length: int, input_ids_length: int, batch_size: int = 1, -) -> Tuple[str, List[int]]: +) -> Tuple[str, List[int], Optional[int]]: """ Enforces the appropriate input shapes for the onnx model, as well as checks whether kv cache is enabled or not. diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py index b740fad5ff..24d2734d73 100644 --- a/src/deepsparse/utils/onnx.py +++ b/src/deepsparse/utils/onnx.py @@ -512,7 +512,7 @@ def overwrite_cache_model_inputs( model_path: str, input_ids_length: int, sequence_length: int, -) -> Tuple[str, int]: +) -> Tuple[str, List[int], Optional[int]]: """ Takes a path to an onnx model and enforces that it has static input dimensions. @@ -520,7 +520,14 @@ def overwrite_cache_model_inputs( :param model_path: Path to a model. :param input_ids_length: The input_ids length to overwrite the model with. :param sequence_length: The sequence length to overwrite the model with. - :return: Path to the model with static input dimensions. + :return: A tuple that contains: + - the path to the onnx model file that has been overwritten + with the new input shapes + - boolean list, where elements are set to True if the + corresponding model output should be cached or False + if not. + - the data type of the kv cache. If the model does not + use kv cache, then the data type is None """ from deepsparse.transformers.utils.helpers import ( overwrite_onnx_model_inputs_for_kv_cache_models, @@ -531,10 +538,14 @@ def overwrite_cache_model_inputs( f"must be less than sequence_length {sequence_length}" ) - onnx_file_path, _, _ = overwrite_onnx_model_inputs_for_kv_cache_models( + ( + onnx_file_path, + output_indices_to_be_cached, + kv_cache_data_type, + ) = overwrite_onnx_model_inputs_for_kv_cache_models( onnx_file_path=model_path, sequence_length=sequence_length, input_ids_length=input_ids_length, ) - return onnx_file_path + return onnx_file_path, output_indices_to_be_cached, kv_cache_data_type diff --git a/tests/deepsparse/transformers/pipelines/test_text_generation.py b/tests/deepsparse/transformers/pipelines/test_text_generation.py index b9569d9f0d..1be380542a 100644 --- a/tests/deepsparse/transformers/pipelines/test_text_generation.py +++ b/tests/deepsparse/transformers/pipelines/test_text_generation.py @@ -22,7 +22,7 @@ from deepsparse import Pipeline from deepsparse.transformers.utils.helpers import ( create_causal_mask, - overwrite_onnx_model_inputs, + overwrite_onnx_model_inputs_for_kv_cache_models, ) from deepsparse.utils.onnx import CACHE_INPUT_PREFIX from sparsezoo import Model @@ -216,7 +216,7 @@ def _get_cache_state_ort_kv_cache(model_onnx_path, sequence, model_name): # setup model and session # (run full sequence inference) - overwrite_onnx_model_inputs( + overwrite_onnx_model_inputs_for_kv_cache_models( model_onnx_path, sequence_length=128, input_ids_length=128 ) sess = onnxruntime.InferenceSession(model_onnx_path)