From 8e398fc271932b96cd7296409155f836df093bb6 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Mon, 24 Jul 2023 12:45:25 +0000
Subject: [PATCH 1/9] initial commit

---
 src/deepsparse/benchmark/benchmark_model.py   |  10 ++
 .../transformers/engines/nl_decoder_engine.py |  90 +++-------------
 src/deepsparse/transformers/utils/helpers.py  | 100 ++++++++++++++++--
 src/deepsparse/utils/onnx.py                  |  52 +++++++--
 4 files changed, 158 insertions(+), 94 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py
index 0bef7c57ed..cee885ecfb 100644
--- a/src/deepsparse/benchmark/benchmark_model.py
+++ b/src/deepsparse/benchmark/benchmark_model.py
@@ -104,7 +104,9 @@
 from deepsparse.cpu import cpu_architecture
 from deepsparse.log import set_logging_level
 from deepsparse.utils import (
+    assert_model_sequence_length_one,
     generate_random_inputs,
+    has_model_kv_cache,
     model_to_path,
     override_onnx_input_shapes,
     parse_input_shapes,
@@ -357,6 +359,14 @@ def benchmark_model(
 
     orig_model_path = model_path
     model_path = model_to_path(model_path)
+
+    if has_model_kv_cache(model_path):
+        _LOGGER.info(
+            "Found model that contains KV cache inputs and outputs.\n"
+            "Enforcing `len(input_ids)=1` (simulating autoregressive inference)."
+        )
+        model_path = assert_model_sequence_length_one(model_path)
+
     num_streams = parse_num_streams(num_streams, num_cores, scenario)
 
     # Compile the ONNX into a runnable model
diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py
index d75d051e56..410ce29df9 100644
--- a/src/deepsparse/transformers/engines/nl_decoder_engine.py
+++ b/src/deepsparse/transformers/engines/nl_decoder_engine.py
@@ -16,23 +16,23 @@
 from typing import Any, Dict, List, Optional, Tuple
 
 import numpy
-import onnx
 from transformers import AutoTokenizer
 
 from deepsparse.engine import Context
 from deepsparse.pipeline import DEEPSPARSE_ENGINE, create_engine
 from deepsparse.transformers.utils.decoder_kv_cache import DecoderKVCache
-from deepsparse.transformers.utils.helpers import generate_session_id, softmax
-from deepsparse.utils.onnx import translate_onnx_type_to_numpy
-from sparsezoo.utils.onnx import save_onnx
+from deepsparse.transformers.utils.helpers import generate_session_id
+from deepsparse.transformers.utils.helpers import (
+    overwrite_onnx_model_inputs_for_kv_cache_models as overwrite_onnx_model_inputs,
+)
+from deepsparse.transformers.utils.helpers import softmax
+from deepsparse.utils.onnx import CACHE_INPUT_NAME, CACHE_OUTPUT_NAME
 
 
 _LOGGER = logging.getLogger(__name__)
 
 __all__ = ["NLDecoderEngine"]
 
-_CACHE_INPUT_NAME = "past_key_values"
-
 
 class NLDecoderEngine:
     """
@@ -70,7 +70,11 @@ def __init__(
         # flag to indicate if the model is quantized or not
         self.kv_cache_data_type = None
 
-        onnx_file_path, output_indices_to_be_cached = self.overwrite_onnx_model_inputs(
+        (
+            onnx_file_path,
+            output_indices_to_be_cached,
+            kv_cache_data_type,
+        ) = overwrite_onnx_model_inputs(
             onnx_file_path=onnx_file_path,
             batch_size=engine_args.get("batch_size", 1),
             sequence_length=sequence_length,
@@ -79,6 +83,7 @@ def __init__(
         kv_cache_enabled = False
         if sum(output_indices_to_be_cached):
             kv_cache_enabled = True
+            self.kv_cache_data_type = kv_cache_data_type
             if use_deepsparse_cache and engine_type == DEEPSPARSE_ENGINE:
                 # inform the engine, that are using the kv cache
                 engine_args["cache_output_bools"] = output_indices_to_be_cached
@@ -123,7 +128,7 @@ def onnx_input_names_no_cache(self) -> List[str]:
         return [
             name
             for name in self.engine.input_names
-            if not name.startswith(_CACHE_INPUT_NAME)
+            if not name.startswith(CACHE_INPUT_NAME)
         ]
 
     def __call__(
@@ -176,67 +181,6 @@ def transfer_cache_state(self, cache: DecoderKVCache):
         """
         self.kv_cache = copy.deepcopy(cache)
 
-    def overwrite_onnx_model_inputs(
-        self,
-        onnx_file_path: str,
-        sequence_length: int,
-        input_ids_length: int,
-        batch_size: int = 1,
-    ) -> Tuple[str, List[int]]:
-        """
-        Enforces the appropriate input shapes for the onnx model, as well as
-        checks whether kv cache is enabled or not.
-
-        :param onnx_file_path: The path to the onnx model file that will be
-            overwritten with the new input shapes
-        :param batch_size: The batch size to use for the input
-        :param sequence_length: The sequence length to use for the input
-        :param input_ids_length: The length of input_ids
-        :return: The path to the onnx model file that has been overwritten
-            with the new input shapes, as well as the indices of the inputs
-            that should be cached
-        """
-        model = onnx.load(onnx_file_path, load_external_data=False)
-        initializer_input_names = set(node.name for node in model.graph.initializer)
-        external_inputs = [
-            inp for inp in model.graph.input if inp.name not in initializer_input_names
-        ]
-        for external_input in external_inputs:
-            # overwrite the batch size for all the inputs
-            external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-
-            if external_input.name in ["input_ids", "positions"]:
-                external_input.type.tensor_type.shape.dim[
-                    1
-                ].dim_value = input_ids_length
-            elif external_input.name == "attention_mask":
-                external_input.type.tensor_type.shape.dim[1].dim_value = sequence_length
-            elif external_input.name.startswith(_CACHE_INPUT_NAME):
-                external_input.type.tensor_type.shape.dim[2].dim_value = (
-                    sequence_length - input_ids_length
-                )
-            else:
-                raise ValueError(
-                    f"Unexpected external input name: {external_input.name}"
-                )
-
-        _LOGGER.info(
-            "Overwriting in-place the input shapes "
-            f"of the transformer model at {onnx_file_path}"
-        )
-        save_onnx(model, onnx_file_path)
-
-        output_indices_to_be_cached = [
-            1 if inp.name.startswith("present") else 0 for inp in model.graph.output
-        ]
-
-        kv_cache_elem_type = next(
-            inp for inp in model.graph.input if inp.name.startswith(_CACHE_INPUT_NAME)
-        ).type.tensor_type.elem_type
-        self.kv_cache_data_type = translate_onnx_type_to_numpy(kv_cache_elem_type)
-
-        return onnx_file_path, output_indices_to_be_cached
-
     def generate_token(self, logits: numpy.ndarray) -> numpy.ndarray:
         """
         Samples a token from the logits using the sampling temperature.
@@ -301,7 +245,7 @@ def update_kv_cache(
         cache_onnx_names = [
             name
             for name in self.engine.input_names
-            if name.startswith(_CACHE_INPUT_NAME)
+            if name.startswith(CACHE_INPUT_NAME)
         ]
         kv_cache_state = {
             name: array for name, array in zip(cache_onnx_names, kv_cache_state)
@@ -319,7 +263,7 @@ def _initialize_kv_cache_state(self, length: int) -> Dict[str, numpy.ndarray]:
         cache_engine_input_index = next(
             i
             for i, name in enumerate(self.engine.input_names)
-            if _CACHE_INPUT_NAME in name
+            if CACHE_INPUT_NAME in name
         )
         batch_size, num_attention_heads, _, hidden_dims = self.engine.input_shapes[
             cache_engine_input_index
@@ -331,9 +275,9 @@ def _initialize_kv_cache_state(self, length: int) -> Dict[str, numpy.ndarray]:
         )
 
         cache_keys = [
-            output_name.replace("present", _CACHE_INPUT_NAME)
+            output_name.replace(CACHE_OUTPUT_NAME, CACHE_INPUT_NAME)
             for output_name in self.engine.output_names
-            if output_name.startswith("present")
+            if output_name.startswith(CACHE_OUTPUT_NAME)
         ]
         return {key: empty_kv_cache_tensor for key in cache_keys}
 
diff --git a/src/deepsparse/transformers/utils/helpers.py b/src/deepsparse/transformers/utils/helpers.py
index f4e72ca665..f9df5955d0 100644
--- a/src/deepsparse/transformers/utils/helpers.py
+++ b/src/deepsparse/transformers/utils/helpers.py
@@ -11,13 +11,100 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import logging
 import uuid
+from typing import List, Optional, Tuple
 
 import numpy
+import onnx
+
+from deepsparse.utils.onnx import (
+    CACHE_INPUT_NAME,
+    default_cached_outputs,
+    translate_onnx_type_to_numpy,
+)
+from sparsezoo.utils import save_onnx
+
+
+__all__ = [
+    "overwrite_onnx_model_inputs_for_kv_cache_models",
+    "generate_session_id",
+    "pad_to_fixed_length",
+    "softmax",
+]
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def overwrite_onnx_model_inputs_for_kv_cache_models(
+    onnx_file_path: str,
+    sequence_length: int = 128,
+    input_ids_length: int = 1,
+    batch_size: int = 1,
+) -> Tuple[str, List[int], Optional[numpy.dtype]]:
+    """
+    Enforces the appropriate input shapes for the onnx model, as well as
+    checks whether kv cache is enabled or not.
 
+    :param onnx_file_path: The path to the onnx model file that will be
+        overwritten with the new input shapes
+    :param batch_size: The batch size to use for the input
+    :param sequence_length: The sequence length to use for the input
+    :param input_ids_length: The length of input_ids
+    :return: A tuple that contains:
+        -   the path to the onnx model file that has been overwritten
+            with the new input shapes
+        -   boolean list, where elements are set to True if the
+            corresponding model output should be cached or False
+            if not.
+        -  the data type of the kv cache. If the model does not
+            use kv cache, then the data type is None
+    """
+    model = onnx.load(onnx_file_path, load_external_data=False)
+    initializer_input_names = set(node.name for node in model.graph.initializer)
+    external_inputs = [
+        inp for inp in model.graph.input if inp.name not in initializer_input_names
+    ]
+    for external_input in external_inputs:
+        # overwrite the batch size for all the inputs
+        external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+
+        if external_input.name in ["input_ids", "positions"]:
+            external_input.type.tensor_type.shape.dim[1].dim_value = input_ids_length
+        elif external_input.name == "attention_mask":
+            external_input.type.tensor_type.shape.dim[1].dim_value = sequence_length
+        elif external_input.name.startswith(CACHE_INPUT_NAME):
+            external_input.type.tensor_type.shape.dim[2].dim_value = (
+                sequence_length - input_ids_length
+            )
+        else:
+            raise ValueError(f"Unexpected external input name: {external_input.name}")
+
+    _LOGGER.info(
+        "Overwriting in-place the input shapes "
+        f"of the transformer model at {onnx_file_path}"
+    )
+    save_onnx(model, onnx_file_path)
+
+    output_indices_to_be_cached = default_cached_outputs(model)
 
-__all__ = ["softmax", "generate_session_id", "pad_to_fixed_length"]
+    kv_cache_data_type = None
+    if sum(output_indices_to_be_cached):
+        kv_cache_elem_type = next(
+            inp for inp in model.graph.input if inp.name.startswith(CACHE_INPUT_NAME)
+        ).type.tensor_type.elem_type
+        kv_cache_data_type = translate_onnx_type_to_numpy(kv_cache_elem_type)
+
+    return onnx_file_path, output_indices_to_be_cached, kv_cache_data_type
+
+
+def generate_session_id() -> str:
+    """
+    Generate uuid for session id. This is used to
+    identify the kv cache session for the user
+    """
+    session_id = str(uuid.uuid4())
+    return session_id
 
 
 def softmax(x: numpy.ndarray) -> numpy.ndarray:
@@ -36,15 +123,6 @@ def softmax(x: numpy.ndarray) -> numpy.ndarray:
     return numerator / denominator
 
 
-def generate_session_id() -> str:
-    """
-    Generate uuid for session id. This is used to
-    identify the kv cache session for the user
-    """
-    session_id = str(uuid.uuid4())
-    return session_id
-
-
 def pad_to_fixed_length(
     array: numpy.ndarray, max_len: int, axis: int = 0, value: int = 0
 ) -> numpy.ndarray:
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index 89d8baf4c9..ce4c552e4f 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -50,10 +50,17 @@
     "truncate_onnx_model",
     "truncate_onnx_embedding_model",
     "default_cached_outputs",
+    "has_model_kv_cache",
+    "CACHE_INPUT_NAME",
+    "CACHE_OUTPUT_NAME",
+    "assert_model_sequence_length_one",
 ]
 
 _LOGGER = logging.getLogger(__name__)
 
+CACHE_INPUT_NAME = "past_key_values"
+CACHE_OUTPUT_NAME = "present"
+
 
 @contextlib.contextmanager
 def save_onnx_to_temp_files(model: onnx.ModelProto, with_external_data=False) -> str:
@@ -475,20 +482,45 @@ def truncate_onnx_embedding_model(
     return output_filepath, tmp_file
 
 
-def default_cached_outputs(model_path: str) -> List[bool]:
+def default_cached_outputs(model: Union[str, ModelProto]) -> List[bool]:
     """
+    Get a list of bools that indicate which outputs should be cached.
+    The elements that are set to True correspond to cached outputs,
+    the rest are set to False.
+
     :param model_path: Path to a model
-    :return A list of bools that indicates caching of all outputs except the first one.
+    :return A list of bools that indicate which outputs should be cached.
     """
-
-    outputs = list(onnx.load(model_path).graph.output)
+    model = (
+        onnx.load(model, load_external_data=False) if isinstance(model, str) else model
+    )
+    outputs = model.graph.output
     assert len(outputs) > 0
 
-    # Create a boolean list of every output of the
-    # model [logits, key0, value0, key1, value1, ..., keyN, valueN]
-    cached_outputs = [True for i in range(len(outputs))]
+    return [output.name.startswith(CACHE_OUTPUT_NAME) for output in outputs]
+
+
+def has_model_kv_cache(model: Union[str, ModelProto]) -> bool:
+    """
+    Check whether a model has a KV cache support.
 
-    # Assume first input is logits and logits ought not to be cached
-    cached_outputs[0] = False
+    :param model_path: Path to a model or a model proto.
+    :return True if the model has a KV cache support, False otherwise.
+    """
+    return bool(sum(default_cached_outputs(model)))
+
+
+def assert_model_sequence_length_one(model_path: str) -> str:
+    """
+    Takes a path to an onnx model and enforces that it has
+    static input dimensions.
+
+    :param model_path: Path to a model.
+    :return: Path to the model with static input dimensions.
+    """
+    from deepsparse.transformers.utils.helpers import (
+        overwrite_onnx_model_inputs_for_kv_cache_models,
+    )
 
-    return cached_outputs
+    onnx_file_path, _, _ = overwrite_onnx_model_inputs_for_kv_cache_models(model_path)
+    return onnx_file_path

From 0fe9f7e79ce1b2edc720fac61690a4a8b18f61a1 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Mon, 24 Jul 2023 13:00:44 +0000
Subject: [PATCH 2/9] improve logging docstring

---
 src/deepsparse/benchmark/benchmark_model.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py
index cee885ecfb..a39b2afd02 100644
--- a/src/deepsparse/benchmark/benchmark_model.py
+++ b/src/deepsparse/benchmark/benchmark_model.py
@@ -361,10 +361,7 @@ def benchmark_model(
     model_path = model_to_path(model_path)
 
     if has_model_kv_cache(model_path):
-        _LOGGER.info(
-            "Found model that contains KV cache inputs and outputs.\n"
-            "Enforcing `len(input_ids)=1` (simulating autoregressive inference)."
-        )
+        _LOGGER.info("Found model that contains KV cache support.")
         model_path = assert_model_sequence_length_one(model_path)
 
     num_streams = parse_num_streams(num_streams, num_cores, scenario)

From 55916f0dc6974b4596020d420798c427df277b83 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Tue, 25 Jul 2023 06:43:25 +0000
Subject: [PATCH 3/9] more verbose logging

---
 src/deepsparse/benchmark/benchmark_model.py              | 5 ++++-
 src/deepsparse/transformers/engines/nl_decoder_engine.py | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py
index a39b2afd02..44cfc40cd7 100644
--- a/src/deepsparse/benchmark/benchmark_model.py
+++ b/src/deepsparse/benchmark/benchmark_model.py
@@ -361,7 +361,10 @@ def benchmark_model(
     model_path = model_to_path(model_path)
 
     if has_model_kv_cache(model_path):
-        _LOGGER.info("Found model that contains KV cache support.")
+        _LOGGER.info(
+            "Found model that contains KV cache support. "
+            "Benchmarking the autoregressive model."
+        )
         model_path = assert_model_sequence_length_one(model_path)
 
     num_streams = parse_num_streams(num_streams, num_cores, scenario)
diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py
index 92101496f7..dd4ed8ccaa 100644
--- a/src/deepsparse/transformers/engines/nl_decoder_engine.py
+++ b/src/deepsparse/transformers/engines/nl_decoder_engine.py
@@ -25,7 +25,7 @@
 from deepsparse.transformers.utils.helpers import (
     overwrite_onnx_model_inputs_for_kv_cache_models as overwrite_onnx_model_inputs,
 )
-from deepsparse.transformers.utils.helpers import softmax
+from deepsparse.utils.data import numpy_softmax
 from deepsparse.utils.onnx import CACHE_INPUT_NAME, CACHE_OUTPUT_NAME
 
 

From fa755cb3d34babc9b295005b4eae729374ac9a01 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Tue, 1 Aug 2023 10:06:37 +0000
Subject: [PATCH 4/9] add sequence_length as variable

---
 src/deepsparse/benchmark/benchmark_model.py   | 38 +++++++++++++++++--
 .../transformers/engines/nl_decoder_engine.py |  2 +-
 src/deepsparse/transformers/utils/helpers.py  |  5 ++-
 src/deepsparse/utils/onnx.py                  | 21 +++++++---
 4 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py
index 44cfc40cd7..e739f571c9 100644
--- a/src/deepsparse/benchmark/benchmark_model.py
+++ b/src/deepsparse/benchmark/benchmark_model.py
@@ -81,6 +81,13 @@
    zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/base-none \
    --input_shapes "[1,512],[1,512],[1,512]"
 
+##########
+Example on a OPT (Large Language Model) from SparseZoo with sequence length 256:
+deepsparse.benchmark \
+   zoo:nlg/text_generation/opt-1.3b/pytorch/huggingface/
+   opt_pretrain/pruned50_quantW8A8-none \
+   --sequence_length 256
+
 ##########
 Example on local ONNX model:
 deepsparse.benchmark /PATH/TO/model.onnx
@@ -96,7 +103,7 @@
 import json
 import logging
 import os
-from typing import Dict
+from typing import Dict, Optional
 
 from deepsparse import Scheduler, __version__, compile_model
 from deepsparse.benchmark.ort_engine import ORTEngine
@@ -104,11 +111,11 @@
 from deepsparse.cpu import cpu_architecture
 from deepsparse.log import set_logging_level
 from deepsparse.utils import (
-    assert_model_sequence_length_one,
     generate_random_inputs,
     has_model_kv_cache,
     model_to_path,
     override_onnx_input_shapes,
+    overwrite_sequence_length,
     parse_input_shapes,
 )
 
@@ -140,6 +147,15 @@ def parse_args():
         default=1,
         help="The batch size to run the analysis for. Must be greater than 0",
     )
+
+    parser.add_argument(
+        "-seq_len",
+        "--sequence_length",
+        type=int,
+        default=None,
+        help="The sequence length to run the "
+        "Large Language Models (LLMs) benchmarks for. Must be greater than 0",
+    )
     parser.add_argument(
         "-i",
         "-shapes",
@@ -334,6 +350,7 @@ def load_custom_engine(custom_engine_identifier: str):
 def benchmark_model(
     model_path: str,
     batch_size: int = 1,
+    sequence_length: Optional[int] = None,
     input_shapes: str = "",
     num_cores: int = None,
     scenario: str = "sync",
@@ -361,11 +378,20 @@ def benchmark_model(
     model_path = model_to_path(model_path)
 
     if has_model_kv_cache(model_path):
+        if batch_size != 1:
+            raise ValueError(
+                "Unable to run models with KV cache support "
+                "for batch size different than one."
+                "Please set batch size to 1 and try again"
+            )
+        model_path, sequence_length = overwrite_sequence_length(
+            model_path=model_path, sequence_length=sequence_length
+        )
         _LOGGER.info(
             "Found model that contains KV cache support. "
-            "Benchmarking the autoregressive model."
+            "Benchmarking the autoregressive model with "
+            f"sequence length: {sequence_length}."
         )
-        model_path = assert_model_sequence_length_one(model_path)
 
     num_streams = parse_num_streams(num_streams, num_cores, scenario)
 
@@ -428,6 +454,7 @@ def benchmark_model(
         "orig_model_path": orig_model_path,
         "model_path": model_path,
         "batch_size": batch_size,
+        "sequence_length": sequence_length,
         "input_shapes": input_shapes,
         "num_cores": num_cores,
         "scenario": scenario,
@@ -453,6 +480,7 @@ def main():
 
     result = benchmark_model(
         model_path=args.model_path,
+        sequence_length=args.sequence_length,
         batch_size=args.batch_size,
         input_shapes=args.input_shapes,
         num_cores=args.num_cores,
@@ -469,6 +497,8 @@ def main():
     # Results summary
     print("Original Model Path: {}".format(args.model_path))
     print("Batch Size: {}".format(args.batch_size))
+    if args.sequence_length is not None:
+        print("Sequence Length: {}".format(args.sequence_length))
     print("Scenario: {}".format(args.scenario))
     print(
         "Throughput (items/sec): {:.4f}".format(
diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py
index ef90101ded..0860b66a14 100644
--- a/src/deepsparse/transformers/engines/nl_decoder_engine.py
+++ b/src/deepsparse/transformers/engines/nl_decoder_engine.py
@@ -69,7 +69,6 @@ def __init__(
     ):
         # flag to indicate if the model is quantized or not
         self.kv_cache_data_type = None
-
         (
             onnx_file_path,
             output_indices_to_be_cached,
@@ -80,6 +79,7 @@ def __init__(
             sequence_length=sequence_length,
             input_ids_length=input_ids_length,
         )
+
         kv_cache_enabled = False
         if sum(output_indices_to_be_cached):
             kv_cache_enabled = True
diff --git a/src/deepsparse/transformers/utils/helpers.py b/src/deepsparse/transformers/utils/helpers.py
index 2090258577..c771d27aa3 100644
--- a/src/deepsparse/transformers/utils/helpers.py
+++ b/src/deepsparse/transformers/utils/helpers.py
@@ -77,6 +77,9 @@ def overwrite_onnx_model_inputs_for_kv_cache_models(
             external_input.type.tensor_type.shape.dim[2].dim_value = (
                 sequence_length - input_ids_length
             )
+        elif external_input.name.startswith("causal_mask"):
+            external_input.type.tensor_type.shape.dim[2].dim_value = input_ids_length
+            external_input.type.tensor_type.shape.dim[3].dim_value = sequence_length
         else:
             raise ValueError(f"Unexpected external input name: {external_input.name}")
 
@@ -89,7 +92,7 @@ def overwrite_onnx_model_inputs_for_kv_cache_models(
     output_indices_to_be_cached = default_cached_outputs(model)
 
     kv_cache_data_type = None
-    if sum(output_indices_to_be_cached):
+    if any(output_indices_to_be_cached):
         kv_cache_elem_type = next(
             inp for inp in model.graph.input if inp.name.startswith(CACHE_INPUT_NAME)
         ).type.tensor_type.elem_type
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index ce4c552e4f..dc9fd675d2 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -53,7 +53,7 @@
     "has_model_kv_cache",
     "CACHE_INPUT_NAME",
     "CACHE_OUTPUT_NAME",
-    "assert_model_sequence_length_one",
+    "overwrite_sequence_length",
 ]
 
 _LOGGER = logging.getLogger(__name__)
@@ -507,20 +507,31 @@ def has_model_kv_cache(model: Union[str, ModelProto]) -> bool:
     :param model_path: Path to a model or a model proto.
     :return True if the model has a KV cache support, False otherwise.
     """
-    return bool(sum(default_cached_outputs(model)))
+    return bool(any(default_cached_outputs(model)))
 
 
-def assert_model_sequence_length_one(model_path: str) -> str:
+def overwrite_sequence_length(
+    model_path: str, sequence_length: Optional[int] = None
+) -> str:
     """
     Takes a path to an onnx model and enforces that it has
     static input dimensions.
 
     :param model_path: Path to a model.
+    :param sequence_length: The sequence length to overwrite the model with.
     :return: Path to the model with static input dimensions.
     """
     from deepsparse.transformers.utils.helpers import (
         overwrite_onnx_model_inputs_for_kv_cache_models,
     )
 
-    onnx_file_path, _, _ = overwrite_onnx_model_inputs_for_kv_cache_models(model_path)
-    return onnx_file_path
+    onnx_file_path, _, _ = overwrite_onnx_model_inputs_for_kv_cache_models(
+        onnx_file_path=model_path, sequence_length=sequence_length
+    )
+    attention_input_info = [
+        input
+        for input in onnx.load(onnx_file_path, load_external_data=False).graph.input
+        if "attention" in input.name
+    ][0]
+    sequence_length = attention_input_info.type.tensor_type.shape.dim[1].dim_value
+    return onnx_file_path, sequence_length

From 709853d6c733d002f847f521bb667bab359951f1 Mon Sep 17 00:00:00 2001
From: Luka Govedic <luka.govedic@gmail.com>
Date: Tue, 8 Aug 2023 14:20:23 -0400
Subject: [PATCH 5/9] fixed type annotations and avoided overwriting inputs
 when no sequence_length is passed

---
 src/deepsparse/transformers/utils/helpers.py |  2 +-
 src/deepsparse/utils/onnx.py                 | 12 ++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/deepsparse/transformers/utils/helpers.py b/src/deepsparse/transformers/utils/helpers.py
index c771d27aa3..09eca2ab07 100644
--- a/src/deepsparse/transformers/utils/helpers.py
+++ b/src/deepsparse/transformers/utils/helpers.py
@@ -41,7 +41,7 @@ def overwrite_onnx_model_inputs_for_kv_cache_models(
     sequence_length: int = 128,
     input_ids_length: int = 1,
     batch_size: int = 1,
-) -> Tuple[str, List[int], Optional[numpy.dtype]]:
+) -> Tuple[str, List[bool], Optional[numpy.dtype]]:
     """
     Enforces the appropriate input shapes for the onnx model, as well as
     checks whether kv cache is enabled or not.
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index dc9fd675d2..f19d06c3ea 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -512,7 +512,7 @@ def has_model_kv_cache(model: Union[str, ModelProto]) -> bool:
 
 def overwrite_sequence_length(
     model_path: str, sequence_length: Optional[int] = None
-) -> str:
+) -> Tuple[str, int]:
     """
     Takes a path to an onnx model and enforces that it has
     static input dimensions.
@@ -525,9 +525,13 @@ def overwrite_sequence_length(
         overwrite_onnx_model_inputs_for_kv_cache_models,
     )
 
-    onnx_file_path, _, _ = overwrite_onnx_model_inputs_for_kv_cache_models(
-        onnx_file_path=model_path, sequence_length=sequence_length
-    )
+    if sequence_length is not None:
+        onnx_file_path, _, _ = overwrite_onnx_model_inputs_for_kv_cache_models(
+            onnx_file_path=model_path, sequence_length=sequence_length
+        )
+    else:
+        onnx_file_path = model_path
+
     attention_input_info = [
         input
         for input in onnx.load(onnx_file_path, load_external_data=False).graph.input

From 7f7bf8300b54e8b3fe847be46e45dcdfe84d365a Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Wed, 23 Aug 2023 15:37:48 +0000
Subject: [PATCH 6/9] fix bad merge

---
 src/deepsparse/benchmark/benchmark_model.py      |  1 -
 .../transformers/engines/nl_decoder_engine.py    | 16 ++++++----------
 src/deepsparse/transformers/utils/helpers.py     | 13 +------------
 src/deepsparse/utils/onnx.py                     |  5 ++---
 4 files changed, 9 insertions(+), 26 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py
index 8a5aede1d7..4a81b7bbb1 100644
--- a/src/deepsparse/benchmark/benchmark_model.py
+++ b/src/deepsparse/benchmark/benchmark_model.py
@@ -102,7 +102,6 @@
 import importlib
 import json
 import logging
-import os
 from typing import Dict, Optional
 
 from deepsparse import __version__, compile_model
diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py
index 51bab4bbc6..43c4d9c5de 100644
--- a/src/deepsparse/transformers/engines/nl_decoder_engine.py
+++ b/src/deepsparse/transformers/engines/nl_decoder_engine.py
@@ -24,12 +24,8 @@
 from deepsparse.transformers.utils.helpers import (
     overwrite_onnx_model_inputs_for_kv_cache_models as overwrite_onnx_model_inputs,
 )
-from deepsparse.transformers.utils.helpers import (
-    generate_session_id,
-    overwrite_onnx_model_inputs,
-)
 from deepsparse.utils.data import numpy_softmax
-from deepsparse.utils.onnx import CACHE_INPUT_NAME, CACHE_OUTPUT_NAME
+from deepsparse.utils.onnx import CACHE_INPUT_PREFIX, CACHE_OUTPUT_PREFIX
 
 
 _LOGGER = logging.getLogger(__name__)
@@ -132,7 +128,7 @@ def onnx_input_names_no_cache(self) -> List[str]:
         return [
             name
             for name in self.engine.input_names
-            if not name.startswith(CACHE_INPUT_NAME)
+            if not name.startswith(CACHE_INPUT_PREFIX)
         ]
 
     @property
@@ -287,7 +283,7 @@ def update_kv_cache(
         cache_onnx_names = [
             name
             for name in self.engine.input_names
-            if name.startswith(CACHE_INPUT_NAME)
+            if name.startswith(CACHE_INPUT_PREFIX)
         ]
         kv_cache_state = {
             name: array for name, array in zip(cache_onnx_names, kv_cache_state)
@@ -305,7 +301,7 @@ def _initialize_kv_cache_state(self, length: int) -> Dict[str, numpy.ndarray]:
         cache_engine_input_index = next(
             i
             for i, name in enumerate(self.engine.input_names)
-            if CACHE_INPUT_NAME in name
+            if CACHE_INPUT_PREFIX in name
         )
         batch_size, num_attention_heads, _, hidden_dims = self.engine.input_shapes[
             cache_engine_input_index
@@ -317,9 +313,9 @@ def _initialize_kv_cache_state(self, length: int) -> Dict[str, numpy.ndarray]:
         )
 
         cache_keys = [
-            output_name.replace(CACHE_OUTPUT_NAME, CACHE_INPUT_NAME)
+            output_name.replace(CACHE_OUTPUT_PREFIX, CACHE_INPUT_PREFIX)
             for output_name in self.engine.output_names
-            if output_name.startswith(CACHE_OUTPUT_NAME)
+            if output_name.startswith(CACHE_OUTPUT_PREFIX)
         ]
         return {key: empty_kv_cache_tensor for key in cache_keys}
 
diff --git a/src/deepsparse/transformers/utils/helpers.py b/src/deepsparse/transformers/utils/helpers.py
index 4ba4956648..f5f0b560cb 100644
--- a/src/deepsparse/transformers/utils/helpers.py
+++ b/src/deepsparse/transformers/utils/helpers.py
@@ -11,24 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import logging
-
 import logging
 import uuid
-from typing import List, Optional, Tuple, Union
 from typing import List, Tuple, Union
 
 import numpy
 import onnx
 
-from deepsparse.utils.onnx import (
-    CACHE_INPUT_NAME,
-    default_cached_outputs,
-    translate_onnx_type_to_numpy,
-)
-from sparsezoo.utils import save_onnx
-import onnx
-
 from deepsparse.utils.onnx import translate_onnx_type_to_numpy
 from sparsezoo.utils import save_onnx
 
@@ -43,7 +32,7 @@
 _LOGGER = logging.getLogger(__name__)
 
 
-def overwrite_onnx_model_inputs(
+def overwrite_onnx_model_inputs_for_kv_cache_models(
     onnx_file_path: str,
     sequence_length: int,
     input_ids_length: int,
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index bf3f028df8..9c0801e2e3 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -51,8 +51,6 @@
     "truncate_onnx_embedding_model",
     "default_cached_outputs",
     "has_model_kv_cache",
-    "CACHE_INPUT_NAME",
-    "CACHE_OUTPUT_NAME",
     "overwrite_sequence_length",
     "CACHE_INPUT_PREFIX",
     "CACHE_OUTPUT_PREFIX",
@@ -499,6 +497,7 @@ def default_cached_outputs(model_path: str) -> List[bool]:
 
     return [name.startswith(CACHE_OUTPUT_PREFIX) for name in output_names]
 
+
 def has_model_kv_cache(model: Union[str, ModelProto]) -> bool:
     """
     Check whether a model has a KV cache support.
@@ -537,4 +536,4 @@ def overwrite_sequence_length(
         if "attention" in input.name
     ][0]
     sequence_length = attention_input_info.type.tensor_type.shape.dim[1].dim_value
-    return onnx_file_path, sequence_length
\ No newline at end of file
+    return onnx_file_path, sequence_length

From 652439481354249e59460ae346e4733850bb85ad Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Wed, 23 Aug 2023 16:17:37 +0000
Subject: [PATCH 7/9] tested

---
 src/deepsparse/benchmark/benchmark_model.py | 44 +++++++++++++++------
 src/deepsparse/utils/onnx.py                | 35 ++++++++--------
 2 files changed, 51 insertions(+), 28 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py
index 4a81b7bbb1..ae9256b4aa 100644
--- a/src/deepsparse/benchmark/benchmark_model.py
+++ b/src/deepsparse/benchmark/benchmark_model.py
@@ -82,11 +82,12 @@
    --input_shapes "[1,512],[1,512],[1,512]"
 
 ##########
-Example on a OPT (Large Language Model) from SparseZoo with sequence length 256:
+Example on a CodeGen (model with KV cache support)
+from SparseZoo with input_ids_length 10 and sequence length 256:
 deepsparse.benchmark \
-   zoo:nlg/text_generation/opt-1.3b/pytorch/huggingface/
-   opt_pretrain/pruned50_quantW8A8-none \
-   --sequence_length 256
+   zoo:nlg/text_generation/codegen_mono-350m/pytorch/
+   huggingface/bigpython_bigquery_thepile/pruned50-none
+   --input_ids_length 10 --sequence_length 256
 
 ##########
 Example on local ONNX model:
@@ -120,7 +121,7 @@
     has_model_kv_cache,
     model_to_path,
     override_onnx_input_shapes,
-    overwrite_sequence_length,
+    overwrite_cache_model_inputs,
     parse_input_shapes,
 )
 
@@ -157,9 +158,20 @@ def parse_args():
         "-seq_len",
         "--sequence_length",
         type=int,
-        default=None,
+        default=128,
         help="The sequence length to run the "
-        "Large Language Models (LLMs) benchmarks for. Must be greater than 0",
+        "KV cache supported model benchmarks for. "
+        "Must be greater than 0, default is 128",
+    )
+
+    parser.add_argument(
+        "-input_ids_len",
+        "--input_ids_length",
+        type=int,
+        default=1,
+        help="The input ids length to run the "
+        "KV cache supported model benchmarks for. "
+        "Must be greater than 0, default is 1",
     )
     parser.add_argument(
         "-i",
@@ -284,6 +296,7 @@ def benchmark_model(
     model_path: str,
     batch_size: int = 1,
     sequence_length: Optional[int] = None,
+    input_ids_length: Optional[int] = None,
     input_shapes: str = "",
     num_cores: int = None,
     scenario: str = "sync",
@@ -317,15 +330,20 @@ def benchmark_model(
                 "for batch size different than one."
                 "Please set batch size to 1 and try again"
             )
-        model_path, sequence_length = overwrite_sequence_length(
-            model_path=model_path, sequence_length=sequence_length
-        )
+
         _LOGGER.info(
-            "Found model that contains KV cache support. "
+            "Found model with KV cache support. "
             "Benchmarking the autoregressive model with "
+            f"input_ids_length: {input_ids_length} and "
             f"sequence length: {sequence_length}."
         )
 
+        model_path = overwrite_cache_model_inputs(
+            model_path=model_path,
+            input_ids_length=input_ids_length,
+            sequence_length=sequence_length,
+        )
+
     num_streams = parse_num_streams(num_streams, num_cores, scenario)
 
     # Compile the ONNX into a runnable model
@@ -388,6 +406,7 @@ def benchmark_model(
         "model_path": model_path,
         "batch_size": batch_size,
         "sequence_length": sequence_length,
+        "input_ids_length": input_ids_length,
         "input_shapes": input_shapes,
         "num_cores": num_cores,
         "scenario": scenario,
@@ -414,6 +433,7 @@ def main():
     result = benchmark_model(
         model_path=args.model_path,
         sequence_length=args.sequence_length,
+        input_ids_length=args.input_ids_length,
         batch_size=args.batch_size,
         input_shapes=args.input_shapes,
         num_cores=args.num_cores,
@@ -432,6 +452,8 @@ def main():
     print("Batch Size: {}".format(args.batch_size))
     if args.sequence_length is not None:
         print("Sequence Length: {}".format(args.sequence_length))
+    if args.input_ids_length is not None:
+        print("Input IDs Length: {}".format(args.input_ids_length))
     print("Scenario: {}".format(args.scenario))
     print(
         "Throughput (items/sec): {:.4f}".format(
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index 9c0801e2e3..b740fad5ff 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -51,7 +51,7 @@
     "truncate_onnx_embedding_model",
     "default_cached_outputs",
     "has_model_kv_cache",
-    "overwrite_sequence_length",
+    "overwrite_cache_model_inputs",
     "CACHE_INPUT_PREFIX",
     "CACHE_OUTPUT_PREFIX",
 ]
@@ -508,14 +508,17 @@ def has_model_kv_cache(model: Union[str, ModelProto]) -> bool:
     return bool(any(default_cached_outputs(model)))
 
 
-def overwrite_sequence_length(
-    model_path: str, sequence_length: Optional[int] = None
+def overwrite_cache_model_inputs(
+    model_path: str,
+    input_ids_length: int,
+    sequence_length: int,
 ) -> Tuple[str, int]:
     """
     Takes a path to an onnx model and enforces that it has
     static input dimensions.
 
     :param model_path: Path to a model.
+    :param input_ids_length: The input_ids length to overwrite the model with.
     :param sequence_length: The sequence length to overwrite the model with.
     :return: Path to the model with static input dimensions.
     """
@@ -523,17 +526,15 @@ def overwrite_sequence_length(
         overwrite_onnx_model_inputs_for_kv_cache_models,
     )
 
-    if sequence_length is not None:
-        onnx_file_path, _, _ = overwrite_onnx_model_inputs_for_kv_cache_models(
-            onnx_file_path=model_path, sequence_length=sequence_length
-        )
-    else:
-        onnx_file_path = model_path
-
-    attention_input_info = [
-        input
-        for input in onnx.load(onnx_file_path, load_external_data=False).graph.input
-        if "attention" in input.name
-    ][0]
-    sequence_length = attention_input_info.type.tensor_type.shape.dim[1].dim_value
-    return onnx_file_path, sequence_length
+    assert input_ids_length < sequence_length, (
+        f"input_ids_length {input_ids_length} "
+        f"must be less than sequence_length {sequence_length}"
+    )
+
+    onnx_file_path, _, _ = overwrite_onnx_model_inputs_for_kv_cache_models(
+        onnx_file_path=model_path,
+        sequence_length=sequence_length,
+        input_ids_length=input_ids_length,
+    )
+
+    return onnx_file_path

From 27dbf42287e3ef655beee9c75c14251a92143258 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Wed, 23 Aug 2023 16:20:11 +0000
Subject: [PATCH 8/9] update defaults

---
 src/deepsparse/benchmark/benchmark_model.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py
index ae9256b4aa..4554f04317 100644
--- a/src/deepsparse/benchmark/benchmark_model.py
+++ b/src/deepsparse/benchmark/benchmark_model.py
@@ -103,7 +103,7 @@
 import importlib
 import json
 import logging
-from typing import Dict, Optional
+from typing import Dict
 
 from deepsparse import __version__, compile_model
 from deepsparse.benchmark.helpers import (
@@ -158,10 +158,10 @@ def parse_args():
         "-seq_len",
         "--sequence_length",
         type=int,
-        default=128,
+        default=2048,
         help="The sequence length to run the "
         "KV cache supported model benchmarks for. "
-        "Must be greater than 0, default is 128",
+        "Must be greater than 0, default is 2048",
     )
 
     parser.add_argument(
@@ -295,8 +295,8 @@ def load_custom_engine(custom_engine_identifier: str):
 def benchmark_model(
     model_path: str,
     batch_size: int = 1,
-    sequence_length: Optional[int] = None,
-    input_ids_length: Optional[int] = None,
+    sequence_length: int = 2048,
+    input_ids_length: int = 1,
     input_shapes: str = "",
     num_cores: int = None,
     scenario: str = "sync",

From 8ab1c87cbb3307242b46b476c886509cdbc11128 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Wed, 23 Aug 2023 16:51:22 +0000
Subject: [PATCH 9/9] address Luka comments

---
 src/deepsparse/benchmark/benchmark_model.py   |  2 +-
 .../transformers/engines/nl_decoder_engine.py |  6 +++---
 src/deepsparse/transformers/utils/helpers.py  |  4 ++--
 src/deepsparse/utils/onnx.py                  | 19 +++++++++++++++----
 .../pipelines/test_text_generation.py         |  4 ++--
 5 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py
index 4554f04317..aa350fb474 100644
--- a/src/deepsparse/benchmark/benchmark_model.py
+++ b/src/deepsparse/benchmark/benchmark_model.py
@@ -338,7 +338,7 @@ def benchmark_model(
             f"sequence length: {sequence_length}."
         )
 
-        model_path = overwrite_cache_model_inputs(
+        model_path, _, _ = overwrite_cache_model_inputs(
             model_path=model_path,
             input_ids_length=input_ids_length,
             sequence_length=sequence_length,
diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py
index 43c4d9c5de..30176b3b10 100644
--- a/src/deepsparse/transformers/engines/nl_decoder_engine.py
+++ b/src/deepsparse/transformers/engines/nl_decoder_engine.py
@@ -20,9 +20,9 @@
 from deepsparse.engine import Context
 from deepsparse.pipeline import DEEPSPARSE_ENGINE, create_engine
 from deepsparse.transformers.utils.decoder_kv_cache import DecoderKVCache
-from deepsparse.transformers.utils.helpers import generate_session_id
 from deepsparse.transformers.utils.helpers import (
-    overwrite_onnx_model_inputs_for_kv_cache_models as overwrite_onnx_model_inputs,
+    generate_session_id,
+    overwrite_onnx_model_inputs_for_kv_cache_models,
 )
 from deepsparse.utils.data import numpy_softmax
 from deepsparse.utils.onnx import CACHE_INPUT_PREFIX, CACHE_OUTPUT_PREFIX
@@ -72,7 +72,7 @@ def __init__(
             onnx_file_path,
             output_indices_to_be_cached,
             kv_cache_data_type,
-        ) = overwrite_onnx_model_inputs(
+        ) = overwrite_onnx_model_inputs_for_kv_cache_models(
             onnx_file_path=onnx_file_path,
             batch_size=engine_args.get("batch_size", 1),
             sequence_length=sequence_length,
diff --git a/src/deepsparse/transformers/utils/helpers.py b/src/deepsparse/transformers/utils/helpers.py
index f5f0b560cb..5fb0f3c1c5 100644
--- a/src/deepsparse/transformers/utils/helpers.py
+++ b/src/deepsparse/transformers/utils/helpers.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import logging
 import uuid
-from typing import List, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import numpy
 import onnx
@@ -37,7 +37,7 @@ def overwrite_onnx_model_inputs_for_kv_cache_models(
     sequence_length: int,
     input_ids_length: int,
     batch_size: int = 1,
-) -> Tuple[str, List[int]]:
+) -> Tuple[str, List[int], Optional[int]]:
     """
     Enforces the appropriate input shapes for the onnx model, as well as
     checks whether kv cache is enabled or not.
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index b740fad5ff..24d2734d73 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -512,7 +512,7 @@ def overwrite_cache_model_inputs(
     model_path: str,
     input_ids_length: int,
     sequence_length: int,
-) -> Tuple[str, int]:
+) -> Tuple[str, List[int], Optional[int]]:
     """
     Takes a path to an onnx model and enforces that it has
     static input dimensions.
@@ -520,7 +520,14 @@ def overwrite_cache_model_inputs(
     :param model_path: Path to a model.
     :param input_ids_length: The input_ids length to overwrite the model with.
     :param sequence_length: The sequence length to overwrite the model with.
-    :return: Path to the model with static input dimensions.
+    :return: A tuple that contains:
+        -   the path to the onnx model file that has been overwritten
+            with the new input shapes
+        -   boolean list, where elements are set to True if the
+            corresponding model output should be cached or False
+            if not.
+        -   the data type of the kv cache. If the model does not
+            use kv cache, then the data type is None
     """
     from deepsparse.transformers.utils.helpers import (
         overwrite_onnx_model_inputs_for_kv_cache_models,
@@ -531,10 +538,14 @@ def overwrite_cache_model_inputs(
         f"must be less than sequence_length {sequence_length}"
     )
 
-    onnx_file_path, _, _ = overwrite_onnx_model_inputs_for_kv_cache_models(
+    (
+        onnx_file_path,
+        output_indices_to_be_cached,
+        kv_cache_data_type,
+    ) = overwrite_onnx_model_inputs_for_kv_cache_models(
         onnx_file_path=model_path,
         sequence_length=sequence_length,
         input_ids_length=input_ids_length,
     )
 
-    return onnx_file_path
+    return onnx_file_path, output_indices_to_be_cached, kv_cache_data_type
diff --git a/tests/deepsparse/transformers/pipelines/test_text_generation.py b/tests/deepsparse/transformers/pipelines/test_text_generation.py
index b9569d9f0d..1be380542a 100644
--- a/tests/deepsparse/transformers/pipelines/test_text_generation.py
+++ b/tests/deepsparse/transformers/pipelines/test_text_generation.py
@@ -22,7 +22,7 @@
 from deepsparse import Pipeline
 from deepsparse.transformers.utils.helpers import (
     create_causal_mask,
-    overwrite_onnx_model_inputs,
+    overwrite_onnx_model_inputs_for_kv_cache_models,
 )
 from deepsparse.utils.onnx import CACHE_INPUT_PREFIX
 from sparsezoo import Model
@@ -216,7 +216,7 @@ def _get_cache_state_ort_kv_cache(model_onnx_path, sequence, model_name):
 
         # setup model and session
         # (run full sequence inference)
-        overwrite_onnx_model_inputs(
+        overwrite_onnx_model_inputs_for_kv_cache_models(
             model_onnx_path, sequence_length=128, input_ids_length=128
         )
         sess = onnxruntime.InferenceSession(model_onnx_path)