From 6c8dfb3675f94cf8abaffd8899305d76fbbd1380 Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Mon, 25 Sep 2023 16:31:40 -0400
Subject: [PATCH 01/17] [deepsparse.infer] UX improvements, data only mode

---
 src/deepsparse/transformers/infer.py | 49 ++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 3 deletions(-)

diff --git a/src/deepsparse/transformers/infer.py b/src/deepsparse/transformers/infer.py
index e4f8ad26f3..7332a77579 100644
--- a/src/deepsparse/transformers/infer.py
+++ b/src/deepsparse/transformers/infer.py
@@ -64,6 +64,7 @@
     --task text-generation
 """
 import click
+from typing import Optional
 
 from deepsparse import Pipeline
 from deepsparse.tasks import SupportedTasks
@@ -75,6 +76,14 @@
     )
 )
 @click.argument("model_path", type=str)
+@click.option(
+    "--data",
+    type=Optional[str],
+    default=None,
+    help="Path to .txt, .csv, .json, or .jsonl file to load data from"
+    "If provided, runs inference over the entire dataset. If not provided "
+    "runs an interactive inference session in the console. Default None.",
+)
 @click.option(
     "--sequence_length",
     type=int,
@@ -112,6 +121,7 @@
 )
 def main(
     model_path: str,
+    data: Optional[str],
     sequence_length: int,
     sampling_temperature: float,
     prompt_sequence_length: int,
@@ -135,16 +145,23 @@ def main(
         prompt_sequence_length=prompt_sequence_length,
     )
 
+    if data is not None:
+        for prompt in _iter_prompt_from_file(data):
+            # TODO: George run inference
+            pass
+        return
+
     # continue prompts until a keyboard interrupt
-    while True:
-        input_text = input("User: ")
+    while data is None:  # always True in interactive Mode
+        input_text = input(">>> ")
         pipeline_inputs = {"prompt": [input_text]}
 
         if SupportedTasks.is_chat(task):
             pipeline_inputs["session_ids"] = session_ids
 
         response = pipeline(**pipeline_inputs)
-        print("Bot: ", response.generations[0].text)
+        print("\n", response.generations[0].text)
+
         if show_tokens_per_sec:
             times = pipeline.timer_manager.times
             prefill_speed = (
@@ -158,5 +175,31 @@ def main(
             )
 
 
+def _iter_prompt_from_file(data: str):
+    """
+    TODO: George
+    .txt - each line is a single prompt
+    .csv - match first column with name in [text, prompt, sequence, sentence, sentence1], only look at values in that column, can treat other columns as kwargs
+            i.e.
+            prompt,sampling_temperature
+            prompt 1,0.9
+
+            this would make pipeline(prompt="prompt 1", temperature=0.9)
+
+    .json - expect json file to be a single list of objects where each obj can be passed directly as kwarg inputs
+
+            [
+                {},
+                {},
+            ]
+    .jsonl - load as a text file and then each line is a json object (use json.loads) treated the same as the objects above
+            {}
+            {}
+            {}
+            {}
+    """
+    pass
+
+
 if __name__ == "__main__":
     main()

From b4b7ec65d6456c64461e2aef543b4f4e311ea4a1 Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Mon, 25 Sep 2023 16:55:09 -0400
Subject: [PATCH 02/17] fix bug on main

---
 src/deepsparse/transformers/infer.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/deepsparse/transformers/infer.py b/src/deepsparse/transformers/infer.py
index 7332a77579..82e655649a 100644
--- a/src/deepsparse/transformers/infer.py
+++ b/src/deepsparse/transformers/infer.py
@@ -141,7 +141,6 @@ def main(
         task=task,  # let pipeline determine if task is supported
         model_path=model_path,
         sequence_length=sequence_length,
-        sampling_temperature=sampling_temperature,
         prompt_sequence_length=prompt_sequence_length,
     )
 
@@ -153,8 +152,11 @@ def main(
 
     # continue prompts until a keyboard interrupt
     while data is None:  # always True in interactive Mode
-        input_text = input(">>> ")
-        pipeline_inputs = {"prompt": [input_text]}
+        prompt_input = input(">>> ")
+        pipeline_inputs = dict(
+            prompt=[prompt_input],
+            sampling_temperature=sampling_temperature,
+        )
 
         if SupportedTasks.is_chat(task):
             pipeline_inputs["session_ids"] = session_ids

From 17168f6a89453503b414886ff807945201e0b6be Mon Sep 17 00:00:00 2001
From: horheynm <george@neuralmagic.com>
Date: Tue, 26 Sep 2023 04:13:35 +0000
Subject: [PATCH 03/17] draft, load files line by line, return iter, save up
 memory

---
 src/deepsparse/transformers/infer.py | 91 +++++++++++++++++++++++++++-
 1 file changed, 88 insertions(+), 3 deletions(-)

diff --git a/src/deepsparse/transformers/infer.py b/src/deepsparse/transformers/infer.py
index 82e655649a..13f25f56c4 100644
--- a/src/deepsparse/transformers/infer.py
+++ b/src/deepsparse/transformers/infer.py
@@ -63,11 +63,95 @@
 deepsparse.infer models/llama/deployment \
     --task text-generation
 """
-import click
+import csv
+import json
+from enum import Enum
 from typing import Optional
 
+import click
+
+import Exception
 from deepsparse import Pipeline
 from deepsparse.tasks import SupportedTasks
+from typing import Iterator
+
+
+
+class InvalidPromptSourceDirectoryException(Exception):
+    pass
+
+
+class PromptParser:
+    class Extentions(Enum):
+        TEXT = ".txt"
+        CSV = ".csv"
+        JSON = ".json"
+        JSONL = ".jsonl"
+
+    def __init__(self, filename: str):
+        self.extention = self._validate_and_return_extention(filename)
+        self.filename: str = filename
+
+    def parse(self):
+
+        if self.extention == self.Extentions.TEXT:
+            return self._parse_text()
+        if self.extention == self.Extentions.CSV:
+            return self._parse_csv()
+        if self.extention == self.Extentions.JSON or self.extention == self.Extentions.JSONL:
+            return self._parse_json_list()
+
+
+    def _parse_text(self):
+        try:
+            with open(self.filename, "r") as file:
+                for line in file:
+                    yield line.strip()
+        except FileNotFoundError:
+            raise
+            # print(f"The file '{self.filename}' not found.")
+
+    def _parse_csv(self, column_name: str = "prompt"):
+        try:
+            with open(self.filename, "r", newline="") as file:
+                reader = csv.DictReader(file)
+                for row in reader:
+                    yield row[column_name]
+        except FileNotFoundError:
+            raise
+            # print(f"The file '{self.filename}' was not found.")
+        except KeyError:
+            raise
+            # print(f"Column '{column_name}' not found in the CSV.")
+
+    def _parse_json(self, prompt_key: str = "prompt"):
+        try:
+            # with open(self.filename, "w") as file:
+            #     json_list = json.dump(self.filename, file)
+            #     for json_obj in json_list:
+            #         yield json_obj[prompt_key]
+            with open(self.filename, 'r') as file:
+                json_list = json.load(file)
+                for json_object in json_list:
+                    yield json_object[prompt_key]
+        except FileNotFoundError:
+            raise
+            # print(f"The file '{self.filename}' was not found.")
+        except KeyError:
+            raise
+            # print(f"Column '{column_name}' not found in the CSV.")
+
+    def _validate_and_return_extention(self, data: str):
+        for extention in self.Extentions:
+            if self.data.endswith(extention.value):
+                return extention.value
+
+        raise InvalidPromptSourceDirectoryException(
+            f"{data} is not a valid source extract batched prompts"
+        )
+
+        # if not data.endswith(tuple(extension.value for extension in self.Extensions)):
+        #
 
 
 @click.command(
@@ -177,7 +261,7 @@ def main(
             )
 
 
-def _iter_prompt_from_file(data: str):
+def _iter_prompt_from_file(data: str) -> Iterator:
     """
     TODO: George
     .txt - each line is a single prompt
@@ -200,7 +284,8 @@ def _iter_prompt_from_file(data: str):
             {}
             {}
     """
-    pass
+    parser = PromptParser(data)
+    return parser.parse_as_iterable()
 
 
 if __name__ == "__main__":

From 84b03f82808361eb34714b84a7c7a5d4b51d6a11 Mon Sep 17 00:00:00 2001
From: horheynm <george@neuralmagic.com>
Date: Tue, 26 Sep 2023 14:57:17 +0000
Subject: [PATCH 04/17] add inference

---
 src/deepsparse/transformers/infer.py | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/deepsparse/transformers/infer.py b/src/deepsparse/transformers/infer.py
index 13f25f56c4..be2409b9d9 100644
--- a/src/deepsparse/transformers/infer.py
+++ b/src/deepsparse/transformers/infer.py
@@ -231,8 +231,28 @@ def main(
     if data is not None:
         for prompt in _iter_prompt_from_file(data):
             # TODO: George run inference
-            pass
-        return
+            pipeline_inputs = dict(
+                prompt=[prompt],
+                sampling_temperature=sampling_temperature,
+            )
+            if SupportedTasks.is_chat(task):
+                pipeline_inputs["session_ids"] = session_ids
+
+            response = pipeline(**pipeline_inputs)
+            print("\n", response.generations[0].text)
+
+            if show_tokens_per_sec:
+                times = pipeline.timer_manager.times
+                prefill_speed = (
+                    1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"]
+                )
+                generation_speed = 1.0 / times["engine_token_generation_single"]
+                print(
+                    f"[prefill: {prefill_speed:.2f} tokens/sec]",
+                    f"[decode: {generation_speed:.2f} tokens/sec]",
+                    sep="\n",
+                )
+            return
 
     # continue prompts until a keyboard interrupt
     while data is None:  # always True in interactive Mode

From 8a69c5f39dcd05e7ead551ab5fdf09de409822ba Mon Sep 17 00:00:00 2001
From: horheynm <george@neuralmagic.com>
Date: Tue, 26 Sep 2023 16:02:22 +0000
Subject: [PATCH 05/17] pass passing in files

---
 src/deepsparse/transformers/infer.py | 312 ---------------------------
 1 file changed, 312 deletions(-)
 delete mode 100644 src/deepsparse/transformers/infer.py

diff --git a/src/deepsparse/transformers/infer.py b/src/deepsparse/transformers/infer.py
deleted file mode 100644
index be2409b9d9..0000000000
--- a/src/deepsparse/transformers/infer.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Usage: deepsparse.infer [OPTIONS] MODEL_PATH
-
-  Command Line utility to interact with a text genration LLM in a chatbot
-  style
-
-  Example usage:
-
-  deepsparse.infer [OPTIONS] <MODEL_PATH>
-
-Options:
-  --sequence_length INTEGER       Sequence length to compile model and
-                                  tokenizer for.This controls the maximum
-                                  context length of the pipeline.  [default:
-                                  512]
-  --sampling_temperature FLOAT    The temperature to use when samplingfrom the
-                                  probability distribution computed from the
-                                  logits.Higher values will result in more
-                                  random samples. Shouldbe greater than 0.0.
-                                  [default: 1.0]
-  --prompt_sequence_length INTEGER
-                                  Processed prompt in chunks of this length.
-                                  This is to maximize the inference speed
-                                  [default: 64]
-  --show_tokens_per_sec / --no_show_tokens_per_sec
-                                  Whether to display the token generation
-                                  speed or not  [default:
-                                  no_show_tokens_per_sec]
-  --task TEXT                     The task to use for the pipeline. Choose any
-                                  of `chat`, `codegen`, `text-generation`
-                                  [default: chat]
-  --help                          Show this message and exit.
-
-Installation: pip install deepsparse[transformers]
-Examples:
-
-1) Use a local deployment directory
-deepsparse.infer models/llama/deployment
-
-2) Use a SparseZoo stub
-deepsparse.infer \
-    zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none # noqa: E501
-
-3) Display token generation speed
-deepsparse.infer models/llama/deployment \
-    --show_tokens_per_sec
-
-4) Disable history
-deepsparse.infer models/llama/deployment \
-    --task text-generation
-"""
-import csv
-import json
-from enum import Enum
-from typing import Optional
-
-import click
-
-import Exception
-from deepsparse import Pipeline
-from deepsparse.tasks import SupportedTasks
-from typing import Iterator
-
-
-
-class InvalidPromptSourceDirectoryException(Exception):
-    pass
-
-
-class PromptParser:
-    class Extentions(Enum):
-        TEXT = ".txt"
-        CSV = ".csv"
-        JSON = ".json"
-        JSONL = ".jsonl"
-
-    def __init__(self, filename: str):
-        self.extention = self._validate_and_return_extention(filename)
-        self.filename: str = filename
-
-    def parse(self):
-
-        if self.extention == self.Extentions.TEXT:
-            return self._parse_text()
-        if self.extention == self.Extentions.CSV:
-            return self._parse_csv()
-        if self.extention == self.Extentions.JSON or self.extention == self.Extentions.JSONL:
-            return self._parse_json_list()
-
-
-    def _parse_text(self):
-        try:
-            with open(self.filename, "r") as file:
-                for line in file:
-                    yield line.strip()
-        except FileNotFoundError:
-            raise
-            # print(f"The file '{self.filename}' not found.")
-
-    def _parse_csv(self, column_name: str = "prompt"):
-        try:
-            with open(self.filename, "r", newline="") as file:
-                reader = csv.DictReader(file)
-                for row in reader:
-                    yield row[column_name]
-        except FileNotFoundError:
-            raise
-            # print(f"The file '{self.filename}' was not found.")
-        except KeyError:
-            raise
-            # print(f"Column '{column_name}' not found in the CSV.")
-
-    def _parse_json(self, prompt_key: str = "prompt"):
-        try:
-            # with open(self.filename, "w") as file:
-            #     json_list = json.dump(self.filename, file)
-            #     for json_obj in json_list:
-            #         yield json_obj[prompt_key]
-            with open(self.filename, 'r') as file:
-                json_list = json.load(file)
-                for json_object in json_list:
-                    yield json_object[prompt_key]
-        except FileNotFoundError:
-            raise
-            # print(f"The file '{self.filename}' was not found.")
-        except KeyError:
-            raise
-            # print(f"Column '{column_name}' not found in the CSV.")
-
-    def _validate_and_return_extention(self, data: str):
-        for extention in self.Extentions:
-            if self.data.endswith(extention.value):
-                return extention.value
-
-        raise InvalidPromptSourceDirectoryException(
-            f"{data} is not a valid source extract batched prompts"
-        )
-
-        # if not data.endswith(tuple(extension.value for extension in self.Extensions)):
-        #
-
-
-@click.command(
-    context_settings=dict(
-        token_normalize_func=lambda x: x.replace("-", "_"), show_default=True
-    )
-)
-@click.argument("model_path", type=str)
-@click.option(
-    "--data",
-    type=Optional[str],
-    default=None,
-    help="Path to .txt, .csv, .json, or .jsonl file to load data from"
-    "If provided, runs inference over the entire dataset. If not provided "
-    "runs an interactive inference session in the console. Default None.",
-)
-@click.option(
-    "--sequence_length",
-    type=int,
-    default=512,
-    help="Sequence length to compile model and tokenizer for."
-    "This controls the maximum context length of the pipeline.",
-)
-@click.option(
-    "--sampling_temperature",
-    type=float,
-    default=1.0,
-    help="The temperature to use when sampling"
-    "from the probability distribution computed from the logits."
-    "Higher values will result in more random samples. Should"
-    "be greater than 0.0.",
-)
-@click.option(
-    "--prompt_sequence_length",
-    type=int,
-    default=64,
-    help="Processed prompt in chunks of this length. "
-    "This is to maximize the inference speed",
-)
-@click.option(
-    "--show_tokens_per_sec/--no_show_tokens_per_sec",
-    default=False,
-    help="Whether to display the token generation speed or not",
-)
-@click.option(
-    "--task",
-    default="chat",
-    type=str,
-    help="The task to use for the pipeline. Choose any of "
-    "`chat`, `codegen`, `text-generation`",
-)
-def main(
-    model_path: str,
-    data: Optional[str],
-    sequence_length: int,
-    sampling_temperature: float,
-    prompt_sequence_length: int,
-    show_tokens_per_sec: bool,
-    task: str,
-):
-    """
-    Command Line utility to interact with a text genration LLM in a chatbot style
-
-    Example usage:
-
-    deepsparse.infer [OPTIONS] <MODEL_PATH>
-    """
-    session_ids = "chatbot_cli_session"
-
-    pipeline = Pipeline.create(
-        task=task,  # let pipeline determine if task is supported
-        model_path=model_path,
-        sequence_length=sequence_length,
-        prompt_sequence_length=prompt_sequence_length,
-    )
-
-    if data is not None:
-        for prompt in _iter_prompt_from_file(data):
-            # TODO: George run inference
-            pipeline_inputs = dict(
-                prompt=[prompt],
-                sampling_temperature=sampling_temperature,
-            )
-            if SupportedTasks.is_chat(task):
-                pipeline_inputs["session_ids"] = session_ids
-
-            response = pipeline(**pipeline_inputs)
-            print("\n", response.generations[0].text)
-
-            if show_tokens_per_sec:
-                times = pipeline.timer_manager.times
-                prefill_speed = (
-                    1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"]
-                )
-                generation_speed = 1.0 / times["engine_token_generation_single"]
-                print(
-                    f"[prefill: {prefill_speed:.2f} tokens/sec]",
-                    f"[decode: {generation_speed:.2f} tokens/sec]",
-                    sep="\n",
-                )
-            return
-
-    # continue prompts until a keyboard interrupt
-    while data is None:  # always True in interactive Mode
-        prompt_input = input(">>> ")
-        pipeline_inputs = dict(
-            prompt=[prompt_input],
-            sampling_temperature=sampling_temperature,
-        )
-
-        if SupportedTasks.is_chat(task):
-            pipeline_inputs["session_ids"] = session_ids
-
-        response = pipeline(**pipeline_inputs)
-        print("\n", response.generations[0].text)
-
-        if show_tokens_per_sec:
-            times = pipeline.timer_manager.times
-            prefill_speed = (
-                1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"]
-            )
-            generation_speed = 1.0 / times["engine_token_generation_single"]
-            print(
-                f"[prefill: {prefill_speed:.2f} tokens/sec]",
-                f"[decode: {generation_speed:.2f} tokens/sec]",
-                sep="\n",
-            )
-
-
-def _iter_prompt_from_file(data: str) -> Iterator:
-    """
-    TODO: George
-    .txt - each line is a single prompt
-    .csv - match first column with name in [text, prompt, sequence, sentence, sentence1], only look at values in that column, can treat other columns as kwargs
-            i.e.
-            prompt,sampling_temperature
-            prompt 1,0.9
-
-            this would make pipeline(prompt="prompt 1", temperature=0.9)
-
-    .json - expect json file to be a single list of objects where each obj can be passed directly as kwarg inputs
-
-            [
-                {},
-                {},
-            ]
-    .jsonl - load as a text file and then each line is a json object (use json.loads) treated the same as the objects above
-            {}
-            {}
-            {}
-            {}
-    """
-    parser = PromptParser(data)
-    return parser.parse_as_iterable()
-
-
-if __name__ == "__main__":
-    main()

From 4091503f2c8115a12b673163dd4abea140ecf41c Mon Sep 17 00:00:00 2001
From: horheynm <george@neuralmagic.com>
Date: Tue, 26 Sep 2023 16:28:01 +0000
Subject: [PATCH 06/17] latest changes'

---
 .../pipelines/test_text_generation.py         | 229 +++++++++---------
 1 file changed, 115 insertions(+), 114 deletions(-)

diff --git a/tests/deepsparse/transformers/pipelines/test_text_generation.py b/tests/deepsparse/transformers/pipelines/test_text_generation.py
index f781e4f119..e126385a14 100644
--- a/tests/deepsparse/transformers/pipelines/test_text_generation.py
+++ b/tests/deepsparse/transformers/pipelines/test_text_generation.py
@@ -82,7 +82,7 @@ def Fibonacci(n):
     ],
     scope="class",
 )
-@pytest.mark.skip(reason="Those tests are too heavy to run as a normal part of the CI.")
+# @pytest.mark.skip(reason="Those tests are too heavy to run as a normal part of the CI.")
 class TestTextGenerationPipeline:
     """
     This test suite is meant to test the main scenarios of
@@ -154,119 +154,119 @@ def setup(
 
         yield model_name, uses_bos_token, torch_ground_truth
 
-    def test_freeze_first_position(self, setup):
-        # Test whether we should be "freezing" the first token after
-        # the kv cache is full
-        _, uses_bos_token, _ = setup
-        pipeline = self.get_pipeline()
-        assert prepends_bos_token(pipeline.tokenizer) == uses_bos_token
-
-    def test_ort_single_token_prefill(self, setup):
-        # Test the pipeline that uses ORT engine. The test covers the
-        # following scenario:
-        # 1. Prompt preprocessing is performed by single-token engine
-        # 2. The KV Cache is never filled up
-        # 3. KV Cache managed externally
-
-        if self.internal_kv_cache:
-            pytest.skip(
-                "Cannot run ORT pipeline with the internal deepsparse cache enabled."
-            )
-        _, _, torch_ground_truth = setup
-        pipeline = self.get_pipeline(
-            task=self.pipeline_type,
-            model_path=self.model_stub,
-            sequence_length=self.sequence_length,
-            prompt_sequence_length=1,
-            engine_type="onnxruntime",
-        )
-        pipeline._debug = True
-
-        config = GenerationConfig(
-            output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0
-        )
-
-        output = pipeline(
-            sequences=self.prompt, include_prompt_logits=True, generation_config=config
-        )
-        assert output.total_num_processed_tokens[0] < self.sequence_length
-        self._test_output(
-            output=output,
-            torch_ground_truth=torch_ground_truth,
-        )
-
-    def test_ort_multi_token_prefill(self, setup):
-        # Test the pipeline that uses ORT engine. The test covers the
-        # following scenario:
-        # 1. Prompt preprocessing is performed by multi-token engine
-        # 2. The KV Cache is never filled up
-        # 3. KV Cache managed externally
-
-        if self.internal_kv_cache:
-            pytest.skip(
-                "Cannot run ORT pipeline with the internal deepsparse cache enabled."
-            )
-        _, _, torch_ground_truth = setup
-        pipeline = self.get_pipeline(
-            task=self.pipeline_type,
-            model_path=self.model_stub,
-            sequence_length=self.sequence_length,
-            prompt_sequence_length=self.prompt_sequence_length,
-            engine_type="onnxruntime",
-        )
-        pipeline._debug = True
-        config = GenerationConfig(
-            output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0
-        )
-        output = pipeline(
-            sequences=self.prompt, include_prompt_logits=True, generation_config=config
-        )
-
-        assert output.total_num_processed_tokens[0] < self.sequence_length
-        self._test_output(
-            output=output,
-            torch_ground_truth=torch_ground_truth,
-        )
-
-    def test_ort_generation_after_kv_cache_has_been_filled(self, setup):
-        # Test the pipeline that uses ORT engine. The test covers the
-        # following scenario:
-        # 1. Prompt preprocessing is performed by multi-token engine
-        # 2. The KV Cache is filled up (old entries are removed)
-        # 3. KV Cache managed externally
-
-        if self.internal_kv_cache:
-            pytest.skip(
-                "Cannot run ORT pipeline with the internal deepsparse cache enabled."
-            )
-        _, _, torch_ground_truth = setup
-        pipeline = self.get_pipeline(
-            task=self.pipeline_type,
-            model_path=self.model_stub,
-            sequence_length=self.sequence_length_short,
-            prompt_sequence_length=self.prompt_sequence_length,
-            engine_type="onnxruntime",
-        )
-        pipeline._debug = True
-
-        config = GenerationConfig(
-            output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0
-        )
-        output = pipeline(
-            sequences=self.prompt, include_prompt_logits=True, generation_config=config
-        )
-
-        assert output.total_num_processed_tokens[0] > self.sequence_length_short, (
-            "for this scenario, the kv cache should be full: "
-            "the total number of processed tokens should be "
-            "greater than the sequence length"
-        )
-
-        self._test_output(
-            output=output,
-            torch_ground_truth=torch_ground_truth,
-            max_logits_difference_threshold=self.logits_max_diff_kv_cache_has_been_filled,  # noqa E501
-        )
+    # def test_freeze_first_position(self, setup):
+    #     # Test whether we should be "freezing" the first token after
+    #     # the kv cache is full
+    #     _, uses_bos_token, _ = setup
+    #     pipeline = self.get_pipeline()
+    #     assert prepends_bos_token(pipeline.tokenizer) == uses_bos_token
+
+    # def test_ort_single_token_prefill(self, setup):
+    #     # Test the pipeline that uses ORT engine. The test covers the
+    #     # following scenario:
+    #     # 1. Prompt preprocessing is performed by single-token engine
+    #     # 2. The KV Cache is never filled up
+    #     # 3. KV Cache managed externally
+
+    #     if self.internal_kv_cache:
+    #         pytest.skip(
+    #             "Cannot run ORT pipeline with the internal deepsparse cache enabled."
+    #         )
+    #     _, _, torch_ground_truth = setup
+    #     pipeline = self.get_pipeline(
+    #         task=self.pipeline_type,
+    #         model_path=self.model_stub,
+    #         sequence_length=self.sequence_length,
+    #         prompt_sequence_length=1,
+    #         engine_type="onnxruntime",
+    #     )
+    #     pipeline._debug = True
+
+    #     config = GenerationConfig(
+    #         output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0
+    #     )
+
+    #     output = pipeline(
+    #         sequences=self.prompt, include_prompt_logits=True, generation_config=config
+    #     )
+    #     assert output.total_num_processed_tokens[0] < self.sequence_length
+    #     self._test_output(
+    #         output=output,
+    #         torch_ground_truth=torch_ground_truth,
+    #     )
+
+    # def test_ort_multi_token_prefill(self, setup):
+    #     # Test the pipeline that uses ORT engine. The test covers the
+    #     # following scenario:
+    #     # 1. Prompt preprocessing is performed by multi-token engine
+    #     # 2. The KV Cache is never filled up
+    #     # 3. KV Cache managed externally
+
+    #     if self.internal_kv_cache:
+    #         pytest.skip(
+    #             "Cannot run ORT pipeline with the internal deepsparse cache enabled."
+    #         )
+    #     _, _, torch_ground_truth = setup
+    #     pipeline = self.get_pipeline(
+    #         task=self.pipeline_type,
+    #         model_path=self.model_stub,
+    #         sequence_length=self.sequence_length,
+    #         prompt_sequence_length=self.prompt_sequence_length,
+    #         engine_type="onnxruntime",
+    #     )
+    #     pipeline._debug = True
+    #     config = GenerationConfig(
+    #         output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0
+    #     )
+    #     output = pipeline(
+    #         sequences=self.prompt, include_prompt_logits=True, generation_config=config
+    #     )
+
+    #     assert output.total_num_processed_tokens[0] < self.sequence_length
+    #     self._test_output(
+    #         output=output,
+    #         torch_ground_truth=torch_ground_truth,
+    #     )
+
+    # def test_ort_generation_after_kv_cache_has_been_filled(self, setup):
+    #     # Test the pipeline that uses ORT engine. The test covers the
+    #     # following scenario:
+    #     # 1. Prompt preprocessing is performed by multi-token engine
+    #     # 2. The KV Cache is filled up (old entries are removed)
+    #     # 3. KV Cache managed externally
+
+    #     if self.internal_kv_cache:
+    #         pytest.skip(
+    #             "Cannot run ORT pipeline with the internal deepsparse cache enabled."
+    #         )
+    #     _, _, torch_ground_truth = setup
+    #     pipeline = self.get_pipeline(
+    #         task=self.pipeline_type,
+    #         model_path=self.model_stub,
+    #         sequence_length=self.sequence_length_short,
+    #         prompt_sequence_length=self.prompt_sequence_length,
+    #         engine_type="onnxruntime",
+    #     )
+    #     pipeline._debug = True
+
+    #     config = GenerationConfig(
+    #         output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0
+    #     )
+    #     output = pipeline(
+    #         sequences=self.prompt, include_prompt_logits=True, generation_config=config
+    #     )
+
+    #     assert output.total_num_processed_tokens[0] > self.sequence_length_short, (
+    #         "for this scenario, the kv cache should be full: "
+    #         "the total number of processed tokens should be "
+    #         "greater than the sequence length"
+    #     )
+
+    #     self._test_output(
+    #         output=output,
+    #         torch_ground_truth=torch_ground_truth,
+    #         max_logits_difference_threshold=self.logits_max_diff_kv_cache_has_been_filled,  # noqa E501
+    #     )
 
     def test_deepsparse_single_token_prefill(self, setup):
         # Test the pipeline that uses deepsparse engine. The test covers the
@@ -276,6 +276,7 @@ def test_deepsparse_single_token_prefill(self, setup):
         # 3. KV Cache managed externally or internally
 
         _, _, torch_ground_truth = setup
+        breakpoint()
         pipeline = self.get_pipeline(
             task=self.pipeline_type,
             model_path=self.model_stub,

From b0f65af956aea4949bdf8fdb9ee3c322d3f1d601 Mon Sep 17 00:00:00 2001
From: horheynm <george@neuralmagic.com>
Date: Tue, 26 Sep 2023 16:29:45 +0000
Subject: [PATCH 07/17] revert

---
 .../pipelines/test_text_generation.py         | 229 +++++++++---------
 1 file changed, 114 insertions(+), 115 deletions(-)

diff --git a/tests/deepsparse/transformers/pipelines/test_text_generation.py b/tests/deepsparse/transformers/pipelines/test_text_generation.py
index e126385a14..f781e4f119 100644
--- a/tests/deepsparse/transformers/pipelines/test_text_generation.py
+++ b/tests/deepsparse/transformers/pipelines/test_text_generation.py
@@ -82,7 +82,7 @@ def Fibonacci(n):
     ],
     scope="class",
 )
-# @pytest.mark.skip(reason="Those tests are too heavy to run as a normal part of the CI.")
+@pytest.mark.skip(reason="Those tests are too heavy to run as a normal part of the CI.")
 class TestTextGenerationPipeline:
     """
     This test suite is meant to test the main scenarios of
@@ -154,119 +154,119 @@ def setup(
 
         yield model_name, uses_bos_token, torch_ground_truth
 
-    # def test_freeze_first_position(self, setup):
-    #     # Test whether we should be "freezing" the first token after
-    #     # the kv cache is full
-    #     _, uses_bos_token, _ = setup
-    #     pipeline = self.get_pipeline()
-    #     assert prepends_bos_token(pipeline.tokenizer) == uses_bos_token
-
-    # def test_ort_single_token_prefill(self, setup):
-    #     # Test the pipeline that uses ORT engine. The test covers the
-    #     # following scenario:
-    #     # 1. Prompt preprocessing is performed by single-token engine
-    #     # 2. The KV Cache is never filled up
-    #     # 3. KV Cache managed externally
-
-    #     if self.internal_kv_cache:
-    #         pytest.skip(
-    #             "Cannot run ORT pipeline with the internal deepsparse cache enabled."
-    #         )
-    #     _, _, torch_ground_truth = setup
-    #     pipeline = self.get_pipeline(
-    #         task=self.pipeline_type,
-    #         model_path=self.model_stub,
-    #         sequence_length=self.sequence_length,
-    #         prompt_sequence_length=1,
-    #         engine_type="onnxruntime",
-    #     )
-    #     pipeline._debug = True
-
-    #     config = GenerationConfig(
-    #         output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0
-    #     )
-
-    #     output = pipeline(
-    #         sequences=self.prompt, include_prompt_logits=True, generation_config=config
-    #     )
-    #     assert output.total_num_processed_tokens[0] < self.sequence_length
-    #     self._test_output(
-    #         output=output,
-    #         torch_ground_truth=torch_ground_truth,
-    #     )
-
-    # def test_ort_multi_token_prefill(self, setup):
-    #     # Test the pipeline that uses ORT engine. The test covers the
-    #     # following scenario:
-    #     # 1. Prompt preprocessing is performed by multi-token engine
-    #     # 2. The KV Cache is never filled up
-    #     # 3. KV Cache managed externally
-
-    #     if self.internal_kv_cache:
-    #         pytest.skip(
-    #             "Cannot run ORT pipeline with the internal deepsparse cache enabled."
-    #         )
-    #     _, _, torch_ground_truth = setup
-    #     pipeline = self.get_pipeline(
-    #         task=self.pipeline_type,
-    #         model_path=self.model_stub,
-    #         sequence_length=self.sequence_length,
-    #         prompt_sequence_length=self.prompt_sequence_length,
-    #         engine_type="onnxruntime",
-    #     )
-    #     pipeline._debug = True
-    #     config = GenerationConfig(
-    #         output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0
-    #     )
-    #     output = pipeline(
-    #         sequences=self.prompt, include_prompt_logits=True, generation_config=config
-    #     )
-
-    #     assert output.total_num_processed_tokens[0] < self.sequence_length
-    #     self._test_output(
-    #         output=output,
-    #         torch_ground_truth=torch_ground_truth,
-    #     )
-
-    # def test_ort_generation_after_kv_cache_has_been_filled(self, setup):
-    #     # Test the pipeline that uses ORT engine. The test covers the
-    #     # following scenario:
-    #     # 1. Prompt preprocessing is performed by multi-token engine
-    #     # 2. The KV Cache is filled up (old entries are removed)
-    #     # 3. KV Cache managed externally
-
-    #     if self.internal_kv_cache:
-    #         pytest.skip(
-    #             "Cannot run ORT pipeline with the internal deepsparse cache enabled."
-    #         )
-    #     _, _, torch_ground_truth = setup
-    #     pipeline = self.get_pipeline(
-    #         task=self.pipeline_type,
-    #         model_path=self.model_stub,
-    #         sequence_length=self.sequence_length_short,
-    #         prompt_sequence_length=self.prompt_sequence_length,
-    #         engine_type="onnxruntime",
-    #     )
-    #     pipeline._debug = True
-
-    #     config = GenerationConfig(
-    #         output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0
-    #     )
-    #     output = pipeline(
-    #         sequences=self.prompt, include_prompt_logits=True, generation_config=config
-    #     )
-
-    #     assert output.total_num_processed_tokens[0] > self.sequence_length_short, (
-    #         "for this scenario, the kv cache should be full: "
-    #         "the total number of processed tokens should be "
-    #         "greater than the sequence length"
-    #     )
-
-    #     self._test_output(
-    #         output=output,
-    #         torch_ground_truth=torch_ground_truth,
-    #         max_logits_difference_threshold=self.logits_max_diff_kv_cache_has_been_filled,  # noqa E501
-    #     )
+    def test_freeze_first_position(self, setup):
+        # Test whether we should be "freezing" the first token after
+        # the kv cache is full
+        _, uses_bos_token, _ = setup
+        pipeline = self.get_pipeline()
+        assert prepends_bos_token(pipeline.tokenizer) == uses_bos_token
+
+    def test_ort_single_token_prefill(self, setup):
+        # Test the pipeline that uses ORT engine. The test covers the
+        # following scenario:
+        # 1. Prompt preprocessing is performed by single-token engine
+        # 2. The KV Cache is never filled up
+        # 3. KV Cache managed externally
+
+        if self.internal_kv_cache:
+            pytest.skip(
+                "Cannot run ORT pipeline with the internal deepsparse cache enabled."
+            )
+        _, _, torch_ground_truth = setup
+        pipeline = self.get_pipeline(
+            task=self.pipeline_type,
+            model_path=self.model_stub,
+            sequence_length=self.sequence_length,
+            prompt_sequence_length=1,
+            engine_type="onnxruntime",
+        )
+        pipeline._debug = True
+
+        config = GenerationConfig(
+            output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0
+        )
+
+        output = pipeline(
+            sequences=self.prompt, include_prompt_logits=True, generation_config=config
+        )
+        assert output.total_num_processed_tokens[0] < self.sequence_length
+        self._test_output(
+            output=output,
+            torch_ground_truth=torch_ground_truth,
+        )
+
+    def test_ort_multi_token_prefill(self, setup):
+        # Test the pipeline that uses ORT engine. The test covers the
+        # following scenario:
+        # 1. Prompt preprocessing is performed by multi-token engine
+        # 2. The KV Cache is never filled up
+        # 3. KV Cache managed externally
+
+        if self.internal_kv_cache:
+            pytest.skip(
+                "Cannot run ORT pipeline with the internal deepsparse cache enabled."
+            )
+        _, _, torch_ground_truth = setup
+        pipeline = self.get_pipeline(
+            task=self.pipeline_type,
+            model_path=self.model_stub,
+            sequence_length=self.sequence_length,
+            prompt_sequence_length=self.prompt_sequence_length,
+            engine_type="onnxruntime",
+        )
+        pipeline._debug = True
+        config = GenerationConfig(
+            output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0
+        )
+        output = pipeline(
+            sequences=self.prompt, include_prompt_logits=True, generation_config=config
+        )
+
+        assert output.total_num_processed_tokens[0] < self.sequence_length
+        self._test_output(
+            output=output,
+            torch_ground_truth=torch_ground_truth,
+        )
+
+    def test_ort_generation_after_kv_cache_has_been_filled(self, setup):
+        # Test the pipeline that uses ORT engine. The test covers the
+        # following scenario:
+        # 1. Prompt preprocessing is performed by multi-token engine
+        # 2. The KV Cache is filled up (old entries are removed)
+        # 3. KV Cache managed externally
+
+        if self.internal_kv_cache:
+            pytest.skip(
+                "Cannot run ORT pipeline with the internal deepsparse cache enabled."
+            )
+        _, _, torch_ground_truth = setup
+        pipeline = self.get_pipeline(
+            task=self.pipeline_type,
+            model_path=self.model_stub,
+            sequence_length=self.sequence_length_short,
+            prompt_sequence_length=self.prompt_sequence_length,
+            engine_type="onnxruntime",
+        )
+        pipeline._debug = True
+
+        config = GenerationConfig(
+            output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0
+        )
+        output = pipeline(
+            sequences=self.prompt, include_prompt_logits=True, generation_config=config
+        )
+
+        assert output.total_num_processed_tokens[0] > self.sequence_length_short, (
+            "for this scenario, the kv cache should be full: "
+            "the total number of processed tokens should be "
+            "greater than the sequence length"
+        )
+
+        self._test_output(
+            output=output,
+            torch_ground_truth=torch_ground_truth,
+            max_logits_difference_threshold=self.logits_max_diff_kv_cache_has_been_filled,  # noqa E501
+        )
 
     def test_deepsparse_single_token_prefill(self, setup):
         # Test the pipeline that uses deepsparse engine. The test covers the
@@ -276,7 +276,6 @@ def test_deepsparse_single_token_prefill(self, setup):
         # 3. KV Cache managed externally or internally
 
         _, _, torch_ground_truth = setup
-        breakpoint()
         pipeline = self.get_pipeline(
             task=self.pipeline_type,
             model_path=self.model_stub,

From 8ee765b2ccb1650bc70735755a5ca89a2c029b61 Mon Sep 17 00:00:00 2001
From: horheynm <george@neuralmagic.com>
Date: Tue, 26 Sep 2023 16:30:11 +0000
Subject: [PATCH 08/17] make new folder for inderence

---
 .../transformers/inference/__init__.py        |   0
 src/deepsparse/transformers/inference/main.py | 233 ++++++++++++++++++
 .../transformers/inference/prompt_parser.py   |  81 ++++++
 3 files changed, 314 insertions(+)
 create mode 100644 src/deepsparse/transformers/inference/__init__.py
 create mode 100644 src/deepsparse/transformers/inference/main.py
 create mode 100644 src/deepsparse/transformers/inference/prompt_parser.py

diff --git a/src/deepsparse/transformers/inference/__init__.py b/src/deepsparse/transformers/inference/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/deepsparse/transformers/inference/main.py b/src/deepsparse/transformers/inference/main.py
new file mode 100644
index 0000000000..9569aca3b0
--- /dev/null
+++ b/src/deepsparse/transformers/inference/main.py
@@ -0,0 +1,233 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Usage: deepsparse.infer [OPTIONS] MODEL_PATH
+
+  Command Line utility to interact with a text genration LLM in a chatbot
+  style
+
+  Example usage:
+
+  deepsparse.infer [OPTIONS] <MODEL_PATH>
+
+Options:
+  --sequence_length INTEGER       Sequence length to compile model and
+                                  tokenizer for.This controls the maximum
+                                  context length of the pipeline.  [default:
+                                  512]
+  --sampling_temperature FLOAT    The temperature to use when samplingfrom the
+                                  probability distribution computed from the
+                                  logits.Higher values will result in more
+                                  random samples. Shouldbe greater than 0.0.
+                                  [default: 1.0]
+  --prompt_sequence_length INTEGER
+                                  Processed prompt in chunks of this length.
+                                  This is to maximize the inference speed
+                                  [default: 64]
+  --show_tokens_per_sec / --no_show_tokens_per_sec
+                                  Whether to display the token generation
+                                  speed or not  [default:
+                                  no_show_tokens_per_sec]
+  --task TEXT                     The task to use for the pipeline. Choose any
+                                  of `chat`, `codegen`, `text-generation`
+                                  [default: chat]
+  --help                          Show this message and exit.
+
+Installation: pip install deepsparse[transformers]
+Examples:
+
+1) Use a local deployment directory
+deepsparse.infer models/llama/deployment
+
+2) Use a SparseZoo stub
+deepsparse.infer \
+    zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none # noqa: E501
+
+3) Display token generation speed
+deepsparse.infer models/llama/deployment \
+    --show_tokens_per_sec
+
+4) Disable history
+deepsparse.infer models/llama/deployment \
+    --task text-generation
+"""
+import csv
+import json
+from enum import Enum
+from typing import Iterator, Optional
+
+import click
+
+from deepsparse import Pipeline
+from deepsparse.tasks import SupportedTasks
+from deepsparse.transformers.inference.prompt_parser import PromptParser
+
+
+@click.command(
+    context_settings=dict(
+        token_normalize_func=lambda x: x.replace("-", "_"), show_default=True
+    )
+)
+@click.argument("model_path", type=str)
+@click.option(
+    "--data",
+    type=Optional[str],
+    default=None,
+    help="Path to .txt, .csv, .json, or .jsonl file to load data from"
+    "If provided, runs inference over the entire dataset. If not provided "
+    "runs an interactive inference session in the console. Default None.",
+)
+@click.option(
+    "--sequence_length",
+    type=int,
+    default=512,
+    help="Sequence length to compile model and tokenizer for."
+    "This controls the maximum context length of the pipeline.",
+)
+@click.option(
+    "--sampling_temperature",
+    type=float,
+    default=1.0,
+    help="The temperature to use when sampling"
+    "from the probability distribution computed from the logits."
+    "Higher values will result in more random samples. Should"
+    "be greater than 0.0.",
+)
+@click.option(
+    "--prompt_sequence_length",
+    type=int,
+    default=64,
+    help="Processed prompt in chunks of this length. "
+    "This is to maximize the inference speed",
+)
+@click.option(
+    "--show_tokens_per_sec/--no_show_tokens_per_sec",
+    default=False,
+    help="Whether to display the token generation speed or not",
+)
+@click.option(
+    "--task",
+    default="chat",
+    type=str,
+    help="The task to use for the pipeline. Choose any of "
+    "`chat`, `codegen`, `text-generation`",
+)
+def main(
+    model_path: str,
+    data: Optional[str],
+    sequence_length: int,
+    sampling_temperature: float,
+    prompt_sequence_length: int,
+    show_tokens_per_sec: bool,
+    task: str,
+):
+    """
+    Command Line utility to interact with a text genration LLM in a chatbot style
+
+    Example usage:
+
+    deepsparse.infer [OPTIONS] <MODEL_PATH>
+    """
+    session_ids = "chatbot_cli_session"
+
+    pipeline = Pipeline.create(
+        task=task,  # let pipeline determine if task is supported
+        model_path=model_path,
+        sequence_length=sequence_length,
+        prompt_sequence_length=prompt_sequence_length,
+    )
+
+    if data is not None:
+        for prompt in _iter_prompt_from_file(data):
+            # TODO: George run inference
+            pipeline_inputs = dict(
+                prompt=[prompt],
+                sampling_temperature=sampling_temperature,
+            )
+            if SupportedTasks.is_chat(task):
+                pipeline_inputs["session_ids"] = session_ids
+
+            response = pipeline(**pipeline_inputs)
+            print("\n", response.generations[0].text)
+
+            if show_tokens_per_sec:
+                times = pipeline.timer_manager.times
+                prefill_speed = (
+                    1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"]
+                )
+                generation_speed = 1.0 / times["engine_token_generation_single"]
+                print(
+                    f"[prefill: {prefill_speed:.2f} tokens/sec]",
+                    f"[decode: {generation_speed:.2f} tokens/sec]",
+                    sep="\n",
+                )
+            return
+
+    # continue prompts until a keyboard interrupt
+    while data is None:  # always True in interactive Mode
+        prompt_input = input(">>> ")
+        pipeline_inputs = dict(
+            prompt=[prompt_input],
+            sampling_temperature=sampling_temperature,
+        )
+
+        if SupportedTasks.is_chat(task):
+            pipeline_inputs["session_ids"] = session_ids
+
+        response = pipeline(**pipeline_inputs)
+        print("\n", response.generations[0].text)
+
+        if show_tokens_per_sec:
+            times = pipeline.timer_manager.times
+            prefill_speed = (
+                1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"]
+            )
+            generation_speed = 1.0 / times["engine_token_generation_single"]
+            print(
+                f"[prefill: {prefill_speed:.2f} tokens/sec]",
+                f"[decode: {generation_speed:.2f} tokens/sec]",
+                sep="\n",
+            )
+
+
+def _iter_prompt_from_file(data: str) -> Iterator:
+    """
+    TODO: George
+    .txt - each line is a single prompt
+    .csv - match first column with name in [text, prompt, sequence, sentence, sentence1], only look at values in that column, can treat other columns as kwargs
+            i.e.
+            prompt,sampling_temperature
+            prompt 1,0.9
+
+            this would make pipeline(prompt="prompt 1", temperature=0.9)
+
+    .json - expect json file to be a single list of objects where each obj can be passed directly as kwarg inputs
+
+            [
+                {},
+                {},
+            ]
+    .jsonl - load as a text file and then each line is a json object (use json.loads) treated the same as the objects above
+            {}
+            {}
+            {}
+            {}
+    """
+    parser = PromptParser(data)
+    return parser.parse_as_iterable()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/deepsparse/transformers/inference/prompt_parser.py b/src/deepsparse/transformers/inference/prompt_parser.py
new file mode 100644
index 0000000000..6b45e23917
--- /dev/null
+++ b/src/deepsparse/transformers/inference/prompt_parser.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import csv
+import json
+import os
+from enum import Enum
+from typing import Iterator, Optional
+
+
+class InvalidPromptSourceDirectoryException(Exception):
+    pass
+
+
+class PromptParser:
+    class Extentions(Enum):
+        TEXT = ".txt"
+        CSV = ".csv"
+        JSON = ".json"
+        JSONL = ".jsonl"
+
+    def __init__(self, filename: str):
+        self.extention = self._validate_and_return_extention(filename)
+        self.filename: str = filename
+
+    def parse_as_iterable(self):
+
+        if self.extention == self.Extentions.TEXT:
+            return self._parse_text()
+        if self.extention == self.Extentions.CSV:
+            return self._parse_csv()
+        if self.extention == self.Extentions.JSON:
+            return self._parse_json_list()
+        if self.extention == self.Extentions.JSONL:
+            return self._parse_jsonl()
+
+    def _parse_text(self):
+        with open(self.filename, "r") as file:
+            for line in file:
+                yield line.strip()
+
+    def _parse_csv(self, column_name: str = "prompt"):
+        with open(self.filename, "r", newline="", encoding="utf-8-sig") as file:
+            reader = csv.DictReader(file)
+            for row in reader:
+                yield row
+
+    def _parse_json_list(self):
+        with open(self.filename, "r") as file:
+            json_list = json.load(file)
+            for json_object in json_list:
+                yield json_object
+
+    def _parse_jsonl(self):
+        with open(self.filename, "r") as file:
+            for jsonl in file:
+                yield json.loads(jsonl)
+
+    def _validate_and_return_extention(self, filename: str):
+        if os.path.exists(filename):
+
+            for extention in self.Extentions:
+                if filename.endswith(extention.value):
+                    return extention
+
+            raise InvalidPromptSourceDirectoryException(
+                f"{filename} is not a valid source extract batched prompts"
+            )
+        raise FileNotFoundError

From 8a47e011291653611024a0219507ad829112a049 Mon Sep 17 00:00:00 2001
From: horheynm <george@neuralmagic.com>
Date: Tue, 26 Sep 2023 17:30:46 +0000
Subject: [PATCH 09/17] allow input to pass thru cli

---
 setup.py                                      |   2 +-
 .../transformers/inference/__init__.py        |  13 ++
 .../inference/{main.py => infer.py}           | 128 ++++++++----------
 .../transformers/inference/prompt_parser.py   |  14 +-
 4 files changed, 78 insertions(+), 79 deletions(-)
 rename src/deepsparse/transformers/inference/{main.py => infer.py} (66%)

diff --git a/setup.py b/setup.py
index d61c1fa312..6e7ca1bb96 100644
--- a/setup.py
+++ b/setup.py
@@ -298,7 +298,7 @@ def _setup_entry_points() -> Dict:
         "console_scripts": [
             f"deepsparse.transformers.run_inference={data_api_entrypoint}",
             f"deepsparse.transformers.eval_downstream={eval_downstream}",
-            "deepsparse.infer=deepsparse.transformers.infer:main",
+            "deepsparse.infer=deepsparse.transformers.inference.infer:main",
             "deepsparse.debug_analysis=deepsparse.debug_analysis:main",
             "deepsparse.analyze=deepsparse.analyze:main",
             "deepsparse.check_hardware=deepsparse.cpu:print_hardware_capability",
diff --git a/src/deepsparse/transformers/inference/__init__.py b/src/deepsparse/transformers/inference/__init__.py
index e69de29bb2..0c44f887a4 100644
--- a/src/deepsparse/transformers/inference/__init__.py
+++ b/src/deepsparse/transformers/inference/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/src/deepsparse/transformers/inference/main.py b/src/deepsparse/transformers/inference/infer.py
similarity index 66%
rename from src/deepsparse/transformers/inference/main.py
rename to src/deepsparse/transformers/inference/infer.py
index 9569aca3b0..a0c11aa189 100644
--- a/src/deepsparse/transformers/inference/main.py
+++ b/src/deepsparse/transformers/inference/infer.py
@@ -63,9 +63,7 @@
 deepsparse.infer models/llama/deployment \
     --task text-generation
 """
-import csv
-import json
-from enum import Enum
+
 from typing import Iterator, Optional
 
 import click
@@ -83,7 +81,7 @@
 @click.argument("model_path", type=str)
 @click.option(
     "--data",
-    type=Optional[str],
+    type=str,
     default=None,
     help="Path to .txt, .csv, .json, or .jsonl file to load data from"
     "If provided, runs inference over the entire dataset. If not provided "
@@ -149,85 +147,73 @@ def main(
         prompt_sequence_length=prompt_sequence_length,
     )
 
-    if data is not None:
-        for prompt in _iter_prompt_from_file(data):
-            # TODO: George run inference
-            pipeline_inputs = dict(
-                prompt=[prompt],
-                sampling_temperature=sampling_temperature,
+    if data:
+        for prompt, prompt_kwargs in _iter_prompt_from_file(data):
+            prompt_kwargs = {}
+            _run_inference(
+                pipeline,
+                sampling_temperature,
+                task,
+                session_ids,
+                show_tokens_per_sec,
+                prompt_sequence_length,
+                prompt,
+                **prompt_kwargs,
             )
-            if SupportedTasks.is_chat(task):
-                pipeline_inputs["session_ids"] = session_ids
-
-            response = pipeline(**pipeline_inputs)
-            print("\n", response.generations[0].text)
-
-            if show_tokens_per_sec:
-                times = pipeline.timer_manager.times
-                prefill_speed = (
-                    1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"]
-                )
-                generation_speed = 1.0 / times["engine_token_generation_single"]
-                print(
-                    f"[prefill: {prefill_speed:.2f} tokens/sec]",
-                    f"[decode: {generation_speed:.2f} tokens/sec]",
-                    sep="\n",
-                )
-            return
+        return
 
     # continue prompts until a keyboard interrupt
     while data is None:  # always True in interactive Mode
         prompt_input = input(">>> ")
-        pipeline_inputs = dict(
-            prompt=[prompt_input],
-            sampling_temperature=sampling_temperature,
+        _run_inference(
+            pipeline,
+            sampling_temperature,
+            task,
+            session_ids,
+            show_tokens_per_sec,
+            prompt_sequence_length,
+            prompt_input,
         )
 
-        if SupportedTasks.is_chat(task):
-            pipeline_inputs["session_ids"] = session_ids
-
-        response = pipeline(**pipeline_inputs)
-        print("\n", response.generations[0].text)
-
-        if show_tokens_per_sec:
-            times = pipeline.timer_manager.times
-            prefill_speed = (
-                1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"]
-            )
-            generation_speed = 1.0 / times["engine_token_generation_single"]
-            print(
-                f"[prefill: {prefill_speed:.2f} tokens/sec]",
-                f"[decode: {generation_speed:.2f} tokens/sec]",
-                sep="\n",
-            )
-
 
 def _iter_prompt_from_file(data: str) -> Iterator:
-    """
-    TODO: George
-    .txt - each line is a single prompt
-    .csv - match first column with name in [text, prompt, sequence, sentence, sentence1], only look at values in that column, can treat other columns as kwargs
-            i.e.
-            prompt,sampling_temperature
-            prompt 1,0.9
-
-            this would make pipeline(prompt="prompt 1", temperature=0.9)
-
-    .json - expect json file to be a single list of objects where each obj can be passed directly as kwarg inputs
-
-            [
-                {},
-                {},
-            ]
-    .jsonl - load as a text file and then each line is a json object (use json.loads) treated the same as the objects above
-            {}
-            {}
-            {}
-            {}
-    """
     parser = PromptParser(data)
     return parser.parse_as_iterable()
 
 
+def _run_inference(
+    pipeline,
+    sampling_temperature,
+    task,
+    session_ids,
+    show_tokens_per_sec,
+    prompt_sequence_length,
+    prompt,
+    **kwargs,
+):
+    pipeline_inputs = dict(
+        prompt=[prompt],
+        sampling_temperature=sampling_temperature,
+        # **kwargs,
+    )
+    if SupportedTasks.is_chat(task):
+        pipeline_inputs["session_ids"] = session_ids
+
+    response = pipeline(**pipeline_inputs)
+    print("\n", response.generations[0].text)
+
+    if show_tokens_per_sec:
+        times = pipeline.timer_manager.times
+        prefill_speed = (
+            1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"]
+        )
+        generation_speed = 1.0 / times["engine_token_generation_single"]
+        print(
+            f"[prefill: {prefill_speed:.2f} tokens/sec]",
+            f"[decode: {generation_speed:.2f} tokens/sec]",
+            sep="\n",
+        )
+
+
 if __name__ == "__main__":
     main()
diff --git a/src/deepsparse/transformers/inference/prompt_parser.py b/src/deepsparse/transformers/inference/prompt_parser.py
index 6b45e23917..009cd26d6e 100644
--- a/src/deepsparse/transformers/inference/prompt_parser.py
+++ b/src/deepsparse/transformers/inference/prompt_parser.py
@@ -17,7 +17,6 @@
 import json
 import os
 from enum import Enum
-from typing import Iterator, Optional
 
 
 class InvalidPromptSourceDirectoryException(Exception):
@@ -32,7 +31,7 @@ class Extentions(Enum):
         JSONL = ".jsonl"
 
     def __init__(self, filename: str):
-        self.extention = self._validate_and_return_extention(filename)
+        self.extention: self.Extentions = self._validate_and_return_extention(filename)
         self.filename: str = filename
 
     def parse_as_iterable(self):
@@ -49,24 +48,25 @@ def parse_as_iterable(self):
     def _parse_text(self):
         with open(self.filename, "r") as file:
             for line in file:
-                yield line.strip()
+                yield line.strip(), {}
 
-    def _parse_csv(self, column_name: str = "prompt"):
+    def _parse_csv(self):
         with open(self.filename, "r", newline="", encoding="utf-8-sig") as file:
             reader = csv.DictReader(file)
             for row in reader:
-                yield row
+                yield row.get("prompt"), row
 
     def _parse_json_list(self):
         with open(self.filename, "r") as file:
             json_list = json.load(file)
             for json_object in json_list:
-                yield json_object
+                yield json_object.get("prompt"), json_object
 
     def _parse_jsonl(self):
         with open(self.filename, "r") as file:
             for jsonl in file:
-                yield json.loads(jsonl)
+                jsonl_object = json.loads(jsonl)
+                yield jsonl_object.get("prompt"), jsonl_object
 
     def _validate_and_return_extention(self, filename: str):
         if os.path.exists(filename):

From b429917dd55a236f24e001658a048c209c3135e2 Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Wed, 27 Sep 2023 09:56:58 -0400
Subject: [PATCH 10/17] Update src/deepsparse/transformers/inference/infer.py

Co-authored-by: Rahul Tuli <rahul@neuralmagic.com>
---
 src/deepsparse/transformers/inference/infer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/deepsparse/transformers/inference/infer.py b/src/deepsparse/transformers/inference/infer.py
index a0c11aa189..e3929e9c6c 100644
--- a/src/deepsparse/transformers/inference/infer.py
+++ b/src/deepsparse/transformers/inference/infer.py
@@ -193,7 +193,7 @@ def _run_inference(
 ):
     pipeline_inputs = dict(
         prompt=[prompt],
-        sampling_temperature=sampling_temperature,
+        temperature=sampling_temperature,
         # **kwargs,
     )
     if SupportedTasks.is_chat(task):

From 1dc2ee378e81ad3fc164ac2a2453689608987ead Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Wed, 27 Sep 2023 09:57:47 -0400
Subject: [PATCH 11/17] remove hardcoded

---
 src/deepsparse/transformers/inference/infer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/deepsparse/transformers/inference/infer.py b/src/deepsparse/transformers/inference/infer.py
index e3929e9c6c..8abaa1b1e0 100644
--- a/src/deepsparse/transformers/inference/infer.py
+++ b/src/deepsparse/transformers/inference/infer.py
@@ -149,7 +149,6 @@ def main(
 
     if data:
         for prompt, prompt_kwargs in _iter_prompt_from_file(data):
-            prompt_kwargs = {}
             _run_inference(
                 pipeline,
                 sampling_temperature,

From 86a2daf626f7057a707dbb1b2300f4b0fc5f177c Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Wed, 27 Sep 2023 10:01:09 -0400
Subject: [PATCH 12/17] better error message

---
 src/deepsparse/transformers/inference/prompt_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/deepsparse/transformers/inference/prompt_parser.py b/src/deepsparse/transformers/inference/prompt_parser.py
index 009cd26d6e..34c53ec896 100644
--- a/src/deepsparse/transformers/inference/prompt_parser.py
+++ b/src/deepsparse/transformers/inference/prompt_parser.py
@@ -76,6 +76,6 @@ def _validate_and_return_extention(self, filename: str):
                     return extention
 
             raise InvalidPromptSourceDirectoryException(
-                f"{filename} is not a valid source extract batched prompts"
+                f"{filename} is not a parsable data for inference"
             )
         raise FileNotFoundError

From 939c6bc820ce167952545842826a85e08a85d48a Mon Sep 17 00:00:00 2001
From: horheynm <george@neuralmagic.com>
Date: Wed, 27 Sep 2023 14:40:43 +0000
Subject: [PATCH 13/17] clean up

---
 .../transformers/inference/infer.py           |  4 +--
 .../transformers/inference/prompt_parser.py   | 26 +++++++++++++------
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/deepsparse/transformers/inference/infer.py b/src/deepsparse/transformers/inference/infer.py
index 8abaa1b1e0..aa2cab3e36 100644
--- a/src/deepsparse/transformers/inference/infer.py
+++ b/src/deepsparse/transformers/inference/infer.py
@@ -141,7 +141,7 @@ def main(
     session_ids = "chatbot_cli_session"
 
     pipeline = Pipeline.create(
-        task=task,  # let pipeline determine if task is supported
+        task=task,  # let the pipeline determine if task is supported
         model_path=model_path,
         sequence_length=sequence_length,
         prompt_sequence_length=prompt_sequence_length,
@@ -193,7 +193,7 @@ def _run_inference(
     pipeline_inputs = dict(
         prompt=[prompt],
         temperature=sampling_temperature,
-        # **kwargs,
+        **kwargs,
     )
     if SupportedTasks.is_chat(task):
         pipeline_inputs["session_ids"] = session_ids
diff --git a/src/deepsparse/transformers/inference/prompt_parser.py b/src/deepsparse/transformers/inference/prompt_parser.py
index 34c53ec896..92bcd145d8 100644
--- a/src/deepsparse/transformers/inference/prompt_parser.py
+++ b/src/deepsparse/transformers/inference/prompt_parser.py
@@ -23,28 +23,36 @@ class InvalidPromptSourceDirectoryException(Exception):
     pass
 
 
+class UnableToParseExtentionException(Exception):
+    pass
+
+
 class PromptParser:
-    class Extentions(Enum):
+    class Extensions(Enum):
         TEXT = ".txt"
         CSV = ".csv"
         JSON = ".json"
         JSONL = ".jsonl"
 
     def __init__(self, filename: str):
-        self.extention: self.Extentions = self._validate_and_return_extention(filename)
+        self.extention: self.Extensions = self._validate_and_return_extention(filename)
         self.filename: str = filename
 
     def parse_as_iterable(self):
 
-        if self.extention == self.Extentions.TEXT:
+        if self.extention == self.Extensions.TEXT:
             return self._parse_text()
-        if self.extention == self.Extentions.CSV:
+        if self.extention == self.Extensions.CSV:
             return self._parse_csv()
-        if self.extention == self.Extentions.JSON:
+        if self.extention == self.Extensions.JSON:
             return self._parse_json_list()
-        if self.extention == self.Extentions.JSONL:
+        if self.extention == self.Extensions.JSONL:
             return self._parse_jsonl()
 
+        raise UnableToParseExtentionException(
+            f"Parser for {self.extention} does not exist"
+        )
+
     def _parse_text(self):
         with open(self.filename, "r") as file:
             for line in file:
@@ -71,11 +79,13 @@ def _parse_jsonl(self):
     def _validate_and_return_extention(self, filename: str):
         if os.path.exists(filename):
 
-            for extention in self.Extentions:
+            for extention in self.Extensions:
                 if filename.endswith(extention.value):
                     return extention
 
             raise InvalidPromptSourceDirectoryException(
-                f"{filename} is not a parsable data for inference"
+                f"{filename} is not compatible. Select file that has "
+                "extension from "
+                f"{[key.name for key in self.Extensions]}"
             )
         raise FileNotFoundError

From 7b1edfa9ecaa3b079ac24c896d4c48720c909a15 Mon Sep 17 00:00:00 2001
From: horheynm <george@neuralmagic.com>
Date: Wed, 27 Sep 2023 21:09:43 +0000
Subject: [PATCH 14/17] clean up, check kwargs

---
 .../transformers/inference/infer.py           | 31 +++++++------
 .../transformers/inference/prompt_parser.py   | 45 +++++++++++++------
 2 files changed, 47 insertions(+), 29 deletions(-)

diff --git a/src/deepsparse/transformers/inference/infer.py b/src/deepsparse/transformers/inference/infer.py
index aa2cab3e36..460b8499c4 100644
--- a/src/deepsparse/transformers/inference/infer.py
+++ b/src/deepsparse/transformers/inference/infer.py
@@ -64,7 +64,7 @@
     --task text-generation
 """
 
-from typing import Iterator, Optional
+from typing import Optional
 
 import click
 
@@ -148,22 +148,26 @@ def main(
     )
 
     if data:
-        for prompt, prompt_kwargs in _iter_prompt_from_file(data):
+        prompt_parser = PromptParser(data)
+        default_prompt_kwargs = {
+            "sequence_length": sequence_length,
+            "sampling_temperature": sampling_temperature,
+            "prompt_sequence_length": prompt_sequence_length,
+            "show_tokens_per_sec": show_tokens_per_sec,
+        }
+
+        for prompt_kwargs in prompt_parser.parse_as_iterable(**default_prompt_kwargs):
             _run_inference(
-                pipeline,
-                sampling_temperature,
-                task,
-                session_ids,
-                show_tokens_per_sec,
-                prompt_sequence_length,
-                prompt,
+                task=task,
+                pipeline=pipeline,
+                session_ids=session_ids,
                 **prompt_kwargs,
             )
         return
 
     # continue prompts until a keyboard interrupt
     while data is None:  # always True in interactive Mode
-        prompt_input = input(">>> ")
+        prompt = input(">>> ")
         _run_inference(
             pipeline,
             sampling_temperature,
@@ -171,15 +175,10 @@ def main(
             session_ids,
             show_tokens_per_sec,
             prompt_sequence_length,
-            prompt_input,
+            prompt,
         )
 
 
-def _iter_prompt_from_file(data: str) -> Iterator:
-    parser = PromptParser(data)
-    return parser.parse_as_iterable()
-
-
 def _run_inference(
     pipeline,
     sampling_temperature,
diff --git a/src/deepsparse/transformers/inference/prompt_parser.py b/src/deepsparse/transformers/inference/prompt_parser.py
index 92bcd145d8..7a344039c1 100644
--- a/src/deepsparse/transformers/inference/prompt_parser.py
+++ b/src/deepsparse/transformers/inference/prompt_parser.py
@@ -17,6 +17,7 @@
 import json
 import os
 from enum import Enum
+from typing import Iterator, Tuple
 
 
 class InvalidPromptSourceDirectoryException(Exception):
@@ -27,6 +28,18 @@ class UnableToParseExtentionException(Exception):
     pass
 
 
+def parse_value_to_appropriate_type(value: str):
+    if value.isdigit():
+        return int(value)
+    if "." in str(value) and all(part.isdigit() for part in value.split(".", 1)):
+        return float(value)
+    if value.lower() == "true":
+        return True
+    if value.lower() == "false":
+        return False
+    return value
+
+
 class PromptParser:
     class Extensions(Enum):
         TEXT = ".txt"
@@ -38,43 +51,49 @@ def __init__(self, filename: str):
         self.extention: self.Extensions = self._validate_and_return_extention(filename)
         self.filename: str = filename
 
-    def parse_as_iterable(self):
+    def parse_as_iterable(self, **kwargs) -> Iterator[Tuple]:
 
         if self.extention == self.Extensions.TEXT:
-            return self._parse_text()
+            return self._parse_text(**kwargs)
         if self.extention == self.Extensions.CSV:
-            return self._parse_csv()
+            return self._parse_csv(**kwargs)
         if self.extention == self.Extensions.JSON:
-            return self._parse_json_list()
+            return self._parse_json_list(**kwargs)
         if self.extention == self.Extensions.JSONL:
-            return self._parse_jsonl()
+            return self._parse_jsonl(**kwargs)
 
         raise UnableToParseExtentionException(
             f"Parser for {self.extention} does not exist"
         )
 
-    def _parse_text(self):
+    def _parse_text(self, **kwargs):
         with open(self.filename, "r") as file:
             for line in file:
-                yield line.strip(), {}
+                kwargs["prompt"] = line.strip()
+                yield kwargs
 
-    def _parse_csv(self):
+    def _parse_csv(self, **kwargs):
         with open(self.filename, "r", newline="", encoding="utf-8-sig") as file:
             reader = csv.DictReader(file)
             for row in reader:
-                yield row.get("prompt"), row
+                for key, value in row.items():
+                    kwargs.update({key: parse_value_to_appropriate_type(value)})
+                yield kwargs
 
-    def _parse_json_list(self):
+    def _parse_json_list(self, **kwargs):
         with open(self.filename, "r") as file:
             json_list = json.load(file)
             for json_object in json_list:
-                yield json_object.get("prompt"), json_object
+                kwargs.update(json_object)
+                yield kwargs
 
-    def _parse_jsonl(self):
+    def _parse_jsonl(self, **kwargs):
         with open(self.filename, "r") as file:
             for jsonl in file:
                 jsonl_object = json.loads(jsonl)
-                yield jsonl_object.get("prompt"), jsonl_object
+                breakpoint()
+                kwargs.update(jsonl_object)
+                yield kwargs
 
     def _validate_and_return_extention(self, filename: str):
         if os.path.exists(filename):

From 46998376109ae1f1d416499f69892f33d1f01f27 Mon Sep 17 00:00:00 2001
From: horheynm <george@neuralmagic.com>
Date: Wed, 27 Sep 2023 21:12:32 +0000
Subject: [PATCH 15/17] get rid of breakpoint()

---
 src/deepsparse/transformers/inference/prompt_parser.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/deepsparse/transformers/inference/prompt_parser.py b/src/deepsparse/transformers/inference/prompt_parser.py
index 7a344039c1..1ebedcffd1 100644
--- a/src/deepsparse/transformers/inference/prompt_parser.py
+++ b/src/deepsparse/transformers/inference/prompt_parser.py
@@ -91,7 +91,6 @@ def _parse_jsonl(self, **kwargs):
         with open(self.filename, "r") as file:
             for jsonl in file:
                 jsonl_object = json.loads(jsonl)
-                breakpoint()
                 kwargs.update(jsonl_object)
                 yield kwargs
 

From ff4b48f0629a68097e32075da2fe76b89f4af07e Mon Sep 17 00:00:00 2001
From: horheynm <george@neuralmagic.com>
Date: Wed, 27 Sep 2023 21:15:09 +0000
Subject: [PATCH 16/17] return type

---
 src/deepsparse/transformers/inference/prompt_parser.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/deepsparse/transformers/inference/prompt_parser.py b/src/deepsparse/transformers/inference/prompt_parser.py
index 1ebedcffd1..ce2f6986cb 100644
--- a/src/deepsparse/transformers/inference/prompt_parser.py
+++ b/src/deepsparse/transformers/inference/prompt_parser.py
@@ -17,7 +17,7 @@
 import json
 import os
 from enum import Enum
-from typing import Iterator, Tuple
+from typing import Iterator
 
 
 class InvalidPromptSourceDirectoryException(Exception):
@@ -51,8 +51,7 @@ def __init__(self, filename: str):
         self.extention: self.Extensions = self._validate_and_return_extention(filename)
         self.filename: str = filename
 
-    def parse_as_iterable(self, **kwargs) -> Iterator[Tuple]:
-
+    def parse_as_iterable(self, **kwargs) -> Iterator:
         if self.extention == self.Extensions.TEXT:
             return self._parse_text(**kwargs)
         if self.extention == self.Extensions.CSV:

From 2a4b972effea6bacecc7ea54702ec5927d168070 Mon Sep 17 00:00:00 2001
From: horheynm <george@neuralmagic.com>
Date: Mon, 2 Oct 2023 19:18:48 +0000
Subject: [PATCH 17/17] typo

---
 src/deepsparse/transformers/inference/prompt_parser.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/deepsparse/transformers/inference/prompt_parser.py b/src/deepsparse/transformers/inference/prompt_parser.py
index ce2f6986cb..35c433b11f 100644
--- a/src/deepsparse/transformers/inference/prompt_parser.py
+++ b/src/deepsparse/transformers/inference/prompt_parser.py
@@ -24,7 +24,7 @@ class InvalidPromptSourceDirectoryException(Exception):
     pass
 
 
-class UnableToParseExtentionException(Exception):
+class UnableToParseExtentsonException(Exception):
     pass
 
 
@@ -61,7 +61,7 @@ def parse_as_iterable(self, **kwargs) -> Iterator:
         if self.extention == self.Extensions.JSONL:
             return self._parse_jsonl(**kwargs)
 
-        raise UnableToParseExtentionException(
+        raise UnableToParseExtentsonException(
             f"Parser for {self.extention} does not exist"
         )