From 6c8dfb3675f94cf8abaffd8899305d76fbbd1380 Mon Sep 17 00:00:00 2001 From: Benjamin Date: Mon, 25 Sep 2023 16:31:40 -0400 Subject: [PATCH 01/17] [deepsparse.infer] UX improvements, data only mode --- src/deepsparse/transformers/infer.py | 49 ++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/src/deepsparse/transformers/infer.py b/src/deepsparse/transformers/infer.py index e4f8ad26f3..7332a77579 100644 --- a/src/deepsparse/transformers/infer.py +++ b/src/deepsparse/transformers/infer.py @@ -64,6 +64,7 @@ --task text-generation """ import click +from typing import Optional from deepsparse import Pipeline from deepsparse.tasks import SupportedTasks @@ -75,6 +76,14 @@ ) ) @click.argument("model_path", type=str) +@click.option( + "--data", + type=Optional[str], + default=None, + help="Path to .txt, .csv, .json, or .jsonl file to load data from" + "If provided, runs inference over the entire dataset. If not provided " + "runs an interactive inference session in the console. Default None.", +) @click.option( "--sequence_length", type=int, @@ -112,6 +121,7 @@ ) def main( model_path: str, + data: Optional[str], sequence_length: int, sampling_temperature: float, prompt_sequence_length: int, @@ -135,16 +145,23 @@ def main( prompt_sequence_length=prompt_sequence_length, ) + if data is not None: + for prompt in _iter_prompt_from_file(data): + # TODO: George run inference + pass + return + # continue prompts until a keyboard interrupt - while True: - input_text = input("User: ") + while data is None: # always True in interactive Mode + input_text = input(">>> ") pipeline_inputs = {"prompt": [input_text]} if SupportedTasks.is_chat(task): pipeline_inputs["session_ids"] = session_ids response = pipeline(**pipeline_inputs) - print("Bot: ", response.generations[0].text) + print("\n", response.generations[0].text) + if show_tokens_per_sec: times = pipeline.timer_manager.times prefill_speed = ( @@ -158,5 +175,31 @@ def main( ) +def _iter_prompt_from_file(data: str): + """ + TODO: George + .txt - each line is a single prompt + .csv - match first column with name in [text, prompt, sequence, sentence, sentence1], only look at values in that column, can treat other columns as kwargs + i.e. + prompt,sampling_temperature + prompt 1,0.9 + + this would make pipeline(prompt="prompt 1", temperature=0.9) + + .json - expect json file to be a single list of objects where each obj can be passed directly as kwarg inputs + + [ + {}, + {}, + ] + .jsonl - load as a text file and then each line is a json object (use json.loads) treated the same as the objects above + {} + {} + {} + {} + """ + pass + + if __name__ == "__main__": main() From b4b7ec65d6456c64461e2aef543b4f4e311ea4a1 Mon Sep 17 00:00:00 2001 From: Benjamin Date: Mon, 25 Sep 2023 16:55:09 -0400 Subject: [PATCH 02/17] fix bug on main --- src/deepsparse/transformers/infer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/deepsparse/transformers/infer.py b/src/deepsparse/transformers/infer.py index 7332a77579..82e655649a 100644 --- a/src/deepsparse/transformers/infer.py +++ b/src/deepsparse/transformers/infer.py @@ -141,7 +141,6 @@ def main( task=task, # let pipeline determine if task is supported model_path=model_path, sequence_length=sequence_length, - sampling_temperature=sampling_temperature, prompt_sequence_length=prompt_sequence_length, ) @@ -153,8 +152,11 @@ def main( # continue prompts until a keyboard interrupt while data is None: # always True in interactive Mode - input_text = input(">>> ") - pipeline_inputs = {"prompt": [input_text]} + prompt_input = input(">>> ") + pipeline_inputs = dict( + prompt=[prompt_input], + sampling_temperature=sampling_temperature, + ) if SupportedTasks.is_chat(task): pipeline_inputs["session_ids"] = session_ids From 17168f6a89453503b414886ff807945201e0b6be Mon Sep 17 00:00:00 2001 From: horheynm Date: Tue, 26 Sep 2023 04:13:35 +0000 Subject: [PATCH 03/17] draft, load files line by line, return iter, save up memory --- src/deepsparse/transformers/infer.py | 91 +++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 3 deletions(-) diff --git a/src/deepsparse/transformers/infer.py b/src/deepsparse/transformers/infer.py index 82e655649a..13f25f56c4 100644 --- a/src/deepsparse/transformers/infer.py +++ b/src/deepsparse/transformers/infer.py @@ -63,11 +63,95 @@ deepsparse.infer models/llama/deployment \ --task text-generation """ -import click +import csv +import json +from enum import Enum from typing import Optional +import click + +import Exception from deepsparse import Pipeline from deepsparse.tasks import SupportedTasks +from typing import Iterator + + + +class InvalidPromptSourceDirectoryException(Exception): + pass + + +class PromptParser: + class Extentions(Enum): + TEXT = ".txt" + CSV = ".csv" + JSON = ".json" + JSONL = ".jsonl" + + def __init__(self, filename: str): + self.extention = self._validate_and_return_extention(filename) + self.filename: str = filename + + def parse(self): + + if self.extention == self.Extentions.TEXT: + return self._parse_text() + if self.extention == self.Extentions.CSV: + return self._parse_csv() + if self.extention == self.Extentions.JSON or self.extention == self.Extentions.JSONL: + return self._parse_json_list() + + + def _parse_text(self): + try: + with open(self.filename, "r") as file: + for line in file: + yield line.strip() + except FileNotFoundError: + raise + # print(f"The file '{self.filename}' not found.") + + def _parse_csv(self, column_name: str = "prompt"): + try: + with open(self.filename, "r", newline="") as file: + reader = csv.DictReader(file) + for row in reader: + yield row[column_name] + except FileNotFoundError: + raise + # print(f"The file '{self.filename}' was not found.") + except KeyError: + raise + # print(f"Column '{column_name}' not found in the CSV.") + + def _parse_json(self, prompt_key: str = "prompt"): + try: + # with open(self.filename, "w") as file: + # json_list = json.dump(self.filename, file) + # for json_obj in json_list: + # yield json_obj[prompt_key] + with open(self.filename, 'r') as file: + json_list = json.load(file) + for json_object in json_list: + yield json_object[prompt_key] + except FileNotFoundError: + raise + # print(f"The file '{self.filename}' was not found.") + except KeyError: + raise + # print(f"Column '{column_name}' not found in the CSV.") + + def _validate_and_return_extention(self, data: str): + for extention in self.Extentions: + if self.data.endswith(extention.value): + return extention.value + + raise InvalidPromptSourceDirectoryException( + f"{data} is not a valid source extract batched prompts" + ) + + # if not data.endswith(tuple(extension.value for extension in self.Extensions)): + # @click.command( @@ -177,7 +261,7 @@ def main( ) -def _iter_prompt_from_file(data: str): +def _iter_prompt_from_file(data: str) -> Iterator: """ TODO: George .txt - each line is a single prompt @@ -200,7 +284,8 @@ def _iter_prompt_from_file(data: str): {} {} """ - pass + parser = PromptParser(data) + return parser.parse_as_iterable() if __name__ == "__main__": From 84b03f82808361eb34714b84a7c7a5d4b51d6a11 Mon Sep 17 00:00:00 2001 From: horheynm Date: Tue, 26 Sep 2023 14:57:17 +0000 Subject: [PATCH 04/17] add inference --- src/deepsparse/transformers/infer.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/deepsparse/transformers/infer.py b/src/deepsparse/transformers/infer.py index 13f25f56c4..be2409b9d9 100644 --- a/src/deepsparse/transformers/infer.py +++ b/src/deepsparse/transformers/infer.py @@ -231,8 +231,28 @@ def main( if data is not None: for prompt in _iter_prompt_from_file(data): # TODO: George run inference - pass - return + pipeline_inputs = dict( + prompt=[prompt], + sampling_temperature=sampling_temperature, + ) + if SupportedTasks.is_chat(task): + pipeline_inputs["session_ids"] = session_ids + + response = pipeline(**pipeline_inputs) + print("\n", response.generations[0].text) + + if show_tokens_per_sec: + times = pipeline.timer_manager.times + prefill_speed = ( + 1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"] + ) + generation_speed = 1.0 / times["engine_token_generation_single"] + print( + f"[prefill: {prefill_speed:.2f} tokens/sec]", + f"[decode: {generation_speed:.2f} tokens/sec]", + sep="\n", + ) + return # continue prompts until a keyboard interrupt while data is None: # always True in interactive Mode From 8a69c5f39dcd05e7ead551ab5fdf09de409822ba Mon Sep 17 00:00:00 2001 From: horheynm Date: Tue, 26 Sep 2023 16:02:22 +0000 Subject: [PATCH 05/17] pass passing in files --- src/deepsparse/transformers/infer.py | 312 --------------------------- 1 file changed, 312 deletions(-) delete mode 100644 src/deepsparse/transformers/infer.py diff --git a/src/deepsparse/transformers/infer.py b/src/deepsparse/transformers/infer.py deleted file mode 100644 index be2409b9d9..0000000000 --- a/src/deepsparse/transformers/infer.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Usage: deepsparse.infer [OPTIONS] MODEL_PATH - - Command Line utility to interact with a text genration LLM in a chatbot - style - - Example usage: - - deepsparse.infer [OPTIONS] - -Options: - --sequence_length INTEGER Sequence length to compile model and - tokenizer for.This controls the maximum - context length of the pipeline. [default: - 512] - --sampling_temperature FLOAT The temperature to use when samplingfrom the - probability distribution computed from the - logits.Higher values will result in more - random samples. Shouldbe greater than 0.0. - [default: 1.0] - --prompt_sequence_length INTEGER - Processed prompt in chunks of this length. - This is to maximize the inference speed - [default: 64] - --show_tokens_per_sec / --no_show_tokens_per_sec - Whether to display the token generation - speed or not [default: - no_show_tokens_per_sec] - --task TEXT The task to use for the pipeline. Choose any - of `chat`, `codegen`, `text-generation` - [default: chat] - --help Show this message and exit. - -Installation: pip install deepsparse[transformers] -Examples: - -1) Use a local deployment directory -deepsparse.infer models/llama/deployment - -2) Use a SparseZoo stub -deepsparse.infer \ - zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none # noqa: E501 - -3) Display token generation speed -deepsparse.infer models/llama/deployment \ - --show_tokens_per_sec - -4) Disable history -deepsparse.infer models/llama/deployment \ - --task text-generation -""" -import csv -import json -from enum import Enum -from typing import Optional - -import click - -import Exception -from deepsparse import Pipeline -from deepsparse.tasks import SupportedTasks -from typing import Iterator - - - -class InvalidPromptSourceDirectoryException(Exception): - pass - - -class PromptParser: - class Extentions(Enum): - TEXT = ".txt" - CSV = ".csv" - JSON = ".json" - JSONL = ".jsonl" - - def __init__(self, filename: str): - self.extention = self._validate_and_return_extention(filename) - self.filename: str = filename - - def parse(self): - - if self.extention == self.Extentions.TEXT: - return self._parse_text() - if self.extention == self.Extentions.CSV: - return self._parse_csv() - if self.extention == self.Extentions.JSON or self.extention == self.Extentions.JSONL: - return self._parse_json_list() - - - def _parse_text(self): - try: - with open(self.filename, "r") as file: - for line in file: - yield line.strip() - except FileNotFoundError: - raise - # print(f"The file '{self.filename}' not found.") - - def _parse_csv(self, column_name: str = "prompt"): - try: - with open(self.filename, "r", newline="") as file: - reader = csv.DictReader(file) - for row in reader: - yield row[column_name] - except FileNotFoundError: - raise - # print(f"The file '{self.filename}' was not found.") - except KeyError: - raise - # print(f"Column '{column_name}' not found in the CSV.") - - def _parse_json(self, prompt_key: str = "prompt"): - try: - # with open(self.filename, "w") as file: - # json_list = json.dump(self.filename, file) - # for json_obj in json_list: - # yield json_obj[prompt_key] - with open(self.filename, 'r') as file: - json_list = json.load(file) - for json_object in json_list: - yield json_object[prompt_key] - except FileNotFoundError: - raise - # print(f"The file '{self.filename}' was not found.") - except KeyError: - raise - # print(f"Column '{column_name}' not found in the CSV.") - - def _validate_and_return_extention(self, data: str): - for extention in self.Extentions: - if self.data.endswith(extention.value): - return extention.value - - raise InvalidPromptSourceDirectoryException( - f"{data} is not a valid source extract batched prompts" - ) - - # if not data.endswith(tuple(extension.value for extension in self.Extensions)): - # - - -@click.command( - context_settings=dict( - token_normalize_func=lambda x: x.replace("-", "_"), show_default=True - ) -) -@click.argument("model_path", type=str) -@click.option( - "--data", - type=Optional[str], - default=None, - help="Path to .txt, .csv, .json, or .jsonl file to load data from" - "If provided, runs inference over the entire dataset. If not provided " - "runs an interactive inference session in the console. Default None.", -) -@click.option( - "--sequence_length", - type=int, - default=512, - help="Sequence length to compile model and tokenizer for." - "This controls the maximum context length of the pipeline.", -) -@click.option( - "--sampling_temperature", - type=float, - default=1.0, - help="The temperature to use when sampling" - "from the probability distribution computed from the logits." - "Higher values will result in more random samples. Should" - "be greater than 0.0.", -) -@click.option( - "--prompt_sequence_length", - type=int, - default=64, - help="Processed prompt in chunks of this length. " - "This is to maximize the inference speed", -) -@click.option( - "--show_tokens_per_sec/--no_show_tokens_per_sec", - default=False, - help="Whether to display the token generation speed or not", -) -@click.option( - "--task", - default="chat", - type=str, - help="The task to use for the pipeline. Choose any of " - "`chat`, `codegen`, `text-generation`", -) -def main( - model_path: str, - data: Optional[str], - sequence_length: int, - sampling_temperature: float, - prompt_sequence_length: int, - show_tokens_per_sec: bool, - task: str, -): - """ - Command Line utility to interact with a text genration LLM in a chatbot style - - Example usage: - - deepsparse.infer [OPTIONS] - """ - session_ids = "chatbot_cli_session" - - pipeline = Pipeline.create( - task=task, # let pipeline determine if task is supported - model_path=model_path, - sequence_length=sequence_length, - prompt_sequence_length=prompt_sequence_length, - ) - - if data is not None: - for prompt in _iter_prompt_from_file(data): - # TODO: George run inference - pipeline_inputs = dict( - prompt=[prompt], - sampling_temperature=sampling_temperature, - ) - if SupportedTasks.is_chat(task): - pipeline_inputs["session_ids"] = session_ids - - response = pipeline(**pipeline_inputs) - print("\n", response.generations[0].text) - - if show_tokens_per_sec: - times = pipeline.timer_manager.times - prefill_speed = ( - 1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"] - ) - generation_speed = 1.0 / times["engine_token_generation_single"] - print( - f"[prefill: {prefill_speed:.2f} tokens/sec]", - f"[decode: {generation_speed:.2f} tokens/sec]", - sep="\n", - ) - return - - # continue prompts until a keyboard interrupt - while data is None: # always True in interactive Mode - prompt_input = input(">>> ") - pipeline_inputs = dict( - prompt=[prompt_input], - sampling_temperature=sampling_temperature, - ) - - if SupportedTasks.is_chat(task): - pipeline_inputs["session_ids"] = session_ids - - response = pipeline(**pipeline_inputs) - print("\n", response.generations[0].text) - - if show_tokens_per_sec: - times = pipeline.timer_manager.times - prefill_speed = ( - 1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"] - ) - generation_speed = 1.0 / times["engine_token_generation_single"] - print( - f"[prefill: {prefill_speed:.2f} tokens/sec]", - f"[decode: {generation_speed:.2f} tokens/sec]", - sep="\n", - ) - - -def _iter_prompt_from_file(data: str) -> Iterator: - """ - TODO: George - .txt - each line is a single prompt - .csv - match first column with name in [text, prompt, sequence, sentence, sentence1], only look at values in that column, can treat other columns as kwargs - i.e. - prompt,sampling_temperature - prompt 1,0.9 - - this would make pipeline(prompt="prompt 1", temperature=0.9) - - .json - expect json file to be a single list of objects where each obj can be passed directly as kwarg inputs - - [ - {}, - {}, - ] - .jsonl - load as a text file and then each line is a json object (use json.loads) treated the same as the objects above - {} - {} - {} - {} - """ - parser = PromptParser(data) - return parser.parse_as_iterable() - - -if __name__ == "__main__": - main() From 4091503f2c8115a12b673163dd4abea140ecf41c Mon Sep 17 00:00:00 2001 From: horheynm Date: Tue, 26 Sep 2023 16:28:01 +0000 Subject: [PATCH 06/17] latest changes' --- .../pipelines/test_text_generation.py | 229 +++++++++--------- 1 file changed, 115 insertions(+), 114 deletions(-) diff --git a/tests/deepsparse/transformers/pipelines/test_text_generation.py b/tests/deepsparse/transformers/pipelines/test_text_generation.py index f781e4f119..e126385a14 100644 --- a/tests/deepsparse/transformers/pipelines/test_text_generation.py +++ b/tests/deepsparse/transformers/pipelines/test_text_generation.py @@ -82,7 +82,7 @@ def Fibonacci(n): ], scope="class", ) -@pytest.mark.skip(reason="Those tests are too heavy to run as a normal part of the CI.") +# @pytest.mark.skip(reason="Those tests are too heavy to run as a normal part of the CI.") class TestTextGenerationPipeline: """ This test suite is meant to test the main scenarios of @@ -154,119 +154,119 @@ def setup( yield model_name, uses_bos_token, torch_ground_truth - def test_freeze_first_position(self, setup): - # Test whether we should be "freezing" the first token after - # the kv cache is full - _, uses_bos_token, _ = setup - pipeline = self.get_pipeline() - assert prepends_bos_token(pipeline.tokenizer) == uses_bos_token - - def test_ort_single_token_prefill(self, setup): - # Test the pipeline that uses ORT engine. The test covers the - # following scenario: - # 1. Prompt preprocessing is performed by single-token engine - # 2. The KV Cache is never filled up - # 3. KV Cache managed externally - - if self.internal_kv_cache: - pytest.skip( - "Cannot run ORT pipeline with the internal deepsparse cache enabled." - ) - _, _, torch_ground_truth = setup - pipeline = self.get_pipeline( - task=self.pipeline_type, - model_path=self.model_stub, - sequence_length=self.sequence_length, - prompt_sequence_length=1, - engine_type="onnxruntime", - ) - pipeline._debug = True - - config = GenerationConfig( - output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0 - ) - - output = pipeline( - sequences=self.prompt, include_prompt_logits=True, generation_config=config - ) - assert output.total_num_processed_tokens[0] < self.sequence_length - self._test_output( - output=output, - torch_ground_truth=torch_ground_truth, - ) - - def test_ort_multi_token_prefill(self, setup): - # Test the pipeline that uses ORT engine. The test covers the - # following scenario: - # 1. Prompt preprocessing is performed by multi-token engine - # 2. The KV Cache is never filled up - # 3. KV Cache managed externally - - if self.internal_kv_cache: - pytest.skip( - "Cannot run ORT pipeline with the internal deepsparse cache enabled." - ) - _, _, torch_ground_truth = setup - pipeline = self.get_pipeline( - task=self.pipeline_type, - model_path=self.model_stub, - sequence_length=self.sequence_length, - prompt_sequence_length=self.prompt_sequence_length, - engine_type="onnxruntime", - ) - pipeline._debug = True - config = GenerationConfig( - output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0 - ) - output = pipeline( - sequences=self.prompt, include_prompt_logits=True, generation_config=config - ) - - assert output.total_num_processed_tokens[0] < self.sequence_length - self._test_output( - output=output, - torch_ground_truth=torch_ground_truth, - ) - - def test_ort_generation_after_kv_cache_has_been_filled(self, setup): - # Test the pipeline that uses ORT engine. The test covers the - # following scenario: - # 1. Prompt preprocessing is performed by multi-token engine - # 2. The KV Cache is filled up (old entries are removed) - # 3. KV Cache managed externally - - if self.internal_kv_cache: - pytest.skip( - "Cannot run ORT pipeline with the internal deepsparse cache enabled." - ) - _, _, torch_ground_truth = setup - pipeline = self.get_pipeline( - task=self.pipeline_type, - model_path=self.model_stub, - sequence_length=self.sequence_length_short, - prompt_sequence_length=self.prompt_sequence_length, - engine_type="onnxruntime", - ) - pipeline._debug = True - - config = GenerationConfig( - output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0 - ) - output = pipeline( - sequences=self.prompt, include_prompt_logits=True, generation_config=config - ) - - assert output.total_num_processed_tokens[0] > self.sequence_length_short, ( - "for this scenario, the kv cache should be full: " - "the total number of processed tokens should be " - "greater than the sequence length" - ) - - self._test_output( - output=output, - torch_ground_truth=torch_ground_truth, - max_logits_difference_threshold=self.logits_max_diff_kv_cache_has_been_filled, # noqa E501 - ) + # def test_freeze_first_position(self, setup): + # # Test whether we should be "freezing" the first token after + # # the kv cache is full + # _, uses_bos_token, _ = setup + # pipeline = self.get_pipeline() + # assert prepends_bos_token(pipeline.tokenizer) == uses_bos_token + + # def test_ort_single_token_prefill(self, setup): + # # Test the pipeline that uses ORT engine. The test covers the + # # following scenario: + # # 1. Prompt preprocessing is performed by single-token engine + # # 2. The KV Cache is never filled up + # # 3. KV Cache managed externally + + # if self.internal_kv_cache: + # pytest.skip( + # "Cannot run ORT pipeline with the internal deepsparse cache enabled." + # ) + # _, _, torch_ground_truth = setup + # pipeline = self.get_pipeline( + # task=self.pipeline_type, + # model_path=self.model_stub, + # sequence_length=self.sequence_length, + # prompt_sequence_length=1, + # engine_type="onnxruntime", + # ) + # pipeline._debug = True + + # config = GenerationConfig( + # output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0 + # ) + + # output = pipeline( + # sequences=self.prompt, include_prompt_logits=True, generation_config=config + # ) + # assert output.total_num_processed_tokens[0] < self.sequence_length + # self._test_output( + # output=output, + # torch_ground_truth=torch_ground_truth, + # ) + + # def test_ort_multi_token_prefill(self, setup): + # # Test the pipeline that uses ORT engine. The test covers the + # # following scenario: + # # 1. Prompt preprocessing is performed by multi-token engine + # # 2. The KV Cache is never filled up + # # 3. KV Cache managed externally + + # if self.internal_kv_cache: + # pytest.skip( + # "Cannot run ORT pipeline with the internal deepsparse cache enabled." + # ) + # _, _, torch_ground_truth = setup + # pipeline = self.get_pipeline( + # task=self.pipeline_type, + # model_path=self.model_stub, + # sequence_length=self.sequence_length, + # prompt_sequence_length=self.prompt_sequence_length, + # engine_type="onnxruntime", + # ) + # pipeline._debug = True + # config = GenerationConfig( + # output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0 + # ) + # output = pipeline( + # sequences=self.prompt, include_prompt_logits=True, generation_config=config + # ) + + # assert output.total_num_processed_tokens[0] < self.sequence_length + # self._test_output( + # output=output, + # torch_ground_truth=torch_ground_truth, + # ) + + # def test_ort_generation_after_kv_cache_has_been_filled(self, setup): + # # Test the pipeline that uses ORT engine. The test covers the + # # following scenario: + # # 1. Prompt preprocessing is performed by multi-token engine + # # 2. The KV Cache is filled up (old entries are removed) + # # 3. KV Cache managed externally + + # if self.internal_kv_cache: + # pytest.skip( + # "Cannot run ORT pipeline with the internal deepsparse cache enabled." + # ) + # _, _, torch_ground_truth = setup + # pipeline = self.get_pipeline( + # task=self.pipeline_type, + # model_path=self.model_stub, + # sequence_length=self.sequence_length_short, + # prompt_sequence_length=self.prompt_sequence_length, + # engine_type="onnxruntime", + # ) + # pipeline._debug = True + + # config = GenerationConfig( + # output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0 + # ) + # output = pipeline( + # sequences=self.prompt, include_prompt_logits=True, generation_config=config + # ) + + # assert output.total_num_processed_tokens[0] > self.sequence_length_short, ( + # "for this scenario, the kv cache should be full: " + # "the total number of processed tokens should be " + # "greater than the sequence length" + # ) + + # self._test_output( + # output=output, + # torch_ground_truth=torch_ground_truth, + # max_logits_difference_threshold=self.logits_max_diff_kv_cache_has_been_filled, # noqa E501 + # ) def test_deepsparse_single_token_prefill(self, setup): # Test the pipeline that uses deepsparse engine. The test covers the @@ -276,6 +276,7 @@ def test_deepsparse_single_token_prefill(self, setup): # 3. KV Cache managed externally or internally _, _, torch_ground_truth = setup + breakpoint() pipeline = self.get_pipeline( task=self.pipeline_type, model_path=self.model_stub, From b0f65af956aea4949bdf8fdb9ee3c322d3f1d601 Mon Sep 17 00:00:00 2001 From: horheynm Date: Tue, 26 Sep 2023 16:29:45 +0000 Subject: [PATCH 07/17] revert --- .../pipelines/test_text_generation.py | 229 +++++++++--------- 1 file changed, 114 insertions(+), 115 deletions(-) diff --git a/tests/deepsparse/transformers/pipelines/test_text_generation.py b/tests/deepsparse/transformers/pipelines/test_text_generation.py index e126385a14..f781e4f119 100644 --- a/tests/deepsparse/transformers/pipelines/test_text_generation.py +++ b/tests/deepsparse/transformers/pipelines/test_text_generation.py @@ -82,7 +82,7 @@ def Fibonacci(n): ], scope="class", ) -# @pytest.mark.skip(reason="Those tests are too heavy to run as a normal part of the CI.") +@pytest.mark.skip(reason="Those tests are too heavy to run as a normal part of the CI.") class TestTextGenerationPipeline: """ This test suite is meant to test the main scenarios of @@ -154,119 +154,119 @@ def setup( yield model_name, uses_bos_token, torch_ground_truth - # def test_freeze_first_position(self, setup): - # # Test whether we should be "freezing" the first token after - # # the kv cache is full - # _, uses_bos_token, _ = setup - # pipeline = self.get_pipeline() - # assert prepends_bos_token(pipeline.tokenizer) == uses_bos_token - - # def test_ort_single_token_prefill(self, setup): - # # Test the pipeline that uses ORT engine. The test covers the - # # following scenario: - # # 1. Prompt preprocessing is performed by single-token engine - # # 2. The KV Cache is never filled up - # # 3. KV Cache managed externally - - # if self.internal_kv_cache: - # pytest.skip( - # "Cannot run ORT pipeline with the internal deepsparse cache enabled." - # ) - # _, _, torch_ground_truth = setup - # pipeline = self.get_pipeline( - # task=self.pipeline_type, - # model_path=self.model_stub, - # sequence_length=self.sequence_length, - # prompt_sequence_length=1, - # engine_type="onnxruntime", - # ) - # pipeline._debug = True - - # config = GenerationConfig( - # output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0 - # ) - - # output = pipeline( - # sequences=self.prompt, include_prompt_logits=True, generation_config=config - # ) - # assert output.total_num_processed_tokens[0] < self.sequence_length - # self._test_output( - # output=output, - # torch_ground_truth=torch_ground_truth, - # ) - - # def test_ort_multi_token_prefill(self, setup): - # # Test the pipeline that uses ORT engine. The test covers the - # # following scenario: - # # 1. Prompt preprocessing is performed by multi-token engine - # # 2. The KV Cache is never filled up - # # 3. KV Cache managed externally - - # if self.internal_kv_cache: - # pytest.skip( - # "Cannot run ORT pipeline with the internal deepsparse cache enabled." - # ) - # _, _, torch_ground_truth = setup - # pipeline = self.get_pipeline( - # task=self.pipeline_type, - # model_path=self.model_stub, - # sequence_length=self.sequence_length, - # prompt_sequence_length=self.prompt_sequence_length, - # engine_type="onnxruntime", - # ) - # pipeline._debug = True - # config = GenerationConfig( - # output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0 - # ) - # output = pipeline( - # sequences=self.prompt, include_prompt_logits=True, generation_config=config - # ) - - # assert output.total_num_processed_tokens[0] < self.sequence_length - # self._test_output( - # output=output, - # torch_ground_truth=torch_ground_truth, - # ) - - # def test_ort_generation_after_kv_cache_has_been_filled(self, setup): - # # Test the pipeline that uses ORT engine. The test covers the - # # following scenario: - # # 1. Prompt preprocessing is performed by multi-token engine - # # 2. The KV Cache is filled up (old entries are removed) - # # 3. KV Cache managed externally - - # if self.internal_kv_cache: - # pytest.skip( - # "Cannot run ORT pipeline with the internal deepsparse cache enabled." - # ) - # _, _, torch_ground_truth = setup - # pipeline = self.get_pipeline( - # task=self.pipeline_type, - # model_path=self.model_stub, - # sequence_length=self.sequence_length_short, - # prompt_sequence_length=self.prompt_sequence_length, - # engine_type="onnxruntime", - # ) - # pipeline._debug = True - - # config = GenerationConfig( - # output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0 - # ) - # output = pipeline( - # sequences=self.prompt, include_prompt_logits=True, generation_config=config - # ) - - # assert output.total_num_processed_tokens[0] > self.sequence_length_short, ( - # "for this scenario, the kv cache should be full: " - # "the total number of processed tokens should be " - # "greater than the sequence length" - # ) - - # self._test_output( - # output=output, - # torch_ground_truth=torch_ground_truth, - # max_logits_difference_threshold=self.logits_max_diff_kv_cache_has_been_filled, # noqa E501 - # ) + def test_freeze_first_position(self, setup): + # Test whether we should be "freezing" the first token after + # the kv cache is full + _, uses_bos_token, _ = setup + pipeline = self.get_pipeline() + assert prepends_bos_token(pipeline.tokenizer) == uses_bos_token + + def test_ort_single_token_prefill(self, setup): + # Test the pipeline that uses ORT engine. The test covers the + # following scenario: + # 1. Prompt preprocessing is performed by single-token engine + # 2. The KV Cache is never filled up + # 3. KV Cache managed externally + + if self.internal_kv_cache: + pytest.skip( + "Cannot run ORT pipeline with the internal deepsparse cache enabled." + ) + _, _, torch_ground_truth = setup + pipeline = self.get_pipeline( + task=self.pipeline_type, + model_path=self.model_stub, + sequence_length=self.sequence_length, + prompt_sequence_length=1, + engine_type="onnxruntime", + ) + pipeline._debug = True + + config = GenerationConfig( + output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0 + ) + + output = pipeline( + sequences=self.prompt, include_prompt_logits=True, generation_config=config + ) + assert output.total_num_processed_tokens[0] < self.sequence_length + self._test_output( + output=output, + torch_ground_truth=torch_ground_truth, + ) + + def test_ort_multi_token_prefill(self, setup): + # Test the pipeline that uses ORT engine. The test covers the + # following scenario: + # 1. Prompt preprocessing is performed by multi-token engine + # 2. The KV Cache is never filled up + # 3. KV Cache managed externally + + if self.internal_kv_cache: + pytest.skip( + "Cannot run ORT pipeline with the internal deepsparse cache enabled." + ) + _, _, torch_ground_truth = setup + pipeline = self.get_pipeline( + task=self.pipeline_type, + model_path=self.model_stub, + sequence_length=self.sequence_length, + prompt_sequence_length=self.prompt_sequence_length, + engine_type="onnxruntime", + ) + pipeline._debug = True + config = GenerationConfig( + output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0 + ) + output = pipeline( + sequences=self.prompt, include_prompt_logits=True, generation_config=config + ) + + assert output.total_num_processed_tokens[0] < self.sequence_length + self._test_output( + output=output, + torch_ground_truth=torch_ground_truth, + ) + + def test_ort_generation_after_kv_cache_has_been_filled(self, setup): + # Test the pipeline that uses ORT engine. The test covers the + # following scenario: + # 1. Prompt preprocessing is performed by multi-token engine + # 2. The KV Cache is filled up (old entries are removed) + # 3. KV Cache managed externally + + if self.internal_kv_cache: + pytest.skip( + "Cannot run ORT pipeline with the internal deepsparse cache enabled." + ) + _, _, torch_ground_truth = setup + pipeline = self.get_pipeline( + task=self.pipeline_type, + model_path=self.model_stub, + sequence_length=self.sequence_length_short, + prompt_sequence_length=self.prompt_sequence_length, + engine_type="onnxruntime", + ) + pipeline._debug = True + + config = GenerationConfig( + output_scores=True, max_length=self.num_tokens_generate, top_k=0, top_p=0.0 + ) + output = pipeline( + sequences=self.prompt, include_prompt_logits=True, generation_config=config + ) + + assert output.total_num_processed_tokens[0] > self.sequence_length_short, ( + "for this scenario, the kv cache should be full: " + "the total number of processed tokens should be " + "greater than the sequence length" + ) + + self._test_output( + output=output, + torch_ground_truth=torch_ground_truth, + max_logits_difference_threshold=self.logits_max_diff_kv_cache_has_been_filled, # noqa E501 + ) def test_deepsparse_single_token_prefill(self, setup): # Test the pipeline that uses deepsparse engine. The test covers the @@ -276,7 +276,6 @@ def test_deepsparse_single_token_prefill(self, setup): # 3. KV Cache managed externally or internally _, _, torch_ground_truth = setup - breakpoint() pipeline = self.get_pipeline( task=self.pipeline_type, model_path=self.model_stub, From 8ee765b2ccb1650bc70735755a5ca89a2c029b61 Mon Sep 17 00:00:00 2001 From: horheynm Date: Tue, 26 Sep 2023 16:30:11 +0000 Subject: [PATCH 08/17] make new folder for inderence --- .../transformers/inference/__init__.py | 0 src/deepsparse/transformers/inference/main.py | 233 ++++++++++++++++++ .../transformers/inference/prompt_parser.py | 81 ++++++ 3 files changed, 314 insertions(+) create mode 100644 src/deepsparse/transformers/inference/__init__.py create mode 100644 src/deepsparse/transformers/inference/main.py create mode 100644 src/deepsparse/transformers/inference/prompt_parser.py diff --git a/src/deepsparse/transformers/inference/__init__.py b/src/deepsparse/transformers/inference/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/deepsparse/transformers/inference/main.py b/src/deepsparse/transformers/inference/main.py new file mode 100644 index 0000000000..9569aca3b0 --- /dev/null +++ b/src/deepsparse/transformers/inference/main.py @@ -0,0 +1,233 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Usage: deepsparse.infer [OPTIONS] MODEL_PATH + + Command Line utility to interact with a text genration LLM in a chatbot + style + + Example usage: + + deepsparse.infer [OPTIONS] + +Options: + --sequence_length INTEGER Sequence length to compile model and + tokenizer for.This controls the maximum + context length of the pipeline. [default: + 512] + --sampling_temperature FLOAT The temperature to use when samplingfrom the + probability distribution computed from the + logits.Higher values will result in more + random samples. Shouldbe greater than 0.0. + [default: 1.0] + --prompt_sequence_length INTEGER + Processed prompt in chunks of this length. + This is to maximize the inference speed + [default: 64] + --show_tokens_per_sec / --no_show_tokens_per_sec + Whether to display the token generation + speed or not [default: + no_show_tokens_per_sec] + --task TEXT The task to use for the pipeline. Choose any + of `chat`, `codegen`, `text-generation` + [default: chat] + --help Show this message and exit. + +Installation: pip install deepsparse[transformers] +Examples: + +1) Use a local deployment directory +deepsparse.infer models/llama/deployment + +2) Use a SparseZoo stub +deepsparse.infer \ + zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none # noqa: E501 + +3) Display token generation speed +deepsparse.infer models/llama/deployment \ + --show_tokens_per_sec + +4) Disable history +deepsparse.infer models/llama/deployment \ + --task text-generation +""" +import csv +import json +from enum import Enum +from typing import Iterator, Optional + +import click + +from deepsparse import Pipeline +from deepsparse.tasks import SupportedTasks +from deepsparse.transformers.inference.prompt_parser import PromptParser + + +@click.command( + context_settings=dict( + token_normalize_func=lambda x: x.replace("-", "_"), show_default=True + ) +) +@click.argument("model_path", type=str) +@click.option( + "--data", + type=Optional[str], + default=None, + help="Path to .txt, .csv, .json, or .jsonl file to load data from" + "If provided, runs inference over the entire dataset. If not provided " + "runs an interactive inference session in the console. Default None.", +) +@click.option( + "--sequence_length", + type=int, + default=512, + help="Sequence length to compile model and tokenizer for." + "This controls the maximum context length of the pipeline.", +) +@click.option( + "--sampling_temperature", + type=float, + default=1.0, + help="The temperature to use when sampling" + "from the probability distribution computed from the logits." + "Higher values will result in more random samples. Should" + "be greater than 0.0.", +) +@click.option( + "--prompt_sequence_length", + type=int, + default=64, + help="Processed prompt in chunks of this length. " + "This is to maximize the inference speed", +) +@click.option( + "--show_tokens_per_sec/--no_show_tokens_per_sec", + default=False, + help="Whether to display the token generation speed or not", +) +@click.option( + "--task", + default="chat", + type=str, + help="The task to use for the pipeline. Choose any of " + "`chat`, `codegen`, `text-generation`", +) +def main( + model_path: str, + data: Optional[str], + sequence_length: int, + sampling_temperature: float, + prompt_sequence_length: int, + show_tokens_per_sec: bool, + task: str, +): + """ + Command Line utility to interact with a text genration LLM in a chatbot style + + Example usage: + + deepsparse.infer [OPTIONS] + """ + session_ids = "chatbot_cli_session" + + pipeline = Pipeline.create( + task=task, # let pipeline determine if task is supported + model_path=model_path, + sequence_length=sequence_length, + prompt_sequence_length=prompt_sequence_length, + ) + + if data is not None: + for prompt in _iter_prompt_from_file(data): + # TODO: George run inference + pipeline_inputs = dict( + prompt=[prompt], + sampling_temperature=sampling_temperature, + ) + if SupportedTasks.is_chat(task): + pipeline_inputs["session_ids"] = session_ids + + response = pipeline(**pipeline_inputs) + print("\n", response.generations[0].text) + + if show_tokens_per_sec: + times = pipeline.timer_manager.times + prefill_speed = ( + 1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"] + ) + generation_speed = 1.0 / times["engine_token_generation_single"] + print( + f"[prefill: {prefill_speed:.2f} tokens/sec]", + f"[decode: {generation_speed:.2f} tokens/sec]", + sep="\n", + ) + return + + # continue prompts until a keyboard interrupt + while data is None: # always True in interactive Mode + prompt_input = input(">>> ") + pipeline_inputs = dict( + prompt=[prompt_input], + sampling_temperature=sampling_temperature, + ) + + if SupportedTasks.is_chat(task): + pipeline_inputs["session_ids"] = session_ids + + response = pipeline(**pipeline_inputs) + print("\n", response.generations[0].text) + + if show_tokens_per_sec: + times = pipeline.timer_manager.times + prefill_speed = ( + 1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"] + ) + generation_speed = 1.0 / times["engine_token_generation_single"] + print( + f"[prefill: {prefill_speed:.2f} tokens/sec]", + f"[decode: {generation_speed:.2f} tokens/sec]", + sep="\n", + ) + + +def _iter_prompt_from_file(data: str) -> Iterator: + """ + TODO: George + .txt - each line is a single prompt + .csv - match first column with name in [text, prompt, sequence, sentence, sentence1], only look at values in that column, can treat other columns as kwargs + i.e. + prompt,sampling_temperature + prompt 1,0.9 + + this would make pipeline(prompt="prompt 1", temperature=0.9) + + .json - expect json file to be a single list of objects where each obj can be passed directly as kwarg inputs + + [ + {}, + {}, + ] + .jsonl - load as a text file and then each line is a json object (use json.loads) treated the same as the objects above + {} + {} + {} + {} + """ + parser = PromptParser(data) + return parser.parse_as_iterable() + + +if __name__ == "__main__": + main() diff --git a/src/deepsparse/transformers/inference/prompt_parser.py b/src/deepsparse/transformers/inference/prompt_parser.py new file mode 100644 index 0000000000..6b45e23917 --- /dev/null +++ b/src/deepsparse/transformers/inference/prompt_parser.py @@ -0,0 +1,81 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import csv +import json +import os +from enum import Enum +from typing import Iterator, Optional + + +class InvalidPromptSourceDirectoryException(Exception): + pass + + +class PromptParser: + class Extentions(Enum): + TEXT = ".txt" + CSV = ".csv" + JSON = ".json" + JSONL = ".jsonl" + + def __init__(self, filename: str): + self.extention = self._validate_and_return_extention(filename) + self.filename: str = filename + + def parse_as_iterable(self): + + if self.extention == self.Extentions.TEXT: + return self._parse_text() + if self.extention == self.Extentions.CSV: + return self._parse_csv() + if self.extention == self.Extentions.JSON: + return self._parse_json_list() + if self.extention == self.Extentions.JSONL: + return self._parse_jsonl() + + def _parse_text(self): + with open(self.filename, "r") as file: + for line in file: + yield line.strip() + + def _parse_csv(self, column_name: str = "prompt"): + with open(self.filename, "r", newline="", encoding="utf-8-sig") as file: + reader = csv.DictReader(file) + for row in reader: + yield row + + def _parse_json_list(self): + with open(self.filename, "r") as file: + json_list = json.load(file) + for json_object in json_list: + yield json_object + + def _parse_jsonl(self): + with open(self.filename, "r") as file: + for jsonl in file: + yield json.loads(jsonl) + + def _validate_and_return_extention(self, filename: str): + if os.path.exists(filename): + + for extention in self.Extentions: + if filename.endswith(extention.value): + return extention + + raise InvalidPromptSourceDirectoryException( + f"{filename} is not a valid source extract batched prompts" + ) + raise FileNotFoundError From 8a47e011291653611024a0219507ad829112a049 Mon Sep 17 00:00:00 2001 From: horheynm Date: Tue, 26 Sep 2023 17:30:46 +0000 Subject: [PATCH 09/17] allow input to pass thru cli --- setup.py | 2 +- .../transformers/inference/__init__.py | 13 ++ .../inference/{main.py => infer.py} | 128 ++++++++---------- .../transformers/inference/prompt_parser.py | 14 +- 4 files changed, 78 insertions(+), 79 deletions(-) rename src/deepsparse/transformers/inference/{main.py => infer.py} (66%) diff --git a/setup.py b/setup.py index d61c1fa312..6e7ca1bb96 100644 --- a/setup.py +++ b/setup.py @@ -298,7 +298,7 @@ def _setup_entry_points() -> Dict: "console_scripts": [ f"deepsparse.transformers.run_inference={data_api_entrypoint}", f"deepsparse.transformers.eval_downstream={eval_downstream}", - "deepsparse.infer=deepsparse.transformers.infer:main", + "deepsparse.infer=deepsparse.transformers.inference.infer:main", "deepsparse.debug_analysis=deepsparse.debug_analysis:main", "deepsparse.analyze=deepsparse.analyze:main", "deepsparse.check_hardware=deepsparse.cpu:print_hardware_capability", diff --git a/src/deepsparse/transformers/inference/__init__.py b/src/deepsparse/transformers/inference/__init__.py index e69de29bb2..0c44f887a4 100644 --- a/src/deepsparse/transformers/inference/__init__.py +++ b/src/deepsparse/transformers/inference/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/src/deepsparse/transformers/inference/main.py b/src/deepsparse/transformers/inference/infer.py similarity index 66% rename from src/deepsparse/transformers/inference/main.py rename to src/deepsparse/transformers/inference/infer.py index 9569aca3b0..a0c11aa189 100644 --- a/src/deepsparse/transformers/inference/main.py +++ b/src/deepsparse/transformers/inference/infer.py @@ -63,9 +63,7 @@ deepsparse.infer models/llama/deployment \ --task text-generation """ -import csv -import json -from enum import Enum + from typing import Iterator, Optional import click @@ -83,7 +81,7 @@ @click.argument("model_path", type=str) @click.option( "--data", - type=Optional[str], + type=str, default=None, help="Path to .txt, .csv, .json, or .jsonl file to load data from" "If provided, runs inference over the entire dataset. If not provided " @@ -149,85 +147,73 @@ def main( prompt_sequence_length=prompt_sequence_length, ) - if data is not None: - for prompt in _iter_prompt_from_file(data): - # TODO: George run inference - pipeline_inputs = dict( - prompt=[prompt], - sampling_temperature=sampling_temperature, + if data: + for prompt, prompt_kwargs in _iter_prompt_from_file(data): + prompt_kwargs = {} + _run_inference( + pipeline, + sampling_temperature, + task, + session_ids, + show_tokens_per_sec, + prompt_sequence_length, + prompt, + **prompt_kwargs, ) - if SupportedTasks.is_chat(task): - pipeline_inputs["session_ids"] = session_ids - - response = pipeline(**pipeline_inputs) - print("\n", response.generations[0].text) - - if show_tokens_per_sec: - times = pipeline.timer_manager.times - prefill_speed = ( - 1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"] - ) - generation_speed = 1.0 / times["engine_token_generation_single"] - print( - f"[prefill: {prefill_speed:.2f} tokens/sec]", - f"[decode: {generation_speed:.2f} tokens/sec]", - sep="\n", - ) - return + return # continue prompts until a keyboard interrupt while data is None: # always True in interactive Mode prompt_input = input(">>> ") - pipeline_inputs = dict( - prompt=[prompt_input], - sampling_temperature=sampling_temperature, + _run_inference( + pipeline, + sampling_temperature, + task, + session_ids, + show_tokens_per_sec, + prompt_sequence_length, + prompt_input, ) - if SupportedTasks.is_chat(task): - pipeline_inputs["session_ids"] = session_ids - - response = pipeline(**pipeline_inputs) - print("\n", response.generations[0].text) - - if show_tokens_per_sec: - times = pipeline.timer_manager.times - prefill_speed = ( - 1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"] - ) - generation_speed = 1.0 / times["engine_token_generation_single"] - print( - f"[prefill: {prefill_speed:.2f} tokens/sec]", - f"[decode: {generation_speed:.2f} tokens/sec]", - sep="\n", - ) - def _iter_prompt_from_file(data: str) -> Iterator: - """ - TODO: George - .txt - each line is a single prompt - .csv - match first column with name in [text, prompt, sequence, sentence, sentence1], only look at values in that column, can treat other columns as kwargs - i.e. - prompt,sampling_temperature - prompt 1,0.9 - - this would make pipeline(prompt="prompt 1", temperature=0.9) - - .json - expect json file to be a single list of objects where each obj can be passed directly as kwarg inputs - - [ - {}, - {}, - ] - .jsonl - load as a text file and then each line is a json object (use json.loads) treated the same as the objects above - {} - {} - {} - {} - """ parser = PromptParser(data) return parser.parse_as_iterable() +def _run_inference( + pipeline, + sampling_temperature, + task, + session_ids, + show_tokens_per_sec, + prompt_sequence_length, + prompt, + **kwargs, +): + pipeline_inputs = dict( + prompt=[prompt], + sampling_temperature=sampling_temperature, + # **kwargs, + ) + if SupportedTasks.is_chat(task): + pipeline_inputs["session_ids"] = session_ids + + response = pipeline(**pipeline_inputs) + print("\n", response.generations[0].text) + + if show_tokens_per_sec: + times = pipeline.timer_manager.times + prefill_speed = ( + 1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"] + ) + generation_speed = 1.0 / times["engine_token_generation_single"] + print( + f"[prefill: {prefill_speed:.2f} tokens/sec]", + f"[decode: {generation_speed:.2f} tokens/sec]", + sep="\n", + ) + + if __name__ == "__main__": main() diff --git a/src/deepsparse/transformers/inference/prompt_parser.py b/src/deepsparse/transformers/inference/prompt_parser.py index 6b45e23917..009cd26d6e 100644 --- a/src/deepsparse/transformers/inference/prompt_parser.py +++ b/src/deepsparse/transformers/inference/prompt_parser.py @@ -17,7 +17,6 @@ import json import os from enum import Enum -from typing import Iterator, Optional class InvalidPromptSourceDirectoryException(Exception): @@ -32,7 +31,7 @@ class Extentions(Enum): JSONL = ".jsonl" def __init__(self, filename: str): - self.extention = self._validate_and_return_extention(filename) + self.extention: self.Extentions = self._validate_and_return_extention(filename) self.filename: str = filename def parse_as_iterable(self): @@ -49,24 +48,25 @@ def parse_as_iterable(self): def _parse_text(self): with open(self.filename, "r") as file: for line in file: - yield line.strip() + yield line.strip(), {} - def _parse_csv(self, column_name: str = "prompt"): + def _parse_csv(self): with open(self.filename, "r", newline="", encoding="utf-8-sig") as file: reader = csv.DictReader(file) for row in reader: - yield row + yield row.get("prompt"), row def _parse_json_list(self): with open(self.filename, "r") as file: json_list = json.load(file) for json_object in json_list: - yield json_object + yield json_object.get("prompt"), json_object def _parse_jsonl(self): with open(self.filename, "r") as file: for jsonl in file: - yield json.loads(jsonl) + jsonl_object = json.loads(jsonl) + yield jsonl_object.get("prompt"), jsonl_object def _validate_and_return_extention(self, filename: str): if os.path.exists(filename): From b429917dd55a236f24e001658a048c209c3135e2 Mon Sep 17 00:00:00 2001 From: George Date: Wed, 27 Sep 2023 09:56:58 -0400 Subject: [PATCH 10/17] Update src/deepsparse/transformers/inference/infer.py Co-authored-by: Rahul Tuli --- src/deepsparse/transformers/inference/infer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deepsparse/transformers/inference/infer.py b/src/deepsparse/transformers/inference/infer.py index a0c11aa189..e3929e9c6c 100644 --- a/src/deepsparse/transformers/inference/infer.py +++ b/src/deepsparse/transformers/inference/infer.py @@ -193,7 +193,7 @@ def _run_inference( ): pipeline_inputs = dict( prompt=[prompt], - sampling_temperature=sampling_temperature, + temperature=sampling_temperature, # **kwargs, ) if SupportedTasks.is_chat(task): From 1dc2ee378e81ad3fc164ac2a2453689608987ead Mon Sep 17 00:00:00 2001 From: George Date: Wed, 27 Sep 2023 09:57:47 -0400 Subject: [PATCH 11/17] remove hardcoded --- src/deepsparse/transformers/inference/infer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/deepsparse/transformers/inference/infer.py b/src/deepsparse/transformers/inference/infer.py index e3929e9c6c..8abaa1b1e0 100644 --- a/src/deepsparse/transformers/inference/infer.py +++ b/src/deepsparse/transformers/inference/infer.py @@ -149,7 +149,6 @@ def main( if data: for prompt, prompt_kwargs in _iter_prompt_from_file(data): - prompt_kwargs = {} _run_inference( pipeline, sampling_temperature, From 86a2daf626f7057a707dbb1b2300f4b0fc5f177c Mon Sep 17 00:00:00 2001 From: George Date: Wed, 27 Sep 2023 10:01:09 -0400 Subject: [PATCH 12/17] better error message --- src/deepsparse/transformers/inference/prompt_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deepsparse/transformers/inference/prompt_parser.py b/src/deepsparse/transformers/inference/prompt_parser.py index 009cd26d6e..34c53ec896 100644 --- a/src/deepsparse/transformers/inference/prompt_parser.py +++ b/src/deepsparse/transformers/inference/prompt_parser.py @@ -76,6 +76,6 @@ def _validate_and_return_extention(self, filename: str): return extention raise InvalidPromptSourceDirectoryException( - f"{filename} is not a valid source extract batched prompts" + f"{filename} is not a parsable data for inference" ) raise FileNotFoundError From 939c6bc820ce167952545842826a85e08a85d48a Mon Sep 17 00:00:00 2001 From: horheynm Date: Wed, 27 Sep 2023 14:40:43 +0000 Subject: [PATCH 13/17] clean up --- .../transformers/inference/infer.py | 4 +-- .../transformers/inference/prompt_parser.py | 26 +++++++++++++------ 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/deepsparse/transformers/inference/infer.py b/src/deepsparse/transformers/inference/infer.py index 8abaa1b1e0..aa2cab3e36 100644 --- a/src/deepsparse/transformers/inference/infer.py +++ b/src/deepsparse/transformers/inference/infer.py @@ -141,7 +141,7 @@ def main( session_ids = "chatbot_cli_session" pipeline = Pipeline.create( - task=task, # let pipeline determine if task is supported + task=task, # let the pipeline determine if task is supported model_path=model_path, sequence_length=sequence_length, prompt_sequence_length=prompt_sequence_length, @@ -193,7 +193,7 @@ def _run_inference( pipeline_inputs = dict( prompt=[prompt], temperature=sampling_temperature, - # **kwargs, + **kwargs, ) if SupportedTasks.is_chat(task): pipeline_inputs["session_ids"] = session_ids diff --git a/src/deepsparse/transformers/inference/prompt_parser.py b/src/deepsparse/transformers/inference/prompt_parser.py index 34c53ec896..92bcd145d8 100644 --- a/src/deepsparse/transformers/inference/prompt_parser.py +++ b/src/deepsparse/transformers/inference/prompt_parser.py @@ -23,28 +23,36 @@ class InvalidPromptSourceDirectoryException(Exception): pass +class UnableToParseExtentionException(Exception): + pass + + class PromptParser: - class Extentions(Enum): + class Extensions(Enum): TEXT = ".txt" CSV = ".csv" JSON = ".json" JSONL = ".jsonl" def __init__(self, filename: str): - self.extention: self.Extentions = self._validate_and_return_extention(filename) + self.extention: self.Extensions = self._validate_and_return_extention(filename) self.filename: str = filename def parse_as_iterable(self): - if self.extention == self.Extentions.TEXT: + if self.extention == self.Extensions.TEXT: return self._parse_text() - if self.extention == self.Extentions.CSV: + if self.extention == self.Extensions.CSV: return self._parse_csv() - if self.extention == self.Extentions.JSON: + if self.extention == self.Extensions.JSON: return self._parse_json_list() - if self.extention == self.Extentions.JSONL: + if self.extention == self.Extensions.JSONL: return self._parse_jsonl() + raise UnableToParseExtentionException( + f"Parser for {self.extention} does not exist" + ) + def _parse_text(self): with open(self.filename, "r") as file: for line in file: @@ -71,11 +79,13 @@ def _parse_jsonl(self): def _validate_and_return_extention(self, filename: str): if os.path.exists(filename): - for extention in self.Extentions: + for extention in self.Extensions: if filename.endswith(extention.value): return extention raise InvalidPromptSourceDirectoryException( - f"{filename} is not a parsable data for inference" + f"{filename} is not compatible. Select file that has " + "extension from " + f"{[key.name for key in self.Extensions]}" ) raise FileNotFoundError From 7b1edfa9ecaa3b079ac24c896d4c48720c909a15 Mon Sep 17 00:00:00 2001 From: horheynm Date: Wed, 27 Sep 2023 21:09:43 +0000 Subject: [PATCH 14/17] clean up, check kwargs --- .../transformers/inference/infer.py | 31 +++++++------ .../transformers/inference/prompt_parser.py | 45 +++++++++++++------ 2 files changed, 47 insertions(+), 29 deletions(-) diff --git a/src/deepsparse/transformers/inference/infer.py b/src/deepsparse/transformers/inference/infer.py index aa2cab3e36..460b8499c4 100644 --- a/src/deepsparse/transformers/inference/infer.py +++ b/src/deepsparse/transformers/inference/infer.py @@ -64,7 +64,7 @@ --task text-generation """ -from typing import Iterator, Optional +from typing import Optional import click @@ -148,22 +148,26 @@ def main( ) if data: - for prompt, prompt_kwargs in _iter_prompt_from_file(data): + prompt_parser = PromptParser(data) + default_prompt_kwargs = { + "sequence_length": sequence_length, + "sampling_temperature": sampling_temperature, + "prompt_sequence_length": prompt_sequence_length, + "show_tokens_per_sec": show_tokens_per_sec, + } + + for prompt_kwargs in prompt_parser.parse_as_iterable(**default_prompt_kwargs): _run_inference( - pipeline, - sampling_temperature, - task, - session_ids, - show_tokens_per_sec, - prompt_sequence_length, - prompt, + task=task, + pipeline=pipeline, + session_ids=session_ids, **prompt_kwargs, ) return # continue prompts until a keyboard interrupt while data is None: # always True in interactive Mode - prompt_input = input(">>> ") + prompt = input(">>> ") _run_inference( pipeline, sampling_temperature, @@ -171,15 +175,10 @@ def main( session_ids, show_tokens_per_sec, prompt_sequence_length, - prompt_input, + prompt, ) -def _iter_prompt_from_file(data: str) -> Iterator: - parser = PromptParser(data) - return parser.parse_as_iterable() - - def _run_inference( pipeline, sampling_temperature, diff --git a/src/deepsparse/transformers/inference/prompt_parser.py b/src/deepsparse/transformers/inference/prompt_parser.py index 92bcd145d8..7a344039c1 100644 --- a/src/deepsparse/transformers/inference/prompt_parser.py +++ b/src/deepsparse/transformers/inference/prompt_parser.py @@ -17,6 +17,7 @@ import json import os from enum import Enum +from typing import Iterator, Tuple class InvalidPromptSourceDirectoryException(Exception): @@ -27,6 +28,18 @@ class UnableToParseExtentionException(Exception): pass +def parse_value_to_appropriate_type(value: str): + if value.isdigit(): + return int(value) + if "." in str(value) and all(part.isdigit() for part in value.split(".", 1)): + return float(value) + if value.lower() == "true": + return True + if value.lower() == "false": + return False + return value + + class PromptParser: class Extensions(Enum): TEXT = ".txt" @@ -38,43 +51,49 @@ def __init__(self, filename: str): self.extention: self.Extensions = self._validate_and_return_extention(filename) self.filename: str = filename - def parse_as_iterable(self): + def parse_as_iterable(self, **kwargs) -> Iterator[Tuple]: if self.extention == self.Extensions.TEXT: - return self._parse_text() + return self._parse_text(**kwargs) if self.extention == self.Extensions.CSV: - return self._parse_csv() + return self._parse_csv(**kwargs) if self.extention == self.Extensions.JSON: - return self._parse_json_list() + return self._parse_json_list(**kwargs) if self.extention == self.Extensions.JSONL: - return self._parse_jsonl() + return self._parse_jsonl(**kwargs) raise UnableToParseExtentionException( f"Parser for {self.extention} does not exist" ) - def _parse_text(self): + def _parse_text(self, **kwargs): with open(self.filename, "r") as file: for line in file: - yield line.strip(), {} + kwargs["prompt"] = line.strip() + yield kwargs - def _parse_csv(self): + def _parse_csv(self, **kwargs): with open(self.filename, "r", newline="", encoding="utf-8-sig") as file: reader = csv.DictReader(file) for row in reader: - yield row.get("prompt"), row + for key, value in row.items(): + kwargs.update({key: parse_value_to_appropriate_type(value)}) + yield kwargs - def _parse_json_list(self): + def _parse_json_list(self, **kwargs): with open(self.filename, "r") as file: json_list = json.load(file) for json_object in json_list: - yield json_object.get("prompt"), json_object + kwargs.update(json_object) + yield kwargs - def _parse_jsonl(self): + def _parse_jsonl(self, **kwargs): with open(self.filename, "r") as file: for jsonl in file: jsonl_object = json.loads(jsonl) - yield jsonl_object.get("prompt"), jsonl_object + breakpoint() + kwargs.update(jsonl_object) + yield kwargs def _validate_and_return_extention(self, filename: str): if os.path.exists(filename): From 46998376109ae1f1d416499f69892f33d1f01f27 Mon Sep 17 00:00:00 2001 From: horheynm Date: Wed, 27 Sep 2023 21:12:32 +0000 Subject: [PATCH 15/17] get rid of breakpoint() --- src/deepsparse/transformers/inference/prompt_parser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/deepsparse/transformers/inference/prompt_parser.py b/src/deepsparse/transformers/inference/prompt_parser.py index 7a344039c1..1ebedcffd1 100644 --- a/src/deepsparse/transformers/inference/prompt_parser.py +++ b/src/deepsparse/transformers/inference/prompt_parser.py @@ -91,7 +91,6 @@ def _parse_jsonl(self, **kwargs): with open(self.filename, "r") as file: for jsonl in file: jsonl_object = json.loads(jsonl) - breakpoint() kwargs.update(jsonl_object) yield kwargs From ff4b48f0629a68097e32075da2fe76b89f4af07e Mon Sep 17 00:00:00 2001 From: horheynm Date: Wed, 27 Sep 2023 21:15:09 +0000 Subject: [PATCH 16/17] return type --- src/deepsparse/transformers/inference/prompt_parser.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/deepsparse/transformers/inference/prompt_parser.py b/src/deepsparse/transformers/inference/prompt_parser.py index 1ebedcffd1..ce2f6986cb 100644 --- a/src/deepsparse/transformers/inference/prompt_parser.py +++ b/src/deepsparse/transformers/inference/prompt_parser.py @@ -17,7 +17,7 @@ import json import os from enum import Enum -from typing import Iterator, Tuple +from typing import Iterator class InvalidPromptSourceDirectoryException(Exception): @@ -51,8 +51,7 @@ def __init__(self, filename: str): self.extention: self.Extensions = self._validate_and_return_extention(filename) self.filename: str = filename - def parse_as_iterable(self, **kwargs) -> Iterator[Tuple]: - + def parse_as_iterable(self, **kwargs) -> Iterator: if self.extention == self.Extensions.TEXT: return self._parse_text(**kwargs) if self.extention == self.Extensions.CSV: From 2a4b972effea6bacecc7ea54702ec5927d168070 Mon Sep 17 00:00:00 2001 From: horheynm Date: Mon, 2 Oct 2023 19:18:48 +0000 Subject: [PATCH 17/17] typo --- src/deepsparse/transformers/inference/prompt_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/deepsparse/transformers/inference/prompt_parser.py b/src/deepsparse/transformers/inference/prompt_parser.py index ce2f6986cb..35c433b11f 100644 --- a/src/deepsparse/transformers/inference/prompt_parser.py +++ b/src/deepsparse/transformers/inference/prompt_parser.py @@ -24,7 +24,7 @@ class InvalidPromptSourceDirectoryException(Exception): pass -class UnableToParseExtentionException(Exception): +class UnableToParseExtentsonException(Exception): pass @@ -61,7 +61,7 @@ def parse_as_iterable(self, **kwargs) -> Iterator: if self.extention == self.Extensions.JSONL: return self._parse_jsonl(**kwargs) - raise UnableToParseExtentionException( + raise UnableToParseExtentsonException( f"Parser for {self.extention} does not exist" )