From ef213837408953b6d91a4806c761e34ff0a523e1 Mon Sep 17 00:00:00 2001 From: horheynm Date: Fri, 1 Sep 2023 13:45:38 +0000 Subject: [PATCH 01/11] draft --- .../transformers/engines/nl_decoder_engine.py | 4 +- .../transformers/token_generator.py | 87 +++++++++++++++++++ 2 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 src/deepsparse/transformers/token_generator.py diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py index d5f5dfa91b..ad642d75b3 100644 --- a/src/deepsparse/transformers/engines/nl_decoder_engine.py +++ b/src/deepsparse/transformers/engines/nl_decoder_engine.py @@ -176,6 +176,7 @@ def run(self, inputs: List[numpy.ndarray], val_inp: bool) -> List[numpy.ndarray] def __call__( self, inp: List[numpy.ndarray], + token_generator: TokenGenerator, val_inp: bool = True, ) -> Tuple[numpy.ndarray, numpy.ndarray]: """ @@ -203,7 +204,8 @@ def __call__( logits = out[0] # select batch idx 0, batch is always 1 - token = self.generate_token(logits=logits[0, -1, :]) + # token = self.generate_token(logits=logits[0, -1, :]) + token = token_generator.generate(logits=logits[0, -1, :]) return token, logits diff --git a/src/deepsparse/transformers/token_generator.py b/src/deepsparse/transformers/token_generator.py new file mode 100644 index 0000000000..e10ea89402 --- /dev/null +++ b/src/deepsparse/transformers/token_generator.py @@ -0,0 +1,87 @@ +import numpy +from deepsparse.utils.data import numpy_softmax + +class TokenGenerator: + def __init__( + self, + logits: numpy.ndarray, + deterministic: bool = True, + sampling_temperature: float = 1.0, + top_k: int=0, + top_p: float=0.0, + frequency_penalty: float=0.0, + presence_penalty: float=0.0, + ): + self.token_frequencies = numpy.zeros(logits.shape) + + self.deterministic = deterministic + self.sampling_termperature = sampling_temperature + self.top_k = top_k + self.top_p = top_p + self.frequency_penalty = frequency_penalty + self.presence_penalty = presence_penalty + + + def update_frequences(self, token: numpy.ndarray): + for tk in token: + self.token_frequencies[0][tk] += 1 + + + def generate(self, logits: numpy.ndarray) -> numpy.ndarray: + """ + Samples a token from the logits using the sampling temperature. + + :param logits: the logits from the model with shape (vocab_size,) + :return: the sampled token + """ + if self.deterministic: + return numpy.argmax(logits) + + if self.sampling_temperature != 1.0: + logits /= self.sampling_temperature + + if self.top_k: + logits = self.apply_top_k(logits) + if self.top_p: + logits = self.apply_top_p(logits) + + # penalties here + if self.frequency_penalty != 0.0: + logits = self.apply_frequency_penalty(logits) + if self.presence_penalty != 0.0: + logits = self.apply_presence_penalty(logits) + + probs = self.numpy_softmax(logits) + + token = numpy.random.choice(len(probs), p=probs) + self.update_frequencies(token) + + return token + + + # from https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 + def apply_top_k( + self, + logits: numpy.ndarray, top_k: int, filter_value=-float("Inf") + ): + indices_to_remove = ( + logits < numpy.partition(logits, -top_k, axis=1)[:, -top_k][:, None] + ) + logits[indices_to_remove] = filter_value + return logits + + # from https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 + def apply_top_p( + self, + logits: numpy.ndarray, top_p: float, filter_value=-float("Inf") + ): + sorted_indices = numpy.argsort(logits) + sorted_logits = logits[sorted_indices] + cumulative_probs = numpy_softmax(sorted_logits) + sorted_indices_to_remove = cumulative_probs <= (1 - top_p) + + indices_to_remove = sorted_indices_to_remove.scatter( + 1, sorted_indices, sorted_indices_to_remove + ) + logits = numpy.where(indices_to_remove, filter_value, logits) + return logits \ No newline at end of file From c7c308f1b719cb3a9dd1918dd9de382968acc075 Mon Sep 17 00:00:00 2001 From: horheynm Date: Fri, 1 Sep 2023 13:46:50 +0000 Subject: [PATCH 02/11] draft --- src/deepsparse/transformers/pipelines/text_generation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py index 4417ff033e..33edc2b9e6 100644 --- a/src/deepsparse/transformers/pipelines/text_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation.py @@ -482,11 +482,15 @@ def engine_forward( ) callback = context.get("callback") stop = context.get("stop") + + token_generator = TokenGenerator(**token_generator_kwargs) with timer.time(_TextGenerationTimings.TOKEN_GENERATION): while len(generated_tokens) < max_tokens: with timer.time(_TextGenerationTimings.TOKEN_GENERATION_SINGLE): + token, logits = self.autoregressive_inference(tokens) + tokens.append(token) generated_tokens.append(token) generated_logits.append(logits) From 65857c84d3afba0efcac0d6fcbad4110851cb106 Mon Sep 17 00:00:00 2001 From: horheynm Date: Tue, 5 Sep 2023 17:55:27 +0000 Subject: [PATCH 03/11] draft --- .../transformers/engines/nl_decoder_engine.py | 10 +-- .../transformers/pipelines/text_generation.py | 35 ++++++--- .../pipelines/token_classification.py | 1 - .../{ => utils}/token_generator.py | 75 ++++++++++++------- 4 files changed, 78 insertions(+), 43 deletions(-) rename src/deepsparse/transformers/{ => utils}/token_generator.py (57%) diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py index ad642d75b3..a4fca87540 100644 --- a/src/deepsparse/transformers/engines/nl_decoder_engine.py +++ b/src/deepsparse/transformers/engines/nl_decoder_engine.py @@ -24,6 +24,7 @@ generate_session_id, overwrite_onnx_model_inputs_for_kv_cache_models, ) +from deepsparse.transformers.utils.token_generator import TokenGenerator from deepsparse.utils.data import numpy_softmax from deepsparse.utils.onnx import CACHE_INPUT_PREFIX, CACHE_OUTPUT_PREFIX @@ -176,7 +177,6 @@ def run(self, inputs: List[numpy.ndarray], val_inp: bool) -> List[numpy.ndarray] def __call__( self, inp: List[numpy.ndarray], - token_generator: TokenGenerator, val_inp: bool = True, ) -> Tuple[numpy.ndarray, numpy.ndarray]: """ @@ -203,11 +203,11 @@ def __call__( else: logits = out[0] - # select batch idx 0, batch is always 1 - # token = self.generate_token(logits=logits[0, -1, :]) - token = token_generator.generate(logits=logits[0, -1, :]) + # # select batch idx 0, batch is always 1 + # # token = self.generate_token(logits=logits[0, -1, :]) + # token = token_generator.generate(logits=logits[0, -1, :]) - return token, logits + return logits def __str__(self): return f"{self.__class__.__name__}: {self.engine}" diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py index 33edc2b9e6..88f56e10c1 100644 --- a/src/deepsparse/transformers/pipelines/text_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation.py @@ -42,6 +42,7 @@ create_causal_mask, pad_to_fixed_length, ) +from deepsparse.transformers.utils.token_generator import TokenGenerator from deepsparse.utils.onnx import default_cached_outputs @@ -115,6 +116,21 @@ class Config: " tokens is generated). Set to `None` to ignore this parameter." " Default is `None`.", ) + top_p: Optional[float] = Field( + default=0, + description="Select the tokens with cumulative probability sum" + " higher than the given top_p", + ) + top_k: Optional[int] = Field( + default=0.0, + description="Select the tokens with top_k values", + ) + presence_penalty: Optional[float] = Field( + default=0.0, + ) + frquency_peanlty: Optional[float] = Field( + default=0.0, + ) class TextGenerationOutput(BaseModel): @@ -290,7 +306,6 @@ def initialize_engines( if ( self.cache_support_enabled and self.enable_multitoken_prefill ) or not self.cache_support_enabled: - multitoken_engine = NLDecoderEngine( onnx_file_path=self.onnx_file_path, engine_type=self.engine_type, @@ -450,6 +465,8 @@ def engine_forward( # as such, a new context needs to be created since we are no longer in the # main thread. That is why `engine_` is prepended to each of the timer phase # names in this context + token_generator = TokenGenerator(**context) + with self.timer_manager.new_timer_context(total_inference=False) as timer: streamer = context.get("streamer") @@ -482,15 +499,14 @@ def engine_forward( ) callback = context.get("callback") stop = context.get("stop") - - token_generator = TokenGenerator(**token_generator_kwargs) + with timer.time(_TextGenerationTimings.TOKEN_GENERATION): while len(generated_tokens) < max_tokens: with timer.time(_TextGenerationTimings.TOKEN_GENERATION_SINGLE): - - token, logits = self.autoregressive_inference(tokens) - + logits = self.autoregressive_inference(tokens=tokens) + token = token_generator.generate(logits=logits[0, -1, :]) + tokens.append(token) generated_tokens.append(token) generated_logits.append(logits) @@ -572,7 +588,7 @@ def prompt_inference( new_token, new_logits = self.autoregressive_inference(run_tokens) prompt_logits.append(new_logits) - + tokens.append(new_token) return tokens, prompt_logits @@ -612,9 +628,10 @@ def autoregressive_inference( engine_inputs_map[name] for name in self.engine.onnx_input_names_no_cache ] - generated_token, generated_logits = self.engine(engine_inputs) + # generated_token, generated_logits = self.engine(engine_inputs, token_generator=self.token_generator) + generated_logits = self.engine(engine_inputs) - return generated_token, generated_logits + return generated_logits def engine_inputs_for_prefill( self, tokens: List[int] diff --git a/src/deepsparse/transformers/pipelines/token_classification.py b/src/deepsparse/transformers/pipelines/token_classification.py index e94fa51dfc..9924d9ea42 100644 --- a/src/deepsparse/transformers/pipelines/token_classification.py +++ b/src/deepsparse/transformers/pipelines/token_classification.py @@ -525,7 +525,6 @@ def _get_tag(self, entity_name: str) -> Tuple[str, str]: return bi, tag def _group_entities(self, entities: List[dict]) -> List[dict]: - entity_groups = [] entity_group_disagg = [] diff --git a/src/deepsparse/transformers/token_generator.py b/src/deepsparse/transformers/utils/token_generator.py similarity index 57% rename from src/deepsparse/transformers/token_generator.py rename to src/deepsparse/transformers/utils/token_generator.py index e10ea89402..ae8484d31b 100644 --- a/src/deepsparse/transformers/token_generator.py +++ b/src/deepsparse/transformers/utils/token_generator.py @@ -1,31 +1,46 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy + from deepsparse.utils.data import numpy_softmax + class TokenGenerator: def __init__( - self, + self, logits: numpy.ndarray, deterministic: bool = True, sampling_temperature: float = 1.0, - top_k: int=0, - top_p: float=0.0, - frequency_penalty: float=0.0, - presence_penalty: float=0.0, + top_k: int = 0, + top_p: float = 0.0, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + **kwargs, ): - self.token_frequencies = numpy.zeros(logits.shape) - + self.token_frequencies = numpy.zeros(logits.shape[-1]) + self.deterministic = deterministic self.sampling_termperature = sampling_temperature self.top_k = top_k self.top_p = top_p self.frequency_penalty = frequency_penalty self.presence_penalty = presence_penalty - - + def update_frequences(self, token: numpy.ndarray): for tk in token: - self.token_frequencies[0][tk] += 1 - + self.token_frequencies[tk] += 1 def generate(self, logits: numpy.ndarray) -> numpy.ndarray: """ @@ -39,12 +54,12 @@ def generate(self, logits: numpy.ndarray) -> numpy.ndarray: if self.sampling_temperature != 1.0: logits /= self.sampling_temperature - + if self.top_k: logits = self.apply_top_k(logits) - if self.top_p: + if self.top_p: logits = self.apply_top_p(logits) - + # penalties here if self.frequency_penalty != 0.0: logits = self.apply_frequency_penalty(logits) @@ -55,33 +70,37 @@ def generate(self, logits: numpy.ndarray) -> numpy.ndarray: token = numpy.random.choice(len(probs), p=probs) self.update_frequencies(token) - + return token - - + + def apply_frequency_penalty(self, logits: numpy.ndarray): + logits -= self.frequency_penalty * self.token_frequencies + return logits + + def apply_presence_penalty(self, logits: numpy.ndarray): + logits -= self.presence_penalty * (self.frequency_penalty > 0) + return logits + # from https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 - def apply_top_k( - self, - logits: numpy.ndarray, top_k: int, filter_value=-float("Inf") - ): + def apply_top_k(self, logits: numpy.ndarray, filter_value=-float("Inf")): indices_to_remove = ( - logits < numpy.partition(logits, -top_k, axis=1)[:, -top_k][:, None] + logits + < numpy.partition(logits, -self.top_k, axis=1)[:, -self.top_k][:, None] ) logits[indices_to_remove] = filter_value return logits # from https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 - def apply_top_p( - self, - logits: numpy.ndarray, top_p: float, filter_value=-float("Inf") - ): + def apply_top_p(self, logits: numpy.ndarray, filter_value=-float("Inf")): sorted_indices = numpy.argsort(logits) sorted_logits = logits[sorted_indices] cumulative_probs = numpy_softmax(sorted_logits) - sorted_indices_to_remove = cumulative_probs <= (1 - top_p) + sorted_indices_to_remove = cumulative_probs <= (1 - self.top_p) indices_to_remove = sorted_indices_to_remove.scatter( 1, sorted_indices, sorted_indices_to_remove ) logits = numpy.where(indices_to_remove, filter_value, logits) - return logits \ No newline at end of file + return logits + + From 542bf27d2adf9f98e7c13d746ce2b3674d75b0e7 Mon Sep 17 00:00:00 2001 From: horheynm Date: Wed, 6 Sep 2023 04:27:34 +0000 Subject: [PATCH 04/11] impleentation --- .../transformers/engines/nl_decoder_engine.py | 18 --------- .../transformers/pipelines/text_generation.py | 40 ++++++++++--------- .../transformers/utils/token_generator.py | 19 +++++---- 3 files changed, 34 insertions(+), 43 deletions(-) diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py index a4fca87540..b6e21f020a 100644 --- a/src/deepsparse/transformers/engines/nl_decoder_engine.py +++ b/src/deepsparse/transformers/engines/nl_decoder_engine.py @@ -24,8 +24,6 @@ generate_session_id, overwrite_onnx_model_inputs_for_kv_cache_models, ) -from deepsparse.transformers.utils.token_generator import TokenGenerator -from deepsparse.utils.data import numpy_softmax from deepsparse.utils.onnx import CACHE_INPUT_PREFIX, CACHE_OUTPUT_PREFIX @@ -230,22 +228,6 @@ def transfer_cache_state(self, cache: DecoderKVCache): cache.set_capacity(self.cache_length) self.kv_cache = cache - def generate_token(self, logits: numpy.ndarray) -> numpy.ndarray: - """ - Samples a token from the logits using the sampling temperature. - - :param logits: the logits from the model with shape (vocab_size,) - :return: the sampled token - """ - if self.deterministic: - return numpy.argmax(logits) - - logits /= self.sampling_temperature - - probs = numpy_softmax(logits) - - return numpy.random.choice(len(probs), p=probs) - def reset_kv_cache(self): """ Resets the kv cache state. diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py index 88f56e10c1..2c52878fa2 100644 --- a/src/deepsparse/transformers/pipelines/text_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation.py @@ -465,22 +465,29 @@ def engine_forward( # as such, a new context needs to be created since we are no longer in the # main thread. That is why `engine_` is prepended to each of the timer phase # names in this context - token_generator = TokenGenerator(**context) - + with self.timer_manager.new_timer_context(total_inference=False) as timer: streamer = context.get("streamer") if not self.cache_support_enabled: - tokens, prompt_logits = self.multitoken_engine(engine_inputs) - return numpy.array([tokens]), prompt_logits + prompt_logits = self.multitoken_engine(engine_inputs) + token_generator = TokenGenerator(prompt_logits[0]) + for prompt_logit in prompt_logits: + token = token_generator.generate(prompt_logit) + return numpy.array([self.tokens]), prompt_logits else: # run the prompt through with timer.time(_TextGenerationTimings.PROMPT_PREFILL): - tokens, prompt_logits = self.prompt_inference(engine_inputs) + prompt_logits = self.prompt_inference(engine_inputs) + + tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist() + token_generator = TokenGenerator(logits=prompt_logits[-1], tokens=tokens) + token_generator.generate(prompt_logits[-1]) + tokens = [] if streamer is not None: - streamer.put(numpy.array(tokens)) + streamer.put(numpy.array(token_generator.tokens)) # create the generated output max_tokens = ( @@ -491,7 +498,7 @@ def engine_forward( # last prompt token is the first generated token # add it to generated tokens, and the logits - generated_tokens = [tokens[-1]] + generated_tokens = [token_generator.tokens[-1]] generated_logits = ( prompt_logits if context.get("include_prompt_logits") @@ -500,14 +507,14 @@ def engine_forward( callback = context.get("callback") stop = context.get("stop") - with timer.time(_TextGenerationTimings.TOKEN_GENERATION): while len(generated_tokens) < max_tokens: with timer.time(_TextGenerationTimings.TOKEN_GENERATION_SINGLE): - logits = self.autoregressive_inference(tokens=tokens) + logits = self.autoregressive_inference( + tokens=token_generator.tokens + ) token = token_generator.generate(logits=logits[0, -1, :]) - tokens.append(token) generated_tokens.append(token) generated_logits.append(logits) @@ -542,7 +549,8 @@ def engine_forward( ) def prompt_inference( - self, engine_inputs: List[numpy.ndarray] + self, + engine_inputs: List[numpy.ndarray], ) -> Tuple[List[int], List[numpy.ndarray]]: """ An inference run that processes the prompt through the @@ -559,7 +567,6 @@ def prompt_inference( tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist() prompt_logits = [] - new_token = None num_tokens_processed = 0 if ( @@ -568,7 +575,7 @@ def prompt_inference( ): self.multitoken_engine.reset_kv_cache() for engine_inputs in self.engine_inputs_for_prefill(tokens): - new_token, new_logits = self.multitoken_engine(engine_inputs) + new_logits = self.multitoken_engine(engine_inputs) num_tokens_processed += self.prompt_processing_sequence_length prompt_logits.append(new_logits) @@ -585,13 +592,11 @@ def prompt_inference( with self.timer_manager.current.time( _TextGenerationTimings.PROMPT_PREFILL_SINGLE ): - new_token, new_logits = self.autoregressive_inference(run_tokens) + new_logits = self.autoregressive_inference(run_tokens) prompt_logits.append(new_logits) - - tokens.append(new_token) - return tokens, prompt_logits + return prompt_logits def autoregressive_inference( self, @@ -628,7 +633,6 @@ def autoregressive_inference( engine_inputs_map[name] for name in self.engine.onnx_input_names_no_cache ] - # generated_token, generated_logits = self.engine(engine_inputs, token_generator=self.token_generator) generated_logits = self.engine(engine_inputs) return generated_logits diff --git a/src/deepsparse/transformers/utils/token_generator.py b/src/deepsparse/transformers/utils/token_generator.py index ae8484d31b..01b4de057d 100644 --- a/src/deepsparse/transformers/utils/token_generator.py +++ b/src/deepsparse/transformers/utils/token_generator.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import List + import numpy from deepsparse.utils.data import numpy_softmax @@ -21,6 +23,7 @@ class TokenGenerator: def __init__( self, logits: numpy.ndarray, + tokens: List[int] = [], deterministic: bool = True, sampling_temperature: float = 1.0, top_k: int = 0, @@ -37,10 +40,13 @@ def __init__( self.top_p = top_p self.frequency_penalty = frequency_penalty self.presence_penalty = presence_penalty + self.tokens = [] + for token in tokens: + self.update_frequencies(token) - def update_frequences(self, token: numpy.ndarray): - for tk in token: - self.token_frequencies[tk] += 1 + def update_frequencies(self, token: numpy.ndarray): + self.tokens.append(token) + self.token_frequencies[token] += 1 def generate(self, logits: numpy.ndarray) -> numpy.ndarray: """ @@ -50,7 +56,9 @@ def generate(self, logits: numpy.ndarray) -> numpy.ndarray: :return: the sampled token """ if self.deterministic: - return numpy.argmax(logits) + token = numpy.argmax(logits) + self.tokens.append(token) + return token if self.sampling_temperature != 1.0: logits /= self.sampling_temperature @@ -67,7 +75,6 @@ def generate(self, logits: numpy.ndarray) -> numpy.ndarray: logits = self.apply_presence_penalty(logits) probs = self.numpy_softmax(logits) - token = numpy.random.choice(len(probs), p=probs) self.update_frequencies(token) @@ -102,5 +109,3 @@ def apply_top_p(self, logits: numpy.ndarray, filter_value=-float("Inf")): ) logits = numpy.where(indices_to_remove, filter_value, logits) return logits - - From 5b7c97e091f43fc9c7937bcdc29c0f10c3b08b2a Mon Sep 17 00:00:00 2001 From: horheynm Date: Wed, 6 Sep 2023 04:30:18 +0000 Subject: [PATCH 05/11] delete commented line --- src/deepsparse/transformers/engines/nl_decoder_engine.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py index b6e21f020a..25b1c51fcd 100644 --- a/src/deepsparse/transformers/engines/nl_decoder_engine.py +++ b/src/deepsparse/transformers/engines/nl_decoder_engine.py @@ -201,10 +201,6 @@ def __call__( else: logits = out[0] - # # select batch idx 0, batch is always 1 - # # token = self.generate_token(logits=logits[0, -1, :]) - # token = token_generator.generate(logits=logits[0, -1, :]) - return logits def __str__(self): From 23d35d45e56fa18490a78a29cb332bbfa37fd8b1 Mon Sep 17 00:00:00 2001 From: horheynm Date: Thu, 7 Sep 2023 06:15:41 +0000 Subject: [PATCH 06/11] tests, update sampling calculation --- .../transformers/utils/token_generator.py | 39 +++-- .../utils/test_token_generator.py | 157 ++++++++++++++++++ 2 files changed, 181 insertions(+), 15 deletions(-) create mode 100644 tests/deepsparse/transformers/utils/test_token_generator.py diff --git a/src/deepsparse/transformers/utils/token_generator.py b/src/deepsparse/transformers/utils/token_generator.py index 01b4de057d..ee6554fb71 100644 --- a/src/deepsparse/transformers/utils/token_generator.py +++ b/src/deepsparse/transformers/utils/token_generator.py @@ -85,27 +85,36 @@ def apply_frequency_penalty(self, logits: numpy.ndarray): return logits def apply_presence_penalty(self, logits: numpy.ndarray): - logits -= self.presence_penalty * (self.frequency_penalty > 0) + logits -= self.presence_penalty * (self.token_frequencies > 0) return logits # from https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 def apply_top_k(self, logits: numpy.ndarray, filter_value=-float("Inf")): - indices_to_remove = ( - logits - < numpy.partition(logits, -self.top_k, axis=1)[:, -self.top_k][:, None] - ) - logits[indices_to_remove] = filter_value - return logits + logits_shape = logits.shape + logits = logits.reshape(logits.shape[-1]) + top_k_indices = numpy.argpartition(logits, -self.top_k)[-self.top_k :] + logits[~numpy.isin(numpy.arange(len(logits)), top_k_indices)] = filter_value + + return logits.reshape(logits_shape) # from https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 - def apply_top_p(self, logits: numpy.ndarray, filter_value=-float("Inf")): + def apply_top_p( + self, logits: numpy.ndarray, filter_value=-float("Inf"), min_tokens_to_keep=1 + ): + logits_shape = logits.shape + logits = logits.reshape(logits.shape[-1]) + sorted_indices = numpy.argsort(logits) sorted_logits = logits[sorted_indices] - cumulative_probs = numpy_softmax(sorted_logits) - sorted_indices_to_remove = cumulative_probs <= (1 - self.top_p) + logit_cumulative_probs = numpy.cumsum(numpy_softmax(sorted_logits)) - indices_to_remove = sorted_indices_to_remove.scatter( - 1, sorted_indices, sorted_indices_to_remove - ) - logits = numpy.where(indices_to_remove, filter_value, logits) - return logits + # Remove tokens with cumulative top_p above the threshold (token with 0 are kept) + sorted_indices_to_remove = logit_cumulative_probs > self.top_p + # Keep at least min_tokens_to_keep + sorted_indices_to_remove[..., -min_tokens_to_keep:] = 0 + + # scatter sorted tensors to original indexing + indices_to_remove = sorted_indices[sorted_indices_to_remove] + logits[indices_to_remove] = filter_value + + return logits.reshape(logits_shape) diff --git a/tests/deepsparse/transformers/utils/test_token_generator.py b/tests/deepsparse/transformers/utils/test_token_generator.py new file mode 100644 index 0000000000..cf303d5858 --- /dev/null +++ b/tests/deepsparse/transformers/utils/test_token_generator.py @@ -0,0 +1,157 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +from typing import List, Tuple, Union + +import numpy + +import pytest +from deepsparse.transformers.utils.token_generator import TokenGenerator + + +@pytest.fixture(scope="function") +def logits_fixture() -> numpy.array: + def get(shape: Tuple = (1, 1, 51200), token_max_thresh: int = 30, low: int = -30): + return numpy.random.uniform(low, token_max_thresh, size=shape) + + return get + + +@pytest.fixture(scope="function") +def token_fixture() -> List[int]: + def get(shape: Union[int, Tuple] = 5, token_max_thresh: int = 51200): + return numpy.random.randint(0, token_max_thresh, size=shape).tolist() + + return get + + +class TestTokenGenerator: + def test_update_frequencies( + self, logits_fixture, token_fixture, token_max_thresh: int = 51200 + ): + logits, tokens = logits_fixture(), token_fixture( + token_max_thresh=token_max_thresh + ) + token_generator = TokenGenerator(logits=logits, tokens=tokens) + + assert token_generator.tokens == tokens + + freq = defaultdict(int) + for token in token_generator.tokens: + freq[token] += 1 + + for key, value in freq.items(): + assert token_generator.token_frequencies[key] == value + + # test TokenGenerator.update_frequencies + new_token = numpy.random.randint(0, token_max_thresh) + token_generator.update_frequencies(new_token) + + assert token_generator.tokens == tokens + [new_token] + freq[new_token] += 1 + for key, value in freq.items(): + assert token_generator.token_frequencies[key] == value + + def test_apply_frequency_penalty( + self, + logits_fixture, + token_fixture, + ): + logits, tokens = logits_fixture(), token_fixture() + frequency_penalty = 1.0 + token_generator = TokenGenerator( + logits=logits, tokens=(tokens + tokens), frequency_penalty=frequency_penalty + ) + + test_logits = token_generator.token_frequencies + # numpy arrays by default are pass by ref + new_logits = token_generator.apply_frequency_penalty(test_logits.copy()) + assert new_logits.shape == test_logits.shape + assert numpy.sum(new_logits) == 0 + + def test_apply_presence_penalty( + self, + logits_fixture, + token_fixture, + ): + logits, tokens = logits_fixture(), token_fixture() + presence_penalty = 1.0 + token_generator = TokenGenerator( + logits=logits, tokens=(tokens + tokens), presence_penalty=presence_penalty + ) + test_logits = token_generator.token_frequencies + # numpy arrays by default are pass by ref + new_logits = token_generator.apply_presence_penalty(test_logits.copy()) + assert new_logits.shape == test_logits.shape + assert numpy.sum(new_logits) == 0.5 * numpy.sum(test_logits) + + def test_apply_topk( + self, + ): + # logits for opt usually have shape (1,1,51200) + logits = numpy.linspace(0, 1, 11).reshape((1, 1, 11)) + + token_generator = TokenGenerator( + logits=logits, + top_k=3, + ) + + filter_value = -float("Inf") + new_logits = token_generator.apply_top_k( + logits.copy(), filter_value=filter_value + ) + + for _ in range(token_generator.top_k): + curr_max, idx = numpy.max(new_logits), numpy.argmax(new_logits) + assert curr_max > filter_value + new_logits = numpy.delete(new_logits, idx) + + assert numpy.all(new_logits == filter_value) + + def test_apply_top_p( + self, + ): + # logits for opt usually have shape (1,1,51200) + logits = 0.1 * numpy.ones(10).reshape((1, 1, 10)) + + token_generator = TokenGenerator( + logits=logits, + top_p=0.89, + ) + + filter_value = -float("Inf") + new_logits = token_generator.apply_top_p( + logits.copy(), filter_value=filter_value + ) + for _ in range(1): + curr_min, idx = numpy.min(new_logits), numpy.argmin(new_logits) + assert curr_min == filter_value + new_logits = numpy.delete(new_logits, idx) + + assert numpy.all(new_logits != filter_value) + + def test_generate_token( + self, + logits_fixture, + token_fixture, + ): + logits, tokens = logits_fixture(), token_fixture() + token_generator = TokenGenerator( + logits=logits, + tokens=(tokens + tokens), + ) + new_token = token_generator.generate(logits=logits) + assert new_token == token_generator.tokens[-1] + assert len(token_generator.tokens) == len(tokens + tokens) + 1 From f5c2a3199f0d40a99ee752921ff9986b3eb43f0e Mon Sep 17 00:00:00 2001 From: horheynm Date: Fri, 8 Sep 2023 05:06:06 +0000 Subject: [PATCH 07/11] comments and bug fixes --- src/deepsparse/pipeline.py | 6 +- .../transformers/engines/nl_decoder_engine.py | 5 +- .../transformers/pipelines/text_generation.py | 46 +++++++--- .../transformers/utils/token_generator.py | 87 +++++++++++++++---- .../utils/test_token_generator.py | 28 +++--- 5 files changed, 126 insertions(+), 46 deletions(-) diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py index 2b6b6c9854..1fb09cca0f 100644 --- a/src/deepsparse/pipeline.py +++ b/src/deepsparse/pipeline.py @@ -232,7 +232,7 @@ def __call__(self, *args, **kwargs) -> BaseModel: f"Inputs parsed to {type(pipeline_inputs)}" ) # batch size of the inputs may be `> self._batch_size` at this point - engine_inputs: List[numpy.ndarray] = self.process_inputs(pipeline_inputs) + engine_inputs = self.process_inputs(pipeline_inputs) if isinstance(engine_inputs, tuple): engine_inputs, context = engine_inputs else: @@ -494,7 +494,9 @@ def split_engine_inputs( return split_engine_inputs(items, batch_size) def engine_forward( - self, engine_inputs: List[numpy.ndarray], context: Dict = {} + self, + engine_inputs: List[numpy.ndarray], + context: Dict = {}, ) -> List[numpy.ndarray]: """ :param engine_inputs: list of numpy inputs to Pipeline engine forward diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py index 25b1c51fcd..ec35abf5f1 100644 --- a/src/deepsparse/transformers/engines/nl_decoder_engine.py +++ b/src/deepsparse/transformers/engines/nl_decoder_engine.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional import numpy from transformers import AutoTokenizer @@ -176,7 +176,7 @@ def __call__( self, inp: List[numpy.ndarray], val_inp: bool = True, - ) -> Tuple[numpy.ndarray, numpy.ndarray]: + ) -> numpy.ndarray: """ The main entry point for running the engine. @@ -192,7 +192,6 @@ def __call__( inp = self.add_kv_cache_to_input(inp) out = self.run(inp, val_inp) - if self.kv_cache: logits, *kv_cache_state = out self.update_kv_cache( diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py index 2c52878fa2..ab0de0c517 100644 --- a/src/deepsparse/transformers/pipelines/text_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation.py @@ -117,19 +117,27 @@ class Config: " Default is `None`.", ) top_p: Optional[float] = Field( - default=0, - description="Select the tokens with cumulative probability sum" - " higher than the given top_p", + default=0.0, + description="Used for filtering generated tokens. Keep the" + " tokens where its cumulative probability is >= top_p" + " Default set to 0.0", ) top_k: Optional[int] = Field( - default=0.0, - description="Select the tokens with top_k values", + default=0, + description="Used for filtering generated tokens. Keep" + " top_k generated tokens. Default set to 0", ) presence_penalty: Optional[float] = Field( default=0.0, + description="Penalty applied for generating new token. Any existing" + " token results in the subtraction of its corresponding logit value." + " Default set to 0.0", ) frquency_peanlty: Optional[float] = Field( default=0.0, + description="Penalty applied for generating new token. Existing" + " token frequencies summed to subtraction the logit of its" + " corresponding logit value. Default set to 0.0.", ) @@ -429,7 +437,12 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]: include_prompt_logits=inputs.include_prompt_logits, callback=inputs.callback, stop=inputs.stop, + top_p=inputs.top_p, + top_k=inputs.top_k, + presence_penalty=inputs.presence_penalty, + frequency_penalty=inputs.presence_penalty, ) + return engine_input, postprocessing_kwargs def process_engine_outputs( @@ -450,7 +463,9 @@ def process_engine_outputs( return TextGenerationOutput(sequences=sequences, logits=logits) def engine_forward( - self, engine_inputs: List[numpy.ndarray], context: Dict + self, + engine_inputs: List[numpy.ndarray], + context: Dict, ) -> Tuple[numpy.ndarray, numpy.ndarray]: """ Run the forward pass on the engine. @@ -471,9 +486,13 @@ def engine_forward( if not self.cache_support_enabled: prompt_logits = self.multitoken_engine(engine_inputs) - token_generator = TokenGenerator(prompt_logits[0]) + token_generator = TokenGenerator( + logits_shape=prompt_logits[-1].shape[-1], + deterministic=self.deterministic, + **context, + ) for prompt_logit in prompt_logits: - token = token_generator.generate(prompt_logit) + token_generator.generate(prompt_logit) return numpy.array([self.tokens]), prompt_logits else: @@ -482,10 +501,14 @@ def engine_forward( prompt_logits = self.prompt_inference(engine_inputs) tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist() - token_generator = TokenGenerator(logits=prompt_logits[-1], tokens=tokens) - token_generator.generate(prompt_logits[-1]) + token_generator = TokenGenerator( + logits_shape=prompt_logits[-1].shape[-1], + tokens=tokens, + deterministic=self.deterministic, + **context, + ) + token_generator.generate(prompt_logits[-1][0, -1, :]) - tokens = [] if streamer is not None: streamer.put(numpy.array(token_generator.tokens)) @@ -514,7 +537,6 @@ def engine_forward( tokens=token_generator.tokens ) token = token_generator.generate(logits=logits[0, -1, :]) - generated_tokens.append(token) generated_logits.append(logits) diff --git a/src/deepsparse/transformers/utils/token_generator.py b/src/deepsparse/transformers/utils/token_generator.py index ee6554fb71..6a54486278 100644 --- a/src/deepsparse/transformers/utils/token_generator.py +++ b/src/deepsparse/transformers/utils/token_generator.py @@ -20,9 +20,15 @@ class TokenGenerator: + """ + Responsible for generating tokens, and contains functions that + token generation depends on including different sampling and + filtering methods + """ + def __init__( self, - logits: numpy.ndarray, + logits_shape: int, tokens: List[int] = [], deterministic: bool = True, sampling_temperature: float = 1.0, @@ -32,25 +38,38 @@ def __init__( presence_penalty: float = 0.0, **kwargs, ): - self.token_frequencies = numpy.zeros(logits.shape[-1]) + """ + :param logits_shape: int representing the size/length of the logit + to be used. Note that generated token will have the upper bound of + this value + :param tokens: Any previously generated tokens. Used to keep frequncy counts + to be used for penalty calculations + :param deterministic: set to True will always return the same output with the + same inputs + :param sampling_temperature: used to add randomness to the generated token + :param top_k: select top_k logit values + :param top_p: select the cumulative sum of the logits values outside of top_p + :param frequency_penalty: subtract its value and its token frequency count + to thelogit + :param presence_penalty: subtract any corresponding logit with existing tokens + """ + self.token_frequencies = numpy.zeros(logits_shape) self.deterministic = deterministic - self.sampling_termperature = sampling_temperature + self.sampling_temperature = sampling_temperature self.top_k = top_k self.top_p = top_p self.frequency_penalty = frequency_penalty self.presence_penalty = presence_penalty - self.tokens = [] + self.tokens = tokens for token in tokens: - self.update_frequencies(token) - - def update_frequencies(self, token: numpy.ndarray): - self.tokens.append(token) - self.token_frequencies[token] += 1 + self._update_frequencies(token) def generate(self, logits: numpy.ndarray) -> numpy.ndarray: """ - Samples a token from the logits using the sampling temperature. + Samples a token from the logits. If non-deterministic, logits that tokens + get generated from will be a function of sampling_temperature, top_k, top_p, + frequency_penalty and presence_penalty. :param logits: the logits from the model with shape (vocab_size,) :return: the sampled token @@ -74,22 +93,37 @@ def generate(self, logits: numpy.ndarray) -> numpy.ndarray: if self.presence_penalty != 0.0: logits = self.apply_presence_penalty(logits) - probs = self.numpy_softmax(logits) + probs = numpy_softmax(logits) token = numpy.random.choice(len(probs), p=probs) - self.update_frequencies(token) + + self.tokens.append(token) + self._update_frequencies(token) return token - def apply_frequency_penalty(self, logits: numpy.ndarray): + def apply_frequency_penalty(self, logits: numpy.ndarray) -> numpy.ndarray: + """Apply frequency_penalty based on the token frequency count""" logits -= self.frequency_penalty * self.token_frequencies return logits - def apply_presence_penalty(self, logits: numpy.ndarray): + def apply_presence_penalty(self, logits: numpy.ndarray) -> numpy.ndarray: + """ + Apply prensence_penaly to any logits where there exists + a token + """ logits -= self.presence_penalty * (self.token_frequencies > 0) return logits - # from https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 - def apply_top_k(self, logits: numpy.ndarray, filter_value=-float("Inf")): + # from https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf31 + def apply_top_k( + self, logits: numpy.ndarray, filter_value=-float("Inf") + ) -> numpy.ndarray: + """ + Keep top_k logits based on its value. All other values + will be overwritten to filter_value + + :param filter_value: value to overwrite non-top_k values + """ logits_shape = logits.shape logits = logits.reshape(logits.shape[-1]) top_k_indices = numpy.argpartition(logits, -self.top_k)[-self.top_k :] @@ -99,8 +133,19 @@ def apply_top_k(self, logits: numpy.ndarray, filter_value=-float("Inf")): # from https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 def apply_top_p( - self, logits: numpy.ndarray, filter_value=-float("Inf"), min_tokens_to_keep=1 - ): + self, + logits: numpy.ndarray, + filter_value=-float("Inf"), + min_tokens_to_keep: int = 1, + ) -> numpy.ndarray: + """ + Keep any logits' cumulative sum <= top_p. non top_p logits will be + overwritten to filter_value + + :param filter_value: value to overwrite non-top_p values + :param min_tokens_to_keep: number of logit values to keep to avoid + zero valued logits + """ logits_shape = logits.shape logits = logits.reshape(logits.shape[-1]) @@ -108,7 +153,8 @@ def apply_top_p( sorted_logits = logits[sorted_indices] logit_cumulative_probs = numpy.cumsum(numpy_softmax(sorted_logits)) - # Remove tokens with cumulative top_p above the threshold (token with 0 are kept) + # Remove tokens with cumulative top_p above the threshold + # (token with 0 are kept) sorted_indices_to_remove = logit_cumulative_probs > self.top_p # Keep at least min_tokens_to_keep sorted_indices_to_remove[..., -min_tokens_to_keep:] = 0 @@ -118,3 +164,6 @@ def apply_top_p( logits[indices_to_remove] = filter_value return logits.reshape(logits_shape) + + def _update_frequencies(self, token: numpy.ndarray): + self.token_frequencies[token] += 1 diff --git a/tests/deepsparse/transformers/utils/test_token_generator.py b/tests/deepsparse/transformers/utils/test_token_generator.py index cf303d5858..7f5e4b0751 100644 --- a/tests/deepsparse/transformers/utils/test_token_generator.py +++ b/tests/deepsparse/transformers/utils/test_token_generator.py @@ -44,7 +44,9 @@ def test_update_frequencies( logits, tokens = logits_fixture(), token_fixture( token_max_thresh=token_max_thresh ) - token_generator = TokenGenerator(logits=logits, tokens=tokens) + token_generator = TokenGenerator( + logits_shape=logits[-1].shape[-1], tokens=tokens.copy() + ) assert token_generator.tokens == tokens @@ -55,9 +57,10 @@ def test_update_frequencies( for key, value in freq.items(): assert token_generator.token_frequencies[key] == value - # test TokenGenerator.update_frequencies - new_token = numpy.random.randint(0, token_max_thresh) - token_generator.update_frequencies(new_token) + # test TokenGenerator._update_frequencies + new_token = token_fixture(shape=1)[0] + token_generator.tokens.append(new_token) + token_generator._update_frequencies(new_token) assert token_generator.tokens == tokens + [new_token] freq[new_token] += 1 @@ -72,7 +75,9 @@ def test_apply_frequency_penalty( logits, tokens = logits_fixture(), token_fixture() frequency_penalty = 1.0 token_generator = TokenGenerator( - logits=logits, tokens=(tokens + tokens), frequency_penalty=frequency_penalty + logits_shape=logits[-1].shape[-1], + tokens=(tokens + tokens), + frequency_penalty=frequency_penalty, ) test_logits = token_generator.token_frequencies @@ -89,7 +94,9 @@ def test_apply_presence_penalty( logits, tokens = logits_fixture(), token_fixture() presence_penalty = 1.0 token_generator = TokenGenerator( - logits=logits, tokens=(tokens + tokens), presence_penalty=presence_penalty + logits_shape=logits[-1].shape[-1], + tokens=(tokens + tokens), + presence_penalty=presence_penalty, ) test_logits = token_generator.token_frequencies # numpy arrays by default are pass by ref @@ -104,7 +111,7 @@ def test_apply_topk( logits = numpy.linspace(0, 1, 11).reshape((1, 1, 11)) token_generator = TokenGenerator( - logits=logits, + logits_shape=logits[-1].shape[-1], top_k=3, ) @@ -127,7 +134,7 @@ def test_apply_top_p( logits = 0.1 * numpy.ones(10).reshape((1, 1, 10)) token_generator = TokenGenerator( - logits=logits, + logits_shape=logits[-1].shape[-1], top_p=0.89, ) @@ -149,9 +156,10 @@ def test_generate_token( ): logits, tokens = logits_fixture(), token_fixture() token_generator = TokenGenerator( - logits=logits, + logits_shape=logits[-1].shape[-1], tokens=(tokens + tokens), + deterministic=False, ) - new_token = token_generator.generate(logits=logits) + new_token = token_generator.generate(logits=logits[0, -1, :]) assert new_token == token_generator.tokens[-1] assert len(token_generator.tokens) == len(tokens + tokens) + 1 From 3e4ff9e7589af08086d62756c8f9d0f2569a1fa9 Mon Sep 17 00:00:00 2001 From: horheynm Date: Mon, 11 Sep 2023 20:15:36 +0000 Subject: [PATCH 08/11] commnets --- src/deepsparse/transformers/pipelines/text_generation.py | 4 ++-- src/deepsparse/transformers/utils/token_generator.py | 9 +++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py index ab0de0c517..60478c3bf2 100644 --- a/src/deepsparse/transformers/pipelines/text_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation.py @@ -133,7 +133,7 @@ class Config: " token results in the subtraction of its corresponding logit value." " Default set to 0.0", ) - frquency_peanlty: Optional[float] = Field( + frequency_penalty: Optional[float] = Field( default=0.0, description="Penalty applied for generating new token. Existing" " token frequencies summed to subtraction the logit of its" @@ -440,7 +440,7 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]: top_p=inputs.top_p, top_k=inputs.top_k, presence_penalty=inputs.presence_penalty, - frequency_penalty=inputs.presence_penalty, + frequency_penalty=inputs.frequency_penalty, ) return engine_input, postprocessing_kwargs diff --git a/src/deepsparse/transformers/utils/token_generator.py b/src/deepsparse/transformers/utils/token_generator.py index 6a54486278..9d25559227 100644 --- a/src/deepsparse/transformers/utils/token_generator.py +++ b/src/deepsparse/transformers/utils/token_generator.py @@ -62,8 +62,8 @@ def __init__( self.frequency_penalty = frequency_penalty self.presence_penalty = presence_penalty self.tokens = tokens - for token in tokens: - self._update_frequencies(token) + + self._initialize_token_frequencies() def generate(self, logits: numpy.ndarray) -> numpy.ndarray: """ @@ -167,3 +167,8 @@ def apply_top_p( def _update_frequencies(self, token: numpy.ndarray): self.token_frequencies[token] += 1 + + def _initialize_token_frequencies(self): + unique_tokens, frequencies = numpy.unique(self.tokens, return_counts=True) + for token, frequnecies in zip(unique_tokens, frequencies): + self.token_frequencies[token] += frequnecies From 57ab9ddbcdbe405a56497b8da45d5c47568f61f6 Mon Sep 17 00:00:00 2001 From: horheynm Date: Mon, 11 Sep 2023 21:21:29 +0000 Subject: [PATCH 09/11] remove generted tokent est form nldecoder engine --- .../transformers/engine/test_nl_decoder_engine.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/tests/deepsparse/transformers/engine/test_nl_decoder_engine.py b/tests/deepsparse/transformers/engine/test_nl_decoder_engine.py index f4c8cc2f97..7d80aa6ada 100644 --- a/tests/deepsparse/transformers/engine/test_nl_decoder_engine.py +++ b/tests/deepsparse/transformers/engine/test_nl_decoder_engine.py @@ -17,7 +17,6 @@ import numpy as np from deepsparse.transformers.engines import NLDecoderEngine -from flaky import flaky class DummyKVCacheDecoder: @@ -32,20 +31,6 @@ class DummyEngine: input_names = ["input_1", "input_2", "past_key_values_1", "past_key_values_2"] -@flaky(max_runs=10, min_passes=1) -def test_generate_token(): - logits = np.array([1.0, 11, 0.9, 0.8]) - expected_token = 1 - - with patch.object(NLDecoderEngine, "__init__", lambda x, y, z: None): - engine = NLDecoderEngine(None, None) - engine.deterministic = False - engine.sampling_temperature = 1.0 - token = engine.generate_token(logits) - - assert expected_token == token - - def test_add_kv_cache_to_input(): # keep only the first two inputs # (corresponding to "input_1" and "input_2") From bad29f47db9ba9d6dc6878bb4b0190933d5bf600 Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Wed, 13 Sep 2023 09:14:23 -0400 Subject: [PATCH 10/11] update prompt seq len name Co-authored-by: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> --- src/deepsparse/transformers/pipelines/text_generation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py index e757def0f2..b6fffe7af6 100644 --- a/src/deepsparse/transformers/pipelines/text_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation.py @@ -630,7 +630,7 @@ def prompt_inference( self.multitoken_engine.reset_kv_cache() for engine_inputs in self.engine_inputs_for_prefill(tokens): new_logits = self.multitoken_engine(engine_inputs) - num_tokens_processed += self.prompt_processing_sequence_length + num_tokens_processed += self.prompt_sequence_length prompt_logits.append(new_logits) if num_tokens_processed: From e0ab60690a2355b2e83206789759b4c7e50eeb33 Mon Sep 17 00:00:00 2001 From: horheynm Date: Thu, 14 Sep 2023 21:18:01 +0000 Subject: [PATCH 11/11] readd missing code --- src/deepsparse/transformers/pipelines/text_generation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py index b6fffe7af6..f9ccb4b78a 100644 --- a/src/deepsparse/transformers/pipelines/text_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation.py @@ -471,6 +471,8 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]: max_tokens=inputs.max_tokens, ) + return engine_input, context + def process_engine_outputs( self, engine_outputs: List[numpy.ndarray], **kwargs ) -> TextGenerationOutput: