[Feature Branch][LLM Testing] Create GroundTruthSource objects (#1219)

* initial commit * finish creation of helper objects * Update tests/conftest.py * small refactor * [Feature Branch][LLM Testing] LLM Testing Suite (#1227) * Update README.md * Update src/deepsparse/yolov8/README.md * Update text_generation.py * quality * readability * all tests passing * added some full kv cache tests * initial commit * ready for review * Delete tests/deepsparse/transformers/pipelines/proposal_text_generation_tests.md
neuralmagic · Sep 7, 2023 · 0477551 · 0477551
1 parent afb2a2a
commit 0477551
Show file tree

Hide file tree

Showing 4 changed files with 570 additions and 261 deletions.
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -291,6 +291,18 @@ def initialize_engines(
             self.cache_support_enabled and self.enable_multitoken_prefill
         ) or not self.cache_support_enabled:
 
+            # input_ids_length for the multitoken engine is either:
+            # - the prompt_sequence_length if the cache support is enabled
+            #   (the prompt is processed sequentially at predefined processing length)
+            # - the full sequence_length if the cache support is disabled
+            #   (the prompt is processed in a single pass, prompts length is fixed at
+            #   sequence_length)
+            input_ids_length = (
+                self.prompt_sequence_length
+                if self.cache_support_enabled
+                else self.sequence_length
+            )
+
             multitoken_engine = NLDecoderEngine(
                 onnx_file_path=self.onnx_file_path,
                 engine_type=self.engine_type,
@@ -299,7 +311,7 @@ def initialize_engines(
                 sampling_temperature=self.sampling_temperature,
                 deterministic=self.deterministic,
                 sequence_length=self.sequence_length,
-                input_ids_length=self.prompt_sequence_length,
+                input_ids_length=input_ids_length,
                 tokenizer=self.tokenizer,
                 internal_kv_cache=self.internal_kv_cache,
             )
@@ -549,10 +561,11 @@ def prompt_inference(
                 num_tokens_processed += self.prompt_sequence_length
                 prompt_logits.append(new_logits)
 
-        self.engine.reset_kv_cache()
         if num_tokens_processed:
             # transfer the cache state from the multi-token engine to the main engine
             self.engine.transfer_cache_state(cache=self.multitoken_engine.kv_cache)
+        else:
+            self.engine.reset_kv_cache()
 
         # prompt size is small, run autoregressive inference to populate kv cache
         run_tokens = [] if num_tokens_processed == 0 else tokens[:num_tokens_processed]

diff --git a/tests/deepsparse/transformers/pipelines/helpers.py b/tests/deepsparse/transformers/pipelines/helpers.py
@@ -0,0 +1,210 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Tuple
+
+import numpy
+import onnx
+import onnxruntime
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from deepsparse.transformers.utils.helpers import (
+    create_causal_mask,
+    overwrite_onnx_model_inputs_for_kv_cache_models,
+)
+from deepsparse.utils.onnx import CACHE_INPUT_PREFIX
+from sparsezoo import Model
+
+
+class GroundTruthSource(ABC):
+    def __init__(self, model_name: str):
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        tokenizer.padding_side = "left"
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+
+        self.tokenizer = tokenizer
+
+    @abstractmethod
+    def tokenize(self, prompt: str) -> Dict[str, Any]:
+        """
+        :param prompt: The prompt to tokenize
+        :return: A dictionary of tokenized inputs
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def __call__(self, prompt: str) -> Any:
+        """
+        :param prompt: The prompt to generate from
+        :return: Ground truth logits / cache state
+        """
+        raise NotImplementedError()
+
+
+class ORTGroundTruthSource(GroundTruthSource):
+    """
+    An object that generates ground truth logits and
+    cache states from a prompt. This object cannot
+    generate tokens in an autoregressive manner, and thus
+    will only output prompt logits and prompt cache state
+    """
+
+    def __init__(
+        self,
+        model_stub: str,
+        model_name: str,
+        sequence_length: int = 256,
+    ):
+        super().__init__(model_name)
+
+        self.model_onnx_path = Model(model_stub).deployment.get_file("model.onnx").path
+        overwrite_onnx_model_inputs_for_kv_cache_models(
+            self.model_onnx_path,
+            sequence_length=sequence_length,
+            input_ids_length=sequence_length,
+        )
+        self.sequence_length = sequence_length
+        self.session = onnxruntime.InferenceSession(self.model_onnx_path)
+        self.model_inputs = [
+            x.name
+            for x in onnx.load(
+                self.model_onnx_path, load_external_data=False
+            ).graph.input
+        ]
+
+    def tokenize(self, prompt: str):
+        return self.tokenizer(
+            prompt,
+            return_tensors="np",
+            padding="max_length",
+            max_length=self.sequence_length,
+        )
+
+    def __call__(self, prompt: str) -> Tuple[numpy.ndarray, List[numpy.ndarray]]:
+        inputs = self.tokenize(prompt)
+        kv_cache = self._initialize_kv_cache_state()
+
+        onnxruntime_inputs = dict(
+            attention_mask=inputs["attention_mask"],
+            input_ids=inputs["input_ids"],
+            **kv_cache,
+        )
+
+        if "positions" in self.model_inputs:
+            attention_mask = inputs["attention_mask"]
+            positions = attention_mask.cumsum(1) * attention_mask - 1
+            onnxruntime_inputs["positions"] = positions
+
+        if "causal_mask" in self.model_inputs:
+            causal_mask = create_causal_mask(
+                input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]
+            )
+            onnxruntime_inputs["causal_mask"] = causal_mask
+
+        # run inference and return the cache state
+        outputs = self.session.run(None, onnxruntime_inputs)
+        prompt_logits, *prompt_cache = outputs
+
+        # remove logits that correspond to padding tokens
+        prompt_logits = numpy.compress(
+            onnxruntime_inputs["attention_mask"].flatten(), prompt_logits, axis=1
+        )  # (1, prompt_length, vocab_size)
+        prompt_logits = prompt_logits[:, :-1, :]  # (1, prompt_length, vocab_size)
+
+        # remove cache that corresponds to padding tokens
+        prompt_cache = [
+            numpy.compress(
+                onnxruntime_inputs["attention_mask"].flatten(), cache, axis=2
+            )
+            for cache in prompt_cache
+        ]  # List[(1, num_heads, past_length, head_dim)]
+
+        return prompt_logits, prompt_cache
+
+    def _initialize_kv_cache_state(self, length: int = 0) -> Dict[str, numpy.ndarray]:
+        model = onnx.load(self.model_onnx_path, load_external_data=False)
+
+        cache_input = next(
+            input
+            for input in model.graph.input
+            if input.name.startswith(CACHE_INPUT_PREFIX)
+        )
+        # read the shape of the cache input
+        batch_size = cache_input.type.tensor_type.shape.dim[0].dim_value
+        num_attention_heads = cache_input.type.tensor_type.shape.dim[1].dim_value
+        hidden_dims = cache_input.type.tensor_type.shape.dim[3].dim_value
+
+        # create a kv cache dictionary
+        kv_cache = {
+            input_.name: numpy.zeros(
+                (batch_size, num_attention_heads, length, hidden_dims),
+                dtype=numpy.float32,
+            )
+            for input_ in model.graph.input
+            if input_.name.startswith(CACHE_INPUT_PREFIX)
+        }
+        return kv_cache
+
+
+class TorchGroundTruthSource(GroundTruthSource):
+    """
+    An object that generates ground truth logits and
+    cache states from a prompt. This object can
+    generate tokens in an autoregressive manner, and thus
+    will output prompt logits, generated logits, generated
+    sequence and prompt cache state
+    """
+
+    def __init__(self, num_tokens_to_generate: int, model_name: str):
+        super().__init__(model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(model_name)
+        self.num_tokens_to_generate = num_tokens_to_generate
+
+    def tokenize(self, prompt: str):
+        return self.tokenizer(prompt, return_tensors="pt")
+
+    def __call__(
+        self, prompt: str
+    ) -> Tuple[numpy.ndarray, numpy.ndarray, List[numpy.ndarray], str]:
+        # afaik it is not possible to get 'past_key_values' from
+        # the generate method, so we have to run the model twice
+        out = self.model.generate(
+            self.tokenize(prompt).input_ids,
+            max_new_tokens=self.num_tokens_to_generate,
+            output_scores=True,
+            return_dict_in_generate=True,
+            use_cache=True,
+        )
+        generated_text = self.tokenizer.decode(
+            out.sequences[0], skip_special_tokens=True
+        )
+        generated_logits = numpy.concatenate(
+            [[score.numpy() for score in out.scores]]
+        ).transpose(
+            1, 0, 2
+        )  # (1, num_tokens_to_generate, vocab_size)
+
+        out = self.model(**self.tokenize(prompt))
+        prompt_logits = out.logits.detach().numpy()[
+            :, :-1, :
+        ]  # (1, prompt_length, vocab_size)
+        prompt_cache = [
+            entry.detach().numpy()
+            for key_value_tuple in out.past_key_values
+            for entry in key_value_tuple
+        ]  # List[(1, num_heads, past_length, head_dim)]
+
+        return generated_logits, prompt_logits, prompt_cache, generated_text
diff --git a/tests/deepsparse/transformers/pipelines/proposal_text_generation_tests.md b/tests/deepsparse/transformers/pipelines/proposal_text_generation_tests.md