Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature Branch][LLM Testing] Full Testing Harness for LLMs #1216

Merged
merged 20 commits into from
Sep 13, 2023
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
b7133a0
initial commit
dbogunowicz Aug 28, 2023
6c9ab1d
Merge branch 'main' of https://github.com/neuralmagic/deepsparse into…
dbogunowicz Aug 29, 2023
a47f977
initial commit
dbogunowicz Aug 29, 2023
90382bd
Merge branch 'main' into feature/damian/llm_testing_feature_branch
dbogunowicz Aug 31, 2023
37116cb
Merge branch 'main' into feature/damian/llm_testing_feature_branch
dbogunowicz Sep 5, 2023
87fe78b
Merge branch 'main' into feature/damian/llm_testing_feature_branch
dbogunowicz Sep 6, 2023
afb2a2a
Merge branch 'main' into feature/damian/llm_testing_feature_branch
dbogunowicz Sep 7, 2023
0477551
[Feature Branch][LLM Testing] Create GroundTruthSource objects (#1219)
dbogunowicz Sep 7, 2023
a13d7a8
Merge branch 'main' into feature/damian/llm_testing_feature_branch
dbogunowicz Sep 8, 2023
92163b9
fix tests
dbogunowicz Sep 8, 2023
567337f
Merge branch 'main' into feature/damian/llm_testing_feature_branch
dbogunowicz Sep 8, 2023
bfe1b62
Dipika's comments plus adjusting the script to renamed variables
dbogunowicz Sep 11, 2023
27b45e6
Merge branch 'feature/damian/llm_testing_feature_branch' of https://g…
dbogunowicz Sep 11, 2023
f2693ff
remove ORT ground truth
dbogunowicz Sep 12, 2023
dd270a2
add OPT tests
dbogunowicz Sep 12, 2023
c22f5f2
Merge branch 'main' into feature/damian/llm_testing_feature_branch
dbogunowicz Sep 12, 2023
a975402
Merge remote-tracking branch 'origin/main' into feature/damian/llm_te…
dbogunowicz Sep 13, 2023
646d6f5
rebase and disable tests in GHA
dbogunowicz Sep 13, 2023
17e63ee
quality
dbogunowicz Sep 13, 2023
d1379b7
Merge branch 'main' into feature/damian/llm_testing_feature_branch
dbogunowicz Sep 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
210 changes: 210 additions & 0 deletions tests/deepsparse/transformers/pipelines/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import ABC, abstractmethod
from typing import Any, Dict, List, Tuple

import numpy
import onnx
import onnxruntime
from transformers import AutoModelForCausalLM, AutoTokenizer

from deepsparse.transformers.utils.helpers import create_causal_mask
from deepsparse.utils.onnx import (
CACHE_INPUT_PREFIX,
overwrite_onnx_model_inputs_for_kv_cache_models,
)
from sparsezoo import Model


class GroundTruthSource(ABC):
def __init__(self, model_name: str):
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "left"
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token

self.tokenizer = tokenizer

@abstractmethod
def tokenize(self, prompt: str) -> Dict[str, Any]:
"""
:param prompt: The prompt to tokenize
:return: A dictionary of tokenized inputs
"""
raise NotImplementedError()

@abstractmethod
def __call__(self, prompt: str) -> Any:
"""
:param prompt: The prompt to generate from
:return: Ground truth logits / cache state
"""
raise NotImplementedError()


class ORTGroundTruthSource(GroundTruthSource):
dbogunowicz marked this conversation as resolved.
Show resolved Hide resolved
"""
An object that generates ground truth logits and
cache states from a prompt. This object cannot
generate tokens in an autoregressive manner, and thus
will only output prompt logits and prompt cache state
"""

def __init__(
self,
model_stub: str,
model_name: str,
sequence_length: int = 256,
):
super().__init__(model_name)

self.model_onnx_path = Model(model_stub).deployment.get_file("model.onnx").path
overwrite_onnx_model_inputs_for_kv_cache_models(
self.model_onnx_path,
sequence_length=sequence_length,
input_ids_length=sequence_length,
)
self.sequence_length = sequence_length
self.session = onnxruntime.InferenceSession(self.model_onnx_path)
self.model_inputs = [
x.name
for x in onnx.load(
self.model_onnx_path, load_external_data=False
).graph.input
]

def tokenize(self, prompt: str):
return self.tokenizer(
dbogunowicz marked this conversation as resolved.
Show resolved Hide resolved
prompt,
return_tensors="np",
padding="max_length",
max_length=self.sequence_length,
)

def __call__(self, prompt: str) -> Tuple[numpy.ndarray, List[numpy.ndarray]]:
inputs = self.tokenize(prompt)
kv_cache = self._initialize_kv_cache_state()

onnxruntime_inputs = dict(
attention_mask=inputs["attention_mask"],
input_ids=inputs["input_ids"],
**kv_cache,
)

if "positions" in self.model_inputs:
attention_mask = inputs["attention_mask"]
positions = attention_mask.cumsum(1) * attention_mask - 1
onnxruntime_inputs["positions"] = positions

if "causal_mask" in self.model_inputs:
causal_mask = create_causal_mask(
input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]
)
onnxruntime_inputs["causal_mask"] = causal_mask

# run inference and return the cache state
outputs = self.session.run(None, onnxruntime_inputs)
prompt_logits, *prompt_cache = outputs

# remove logits that correspond to padding tokens
prompt_logits = numpy.compress(
onnxruntime_inputs["attention_mask"].flatten(), prompt_logits, axis=1
) # (1, prompt_length, vocab_size)
prompt_logits = prompt_logits[:, :-1, :] # (1, prompt_length, vocab_size)

# remove cache that corresponds to padding tokens
prompt_cache = [
numpy.compress(
onnxruntime_inputs["attention_mask"].flatten(), cache, axis=2
)
for cache in prompt_cache
] # List[(1, num_heads, past_length, head_dim)]

return prompt_logits, prompt_cache

def _initialize_kv_cache_state(self, length: int = 0) -> Dict[str, numpy.ndarray]:
model = onnx.load(self.model_onnx_path, load_external_data=False)

cache_input = next(
input
for input in model.graph.input
if input.name.startswith(CACHE_INPUT_PREFIX)
)
# read the shape of the cache input
batch_size = cache_input.type.tensor_type.shape.dim[0].dim_value
num_attention_heads = cache_input.type.tensor_type.shape.dim[1].dim_value
hidden_dims = cache_input.type.tensor_type.shape.dim[3].dim_value

# create a kv cache dictionary
kv_cache = {
input_.name: numpy.zeros(
(batch_size, num_attention_heads, length, hidden_dims),
dtype=numpy.float32,
)
for input_ in model.graph.input
if input_.name.startswith(CACHE_INPUT_PREFIX)
}
return kv_cache


class TorchGroundTruthSource(GroundTruthSource):
"""
An object that generates ground truth logits and
cache states from a prompt. This object can
generate tokens in an autoregressive manner, and thus
will output prompt logits, generated logits, generated
sequence and prompt cache state
"""

def __init__(self, num_tokens_to_generate: int, model_name: str):
super().__init__(model_name)
self.model = AutoModelForCausalLM.from_pretrained(model_name)
self.num_tokens_to_generate = num_tokens_to_generate

def tokenize(self, prompt: str):
return self.tokenizer(prompt, return_tensors="pt")

def __call__(
self, prompt: str
) -> Tuple[numpy.ndarray, numpy.ndarray, List[numpy.ndarray], str]:
# afaik it is not possible to get 'past_key_values' from
# the generate method, so we have to run the model twice
out = self.model.generate(
self.tokenize(prompt).input_ids,
max_new_tokens=self.num_tokens_to_generate,
output_scores=True,
return_dict_in_generate=True,
use_cache=True,
)
generated_text = self.tokenizer.decode(
out.sequences[0], skip_special_tokens=True
)
generated_logits = numpy.concatenate(
[[score.numpy() for score in out.scores]]
).transpose(
1, 0, 2
) # (1, num_tokens_to_generate, vocab_size)

out = self.model(**self.tokenize(prompt))
prompt_logits = out.logits.detach().numpy()[
:, :-1, :
] # (1, prompt_length, vocab_size)
prompt_cache = [
entry.detach().numpy()
for key_value_tuple in out.past_key_values
for entry in key_value_tuple
] # List[(1, num_heads, past_length, head_dim)]

return generated_logits, prompt_logits, prompt_cache, generated_text
Loading