Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add long context tests and fix tokenizer truncation for activated rope_scaling #524

Merged
merged 25 commits into from
Jul 27, 2023
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
23b7782
Add long context tests and fix tokenizer truncation for activated rop…
arnocandel Jul 21, 2023
db8e125
Speed up long context tests.
arnocandel Jul 21, 2023
e99e0b7
Use xfail.
arnocandel Jul 21, 2023
9afcbe0
Constrain tests for now.
arnocandel Jul 21, 2023
7cbab04
Only do one long context test.
arnocandel Jul 22, 2023
80bd4d1
Use max_position_embeddings instead of max_length. Add alpha_value.
arnocandel Jul 24, 2023
936339c
Remove exllama parameter.
arnocandel Jul 24, 2023
ea2e49d
Merge remote-tracking branch 'origin/main' into long-context
arnocandel Jul 24, 2023
dd98629
Avoid corner case.
arnocandel Jul 24, 2023
137d66f
Show that even orig case isn't perfect.
arnocandel Jul 24, 2023
1fb6e8c
Merge remote-tracking branch 'origin/main' into long-context
arnocandel Jul 25, 2023
6263df7
Use uuid based key/values. Use the actual tokenizer to properly count…
arnocandel Jul 25, 2023
bc25e1a
Add long context test from transformers PR.
arnocandel Jul 25, 2023
81460eb
Add llama2 7b
arnocandel Jul 26, 2023
abf14e9
Fork test.
arnocandel Jul 26, 2023
2ee23d1
Increase test coverage.
arnocandel Jul 26, 2023
d8d4dc2
Cosmetics for easier debugging.
arnocandel Jul 26, 2023
4566265
Only enable passing tests with dynamic for now, don't need linear.
arnocandel Jul 26, 2023
9672d12
Expect scaled max_seq_len
arnocandel Jul 26, 2023
1d183c4
Merge remote-tracking branch 'origin/main' into long-context
arnocandel Jul 26, 2023
c348663
Fix merge conflict.
arnocandel Jul 26, 2023
45e40aa
Remove unused arg.
arnocandel Jul 26, 2023
d98cf3c
Merge remote-tracking branch 'origin/main' into long-context
arnocandel Jul 27, 2023
112eec9
Merge remote-tracking branch 'origin/main' into long-context
arnocandel Jul 27, 2023
76cefa7
Cleanup
arnocandel Jul 27, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/client_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def get_client(serialize=True):
return client


def get_args(prompt, prompt_type, chat=False, stream_output=False,
def get_args(prompt, prompt_type=None, chat=False, stream_output=False,
max_new_tokens=50,
top_k_docs=3,
langchain_mode='Disabled',
Expand Down Expand Up @@ -245,9 +245,9 @@ def test_client_chat_stream(prompt_type='human_bot'):
langchain_agents=[])


def run_client_chat(prompt, prompt_type, stream_output, max_new_tokens,
def run_client_chat(prompt, stream_output, max_new_tokens,
langchain_mode, langchain_action, langchain_agents,
prompt_dict=None):
prompt_type=None, prompt_dict=None):
client = get_client(serialize=False)

kwargs, args = get_args(prompt, prompt_type, chat=True, stream_output=stream_output,
Expand Down
24 changes: 15 additions & 9 deletions src/gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -1071,7 +1071,7 @@ def get_model(
else:
tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model, **tokenizer_kwargs)
# sets raw (no cushion) limit
set_model_max_len(config, tokenizer, verbose=False)
set_model_max_len(config, tokenizer, verbose=False, rope_scaling=rope_scaling)
# if using fake tokenizer, not really accurate when lots of numbers, give a bit of buffer, else get:
# Generation Failed: Input validation error: `inputs` must have less than 2048 tokens. Given: 2233
tokenizer.model_max_length = tokenizer.model_max_length - 50
Expand Down Expand Up @@ -1313,27 +1313,33 @@ def get_hf_model(load_8bit: bool = False,
if torch.__version__ >= "2" and sys.platform != "win32" and compile_model:
model = torch.compile(model)

set_model_max_len(config, tokenizer, verbose=False, reward_type=reward_type)
set_model_max_len(config, tokenizer, verbose=False, reward_type=reward_type, rope_scaling=rope_scaling)

return model, tokenizer, device


def set_model_max_len(config, tokenizer, verbose=False, reward_type=False):
def set_model_max_len(config, tokenizer, verbose=False, reward_type=False, rope_scaling=None):
rope_scaling_factor = 1
if rope_scaling:
rope_scaling_factor = rope_scaling.get('factor')
assert isinstance(rope_scaling_factor, int)
if reward_type:
# limit deberta, else uses too much memory and not worth response score
tokenizer.model_max_length = 512
if hasattr(config, 'max_seq_len') and isinstance(config.max_seq_len, int):
tokenizer.model_max_length = config.max_seq_len
tokenizer.model_max_length = config.max_seq_len * rope_scaling_factor
elif hasattr(config, 'max_position_embeddings') and isinstance(config.max_position_embeddings, int):
# help automatically limit inputs to generate
tokenizer.model_max_length = config.max_position_embeddings
tokenizer.model_max_length = config.max_position_embeddings * rope_scaling_factor
else:
if verbose:
print("Could not determine model_max_length, setting to 2048", flush=True)
tokenizer.model_max_length = 2048
print(f"Could not determine model_max_length, setting to {2048 * rope_scaling_factor}", flush=True)
# hopefully not for Llama2 models
tokenizer.model_max_length = 2048 * rope_scaling_factor
# for bug in HF transformers
if tokenizer.model_max_length > 100000000:
tokenizer.model_max_length = 2048
# hopefully not for Llama2 models
tokenizer.model_max_length = 2048 * rope_scaling_factor


def pop_unused_model_kwargs(model_kwargs):
Expand Down Expand Up @@ -2007,7 +2013,7 @@ def evaluate(
where_from=where_from, extra_dict=extra_dict)
return
else:
assert not inference_server, "inferene_server=%s not supported" % inference_server
assert not inference_server, "inference_server=%s not supported" % inference_server

if isinstance(tokenizer, str):
# pipeline
Expand Down
Loading