forked from HabanaAI/vllm-fork
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Chendi Xue <[email protected]>
- Loading branch information
Showing
1 changed file
with
87 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
from vllm import LLM, SamplingParams | ||
|
||
import argparse | ||
import os | ||
import json | ||
|
||
model_path = "/data/models/DeepSeek-R1/" | ||
#model_path = "/mnt/workdisk/dohayon/Projects/R1/DeepSeek-R1-fp8/" | ||
# model_path = "deepseek-ai/DeepSeek-V2-Lite" | ||
|
||
# Parse the command-line arguments. | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--model", type=str, default=model_path, help="The model path.") | ||
parser.add_argument("--task", type=str, default="gsm8k", help="The model path.") | ||
parser.add_argument("--tokenizer", type=str, default=model_path, help="The model path.") | ||
parser.add_argument("--tp_size", type=int, default=8, help="Tensor Parallelism size.") | ||
parser.add_argument("--ep_size", type=int, default=4, help="Expert Parallelism size.") | ||
parser.add_argument("-l", "--limit", type=int, default=16, help="test request counts.") | ||
args = parser.parse_args() | ||
|
||
os.environ["VLLM_SKIP_WARMUP"] = "true" | ||
os.environ["HABANA_VISIBLE_DEVICES"] = "ALL" | ||
os.environ["PT_HPU_ENABLE_LAZY_COLLECTIVES"] = "true" | ||
if args.ep_size > 1: | ||
os.environ["VLLM_MOE_N_SLICE"] = "1" | ||
os.environ["VLLM_EP_SIZE"] = f"{args.ep_size}" | ||
else: | ||
os.environ["VLLM_MOE_N_SLICE"] = "4" | ||
os.environ["VLLM_EP_SIZE"] = "1" | ||
|
||
os.environ["VLLM_MLA_DISABLE_REQUANTIZATION"] = "1" | ||
os.environ["PT_HPU_WEIGHT_SHARING"] = "0" | ||
|
||
if __name__ == "__main__": | ||
|
||
from lm_eval.models.vllm_causallms import VLLM | ||
from lm_eval import simple_evaluate | ||
|
||
model = args.model | ||
if args.tp_size == 1: | ||
llm = VLLM( | ||
pretrained=model, | ||
tokenizer=args.tokenizer, | ||
trust_remote_code=True, | ||
dtype="bfloat16", | ||
max_model_len=4096, | ||
gpu_memory_utilization=0.8, | ||
) | ||
else: | ||
llm = VLLM( | ||
pretrained=model, | ||
tokenizer=args.tokenizer, | ||
tensor_parallel_size=args.tp_size, | ||
distributed_executor_backend='mp', | ||
trust_remote_code=True, | ||
max_model_len=4096, | ||
dtype="bfloat16", | ||
gpu_memory_utilization=0.8, | ||
) | ||
|
||
|
||
# Run the evaluation; you can adjust num_fewshot and batch_size as needed. | ||
if args.task == "gsm8k": | ||
results = simple_evaluate(model=llm, tasks=["gsm8k"], num_fewshot=5, batch_size=8, limit=args.limit) | ||
# save as json | ||
with open(f"gsm8k_ep{args.ep_size}_result_samples.jsonl", "w") as f: | ||
json.dump(results['results'], f) | ||
f.write("\n") | ||
for sample in results['samples']['gsm8k']: | ||
json.dump(sample, f) | ||
f.write("\n") | ||
elif args.task == "hallaswag": | ||
results = simple_evaluate(model=llm, tasks=["hellaswag"], num_fewshot=0, batch_size=8, limit=args.limit) | ||
with open(f"hallaswag_ep{args.ep_size}_result_samples.jsonl", "w") as f: | ||
json.dump(results['results'], f) | ||
f.write("\n") | ||
for sample in results['samples']['hellaswag']: | ||
json.dump(sample, f) | ||
f.write("\n") | ||
|
||
del llm | ||
print("============ Completed ============") | ||
|
||
# Print out the results. | ||
print("Evaluation Results:") | ||
for task, metrics in results['results'].items(): | ||
print(f"{task}: {metrics}") |