Skip to content

Commit

Permalink
remove ray; use vllm tensor_parallel_size
Browse files Browse the repository at this point in the history
  • Loading branch information
HuanzhiMao committed Jul 20, 2024
1 parent 5ff790e commit 32c1ad4
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 35 deletions.
54 changes: 21 additions & 33 deletions berkeley-function-call-leaderboard/model_handler/oss_handler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json
import os

import ray
import shortuuid
from eval_checker.eval_checker_constant import FILENAME_INDEX_MAPPING
from model_handler.handler import BaseHandler
Expand All @@ -12,14 +11,11 @@
language_specific_pre_processing,
)


class OSSHandler(BaseHandler):
def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None:
super().__init__(model_name, temperature, top_p, max_tokens)
self.model_style = ModelStyle.OSSMODEL
self._init_model()

def _init_model(self):
ray.init(ignore_reinit_error=True, num_cpus=8)

def _format_prompt(prompt, function, test_category):
SYSTEM_PROMPT = """
Expand All @@ -33,7 +29,7 @@ def _format_prompt(prompt, function, test_category):
functions += "\n" + str(function)
return f"SYSTEM: {SYSTEM_PROMPT}\n{functions}\nUSER: {prompt}\nASSISTANT: "

@ray.remote(num_gpus=1)
@staticmethod
def _batch_generate(
question_jsons,
test_category,
Expand All @@ -42,18 +38,13 @@ def _batch_generate(
max_tokens,
top_p,
format_prompt_func,
index,
num_gpus,
):
from vllm import LLM, SamplingParams

prompts = []
ans_jsons = []
for line in question_jsons:
for key, value in FILENAME_INDEX_MAPPING.items():
start, end = value
if index >= start and index < end:
test_category = key
break
ques_json = line
prompt = augment_prompt_by_languge(ques_json["question"], test_category)
functions = language_specific_pre_processing(
Expand All @@ -72,38 +63,35 @@ def _batch_generate(
sampling_params = SamplingParams(
temperature=temperature, max_tokens=max_tokens, top_p=top_p
)
llm = LLM(model=model_path, dtype="float16", trust_remote_code=True)
llm = LLM(
model=model_path,
dtype="float16",
trust_remote_code=True,
tensor_parallel_size=num_gpus,
disable_custom_all_reduce=True,
)
outputs = llm.generate(prompts, sampling_params)
final_ans_jsons = []
for output, ans_json in zip(outputs, ans_jsons):
text = output.outputs[0].text
ans_json["text"] = text
ans_json["result"] = text
final_ans_jsons.append(ans_json)
return final_ans_jsons

def inference(
self, test_question, test_category, num_gpus, format_prompt_func=_format_prompt
):


chunk_size = len(test_question) // num_gpus
ans_handles = []
for i in range(0, len(test_question), chunk_size):
ans_handles.append(
self._batch_generate.remote(
test_question[i : i + chunk_size],
test_category,
self.model_name,
self.temperature,
self.max_tokens,
self.top_p,
format_prompt_func,
i,
)
)
ans_jsons = []
for ans_handle in ans_handles:
ans_jsons.extend(ray.get(ans_handle))
ans_jsons = self._batch_generate(
test_question,
test_category,
self.model_name,
self.temperature,
self.max_tokens,
self.top_p,
format_prompt_func,
num_gpus,
)

return ans_jsons, {"input_tokens": 0, "output_tokens": 0, "latency": 0}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@ def load_file(test_categories):
test_category = test_category,
num_gpus = args.num_gpus,
)
for index, res in enumerate(result[0]):
result_to_write = {"id": index, "result": res["text"]}
for index, res in enumerate(result):
result_to_write = {"id": index, "result": res["result"]}
handler.write(result_to_write, file_to_open)
else:
for index, test_case in enumerate(tqdm(test_cases)):
Expand Down

0 comments on commit 32c1ad4

Please sign in to comment.