remove ray; use vllm tensor_parallel_size

HuanzhiMao · Jul 20, 2024 · 32c1ad4 · 32c1ad4
1 parent 5ff790e
commit 32c1ad4
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 35 deletions.
diff --git a/berkeley-function-call-leaderboard/model_handler/oss_handler.py b/berkeley-function-call-leaderboard/model_handler/oss_handler.py
@@ -1,7 +1,6 @@
 import json
 import os
 
-import ray
 import shortuuid
 from eval_checker.eval_checker_constant import FILENAME_INDEX_MAPPING
 from model_handler.handler import BaseHandler
@@ -12,14 +11,11 @@
     language_specific_pre_processing,
 )
 
+
 class OSSHandler(BaseHandler):
     def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None:
         super().__init__(model_name, temperature, top_p, max_tokens)
         self.model_style = ModelStyle.OSSMODEL
-        self._init_model()
-
-    def _init_model(self):
-        ray.init(ignore_reinit_error=True, num_cpus=8)
 
     def _format_prompt(prompt, function, test_category):
         SYSTEM_PROMPT = """
@@ -33,7 +29,7 @@ def _format_prompt(prompt, function, test_category):
             functions += "\n" + str(function)
         return f"SYSTEM: {SYSTEM_PROMPT}\n{functions}\nUSER: {prompt}\nASSISTANT: "
 
-    @ray.remote(num_gpus=1)
+    @staticmethod
     def _batch_generate(
         question_jsons,
         test_category,
@@ -42,18 +38,13 @@ def _batch_generate(
         max_tokens,
         top_p,
         format_prompt_func,
-        index,
+        num_gpus,
     ):
         from vllm import LLM, SamplingParams
 
         prompts = []
         ans_jsons = []
         for line in question_jsons:
-            for key, value in FILENAME_INDEX_MAPPING.items():
-                start, end = value
-                if index >= start and index < end:
-                    test_category = key
-                    break
             ques_json = line
             prompt = augment_prompt_by_languge(ques_json["question"], test_category)
             functions = language_specific_pre_processing(
@@ -72,38 +63,35 @@ def _batch_generate(
         sampling_params = SamplingParams(
             temperature=temperature, max_tokens=max_tokens, top_p=top_p
         )
-        llm = LLM(model=model_path, dtype="float16", trust_remote_code=True)
+        llm = LLM(
+            model=model_path,
+            dtype="float16",
+            trust_remote_code=True,
+            tensor_parallel_size=num_gpus,
+            disable_custom_all_reduce=True,
+        )
         outputs = llm.generate(prompts, sampling_params)
         final_ans_jsons = []
         for output, ans_json in zip(outputs, ans_jsons):
             text = output.outputs[0].text
-            ans_json["text"] = text
+            ans_json["result"] = text
             final_ans_jsons.append(ans_json)
         return final_ans_jsons
 
     def inference(
         self, test_question, test_category, num_gpus, format_prompt_func=_format_prompt
     ):
 
-
-        chunk_size = len(test_question) // num_gpus
-        ans_handles = []
-        for i in range(0, len(test_question), chunk_size):
-            ans_handles.append(
-                self._batch_generate.remote(
-                    test_question[i : i + chunk_size],
-                    test_category,
-                    self.model_name,
-                    self.temperature,
-                    self.max_tokens,
-                    self.top_p,
-                    format_prompt_func,
-                    i,
-                )
-            )
-        ans_jsons = []
-        for ans_handle in ans_handles:
-            ans_jsons.extend(ray.get(ans_handle))
+        ans_jsons = self._batch_generate(
+            test_question,
+            test_category,
+            self.model_name,
+            self.temperature,
+            self.max_tokens,
+            self.top_p,
+            format_prompt_func,
+            num_gpus,
+        )
 
         return ans_jsons, {"input_tokens": 0, "output_tokens": 0, "latency": 0}
 

diff --git a/berkeley-function-call-leaderboard/openfunctions_evaluation.py b/berkeley-function-call-leaderboard/openfunctions_evaluation.py
@@ -95,8 +95,8 @@ def load_file(test_categories):
                 test_category = test_category,
                 num_gpus = args.num_gpus,
             )
-            for index, res in enumerate(result[0]):
-                result_to_write = {"id": index, "result": res["text"]}
+            for index, res in enumerate(result):
+                result_to_write = {"id": index, "result": res["result"]}
                 handler.write(result_to_write, file_to_open)
         else:
             for index, test_case in enumerate(tqdm(test_cases)):