ShishirPatil · ShishirPatil · Jul 24, 2024 · Jul 20, 2024 · Jul 20, 2024 · Jul 20, 2024
diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md
@@ -209,6 +209,7 @@ Some companies have proposed some optimization strategies in their models' handl
 ## Changelog
 
 * [July 21, 2024] [#538](https://github.com/ShishirPatil/gorilla/pull/538): Fix `language_specific_pre_processing` function to properly handle pre-processing for prompts and function docs in Java and JavaScript test categories. All entries in these categories are affected.
+* [July 20, 2024] [#537](https://github.com/ShishirPatil/gorilla/pull/537): Update generation script for locally-hosted OSS model to use single-node multi-GPU inference method (tensor parallel). Ray is not used anymore.
 * [July 16, 2024] [#525](https://github.com/ShishirPatil/gorilla/pull/525), [#536](https://github.com/ShishirPatil/gorilla/pull/536): Add new model `ibm-granite/granite-20b-functioncalling` to the leaderboard.
 * [July 10, 2024] [#522](https://github.com/ShishirPatil/gorilla/pull/522): Bug fix in the evaluation dataset for Executable Parallel Multiple category. This includes updates to both prompts and function docs. 2 entries are affected.
 * [July 8, 2024] [#516](https://github.com/ShishirPatil/gorilla/pull/516): Fix double-casting issue in `model_handler` for Java and JavaScript test categories.
@@ -252,7 +253,7 @@ Some companies have proposed some optimization strategies in their models' handl
 * [April 3, 2024] [#309](https://github.com/ShishirPatil/gorilla/pull/309): Bug fix for evaluation dataset possible answers. Implement **string standardization** for the AST evaluation pipeline, i.e. removing white spaces and a subset of punctuations (`,./-_*^`) to make the AST evaluation more robust and accurate. Fixed AST evaluation issue for type `tuple`. Add 2 new models `meetkai/functionary-small-v2.4 (FC)`, `meetkai/functionary-medium-v2.4 (FC)` to the leaderboard.
 * [April 1, 2024] [#299](https://github.com/ShishirPatil/gorilla/pull/299): Leaderboard update with new models (`Claude-3-Haiku`, `Databrick-DBRX-Instruct`), more advanced AST evaluation procedure, and updated evaluation datasets. Cost and latency statistics during evaluation are also measured. We also released the manual that our evaluation procedure is based on, available [here](https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html#metrics).
 * [Mar 11, 2024] [#254](https://github.com/ShishirPatil/gorilla/pull/254): Leaderboard update with 3 new models: `Claude-3-Opus-20240229 (Prompt)`, `Claude-3-Sonnet-20240229 (Prompt)`, and `meetkai/functionary-medium-v2.2 (FC)`
-* [Mar 5, 2024] [#237](https://github.com/ShishirPatil/gorilla/pull/237) and [238](https://github.com/ShishirPatil/gorilla/pull/238): leaderboard update resulting from [#223](https://github.com/ShishirPatil/gorilla/pull/223); 3 new models: `mistral-large-2402`, `gemini-1.0-pro`, and `gemma`.
+* [Mar 5, 2024] [#237](https://github.com/ShishirPatil/gorilla/pull/237) and [238](https://github.com/ShishirPatil/gorilla/pull/238): leaderboard update resulting from [#223](https://github.com/ShishirPatil/gorilla/pull/223); 3 new models: `mistral-large-2402`, `gemini-1.0-pro`, and `google/gemma-7b-it`.
 * [Feb 29, 2024] [#223](https://github.com/ShishirPatil/gorilla/pull/223): modifications to REST evaluation. 
 
 

diff --git a/berkeley-function-call-leaderboard/model_handler/glm_handler.py b/berkeley-function-call-leaderboard/model_handler/glm_handler.py
@@ -21,8 +21,10 @@
 class GLMHandler(OSSHandler):
     def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None:
         super().__init__(model_name, temperature, top_p, max_tokens)
-        self.tensor_parallel_size = 8
+        self.max_model_len=4096
+        self.stop_token_ids = [151329, 151336, 151338]
 
+
     def apply_chat_template(self, prompt, function, test_category):
         oai_tool = convert_to_tool(
             function, GORILLA_TO_OPENAPI, ModelStyle.OpenAI, test_category, True
@@ -32,94 +34,31 @@ def apply_chat_template(self, prompt, function, test_category):
             conversation, tokenize=False, add_generation_prompt=True
         )
 
-    def _batch_generate(
-        self,
-        question_jsons,
-        test_category,
-        model_path,
-        temperature,
-        max_tokens,
-        top_p,
-        index,
-        llm,
-    ):
-        from vllm import SamplingParams
-
-        prompts = []
-        ans_jsons = []
-        for line in question_jsons:
-            for key, value in FILENAME_INDEX_MAPPING.items():
-                start, end = value
-                if index >= start and index < end:
-                    test_category = key
-                    break
-            prompts.append(line)
-            ans_id = shortuuid.uuid()
-            ans_jsons.append(
-                {
-                    "answer_id": ans_id,
-                    "question": line,
-                }
-            )
-
-        print("start generating: ", len(prompts))
-        stop_token_ids = [151329, 151336, 151338]
-        sampling_params = SamplingParams(
-            temperature=temperature,
-            max_tokens=max_tokens,
-            top_p=top_p,
-            stop_token_ids=stop_token_ids,
-        )
-        outputs = llm.generate(prompts, sampling_params)
-        final_ans_jsons = []
-        for output, ans_json in zip(outputs, ans_jsons):
-            text = output.outputs[0].text
-            ans_json["text"] = text
-            final_ans_jsons.append(ans_json)
-        return final_ans_jsons
-
+
     def inference(self, test_question, test_category, num_gpus):
         from transformers import AutoTokenizer
         self.tokenizer = AutoTokenizer.from_pretrained(
             self.model_name, trust_remote_code=True
         )
 
-        chat_template_ques_jsons = []
-        for line in test_question:
-            prompt = augment_prompt_by_languge(line["question"], test_category)
-            function = language_specific_pre_processing(
-                line["function"], test_category
-            )
-            chat_template_ques_jsons.append(
-                self.apply_chat_template(prompt, function, test_category)
-            )
-
-        chunk_size = len(test_question) // num_gpus
-        from vllm import LLM
-
-        llm = LLM(
-            model=self.model_name,
-            dtype="float16",
-            trust_remote_code=True,
-            tensor_parallel_size=self.tensor_parallel_size,
-            max_model_len=4096,
+        test_question = self.process_input(
+            test_question, test_category, self.apply_chat_template
+        )
+
+        ans_jsons = self._batch_generate(
+            test_question=test_question,
+            model_path=self.model_name,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
+            top_p=self.top_p,
+            stop_token_ids=self.stop_token_ids,
+            max_model_len=self.max_model_len,
+            num_gpus=num_gpus,
         )
-        ans_jsons = []
-        for i in range(0, len(test_question), chunk_size):
-            output = self._batch_generate(
-                chat_template_ques_jsons[i : i + chunk_size],
-                test_category,
-                self.model_name,
-                self.temperature,
-                self.max_tokens,
-                self.top_p,
-                i,
-                llm,
-            )
-            ans_jsons.extend(output)
 
         return ans_jsons, {"input_tokens": 0, "output_tokens": 0, "latency": 0}
 
+
     def decode_ast(self, result, language="Python"):
         args = result.split("\n")
         if len(args) == 1:

diff --git a/berkeley-function-call-leaderboard/model_handler/handler_map.py b/berkeley-function-call-leaderboard/model_handler/handler_map.py
@@ -60,7 +60,6 @@
     "gemini-1.5-pro-preview-0409": GeminiHandler,
     "gemini-1.5-pro-preview-0514": GeminiHandler,
     "gemini-1.5-flash-preview-0514": GeminiHandler,
-    "gemma": OSSHandler,
     "google/gemma-7b-it": GemmaHandler,
     "glaiveai/glaive-function-calling-v1": GlaiveHandler,
     "deepseek-ai/deepseek-coder-6.7b-instruct": DeepseekHandler,

diff --git a/berkeley-function-call-leaderboard/model_handler/oss_handler.py b/berkeley-function-call-leaderboard/model_handler/oss_handler.py
@@ -1,9 +1,7 @@
 import json
 import os
 
-import ray
 import shortuuid
-import torch
 from eval_checker.eval_checker_constant import FILENAME_INDEX_MAPPING
 from model_handler.handler import BaseHandler
 from model_handler.model_style import ModelStyle
@@ -13,14 +11,11 @@
     language_specific_pre_processing,
 )
 
+
 class OSSHandler(BaseHandler):
     def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None:
         super().__init__(model_name, temperature, top_p, max_tokens)
         self.model_style = ModelStyle.OSSMODEL
-        self._init_model()
-
-    def _init_model(self):
-        ray.init(ignore_reinit_error=True, num_cpus=8)
 
     def _format_prompt(prompt, function, test_category):
         SYSTEM_PROMPT = """
@@ -34,78 +29,78 @@ def _format_prompt(prompt, function, test_category):
             functions += "\n" + str(function)
         return f"SYSTEM: {SYSTEM_PROMPT}\n{functions}\nUSER: {prompt}\nASSISTANT: "
 
-    @ray.remote(num_gpus=1)
-    @torch.inference_mode()
+    @staticmethod
     def _batch_generate(
-        question_jsons,
-        test_category,
+        test_question,
         model_path,
         temperature,
         max_tokens,
         top_p,
-        format_prompt_func,
-        index,
+        stop_token_ids=None,
+        max_model_len=None,
+        num_gpus=1,
     ):
         from vllm import LLM, SamplingParams
 
+        print("start generating, test question length: ", len(test_question))
+
+        sampling_params = SamplingParams(
+            temperature=temperature,
+            max_tokens=max_tokens,
+            top_p=top_p,
+            stop_token_ids=stop_token_ids,
+        )
+        llm = LLM(
+            model=model_path,
+            dtype="float16",
+            trust_remote_code=True,
+            disable_custom_all_reduce=True,
+            max_model_len=max_model_len,
+            tensor_parallel_size=num_gpus,
+        )
+        outputs = llm.generate(test_question, sampling_params)
+
+        final_ans_jsons = []
+        for output in outputs:
+            text = output.outputs[0].text
+            final_ans_jsons.append(text)
+        return final_ans_jsons
+
+    @staticmethod
+    def process_input(test_question, test_category, format_prompt_func):
         prompts = []
-        ans_jsons = []
-        for line in question_jsons:
-            for key, value in FILENAME_INDEX_MAPPING.items():
-                start, end = value
-                if index >= start and index < end:
-                    test_category = key
-                    break
-            ques_json = line
+        for ques_json in test_question:
             prompt = augment_prompt_by_languge(ques_json["question"], test_category)
             functions = language_specific_pre_processing(
                 ques_json["function"], test_category
             )
             prompts.append(format_prompt_func(prompt, functions, test_category))
-            ans_id = shortuuid.uuid()
-            ans_jsons.append(
-                {
-                    "answer_id": ans_id,
-                    "question": ques_json["question"],
-                }
-            )
 
-        print("start generating: ", len(prompts))
-        sampling_params = SamplingParams(
-            temperature=temperature, max_tokens=max_tokens, top_p=top_p
-        )
-        llm = LLM(model=model_path, dtype="float16", trust_remote_code=True)
-        outputs = llm.generate(prompts, sampling_params)
-        final_ans_jsons = []
-        for output, ans_json in zip(outputs, ans_jsons):
-            text = output.outputs[0].text
-            ans_json["text"] = text
-            final_ans_jsons.append(ans_json)
-        return final_ans_jsons
+        return prompts
 
     def inference(
-        self, test_question, test_category, num_gpus, format_prompt_func=_format_prompt
+        self,
+        test_question,
+        test_category,
+        num_gpus,
+        format_prompt_func=_format_prompt,
+        stop_token_ids=None,
+        max_model_len=None,
     ):
+        test_question = self.process_input(
+            test_question, test_category, format_prompt_func
+        )
 
-
-        chunk_size = len(test_question) // num_gpus
-        ans_handles = []
-        for i in range(0, len(test_question), chunk_size):
-            ans_handles.append(
-                self._batch_generate.remote(
-                    test_question[i : i + chunk_size],
-                    test_category,
-                    self.model_name,
-                    self.temperature,
-                    self.max_tokens,
-                    self.top_p,
-                    format_prompt_func,
-                    i,
-                )
-            )
-        ans_jsons = []
-        for ans_handle in ans_handles:
-            ans_jsons.extend(ray.get(ans_handle))
+        ans_jsons = self._batch_generate(
+            test_question=test_question,
+            model_path=self.model_name,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
+            top_p=self.top_p,
+            stop_token_ids=stop_token_ids,
+            max_model_len=max_model_len,
+            num_gpus=num_gpus,
+        )
 
         return ans_jsons, {"input_tokens": 0, "output_tokens": 0, "latency": 0}
 

diff --git a/berkeley-function-call-leaderboard/openfunctions_evaluation.py b/berkeley-function-call-leaderboard/openfunctions_evaluation.py
@@ -90,13 +90,13 @@ def load_file(test_categories):
                     num_existing_result += 1
 
         if handler.model_style == ModelStyle.OSSMODEL:
-            result = handler.inference(
+            result, metadata = handler.inference(
                 test_question = test_cases[num_existing_result:],
                 test_category = test_category,
                 num_gpus = args.num_gpus,
             )
-            for index, res in enumerate(result[0]):
-                result_to_write = {"id": index, "result": res["text"]}
+            for index, res in enumerate(result):
+                result_to_write = {"id": index, "result": res}
                 handler.write(result_to_write, file_to_open)
         else:
             for index, test_case in enumerate(tqdm(test_cases)):