Branch off ShishirPatil#537.

Squashed commit of the following: commit e65a108 Author: Huanzhi Mao <[email protected]> Date: Sat Jul 20 21:50:26 2024 -0700 update README commit 8034aed Author: Huanzhi Mao <[email protected]> Date: Sat Jul 20 17:44:50 2024 -0700 refactor glm_handler to simplify logic and apply fix commit 83912f0 Author: Huanzhi Mao <[email protected]> Date: Sat Jul 20 17:31:33 2024 -0700 polish process_input section commit 7d08daf Author: Huanzhi Mao <[email protected]> Date: Sat Jul 20 15:46:06 2024 -0700 simplify _batch_generate logic; seperate out process_input section commit c5ac395 Author: Huanzhi Mao <[email protected]> Date: Sat Jul 20 15:27:42 2024 -0700 remove outdated gemma model name commit b59af2c Author: Huanzhi Mao <[email protected]> Date: Sat Jul 20 14:32:23 2024 -0700 revert, as vllm still requires ray commit 7a275d7 Author: Huanzhi Mao <[email protected]> Date: Sat Jul 20 14:27:44 2024 -0700 remove ray from requirements.txt commit 0d1c478 Merge: 32c1ad4 7b230df Author: Huanzhi (Hans) Mao <[email protected]> Date: Sat Jul 20 00:01:25 2024 -0700 Merge branch 'main' into main commit 32c1ad4 Author: Huanzhi Mao <[email protected]> Date: Fri Jul 19 23:36:42 2024 -0700 remove ray; use vllm tensor_parallel_size commit 5ff790e Author: Huanzhi Mao <[email protected]> Date: Fri Jul 19 21:21:08 2024 -0700 remove torch inference_mode
HuanzhiMao · Jul 21, 2024 · f57da63 · f57da63
1 parent 7b230df
commit f57da63
Show file tree

Hide file tree

Showing 5 changed files with 76 additions and 142 deletions.
diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md
@@ -208,6 +208,7 @@ Some companies have proposed some optimization strategies in their models' handl
 
 ## Changelog
 
+* [July 20, 2024] [#537](https://github.com/ShishirPatil/gorilla/pull/537): Update generation script for locally-hosted OSS model to use single-node multi-GPU inference method (tensor parallel). Ray is not used anymore.
 * [July 16, 2024] [#525](https://github.com/ShishirPatil/gorilla/pull/525), [#536](https://github.com/ShishirPatil/gorilla/pull/536): Add new model `ibm-granite/granite-20b-functioncalling` to the leaderboard.
 * [July 10, 2024] [#522](https://github.com/ShishirPatil/gorilla/pull/522): Bug fix in the evaluation dataset for Executable Parallel Multiple category. This includes updates to both prompts and function docs. 2 entries are affected.
 * [July 8, 2024] [#516](https://github.com/ShishirPatil/gorilla/pull/516): Fix double-casting issue in `model_handler` for Java and JavaScript test categories.
@@ -251,7 +252,7 @@ Some companies have proposed some optimization strategies in their models' handl
 * [April 3, 2024] [#309](https://github.com/ShishirPatil/gorilla/pull/309): Bug fix for evaluation dataset possible answers. Implement **string standardization** for the AST evaluation pipeline, i.e. removing white spaces and a subset of punctuations (`,./-_*^`) to make the AST evaluation more robust and accurate. Fixed AST evaluation issue for type `tuple`. Add 2 new models `meetkai/functionary-small-v2.4 (FC)`, `meetkai/functionary-medium-v2.4 (FC)` to the leaderboard.
 * [April 1, 2024] [#299](https://github.com/ShishirPatil/gorilla/pull/299): Leaderboard update with new models (`Claude-3-Haiku`, `Databrick-DBRX-Instruct`), more advanced AST evaluation procedure, and updated evaluation datasets. Cost and latency statistics during evaluation are also measured. We also released the manual that our evaluation procedure is based on, available [here](https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html#metrics).
 * [Mar 11, 2024] [#254](https://github.com/ShishirPatil/gorilla/pull/254): Leaderboard update with 3 new models: `Claude-3-Opus-20240229 (Prompt)`, `Claude-3-Sonnet-20240229 (Prompt)`, and `meetkai/functionary-medium-v2.2 (FC)`
-* [Mar 5, 2024] [#237](https://github.com/ShishirPatil/gorilla/pull/237) and [238](https://github.com/ShishirPatil/gorilla/pull/238): leaderboard update resulting from [#223](https://github.com/ShishirPatil/gorilla/pull/223); 3 new models: `mistral-large-2402`, `gemini-1.0-pro`, and `gemma`.
+* [Mar 5, 2024] [#237](https://github.com/ShishirPatil/gorilla/pull/237) and [238](https://github.com/ShishirPatil/gorilla/pull/238): leaderboard update resulting from [#223](https://github.com/ShishirPatil/gorilla/pull/223); 3 new models: `mistral-large-2402`, `gemini-1.0-pro`, and `google/gemma-7b-it`.
 * [Feb 29, 2024] [#223](https://github.com/ShishirPatil/gorilla/pull/223): modifications to REST evaluation. 
 
 

diff --git a/berkeley-function-call-leaderboard/model_handler/glm_handler.py b/berkeley-function-call-leaderboard/model_handler/glm_handler.py
@@ -21,8 +21,10 @@
 class GLMHandler(OSSHandler):
     def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None:
         super().__init__(model_name, temperature, top_p, max_tokens)
-        self.tensor_parallel_size = 8
+        self.max_model_len=4096
+        self.stop_token_ids = [151329, 151336, 151338]
 
+
     def apply_chat_template(self, prompt, function, test_category):
         oai_tool = convert_to_tool(
             function, GORILLA_TO_OPENAPI, ModelStyle.OpenAI, test_category, True
@@ -32,94 +34,31 @@ def apply_chat_template(self, prompt, function, test_category):
             conversation, tokenize=False, add_generation_prompt=True
         )
 
-    def _batch_generate(
-        self,
-        question_jsons,
-        test_category,
-        model_path,
-        temperature,
-        max_tokens,
-        top_p,
-        index,
-        llm,
-    ):
-        from vllm import SamplingParams
-
-        prompts = []
-        ans_jsons = []
-        for line in question_jsons:
-            for key, value in FILENAME_INDEX_MAPPING.items():
-                start, end = value
-                if index >= start and index < end:
-                    test_category = key
-                    break
-            prompts.append(line)
-            ans_id = shortuuid.uuid()
-            ans_jsons.append(
-                {
-                    "answer_id": ans_id,
-                    "question": line,
-                }
-            )
-
-        print("start generating: ", len(prompts))
-        stop_token_ids = [151329, 151336, 151338]
-        sampling_params = SamplingParams(
-            temperature=temperature,
-            max_tokens=max_tokens,
-            top_p=top_p,
-            stop_token_ids=stop_token_ids,
-        )
-        outputs = llm.generate(prompts, sampling_params)
-        final_ans_jsons = []
-        for output, ans_json in zip(outputs, ans_jsons):
-            text = output.outputs[0].text
-            ans_json["text"] = text
-            final_ans_jsons.append(ans_json)
-        return final_ans_jsons
-
+
     def inference(self, test_question, test_category, num_gpus):
         from transformers import AutoTokenizer
         self.tokenizer = AutoTokenizer.from_pretrained(
             self.model_name, trust_remote_code=True
         )
 
-        chat_template_ques_jsons = []
-        for line in test_question:
-            prompt = augment_prompt_by_languge(line["question"], test_category)
-            function = language_specific_pre_processing(
-                line["function"], test_category, False
-            )
-            chat_template_ques_jsons.append(
-                self.apply_chat_template(prompt, function, test_category)
-            )
-
-        chunk_size = len(test_question) // num_gpus
-        from vllm import LLM
-
-        llm = LLM(
-            model=self.model_name,
-            dtype="float16",
-            trust_remote_code=True,
-            tensor_parallel_size=self.tensor_parallel_size,
-            max_model_len=4096,
+        test_question = self.process_input(
+            test_question, test_category, self.apply_chat_template
+        )
+
+        ans_jsons = self._batch_generate(
+            test_question=test_question,
+            model_path=self.model_name,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
+            top_p=self.top_p,
+            stop_token_ids=self.stop_token_ids,
+            max_model_len=self.max_model_len,
+            num_gpus=num_gpus,
         )
-        ans_jsons = []
-        for i in range(0, len(test_question), chunk_size):
-            output = self._batch_generate(
-                chat_template_ques_jsons[i : i + chunk_size],
-                test_category,
-                self.model_name,
-                self.temperature,
-                self.max_tokens,
-                self.top_p,
-                i,
-                llm,
-            )
-            ans_jsons.extend(output)
 
         return ans_jsons, {"input_tokens": 0, "output_tokens": 0, "latency": 0}
 
+
     def decode_ast(self, result, language="Python"):
         args = result.split("\n")
         if len(args) == 1:

diff --git a/berkeley-function-call-leaderboard/model_handler/handler_map.py b/berkeley-function-call-leaderboard/model_handler/handler_map.py
@@ -60,7 +60,6 @@
     "gemini-1.5-pro-preview-0409": GeminiHandler,
     "gemini-1.5-pro-preview-0514": GeminiHandler,
     "gemini-1.5-flash-preview-0514": GeminiHandler,
-    "gemma": OSSHandler,
     "google/gemma-7b-it": GemmaHandler,
     "glaiveai/glaive-function-calling-v1": GlaiveHandler,
     "deepseek-ai/deepseek-coder-6.7b-instruct": DeepseekHandler,

diff --git a/berkeley-function-call-leaderboard/model_handler/oss_handler.py b/berkeley-function-call-leaderboard/model_handler/oss_handler.py
@@ -1,9 +1,7 @@
 import json
 import os
 
-import ray
 import shortuuid
-import torch
 from eval_checker.eval_checker_constant import FILENAME_INDEX_MAPPING
 from model_handler.handler import BaseHandler
 from model_handler.model_style import ModelStyle
@@ -13,14 +11,11 @@
     language_specific_pre_processing,
 )
 
+
 class OSSHandler(BaseHandler):
     def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None:
         super().__init__(model_name, temperature, top_p, max_tokens)
         self.model_style = ModelStyle.OSSMODEL
-        self._init_model()
-
-    def _init_model(self):
-        ray.init(ignore_reinit_error=True, num_cpus=8)
 
     def _format_prompt(prompt, function, test_category):
         SYSTEM_PROMPT = """
@@ -34,78 +29,78 @@ def _format_prompt(prompt, function, test_category):
             functions += "\n" + str(function)
         return f"SYSTEM: {SYSTEM_PROMPT}\n{functions}\nUSER: {prompt}\nASSISTANT: "
 
-    @ray.remote(num_gpus=1)
-    @torch.inference_mode()
+    @staticmethod
     def _batch_generate(
-        question_jsons,
-        test_category,
+        test_question,
         model_path,
         temperature,
         max_tokens,
         top_p,
-        format_prompt_func,
-        index,
+        stop_token_ids=None,
+        max_model_len=None,
+        num_gpus=8,
     ):
         from vllm import LLM, SamplingParams
 
+        print("start generating, test question length: ", len(test_question))
+
+        sampling_params = SamplingParams(
+            temperature=temperature,
+            max_tokens=max_tokens,
+            top_p=top_p,
+            stop_token_ids=stop_token_ids,
+        )
+        llm = LLM(
+            model=model_path,
+            dtype="float16",
+            trust_remote_code=True,
+            disable_custom_all_reduce=True,
+            max_model_len=max_model_len,
+            tensor_parallel_size=num_gpus,
+        )
+        outputs = llm.generate(test_question, sampling_params)
+
+        final_ans_jsons = []
+        for output in outputs:
+            text = output.outputs[0].text
+            final_ans_jsons.append(text)
+        return final_ans_jsons
+
+    @staticmethod
+    def process_input(test_question, test_category, format_prompt_func):
         prompts = []
-        ans_jsons = []
-        for line in question_jsons:
-            for key, value in FILENAME_INDEX_MAPPING.items():
-                start, end = value
-                if index >= start and index < end:
-                    test_category = key
-                    break
-            ques_json = line
+        for ques_json in test_question:
             prompt = augment_prompt_by_languge(ques_json["question"], test_category)
             functions = language_specific_pre_processing(
                 ques_json["function"], test_category, False
             )
             prompts.append(format_prompt_func(prompt, functions, test_category))
-            ans_id = shortuuid.uuid()
-            ans_jsons.append(
-                {
-                    "answer_id": ans_id,
-                    "question": ques_json["question"],
-                }
-            )
 
-        print("start generating: ", len(prompts))
-        sampling_params = SamplingParams(
-            temperature=temperature, max_tokens=max_tokens, top_p=top_p
-        )
-        llm = LLM(model=model_path, dtype="float16", trust_remote_code=True)
-        outputs = llm.generate(prompts, sampling_params)
-        final_ans_jsons = []
-        for output, ans_json in zip(outputs, ans_jsons):
-            text = output.outputs[0].text
-            ans_json["text"] = text
-            final_ans_jsons.append(ans_json)
-        return final_ans_jsons
+        return prompts
 
     def inference(
-        self, test_question, test_category, num_gpus, format_prompt_func=_format_prompt
+        self,
+        test_question,
+        test_category,
+        num_gpus,
+        format_prompt_func=_format_prompt,
+        stop_token_ids=None,
+        max_model_len=None,
     ):
+        test_question = self.process_input(
+            test_question, test_category, format_prompt_func
+        )
 
-
-        chunk_size = len(test_question) // num_gpus
-        ans_handles = []
-        for i in range(0, len(test_question), chunk_size):
-            ans_handles.append(
-                self._batch_generate.remote(
-                    test_question[i : i + chunk_size],
-                    test_category,
-                    self.model_name,
-                    self.temperature,
-                    self.max_tokens,
-                    self.top_p,
-                    format_prompt_func,
-                    i,
-                )
-            )
-        ans_jsons = []
-        for ans_handle in ans_handles:
-            ans_jsons.extend(ray.get(ans_handle))
+        ans_jsons = self._batch_generate(
+            test_question=test_question,
+            model_path=self.model_name,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
+            top_p=self.top_p,
+            stop_token_ids=stop_token_ids,
+            max_model_len=max_model_len,
+            num_gpus=num_gpus,
+        )
 
         return ans_jsons, {"input_tokens": 0, "output_tokens": 0, "latency": 0}
 

diff --git a/berkeley-function-call-leaderboard/openfunctions_evaluation.py b/berkeley-function-call-leaderboard/openfunctions_evaluation.py
@@ -95,8 +95,8 @@ def load_file(test_categories):
                 test_category = test_category,
                 num_gpus = args.num_gpus,
             )
-            for index, res in enumerate(result[0]):
-                result_to_write = {"id": index, "result": res["text"]}
+            for index, res in enumerate(result):
+                result_to_write = {"id": index, "result": res}
                 handler.write(result_to_write, file_to_open)
         else:
             for index, test_case in enumerate(tqdm(test_cases)):