From 2a7e7d425d25402ee79c13875c7b70cf071314ce Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sat, 28 Sep 2024 14:30:46 -0700 Subject: [PATCH 1/5] update --- README.md | 6 +++--- examples/runtime/async_io_api.py | 1 + examples/runtime/openai_batch_chat.py | 9 ++++----- examples/runtime/openai_batch_complete.py | 7 ++----- examples/runtime/reward_model.py | 2 -- python/sglang/lang/backend/runtime_endpoint.py | 5 +++-- python/sglang/srt/server.py | 2 +- 7 files changed, 14 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index a3e7beee6d9..fc767f88d4f 100644 --- a/README.md +++ b/README.md @@ -163,7 +163,7 @@ curl http://localhost:30000/generate \ } }' ``` -Learn more about the argument format [here](docs/en/sampling_params.md). +Learn more about the argument specification, streaming, and multi-modal support [here](docs/en/sampling_params.md). ### OpenAI Compatible API In addition, the server supports OpenAI-compatible APIs. @@ -202,7 +202,7 @@ response = client.embeddings.create( print(response) ``` -It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/). +It supports streaming, vision, and almost all features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/). ### Additional Server Arguments - To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command. @@ -241,9 +241,9 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct - Llama / Llama 2 / Llama 3 / Llama 3.1 - Mistral / Mixtral / Mistral NeMo - Gemma / Gemma 2 -- OLMoE - Qwen / Qwen 2 / Qwen 2 MoE - DeepSeek / DeepSeek 2 +- OLMoE - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/) - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava` - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava` diff --git a/examples/runtime/async_io_api.py b/examples/runtime/async_io_api.py index d12a3a4d9df..23d3d0b90bf 100644 --- a/examples/runtime/async_io_api.py +++ b/examples/runtime/async_io_api.py @@ -1,5 +1,6 @@ """ Usage: + python3 async_io.py """ diff --git a/examples/runtime/openai_batch_chat.py b/examples/runtime/openai_batch_chat.py index 8640d092570..1081f0a69bc 100644 --- a/examples/runtime/openai_batch_chat.py +++ b/examples/runtime/openai_batch_chat.py @@ -1,7 +1,9 @@ """ Usage: + python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 python openai_batch_chat.py + Note: Before running this script, you should create the input.jsonl file with the following content: {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world! List 3 NBA players and tell a story"}],"max_tokens": 300}} @@ -13,12 +15,10 @@ import time import openai -from openai import OpenAI class OpenAIBatchProcessor: - def __init__(self, api_key): - # client = OpenAI(api_key=api_key) + def __init__(self): client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY") self.client = client @@ -81,8 +81,7 @@ def process_batch(self, input_file_path, endpoint, completion_window): # Initialize the OpenAIBatchProcessor -api_key = os.environ.get("OPENAI_API_KEY") -processor = OpenAIBatchProcessor(api_key) +processor = OpenAIBatchProcessor() # Process the batch job input_file_path = "input.jsonl" diff --git a/examples/runtime/openai_batch_complete.py b/examples/runtime/openai_batch_complete.py index af694b54a92..b5fc04c8b2d 100644 --- a/examples/runtime/openai_batch_complete.py +++ b/examples/runtime/openai_batch_complete.py @@ -14,12 +14,10 @@ import time import openai -from openai import OpenAI class OpenAIBatchProcessor: - def __init__(self, api_key): - # client = OpenAI(api_key=api_key) + def __init__(self): client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY") self.client = client @@ -82,8 +80,7 @@ def process_batch(self, input_file_path, endpoint, completion_window): # Initialize the OpenAIBatchProcessor -api_key = os.environ.get("OPENAI_API_KEY") -processor = OpenAIBatchProcessor(api_key) +processor = OpenAIBatchProcessor() # Process the batch job input_file_path = "input_complete.jsonl" diff --git a/examples/runtime/reward_model.py b/examples/runtime/reward_model.py index 3b63c8dd3f0..a18417df7f3 100644 --- a/examples/runtime/reward_model.py +++ b/examples/runtime/reward_model.py @@ -1,8 +1,6 @@ # launch server # python -m sglang.launch_server --model LxzGordon/URM-LLaMa-3.1-8B --is-embedding -import json - import requests url = "http://127.0.0.1:30000" diff --git a/python/sglang/lang/backend/runtime_endpoint.py b/python/sglang/lang/backend/runtime_endpoint.py index dc202ff1eeb..f43ae240aaf 100644 --- a/python/sglang/lang/backend/runtime_endpoint.py +++ b/python/sglang/lang/backend/runtime_endpoint.py @@ -235,6 +235,7 @@ def select( data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}} obj = self._generate_http_request(s, data) prompt_len = obj["meta_info"]["prompt_tokens"] + logprob_start_len = max(prompt_len - 2, 0) # For token healing # Compute logprob data = { @@ -245,7 +246,7 @@ def select( }, "return_logprob": True, "return_text_in_logprobs": True, - "logprob_start_len": prompt_len - 2, # For token healing + "logprob_start_len": logprob_start_len, } obj = self._generate_http_request(s, data) @@ -258,8 +259,8 @@ def select( # Remove extra token if no token healing occurred for i in range(len(input_token_logprobs)): healed_token_str = input_token_logprobs[i][0][-1] - healed_token_logprob = input_token_logprobs[i][0][0] if s.text_.endswith(healed_token_str): + healed_token_logprob = input_token_logprobs[i][0][0] normalized_prompt_logprobs[i] = ( normalized_prompt_logprobs[i] * len(input_token_logprobs[i]) - healed_token_logprob diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 495319f3e3b..3d3a0d4bc50 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -615,7 +615,7 @@ async def async_generate( if chunk == "data: [DONE]\n\n": break data = json.loads(chunk[5:].strip("\n")) - if hasattr(data, "text"): + if "text" in data: cur = data["text"][pos:] if cur: yield cur From 6e29b00d6927585664d981065c4dd16539eefe39 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sat, 28 Sep 2024 14:34:07 -0700 Subject: [PATCH 2/5] fix --- examples/runtime/openai_batch_complete.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/runtime/openai_batch_complete.py b/examples/runtime/openai_batch_complete.py index b5fc04c8b2d..20173fd875b 100644 --- a/examples/runtime/openai_batch_complete.py +++ b/examples/runtime/openai_batch_complete.py @@ -10,7 +10,6 @@ """ import json -import os import time import openai From 0ff1c5d372f204576cecedbd72fdc56fc76bfdf2 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sat, 28 Sep 2024 14:34:59 -0700 Subject: [PATCH 3/5] fix --- examples/runtime/openai_batch_complete.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/runtime/openai_batch_complete.py b/examples/runtime/openai_batch_complete.py index 20173fd875b..2f5be3d306a 100644 --- a/examples/runtime/openai_batch_complete.py +++ b/examples/runtime/openai_batch_complete.py @@ -82,7 +82,7 @@ def process_batch(self, input_file_path, endpoint, completion_window): processor = OpenAIBatchProcessor() # Process the batch job -input_file_path = "input_complete.jsonl" +input_file_path = "input.jsonl" endpoint = "/v1/completions" completion_window = "24h" From 60a8d24562c57215c77ef7495e4ec950e4eb8ff5 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sat, 28 Sep 2024 14:38:44 -0700 Subject: [PATCH 4/5] Improve docs --- README.md | 2 +- docs/en/backend.md | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index fc767f88d4f..f094fb0017b 100644 --- a/README.md +++ b/README.md @@ -163,6 +163,7 @@ curl http://localhost:30000/generate \ } }' ``` + Learn more about the argument specification, streaming, and multi-modal support [here](docs/en/sampling_params.md). ### OpenAI Compatible API @@ -265,7 +266,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct - XVERSE / XVERSE MoE - SmolLM - **Embedding Models** - e5-mistral diff --git a/docs/en/backend.md b/docs/en/backend.md index a488a8c9d0b..be565640083 100644 --- a/docs/en/backend.md +++ b/docs/en/backend.md @@ -19,7 +19,8 @@ curl http://localhost:30000/generate \ } }' ``` -Learn more about the argument format [here](https://sglang.readthedocs.io/en/latest/sampling_params.html). + +Learn more about the argument specification, streaming, and multi-modal support [here](https://sglang.readthedocs.io/en/latest/sampling_params.html). ### OpenAI Compatible API In addition, the server supports OpenAI-compatible APIs. @@ -58,7 +59,7 @@ response = client.embeddings.create( print(response) ``` -It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/). +It supports streaming, vision, and almost all features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/). ### Additional Server Arguments - To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command. @@ -99,6 +100,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct - Gemma / Gemma 2 - Qwen / Qwen 2 / Qwen 2 MoE - DeepSeek / DeepSeek 2 +- OLMoE - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/) - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava` - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava` @@ -115,6 +117,10 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct - ChatGLM - InternLM 2 - Exaone 3 +- BaiChuan2 +- MiniCPM / MiniCPM 3 +- XVERSE / XVERSE MoE +- SmolLM **Embedding Models** From 60a232223f47d3c86b154d3ab30111637f214aec Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sat, 28 Sep 2024 14:41:43 -0700 Subject: [PATCH 5/5] document torchao usage --- README.md | 1 + docs/en/backend.md | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index f094fb0017b..157d159d074 100644 --- a/README.md +++ b/README.md @@ -224,6 +224,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096 ``` - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. +- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies. - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments. - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`. - If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md). diff --git a/docs/en/backend.md b/docs/en/backend.md index be565640083..983a04784f1 100644 --- a/docs/en/backend.md +++ b/docs/en/backend.md @@ -80,6 +80,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096 ``` - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. +- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies. - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments. - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`. - If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](https://sglang.readthedocs.io/en/latest/custom_chat_template.html).