From 2a7e7d425d25402ee79c13875c7b70cf071314ce Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sat, 28 Sep 2024 14:30:46 -0700
Subject: [PATCH 1/5] update

---
 README.md                                      | 6 +++---
 examples/runtime/async_io_api.py               | 1 +
 examples/runtime/openai_batch_chat.py          | 9 ++++-----
 examples/runtime/openai_batch_complete.py      | 7 ++-----
 examples/runtime/reward_model.py               | 2 --
 python/sglang/lang/backend/runtime_endpoint.py | 5 +++--
 python/sglang/srt/server.py                    | 2 +-
 7 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index a3e7beee6d9..fc767f88d4f 100644
--- a/README.md
+++ b/README.md
@@ -163,7 +163,7 @@ curl http://localhost:30000/generate \
     }
   }'
 ```
-Learn more about the argument format [here](docs/en/sampling_params.md).
+Learn more about the argument specification, streaming, and multi-modal support [here](docs/en/sampling_params.md).
 
 ### OpenAI Compatible API
 In addition, the server supports OpenAI-compatible APIs.
@@ -202,7 +202,7 @@ response = client.embeddings.create(
 print(response)
 ```
 
-It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
+It supports streaming, vision, and almost all features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
 
 ### Additional Server Arguments
 - To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
@@ -241,9 +241,9 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - Llama / Llama 2 / Llama 3 / Llama 3.1
 - Mistral / Mixtral / Mistral NeMo
 - Gemma / Gemma 2
-- OLMoE
 - Qwen / Qwen 2 / Qwen 2 MoE
 - DeepSeek / DeepSeek 2
+- OLMoE
 - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
   - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
   - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
diff --git a/examples/runtime/async_io_api.py b/examples/runtime/async_io_api.py
index d12a3a4d9df..23d3d0b90bf 100644
--- a/examples/runtime/async_io_api.py
+++ b/examples/runtime/async_io_api.py
@@ -1,5 +1,6 @@
 """
 Usage:
+
 python3 async_io.py
 """
 
diff --git a/examples/runtime/openai_batch_chat.py b/examples/runtime/openai_batch_chat.py
index 8640d092570..1081f0a69bc 100644
--- a/examples/runtime/openai_batch_chat.py
+++ b/examples/runtime/openai_batch_chat.py
@@ -1,7 +1,9 @@
 """
 Usage:
+
 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
 python openai_batch_chat.py
+
 Note: Before running this script,
 you should create the input.jsonl file with the following content:
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!  List 3 NBA players and tell a story"}],"max_tokens": 300}}
@@ -13,12 +15,10 @@
 import time
 
 import openai
-from openai import OpenAI
 
 
 class OpenAIBatchProcessor:
-    def __init__(self, api_key):
-        # client = OpenAI(api_key=api_key)
+    def __init__(self):
         client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
 
         self.client = client
@@ -81,8 +81,7 @@ def process_batch(self, input_file_path, endpoint, completion_window):
 
 
 # Initialize the OpenAIBatchProcessor
-api_key = os.environ.get("OPENAI_API_KEY")
-processor = OpenAIBatchProcessor(api_key)
+processor = OpenAIBatchProcessor()
 
 # Process the batch job
 input_file_path = "input.jsonl"
diff --git a/examples/runtime/openai_batch_complete.py b/examples/runtime/openai_batch_complete.py
index af694b54a92..b5fc04c8b2d 100644
--- a/examples/runtime/openai_batch_complete.py
+++ b/examples/runtime/openai_batch_complete.py
@@ -14,12 +14,10 @@
 import time
 
 import openai
-from openai import OpenAI
 
 
 class OpenAIBatchProcessor:
-    def __init__(self, api_key):
-        # client = OpenAI(api_key=api_key)
+    def __init__(self):
         client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
 
         self.client = client
@@ -82,8 +80,7 @@ def process_batch(self, input_file_path, endpoint, completion_window):
 
 
 # Initialize the OpenAIBatchProcessor
-api_key = os.environ.get("OPENAI_API_KEY")
-processor = OpenAIBatchProcessor(api_key)
+processor = OpenAIBatchProcessor()
 
 # Process the batch job
 input_file_path = "input_complete.jsonl"
diff --git a/examples/runtime/reward_model.py b/examples/runtime/reward_model.py
index 3b63c8dd3f0..a18417df7f3 100644
--- a/examples/runtime/reward_model.py
+++ b/examples/runtime/reward_model.py
@@ -1,8 +1,6 @@
 # launch server
 # python -m sglang.launch_server --model LxzGordon/URM-LLaMa-3.1-8B --is-embedding
 
-import json
-
 import requests
 
 url = "http://127.0.0.1:30000"
diff --git a/python/sglang/lang/backend/runtime_endpoint.py b/python/sglang/lang/backend/runtime_endpoint.py
index dc202ff1eeb..f43ae240aaf 100644
--- a/python/sglang/lang/backend/runtime_endpoint.py
+++ b/python/sglang/lang/backend/runtime_endpoint.py
@@ -235,6 +235,7 @@ def select(
         data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
         obj = self._generate_http_request(s, data)
         prompt_len = obj["meta_info"]["prompt_tokens"]
+        logprob_start_len = max(prompt_len - 2, 0)  # For token healing
 
         # Compute logprob
         data = {
@@ -245,7 +246,7 @@ def select(
             },
             "return_logprob": True,
             "return_text_in_logprobs": True,
-            "logprob_start_len": prompt_len - 2,  # For token healing
+            "logprob_start_len": logprob_start_len,
         }
         obj = self._generate_http_request(s, data)
 
@@ -258,8 +259,8 @@ def select(
         # Remove extra token if no token healing occurred
         for i in range(len(input_token_logprobs)):
             healed_token_str = input_token_logprobs[i][0][-1]
-            healed_token_logprob = input_token_logprobs[i][0][0]
             if s.text_.endswith(healed_token_str):
+                healed_token_logprob = input_token_logprobs[i][0][0]
                 normalized_prompt_logprobs[i] = (
                     normalized_prompt_logprobs[i] * len(input_token_logprobs[i])
                     - healed_token_logprob
diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index 495319f3e3b..3d3a0d4bc50 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -615,7 +615,7 @@ async def async_generate(
                         if chunk == "data: [DONE]\n\n":
                             break
                         data = json.loads(chunk[5:].strip("\n"))
-                        if hasattr(data, "text"):
+                        if "text" in data:
                             cur = data["text"][pos:]
                             if cur:
                                 yield cur

From 6e29b00d6927585664d981065c4dd16539eefe39 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sat, 28 Sep 2024 14:34:07 -0700
Subject: [PATCH 2/5] fix

---
 examples/runtime/openai_batch_complete.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/runtime/openai_batch_complete.py b/examples/runtime/openai_batch_complete.py
index b5fc04c8b2d..20173fd875b 100644
--- a/examples/runtime/openai_batch_complete.py
+++ b/examples/runtime/openai_batch_complete.py
@@ -10,7 +10,6 @@
 """
 
 import json
-import os
 import time
 
 import openai

From 0ff1c5d372f204576cecedbd72fdc56fc76bfdf2 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sat, 28 Sep 2024 14:34:59 -0700
Subject: [PATCH 3/5] fix

---
 examples/runtime/openai_batch_complete.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/runtime/openai_batch_complete.py b/examples/runtime/openai_batch_complete.py
index 20173fd875b..2f5be3d306a 100644
--- a/examples/runtime/openai_batch_complete.py
+++ b/examples/runtime/openai_batch_complete.py
@@ -82,7 +82,7 @@ def process_batch(self, input_file_path, endpoint, completion_window):
 processor = OpenAIBatchProcessor()
 
 # Process the batch job
-input_file_path = "input_complete.jsonl"
+input_file_path = "input.jsonl"
 endpoint = "/v1/completions"
 completion_window = "24h"
 

From 60a8d24562c57215c77ef7495e4ec950e4eb8ff5 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sat, 28 Sep 2024 14:38:44 -0700
Subject: [PATCH 4/5] Improve docs

---
 README.md          |  2 +-
 docs/en/backend.md | 10 ++++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index fc767f88d4f..f094fb0017b 100644
--- a/README.md
+++ b/README.md
@@ -163,6 +163,7 @@ curl http://localhost:30000/generate \
     }
   }'
 ```
+
 Learn more about the argument specification, streaming, and multi-modal support [here](docs/en/sampling_params.md).
 
 ### OpenAI Compatible API
@@ -265,7 +266,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - XVERSE / XVERSE MoE
 - SmolLM
 
-
 **Embedding Models**
 
 - e5-mistral
diff --git a/docs/en/backend.md b/docs/en/backend.md
index a488a8c9d0b..be565640083 100644
--- a/docs/en/backend.md
+++ b/docs/en/backend.md
@@ -19,7 +19,8 @@ curl http://localhost:30000/generate \
     }
   }'
 ```
-Learn more about the argument format [here](https://sglang.readthedocs.io/en/latest/sampling_params.html).
+
+Learn more about the argument specification, streaming, and multi-modal support [here](https://sglang.readthedocs.io/en/latest/sampling_params.html).
 
 ### OpenAI Compatible API
 In addition, the server supports OpenAI-compatible APIs.
@@ -58,7 +59,7 @@ response = client.embeddings.create(
 print(response)
 ```
 
-It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
+It supports streaming, vision, and almost all features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
 
 ### Additional Server Arguments
 - To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
@@ -99,6 +100,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - Gemma / Gemma 2
 - Qwen / Qwen 2 / Qwen 2 MoE
 - DeepSeek / DeepSeek 2
+- OLMoE
 - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
   - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
   - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
@@ -115,6 +117,10 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - ChatGLM
 - InternLM 2
 - Exaone 3
+- BaiChuan2
+- MiniCPM / MiniCPM 3
+- XVERSE / XVERSE MoE
+- SmolLM
 
 **Embedding Models**
 

From 60a232223f47d3c86b154d3ab30111637f214aec Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sat, 28 Sep 2024 14:41:43 -0700
Subject: [PATCH 5/5] document torchao usage

---
 README.md          | 1 +
 docs/en/backend.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index f094fb0017b..157d159d074 100644
--- a/README.md
+++ b/README.md
@@ -224,6 +224,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
 ```
 - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
+- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
 - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
 - If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
diff --git a/docs/en/backend.md b/docs/en/backend.md
index be565640083..983a04784f1 100644
--- a/docs/en/backend.md
+++ b/docs/en/backend.md
@@ -80,6 +80,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
 ```
 - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
+- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
 - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
 - If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](https://sglang.readthedocs.io/en/latest/custom_chat_template.html).