81549361 · 81549361 · Oct 23, 2024 · Oct 10, 2024 · Oct 11, 2024 · Oct 11, 2024
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -0,0 +1,13 @@
+/python/sglang/lang @merrymercy @Ying1123 @hnyls2002 @ByronHsu
+/python/sglang/srt @merrymercy @Ying1123 @hnyls2002 @zhyncs @ispobock @ByronHsu
+/python/sglang/srt/constrained @hnyls2002
+/python/sglang/srt/layers @merrymercy @Ying1123 @zhyncs @ispobock
+/python/sglang/srt/lora @Ying1123
+/python/sglang/srt/managers @merrymercy @Ying1123 @hnyls2002
+/python/sglang/srt/mem_cache @merrymercy @Ying1123 @hnyls2002
+/python/sglang/srt/model_executor @merrymercy @Ying1123 @hnyls2002 @zhyncs @ispobock
+/python/sglang/srt/models @merrymercy @Ying1123 @hnyls2002 @zhyncs @ispobock @ByronHsu
+/python/sglang/srt/openai_api @merrymercy @Ying1123 @hnyls2002 @zhyncs @ispobock @ByronHsu
+/python/sglang/srt/sampling @merrymercy @hnyls2002
+/test/lang @merrymercy @Ying1123 @hnyls2002 @ByronHsu
+/test/srt @merrymercy @Ying1123 @hnyls2002 @zhyncs @ispobock @ByronHsu
diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml
@@ -14,7 +14,7 @@ on:
   workflow_dispatch:
 
 concurrency:
-  group: pr-test-${{ github.ref }}
+  group: pr-test-amd-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
@@ -28,7 +28,8 @@ jobs:
       - name: Install dependencies
         run: |
           pip install --upgrade pip
-          pip install -e "python[all]" --no-deps
+          pip install -e "python[runtime_common, test]"
+          pip install -e "python" --no-deps
 
           git clone https://github.com/merrymercy/human-eval.git
           cd human-eval

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
@@ -29,7 +29,7 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install -e "python[dev]"
-          pip install transformers==4.44
+          pip install transformers==4.45.2
           pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
 
       - name: Run test
@@ -49,14 +49,14 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install -e "python[dev]"
-          pip install transformers==4.44
+          pip install transformers==4.45.2
           pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
 
       - name: Run test
         timeout-minutes: 20
         run: |
           cd test/srt
-          python3 run_suite.py --suite minimal --range-begin 0 --range-end 7
+          python3 run_suite.py --suite minimal --range-begin 0 --range-end 5
 
   unit-test-backend-part-2:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
@@ -69,14 +69,14 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install -e "python[dev]"
-          pip install transformers==4.44
+          pip install transformers==4.45.2
           pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
 
       - name: Run test
-        timeout-minutes: 20
+        timeout-minutes: 30
         run: |
           cd test/srt
-          python3 run_suite.py --suite minimal --range-begin 7 --range-end 14
+          python3 run_suite.py --suite minimal --range-begin 5 --range-end 17
 
   unit-test-backend-part-3:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
@@ -89,14 +89,14 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install -e "python[dev]"
-          pip install transformers==4.44
+          pip install transformers==4.45.2
           pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
 
       - name: Run test
-        timeout-minutes: 20
+        timeout-minutes: 30
         run: |
           cd test/srt
-          python3 run_suite.py --suite minimal --range-begin 14
+          python3 run_suite.py --suite minimal --range-begin 17
 
   performance-test-1-gpu-part-1:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
@@ -109,7 +109,7 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install -e "python[all]"
-          pip install transformers==4.44
+          pip install transformers==4.45.2
           pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
 
       - name: Benchmark Single Latency
@@ -147,7 +147,7 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install -e "python[all]"
-          pip install transformers==4.44
+          pip install transformers==4.45.2
           pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
 
       - name: Benchmark Offline Throughput (w/o RadixAttention)
@@ -179,7 +179,7 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install -e "python[all]"
-          pip install transformers==4.44
+          pip install transformers==4.45.2
           pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
 
       - name: Benchmark Offline Throughput (TP=2)
@@ -211,7 +211,7 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install -e "python[all]"
-          pip install transformers==4.44
+          pip install transformers==4.45.2
           pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
 
           git clone https://github.com/merrymercy/human-eval.git
@@ -235,7 +235,7 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install -e "python[all]"
-          pip install transformers==4.44
+          pip install transformers==4.45.2
           pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
 
           git clone https://github.com/merrymercy/human-eval.git
@@ -255,12 +255,11 @@ jobs:
           python3 test_mla.py
           python3 test_mla_fp8.py
 
-      # Temporarily disabled
-      #- name: Evaluate Data Parallelism Accuracy (TP=2)
-      #  timeout-minutes: 10
-      #  run: |
-      #    cd test/srt
-      #    python3 test_data_parallelism.py
+      - name: Evaluate Data Parallelism Accuracy (DP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 test_data_parallelism.py
 
   finish:
     needs: [

diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
-<div align="center">
-<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
+<div align="center"  id="sglangtop">
+<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
 
 [![PyPI](https://img.shields.io/pypi/v/sglang)](https://pypi.org/project/sglang)
 ![PyPI - Downloads](https://img.shields.io/pypi/dm/sglang)
@@ -11,20 +11,18 @@
 
 --------------------------------------------------------------------------------
 
-| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
-
-## Upcoming Events
-- [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
-- [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
+| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
+[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
 
 ## News
-- [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
-- [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
-- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
+- [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
+- [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
+- [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
 
 <details>
 <summary>More</summary>
 
+- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
 - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
 - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
 - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
@@ -58,23 +56,27 @@ You can install SGLang using any of the methods below.
 pip install --upgrade pip
 pip install "sglang[all]"
 
-# Install FlashInfer CUDA kernels
+# Install FlashInfer accelerated kernels
 pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ```
 
+**Important: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.**
+
 ### Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
+git clone -b v0.3.4.post1 https://github.com/sgl-project/sglang.git
 cd sglang
 
 pip install --upgrade pip
 pip install -e "python[all]"
 
-# Install FlashInfer CUDA kernels
+# Install FlashInfer accelerated kernels
 pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ```
 
+**Important: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.**
+
 ### Method 3: Using docker
 The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](https://github.com/sgl-project/sglang/tree/main/docker).
 Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
@@ -228,7 +230,8 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
 ```
-- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
+- To enable the experimental overlapped scheduler, add `--enable-overlap-scheduler`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currenly.
+- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currenly.
 - To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
 - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
@@ -242,6 +245,35 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
 ```
 
+### Engine Without HTTP Server
+
+We also provide an inference engine **without a HTTP server**. For example,
+
+```python
+import sglang as sgl
+
+def main():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = {"temperature": 0.8, "top_p": 0.95}
+    llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
+
+    outputs = llm.generate(prompts, sampling_params)
+    for prompt, output in zip(prompts, outputs):
+        print("===============================")
+        print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
+
+if __name__ == "__main__":
+    main()
+```
+
+This can be used for offline batch inference and building custom servers.
+You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine).
+
 ### Supported Models
 
 **Generative Models**
@@ -271,6 +303,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - MiniCPM / MiniCPM 3
 - XVERSE / XVERSE MoE
 - SmolLM
+- GLM-4
 
 **Embedding Models**
 
@@ -407,7 +440,6 @@ print(state["answer_1"])
 ```
 
 #### More Examples
-
 Anthropic and VertexAI (Gemini) models are also supported.
 You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
 
@@ -578,14 +610,18 @@ def chat_example(s):
 - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
 
 ## Benchmark And Performance
-![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg)
-![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg)
-
-Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
+Learn more in our release blogs: [v0.2](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3](https://lmsys.org/blog/2024-09-04-sglang-v0-3/).
 
 ## Roadmap
 [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
 
 ## Citation And Acknowledgment
 Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
 We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
+
+
+<p align="center">
+  <a href="#sglangtop" target="_blank">
+  <bold>Back To Top </bold>
+  </a>
+</p>
diff --git a/benchmark/llava_bench/README.md b/benchmark/llava_bench/README.md
@@ -17,7 +17,7 @@ pip3 install "torch>=2.1.2" "transformers>=4.36" pillow
 ### Benchmark sglang
 Launch a server
 ```
-python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --port 30000
+python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --port 30000
 ```
 
 Run benchmark

diff --git a/benchmark/llava_bench/bench_sglang.py b/benchmark/llava_bench/bench_sglang.py
@@ -20,7 +20,7 @@ def image_qa(s, image_file, question):
 
 
 def main(args):
-    lines = read_jsonl(args.question_file)[: args.num_questions]
+    lines = list(read_jsonl(args.question_file))[: args.num_questions]
     arguments = [
         {
             "image_file": os.path.abspath(args.image_folder + "/" + l["image"]),

diff --git a/docs/en/backend.md b/docs/en/backend.md
@@ -79,7 +79,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
 ```
-- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
+- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currenly.
 - To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
 - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
@@ -93,6 +93,35 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
 ```
 
+### Engine Without HTTP Server
+
+We also provide an inference engine **without a HTTP server**. For example,
+
+```python
+import sglang as sgl
+
+def main():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = {"temperature": 0.8, "top_p": 0.95}
+    llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
+
+    outputs = llm.generate(prompts, sampling_params)
+    for prompt, output in zip(prompts, outputs):
+        print("===============================")
+        print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
+
+if __name__ == "__main__":
+    main()
+```
+
+This can be used for offline batch inference and building custom servers.
+You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine).
+
 ### Supported Models
 
 **Generative Models**

diff --git a/docs/en/benchmark_and_profiling.md b/docs/en/benchmark_and_profiling.md
@@ -46,4 +46,8 @@ pip install nvtx
 import nvtx
 with nvtx.annotate("description", color="color"):
     # some critical code
-```
+```
+
+## Other tips
+
+1. You can benchmark a model using dummy weights by only providing the config.json file. This allows for quick testing of model variants without training. To do so, add `--load-format dummy` to the above commands and then you only need a correct `config.json` under the checkpoint folder.
diff --git a/docs/en/frontend.md b/docs/en/frontend.md
@@ -68,7 +68,6 @@ print(state["answer_1"])
 ```
 
 #### More Examples
-
 Anthropic and VertexAI (Gemini) models are also supported.
 You can find more examples at [examples/quick_start](https://github.com/sgl-project/sglang/tree/main/examples/frontend_language/quick_start).
-Original file line number
+Diff line change
@@ Expand Up / @@ -68,7 +68,6 @@ print(state["answer_1"]) @@
     ```
     #### More Examples
     Anthropic and VertexAI (Gemini) models are also supported.
     You can find more examples at [examples/quick_start](https://github.com/sgl-project/sglang/tree/main/examples/frontend_language/quick_start).
@@ Expand Down @@