diff --git a/README.md b/README.md index 2651d094328..65ac1350f8e 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ docker run --gpus all \ --env "HF_TOKEN=" \ --ipc=host \ lmsysorg/sglang:latest \ - python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000 + python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000 ``` ### Method 4: Using docker compose @@ -121,7 +121,7 @@ resources: run: | conda deactivate python3 -m sglang.launch_server \ - --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \ + --model-path meta-llama/Llama-3.1-8B-Instruct \ --host 0.0.0.0 \ --port 30000 ``` diff --git a/benchmark/benchmark_vllm_060/README.md b/benchmark/benchmark_vllm_060/README.md index 5a1247c5f4b..b480dabf234 100644 --- a/benchmark/benchmark_vllm_060/README.md +++ b/benchmark/benchmark_vllm_060/README.md @@ -58,12 +58,12 @@ We referred to the reproduction method in https://github.com/vllm-project/vllm/i ```bash # Llama 3.1 8B Instruct on 1 x A100 -python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache -python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096 +python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache +python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096 # Llama 3.1 70B Instruct on 4 x H100 -python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-70B-Instruct --disable-radix-cache --tp 4 -python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096 +python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 4 +python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096 # bench serving python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 1200 --request-rate 4 @@ -76,12 +76,12 @@ python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-pro ```bash # Llama 3.1 8B Instruct on 1 x A100 -python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache -python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096 +python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache +python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096 # Llama 3.1 70B Instruct on 4 x H100 -python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-70B-Instruct --disable-radix-cache --tp 4 --mem-frac 0.88 -python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096 +python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 4 --mem-frac 0.88 +python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096 # bench serving python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 5000 diff --git a/benchmark/blog_v0_2/README.md b/benchmark/blog_v0_2/README.md index 57443e5fe21..7448554ee61 100644 --- a/benchmark/blog_v0_2/README.md +++ b/benchmark/blog_v0_2/README.md @@ -27,10 +27,10 @@ export HF_TOKEN=hf_token ```bash # Meta-Llama-3.1-8B-Instruct -python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache +python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache # Meta-Llama-3.1-70B-Instruct -python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-70B-Instruct --disable-radix-cache --tp 8 +python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 8 # Meta-Llama-3-70B-Instruct-FP8 python -m sglang.launch_server --model-path neuralmagic/Meta-Llama-3-70B-Instruct-FP8 --disable-radix-cache --tp 8 diff --git a/docker/compose.yaml b/docker/compose.yaml index 14801462229..c49d5c5bba5 100644 --- a/docker/compose.yaml +++ b/docker/compose.yaml @@ -17,7 +17,7 @@ services: # - SGLANG_USE_MODELSCOPE: true entrypoint: python3 -m sglang.launch_server command: - --model-path meta-llama/Meta-Llama-3.1-8B-Instruct + --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000 ulimits: diff --git a/docker/k8s-sglang-service.yaml b/docker/k8s-sglang-service.yaml index c217f356af2..cbccb142177 100644 --- a/docker/k8s-sglang-service.yaml +++ b/docker/k8s-sglang-service.yaml @@ -32,7 +32,7 @@ spec: ports: - containerPort: 30000 command: ["python3", "-m", "sglang.launch_server"] - args: ["--model-path", "meta-llama/Meta-Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"] + args: ["--model-path", "meta-llama/Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"] env: - name: HF_TOKEN value: diff --git a/docs/en/benchmark_and_profiling.md b/docs/en/benchmark_and_profiling.md index 3fbd935891c..77fbbfc1b64 100644 --- a/docs/en/benchmark_and_profiling.md +++ b/docs/en/benchmark_and_profiling.md @@ -30,7 +30,7 @@ apt install nsight-systems-cli ```bash # server # set the delay and duration times according to needs -nsys profile --trace-fork-before-exec=true --cuda-graph-trace=node -o sglang.out --delay 60 --duration 70 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --disable-radix-cache +nsys profile --trace-fork-before-exec=true --cuda-graph-trace=node -o sglang.out --delay 60 --duration 70 python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache # client python3 -m sglang.bench_serving --backend sglang --num-prompts 6000 --dataset-name random --random-input 4096 --random-output 2048 diff --git a/docs/en/install.md b/docs/en/install.md index c9dc1d70ae4..55eed71ae7e 100644 --- a/docs/en/install.md +++ b/docs/en/install.md @@ -35,7 +35,7 @@ docker run --gpus all \ --env "HF_TOKEN=" \ --ipc=host \ lmsysorg/sglang:latest \ - python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000 + python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000 ``` ### Method 4: Using docker compose @@ -75,7 +75,7 @@ resources: run: | conda deactivate python3 -m sglang.launch_server \ - --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \ + --model-path meta-llama/Llama-3.1-8B-Instruct \ --host 0.0.0.0 \ --port 30000 ``` diff --git a/examples/runtime/openai_chat_with_response_prefill.py b/examples/runtime/openai_chat_with_response_prefill.py index a856019b516..1b1604b3023 100644 --- a/examples/runtime/openai_chat_with_response_prefill.py +++ b/examples/runtime/openai_chat_with_response_prefill.py @@ -1,6 +1,6 @@ """ Usage: -python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 +python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --port 30000 python openai_chat.py """ @@ -10,7 +10,7 @@ client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY") response = client.chat.completions.create( - model="meta-llama/Meta-Llama-3.1-8B-Instruct", + model="meta-llama/Llama-3.1-8B-Instruct", messages=[ {"role": "system", "content": "You are a helpful AI assistant"}, { diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index fbe45bb2ffe..2c22f8d9017 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -23,13 +23,13 @@ from sglang.utils import get_exception_traceback DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8" -DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct" +DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct" DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1" DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8" DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600 -DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it" -DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" +DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it" +DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4" diff --git a/test/srt/models/test_generation_models.py b/test/srt/models/test_generation_models.py index 21078e8aaa8..7be410ccb03 100644 --- a/test/srt/models/test_generation_models.py +++ b/test/srt/models/test_generation_models.py @@ -44,7 +44,7 @@ class ModelCase: # Popular models that run on CI CI_MODELS = [ - ModelCase("meta-llama/Meta-Llama-3.1-8B-Instruct"), + ModelCase("meta-llama/Llama-3.1-8B-Instruct"), ModelCase("google/gemma-2-2b"), ] diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index d92a9de96bf..5afe9b0b177 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -499,7 +499,7 @@ def test_response_prefill(self): client = openai.Client(api_key=self.api_key, base_url=self.base_url) response = client.chat.completions.create( - model="meta-llama/Meta-Llama-3.1-8B-Instruct", + model="meta-llama/Llama-3.1-8B-Instruct", messages=[ {"role": "system", "content": "You are a helpful AI assistant"}, {