Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

55 #4

Merged
merged 5 commits into from
Sep 9, 2024
Merged

55 #4

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmark/blog_v0_2/405b_sglang.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json

# Launch sglang
# python -m sglang.launch_server --model ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --tp 8 --quant fp8 --disable-radix --mem-frac 0.87
# python -m sglang.launch_server --model-path ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --tp 8 --quant fp8 --disable-radix --mem-frac 0.87

# offline
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > sglang_log11
Expand Down
5 changes: 0 additions & 5 deletions benchmark/gsm8k/README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@
## Download data
```
bash download_data.sh
```

## Run benchmark

### Benchmark sglang
Expand Down
30 changes: 18 additions & 12 deletions benchmark/gsm8k/bench_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from tqdm import tqdm

from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
from sglang.utils import dump_state_text, read_jsonl
from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl

INVALID = -9999999

Expand Down Expand Up @@ -41,24 +41,28 @@ def get_answer_value(answer_str):


def main(args):
lines = read_jsonl(args.data_path)
# Select backend
call_generate = get_call_generate(args)

# Read data
url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
filename = download_and_cache_file(url)
lines = list(read_jsonl(filename))

# Construct prompts
k = args.num_shot
few_shot_examples = get_few_shot_examples(lines, k)
num_questions = args.num_questions
num_shots = args.num_shots
few_shot_examples = get_few_shot_examples(lines, num_shots)

questions = []
labels = []
for i in range(len(lines[: args.num_questions])):
for i in range(len(lines[:num_questions])):
questions.append(get_one_example(lines, i, False))
labels.append(get_answer_value(lines[i]["answer"]))
assert all(l != INVALID for l in labels)

states = [None] * len(labels)

# Select backend
call_generate = get_call_generate(args)

# Run requests
if args.backend != "lmql":
# Use thread pool
Expand Down Expand Up @@ -113,11 +117,13 @@ async def batched_call(batch_size):
# Compute accuracy
acc = np.mean(np.array(preds) == np.array(labels))
invalid = np.mean(np.array(preds) == INVALID)
print(f"Latency: {latency:.3f}")
print(f"Invalid: {invalid:.3f}")

# Print results
print(f"Accuracy: {acc:.3f}")
print(f"Invalid: {invalid:.3f}")
print(f"Latency: {latency:.3f} s")

# Write results
# Dump results
dump_state_text(f"tmp_output_{args.backend}.txt", states)

with open(args.result_file, "a") as fout:
Expand All @@ -138,7 +144,7 @@ async def batched_call(batch_size):

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--num-shot", type=int, default=5)
parser.add_argument("--num-shots", type=int, default=5)
parser.add_argument("--data-path", type=str, default="test.jsonl")
parser.add_argument("--num-questions", type=int, default=200)
args = add_common_other_args_and_parse(parser)
Expand Down
39 changes: 26 additions & 13 deletions benchmark/gsm8k/bench_sglang.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@

import numpy as np

from sglang.api import set_default_backend
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
from sglang.utils import dump_state_text, read_jsonl
from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl

INVALID = -9999999

Expand Down Expand Up @@ -41,15 +42,22 @@ def get_answer_value(answer_str):


def main(args):
lines = read_jsonl(args.data_path)
# Select backend
set_default_backend(select_sglang_backend(args))

# Read data
url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
filename = download_and_cache_file(url)
lines = list(read_jsonl(filename))

# Construct prompts
k = args.num_shot
few_shot_examples = get_few_shot_examples(lines, k)
num_questions = args.num_questions
num_shots = args.num_shots
few_shot_examples = get_few_shot_examples(lines, num_shots)

questions = []
labels = []
for i in range(len(lines[: args.num_questions])):
for i in range(len(lines[:num_questions])):
questions.append(get_one_example(lines, i, False))
labels.append(get_answer_value(lines[i]["answer"]))
assert all(l != INVALID for l in labels)
Expand All @@ -72,15 +80,11 @@ def few_shot_gsm8k(s, question):
########## SGL Program End ##########
#####################################

# Select backend
backend = select_sglang_backend(args)

# Run requests
tic = time.time()
states = few_shot_gsm8k.run_batch(
arguments,
temperature=0,
backend=backend,
num_threads=args.parallel,
progress_bar=True,
)
Expand All @@ -96,11 +100,20 @@ def few_shot_gsm8k(s, question):
# Compute accuracy
acc = np.mean(np.array(preds) == np.array(labels))
invalid = np.mean(np.array(preds) == INVALID)
print(f"Latency: {latency:.3f}")
print(f"Invalid: {invalid:.3f}")

# Compute speed
num_output_tokens = sum(
s.get_meta_info("answer")["completion_tokens"] for s in states
)
output_throughput = num_output_tokens / latency

# Print results
print(f"Accuracy: {acc:.3f}")
print(f"Invalid: {invalid:.3f}")
print(f"Latency: {latency:.3f} s")
print(f"Output throughput: {output_throughput:.3f} token/s")

# Write results
# Dump results
dump_state_text(f"tmp_output_{args.backend}.txt", states)

with open(args.result_file, "a") as fout:
Expand All @@ -121,7 +134,7 @@ def few_shot_gsm8k(s, question):

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--num-shot", type=int, default=5)
parser.add_argument("--num-shots", type=int, default=5)
parser.add_argument("--data-path", type=str, default="test.jsonl")
parser.add_argument("--num-questions", type=int, default=200)
args = add_common_sglang_args_and_parse(parser)
Expand Down
2 changes: 0 additions & 2 deletions benchmark/gsm8k/download_data.sh

This file was deleted.

5 changes: 0 additions & 5 deletions benchmark/hellaswag/README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@
## Download data
```
wget https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl
```

## Run benchmark

### Benchmark sglang
Expand Down
23 changes: 13 additions & 10 deletions benchmark/hellaswag/bench_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from tqdm import tqdm

from sglang.test.test_utils import add_common_other_args_and_parse, get_call_select
from sglang.utils import read_jsonl
from sglang.utils import download_and_cache_file, read_jsonl


def get_one_example(lines, i, include_answer):
Expand All @@ -26,25 +26,29 @@ def get_few_shot_examples(lines, k):


def main(args):
lines = read_jsonl(args.data_path)
# Select backend
call_select = get_call_select(args)

# Read data
url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
filename = download_and_cache_file(url)
lines = list(read_jsonl(filename))

# Construct prompts
k = args.num_shot
few_shot_examples = get_few_shot_examples(lines, k)
num_questions = args.num_questions
num_shots = args.num_shots
few_shot_examples = get_few_shot_examples(lines, num_shots)

questions = []
choices = []
labels = []
for i in range(len(lines[: args.num_questions])):
for i in range(len(lines[:num_questions])):
questions.append(get_one_example(lines, i, False))
choices.append(lines[i]["endings"])
labels.append(lines[i]["label"])

preds = [None] * len(labels)

# Select backend
call_select = get_call_select(args)

# Run requests
if args.backend != "lmql":
# Use thread pool
Expand All @@ -65,7 +69,6 @@ def get_one_answer(i):
total=len(questions),
)
)

else:
# Use asyncio
async def batched_call(batch_size):
Expand Down Expand Up @@ -108,7 +111,7 @@ async def batched_call(batch_size):

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--num-shot", type=int, default=20)
parser.add_argument("--num-shots", type=int, default=20)
parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
parser.add_argument("--num-questions", type=int, default=200)
args = add_common_other_args_and_parse(parser)
Expand Down
24 changes: 14 additions & 10 deletions benchmark/hellaswag/bench_sglang.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@

import numpy as np

from sglang.api import set_default_backend
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
from sglang.utils import read_jsonl
from sglang.utils import download_and_cache_file, read_jsonl


def get_one_example(lines, i, include_answer):
Expand All @@ -26,16 +27,23 @@ def get_few_shot_examples(lines, k):


def main(args):
lines = read_jsonl(args.data_path)
# Select backend
set_default_backend(select_sglang_backend(args))

# Read data
url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
filename = download_and_cache_file(url)
lines = list(read_jsonl(filename))

# Construct prompts
k = args.num_shot
few_shot_examples = get_few_shot_examples(lines, k)
num_questions = args.num_questions
num_shots = args.num_shots
few_shot_examples = get_few_shot_examples(lines, num_shots)

questions = []
choices = []
labels = []
for i in range(len(lines[: args.num_questions])):
for i in range(len(lines[:num_questions])):
questions.append(get_one_example(lines, i, False))
choices.append(lines[i]["endings"])
labels.append(lines[i]["label"])
Expand All @@ -56,15 +64,11 @@ def few_shot_hellaswag(s, question, choices):
########## SGL Program End ##########
#####################################

# Select backend
backend = select_sglang_backend(args)

# Run requests
tic = time.time()
rets = few_shot_hellaswag.run_batch(
arguments,
temperature=0,
backend=backend,
num_threads=args.parallel,
progress_bar=True,
)
Expand Down Expand Up @@ -95,7 +99,7 @@ def few_shot_hellaswag(s, question, choices):

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--num-shot", type=int, default=20)
parser.add_argument("--num-shots", type=int, default=20)
parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
parser.add_argument("--num-questions", type=int, default=200)
args = add_common_sglang_args_and_parse(parser)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import argparse
import csv
import json
import os
import time

Expand Down Expand Up @@ -223,7 +224,7 @@ def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size=
tokenizer_path=tokenizer_path,
port=cur_port,
additional_ports=[cur_port + 1, cur_port + 2, cur_port + 3, cur_port + 4],
model_override_args=model_override_args,
json_model_override_args=json.dumps(model_override_args),
tp_size=1,
)
sgl.set_default_backend(runtime)
Expand Down
3 changes: 3 additions & 0 deletions examples/runtime/llava_onevision/http_llava_onevision_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,14 @@ def multi_image_stream_request_test(client):
"image_url": {
"url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png"
},
"modalities": "multi-images",
},
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png"
},
"modalities": "multi-images",
},
{
"type": "text",
Expand Down Expand Up @@ -218,6 +220,7 @@ def prepare_video_messages(video_path):
frame_format = {
"type": "image_url",
"image_url": {"url": "data:image/jpeg;base64,{}"},
"modalities": "video",
}

for base64_frame in base64_frames:
Expand Down
2 changes: 1 addition & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ dependencies = [
[project.optional-dependencies]
srt = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
"packaging", "pillow", "psutil", "pydantic", "python-multipart",
"torch", "uvicorn", "uvloop", "zmq",
"torch", "torchao", "uvicorn", "uvloop", "zmq",
"vllm==0.5.5", "outlines>=0.0.44"]
openai = ["openai>=1.0", "tiktoken"]
anthropic = ["anthropic>=0.20.0"]
Expand Down
1 change: 1 addition & 0 deletions python/sglang/bench_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,7 @@ def main(server_args, bench_args):


if __name__ == "__main__":
# TODO(kevin85421): Make the parser setup unit testable.
parser = argparse.ArgumentParser()
ServerArgs.add_cli_args(parser)
BenchArgs.add_cli_args(parser)
Expand Down
Loading
Loading