latency test enhancement - part 1 #101
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: E2E Test | |
on: | |
push: | |
branches: [ main ] | |
paths: | |
- "python/sglang/**" | |
- "test/**" | |
pull_request: | |
branches: [ main ] | |
paths: | |
- "python/sglang/**" | |
- "test/**" | |
workflow_dispatch: | |
concurrency: | |
group: e2e-test-${{ github.ref }} | |
cancel-in-progress: true | |
jobs: | |
e2e-test: | |
runs-on: self-hosted | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v3 | |
- name: Install dependencies | |
run: | | |
source $HOME/venv/bin/activate | |
echo "$HOME/venv/bin" >> $GITHUB_PATH | |
pip cache purge | |
pip install --upgrade pip | |
pip install -e "python[all]" | |
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ --force-reinstall | |
pip install --upgrade transformers | |
- name: Benchmark Serving Throughput | |
run: | | |
python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --port 8413 --disable-radix-cache & | |
echo "Waiting for server to start..." | |
for i in {1..120}; do | |
if curl -s http://127.0.0.1:8413/health; then | |
echo "Server is up!" | |
break | |
fi | |
if [ $i -eq 120 ]; then | |
echo "Server failed to start within 120 seconds" | |
exit 1 | |
fi | |
sleep 1 | |
done | |
cd $HOME && python3 -m sglang.bench_serving --backend sglang --port 8413 --dataset-name random --num-prompts 3000 --random-input 256 --random-output 512 | |
echo "Stopping server..." | |
kill -9 $(ps aux | grep sglang | grep Meta-Llama-3.1-8B-Instruct | grep -v grep | awk '{print $2}') |