-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstart_tgi_servers.sh
22 lines (19 loc) · 971 Bytes
/
start_tgi_servers.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#!/bin/bash
# Share cache directory
volume=/fastdata/hfcache/transformers/
# HF token
token=$(cat ${HOME}/.config/huggingface/token)
# StarCoder: 8192, GPUs 0,1
#port=8192
#model=bigcode/starcoder
#docker run -d -e HUGGING_FACE_HUB_TOKEN=$token --gpus '"device=0,1"' --shm-size 1g \
# -p ${port}:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest \
# --model-id $model --trust-remote-code --dtype bfloat16 --sharded true --num-shard 2 \
# --max-total-tokens 8192 --max-input-length 8000 --max-batch-prefill-tokens 8000
# Code Llama: 8193, GPUs 2,3
port=8192
model=codellama/CodeLlama-13b-hf
docker run -e HUGGING_FACE_HUB_TOKEN=$token --gpus '"device=0,3"' --shm-size 1g \
-p ${port}:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest \
--model-id $model --trust-remote-code --dtype bfloat16 --sharded true --num-shard 2 \
--max-total-tokens 8192 --max-input-length 8000 --max-batch-prefill-tokens 8000