Temporary repo for large-scale model inference.
$ git clone https://github.com/hpcaitech/ColossalAI-Inference.git
$ python setup.py install or python setup.py develop
# To pack the distributed inference as a service, we rely on Triton python backend.
$ docker run --gpus all --name=triton_server -v /<host path>/workspace:/opt/tritonserver/host --shm-size=1g --ulimit memlock=-1 -p 10010:8000 -p 10011:8001 -p 10012:8002 --ulimit stack=67108864 -ti nvcr.io/nvidia/tritonserver:21.10-py3
$ git clone https://github.com/triton-inference-server/python_backend -b r<xx.yy>
$ mv /examples/energon /opt/tritonserver/python_backend/models
$ bash run_gpt.sh