Convert to Triton Punica kernels (#658)

predibase · Nov 5, 2024 · 902a68c · 902a68c
1 parent b3944ad
commit 902a68c
Show file tree

Hide file tree

Showing 67 changed files with 4,198 additions and 954 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Dockerfile b/Dockerfile
@@ -216,7 +216,7 @@ COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-31
 RUN pip install einops --no-cache-dir
 
 # Install flashinfer
-RUN pip install --no-cache-dir flashinfer==0.1.5+cu124torch2.4 -i https://flashinfer.ai/whl/cu124/torch2.4
+RUN pip install --no-cache-dir flashinfer==0.1.6 -i https://flashinfer.ai/whl/cu124/torch2.4
 
 # Install server
 COPY proto proto

diff --git a/clients/python/lorax/client.py b/clients/python/lorax/client.py
@@ -1,4 +1,5 @@
 import json
+import logging
 import requests
 from requests.adapters import HTTPAdapter, Retry
 
@@ -20,7 +21,22 @@
 from lorax.errors import parse_error
 import os 
 
-LORAX_DEBUG_MODE = os.getenv("LORAD_DEBUG_MODE", None) is not None
+LORAX_DEBUG_MODE = os.getenv("LORAX_DEBUG_MODE", None) is not None
+if LORAX_DEBUG_MODE:
+    # https://stackoverflow.com/a/16630836/1869739
+    # These two lines enable debugging at httplib level (requests->urllib3->http.client)
+    # You will see the REQUEST, including HEADERS and DATA, and RESPONSE with HEADERS but without DATA.
+    # The only thing missing will be the response.body which is not logged.
+    import http.client as http_client
+    http_client.HTTPConnection.debuglevel = 1
+
+    # You must initialize logging, otherwise you'll not see debug output.
+    logging.basicConfig()
+    logging.getLogger().setLevel(logging.DEBUG)
+    requests_log = logging.getLogger("requests.packages.urllib3")
+    requests_log.setLevel(logging.DEBUG)
+    requests_log.propagate = True
+
 
 class Client:
     """Client to make calls to a LoRAX instance

diff --git a/docs/guides/contributing/development_env.md b/docs/guides/contributing/development_env.md
@@ -47,12 +47,12 @@ We'll be working out of three different terminals during development, each servi
 Install development dependencies:
 
 ```shell
-DEBIAN_FRONTEND=noninteractive apt install pkg-config rsync tmux rust-gdb git -y
+DEBIAN_FRONTEND=noninteractive apt install pkg-config rsync tmux rust-gdb git -y && \
 PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
     curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
     unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
     unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
-    rm -f $PROTOC_ZIP
+    rm -f $PROTOC_ZIP && \
 hash -r
 ```
 
@@ -71,8 +71,7 @@ tmux new -s server
 From within the `tmux` session, move into the LoRAX `server` directory within the repo (assumed to be in `/data/lorax`) and install dependencies:
 
 ```shell
-cd /data/lorax/server
-pip install -e .
+cd /data/lorax/server && pip install -e .
 make gen-server
 ```
 
@@ -95,9 +94,9 @@ tmux new -s router
 Now move into the `router` directory within the repo and install dependencies:
 
 ```shell
-cd /data/lorax/router
-curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-export PATH=$PATH:$HOME/.cargo/bin
+cd /data/lorax/router && \
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \
+export PATH=$PATH:$HOME/.cargo/bin && \
 touch ../proto/generate.proto
 ```
 

diff --git a/docs/guides/contributing/index.md b/docs/guides/contributing/index.md
@@ -23,3 +23,22 @@ make export-requirements
 ```
 
 Never modify `requirements.txt` directly, as it may introduce dependency conflicts.
+
+## Profiling
+
+LoRAX supports the [PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html) to measure performance of LoRAX.
+
+You can enable profiling when launching LoRAX by setting the `LORAX_PROFILER_DIR` environment variable to the directory
+you wish to output the Tensorboard traces to.
+
+Once initialized, LoRAX will begin recording traces for every request to the server. Because traces can get very large,
+we record only the first 10 prefill requests (plus any decode requests between them), then stop recording and write
+out the results. A summary will be printed to stdout when this occurs.
+
+Once you have your traces written to the profiler directory, you can visualize them in Tensorboard using the
+[PyTorch Profiler Tensorboard Plugin](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html).
+
+```bash
+pip install torch_tb_profiler
+tensorboard --logdir=$LORAX_PROFILER_DIR
+```
diff --git a/launcher/Cargo.toml b/launcher/Cargo.toml
@@ -11,6 +11,7 @@ clap = { version = "4.1.4", features = ["derive", "env"] }
 ctrlc = { version = "3.2.5", features = ["termination"] }
 nix = "0.26.2"
 openssl = "0.10.66"
+hf-hub = { version = "0.3.0", features = ["tokio"] }
 h2 = "0.3.26"
 rustix = "0.37.25"
 serde = { version = "1.0.152", features = ["derive"]  }