diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md new file mode 100644 index 000000000..d942da88f --- /dev/null +++ b/TROUBLESHOOTING.md @@ -0,0 +1,61 @@ +# **Troubleshooting Guide** + +This guide provides steps and information to troubleshoot issues related to the model server and debugging tools. It is a work in progress and will eventually be moved to `serving/docs` upon finalization. + +--- + +## **Profiling** + +> Note that profiling is still being worked upon and the interfaces are bound to change until finalized. In it's current state this is only recommended for personal debugging. + +The container can be started in **DEBUG mode** by setting the environment variable `LMI_DEBUG_NSYS_ENABLED=1`. When enabled, this mode facilitates advanced profiling and debugging capabilities with the following effects: + +### **1. Installation of Debugging Tools** + +In DEBUG mode, the following tool will be installed automatically: + +- **[NVIDIA Nsight Systems](https://docs.nvidia.com/nsight-systems/)** + - Nsight Systems enables system-wide performance analysis. + - The version of Nsight can be controlled using the environment variable: + - `LMI_DEBUG_NSIGHT_VERSION`: Specifies the version of Nsight Systems to install (e.g., `2024.6.1`). + +### **2. Profiling with Nsight Systems** + +The model server will automatically start under the `nsys` profiler when `LMI_DEBUG_NSYS_ENABLED` is enabled. The following environment variables can be configured to customize the profiling behavior: + +- **`LMI_DEBUG_NSYS_PROFILE_DELAY`**: + - Specifies the delay in seconds before profiling begins. + - Use this to exclude startup activities and capture only relevant information. + - **Default**: `30` seconds. + +- **`LMI_DEBUG_NSYS_PROFILE_DURATION`**: + - Specifies the duration in seconds for profiling. + - Avoid setting this to values larger than 600 seconds (10 minutes) to prevent generating large and unwieldy reports. + - **Default**: `600` seconds. + +- **`LMI_DEBUG_NSYS_PROFILE_TRACE`**: + - Allows customization of the APIs and operations to trace. + - Examples include `cuda`, `nvtx`, `osrt`, `cudnn`, `cublas`, `mpi`, and `python-gil`. + - Refer to the [Nsight Systems User Guide](https://docs.nvidia.com/nsight-systems/UserGuide/index.html) for more details. + +### **3. Report Generation and Upload** + +- After profiling is complete, the generated `.nsys-rep` report will be automatically uploaded to the specified S3 bucket if the `LMI_DEBUG_S3_ARTIFACT_PATH` environment variable is provided. +- **`LMI_DEBUG_S3_ARTIFACT_PATH`**: + - Specifies the S3 bucket and path for storing the profiling report. + - **Example**: `s3://my-bucket/profiles/`. + +--- + +### **Example Usage** + +To enable profiling and customize its behavior: + +```bash +LMI_DEBUG_NSYS_ENABLED=1 \ +LMI_DEBUG_NSIGHT_VERSION=2024.6.1 \ +LMI_DEBUG_NSYS_PROFILE_DELAY=20 \ +LMI_DEBUG_NSYS_PROFILE_DURATION=300 \ +LMI_DEBUG_NSYS_PROFILE_TRACE="cuda,nvtx,osrt" \ +LMI_DEBUG_S3_ARTIFACT_PATH="s3://my-bucket/debug-reports/" \ +docker run my-container diff --git a/serving/docker/Dockerfile b/serving/docker/Dockerfile index 3cd0347df..23d072b8a 100644 --- a/serving/docker/Dockerfile +++ b/serving/docker/Dockerfile @@ -44,6 +44,8 @@ ENV MODEL_SERVER_HOME=/opt/djl ENV DJL_CACHE_DIR=/tmp/.djl.ai ENV HF_HOME=/tmp/.cache/huggingface ENV TRANSFORMERS_CACHE=/tmp/.cache/huggingface/transformers +# Making s5cmd discoverable +ENV PATH="/opt/djl/bin:${PATH}" RUN useradd -m -d /home/djl djl && \ chown -R djl:djl /opt/djl diff --git a/serving/docker/aarch64.Dockerfile b/serving/docker/aarch64.Dockerfile index 6433e88e2..985c330f5 100644 --- a/serving/docker/aarch64.Dockerfile +++ b/serving/docker/aarch64.Dockerfile @@ -26,6 +26,8 @@ ENV HF_HOME=/tmp/.cache/huggingface ENV TRANSFORMERS_CACHE=/tmp/.cache/huggingface/transformers ENV DNNL_DEFAULT_FPMATH_MODE=BF16 ENV LRU_CACHE_CAPACITY=1024 +# Making s5cmd discoverable +ENV PATH="/opt/djl/bin:${PATH}" ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"] CMD ["serve"] diff --git a/serving/docker/dockerd-entrypoint-with-cuda-compat.sh b/serving/docker/dockerd-entrypoint-with-cuda-compat.sh index 14cd18e8d..8d0103656 100644 --- a/serving/docker/dockerd-entrypoint-with-cuda-compat.sh +++ b/serving/docker/dockerd-entrypoint-with-cuda-compat.sh @@ -60,11 +60,18 @@ fi if [[ "$1" = "serve" ]]; then shift 1 - code=77 - while [[ code -eq 77 ]]; do - /usr/bin/djl-serving "$@" - code=$? - done + echo "LMI_DEBUG_NSYS_ENABLED=$LMI_DEBUG_NSYS_ENABLED" + if [[ -n "$LMI_DEBUG_NSYS_ENABLED" ]]; then + set -e + source /opt/djl/scripts/install_debug_tools.sh + /opt/djl/scripts/start_debug_tools.sh "$@" + else + code=77 + while [[ code -eq 77 ]]; do + /usr/bin/djl-serving "$@" + code=$? + done + fi elif [[ "$1" = "partition" ]] || [[ "$1" = "train" ]]; then shift 1 /usr/bin/python3 /opt/djl/partition/partition.py "$@" diff --git a/serving/docker/dockerd-entrypoint.sh b/serving/docker/dockerd-entrypoint.sh index 43bd13fa4..ca4e9784c 100644 --- a/serving/docker/dockerd-entrypoint.sh +++ b/serving/docker/dockerd-entrypoint.sh @@ -9,12 +9,19 @@ fi if [[ "$1" = "serve" ]]; then shift 1 - code=77 - while [[ code -eq 77 ]]; do - /usr/bin/djl-serving "$@" - code=$? - done - exit $code + echo "LMI_DEBUG_NSYS_ENABLED=$LMI_DEBUG_NSYS_ENABLED" + if [[ -n "$LMI_DEBUG_NSYS_ENABLED" ]]; then + set -e + source /opt/djl/scripts/install_debug_tools.sh + /opt/djl/scripts/start_debug_tools.sh "$@" + else + code=77 + while [[ code -eq 77 ]]; do + /usr/bin/djl-serving "$@" + code=$? + done + exit $code + fi elif [[ "$1" = "partition" ]] || [[ "$1" = "train" ]]; then set -e shift 1 diff --git a/serving/docker/lmi.Dockerfile b/serving/docker/lmi.Dockerfile index 0dfd7ae07..6abaa8a41 100644 --- a/serving/docker/lmi.Dockerfile +++ b/serving/docker/lmi.Dockerfile @@ -55,11 +55,14 @@ ENV TORCH_NCCL_ASYNC_ERROR_HANDLING=1 ENV TORCH_NCCL_AVOID_RECORD_STREAMS=1 ENV SERVING_FEATURES=vllm,lmi-dist ENV DEBIAN_FRONTEND=noninteractive +# Making s5cmd discoverable +ENV PATH="/opt/djl/bin:${PATH}" ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"] CMD ["serve"] COPY scripts scripts/ +RUN chmod -R +x scripts RUN mkdir -p /opt/djl/conf \ && mkdir -p /opt/djl/deps \ && mkdir -p /opt/djl/partition \ @@ -100,7 +103,6 @@ RUN scripts/patch_oss_dlc.sh python \ && apt-get clean -y && rm -rf /var/lib/apt/lists/* RUN scripts/install_djl_serving.sh $djl_version $djl_serving_version ${djl_torch_version} \ - && rm -rf scripts \ && djl-serving -i ai.djl.onnxruntime:onnxruntime-engine:$djl_version \ && djl-serving -i com.microsoft.onnxruntime:onnxruntime_gpu:$djl_onnx_version diff --git a/serving/docker/pytorch-gpu.Dockerfile b/serving/docker/pytorch-gpu.Dockerfile index f80a7513f..6aff25be0 100644 --- a/serving/docker/pytorch-gpu.Dockerfile +++ b/serving/docker/pytorch-gpu.Dockerfile @@ -47,6 +47,8 @@ ENV JAVA_OPTS="-Xmx1g -Xms1g -XX:+ExitOnOutOfMemoryError -Dai.djl.default_engine ENV HF_HOME=/tmp/.cache/huggingface ENV HF_HUB_ENABLE_HF_TRANSFER=1 ENV PYTORCH_KERNEL_CACHE_PATH=/tmp/.cache +# Making s5cmd discoverable +ENV PATH="/opt/djl/bin:${PATH}" COPY distribution[s]/ ./ RUN mv *.deb djl-serving_all.deb || true diff --git a/serving/docker/pytorch-inf2.Dockerfile b/serving/docker/pytorch-inf2.Dockerfile index 35e28bd64..a20842c82 100644 --- a/serving/docker/pytorch-inf2.Dockerfile +++ b/serving/docker/pytorch-inf2.Dockerfile @@ -63,6 +63,8 @@ ENV PYTORCH_VERSION=2.1.2 ENV JAVA_OPTS="-Xmx1g -Xms1g -Xss2m -XX:+ExitOnOutOfMemoryError" ENV NEURON_CC_FLAGS="--logfile /tmp/compile.log --temp-dir=/tmp" ENV SERVING_FEATURES=vllm,lmi-dist,tnx +# Making s5cmd discoverable +ENV PATH="/opt/djl/bin:${PATH}" ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"] CMD ["serve"] diff --git a/serving/docker/scripts/install_debug_tools.sh b/serving/docker/scripts/install_debug_tools.sh new file mode 100644 index 000000000..f904c1ae2 --- /dev/null +++ b/serving/docker/scripts/install_debug_tools.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash +set -e + +source /opt/djl/scripts/install_nsys.sh \ No newline at end of file diff --git a/serving/docker/scripts/install_nsys.sh b/serving/docker/scripts/install_nsys.sh new file mode 100644 index 000000000..4e8b7f7d5 --- /dev/null +++ b/serving/docker/scripts/install_nsys.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash + +# Define the base URL for Nsight Systems +BASE_URL="https://developer.download.nvidia.com/devtools/nsight-systems/" + +# Check for LMI_DEBUG_NSIGHT_VERSION +if [ -n "${LMI_DEBUG_NSIGHT_VERSION}" ]; then + # Check if the variable contains only numbers, dots, and hyphens + echo "LMI_DEBUG_NSIGHT_VERSION is set: ${LMI_DEBUG_NSIGHT_VERSION}" +else + # Find the latest version dynamically + echo "Fetching the latest Nsight Systems version..." + LMI_DEBUG_NSIGHT_VERSION=$(wget -qO- "$BASE_URL" | grep -oP 'NsightSystems-linux-public-\K[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+-[0-9]+' | sort -V | tail -1) + + if [ -z "$LMI_DEBUG_NSIGHT_VERSION" ]; then + echo "Failed to fetch the latest version. Exiting." + exit 1 + fi + + echo "Latest Nsight Systems version found: $LMI_DEBUG_NSIGHT_VERSION" +fi + +# Security Validation +if [[ "${LMI_DEBUG_NSIGHT_VERSION}" =~ ^[0-9.-]+$ ]]; then + echo "LMI_DEBUG_NSIGHT_VERSION is valid: ${LMI_DEBUG_NSIGHT_VERSION}" +else + echo "LMI_DEBUG_NSIGHT_VERSION is invalid: ${LMI_DEBUG_NSIGHT_VERSION}" + exit 1 +fi + +# Construct the download URL +DOWNLOAD_URL="${BASE_URL}NsightSystems-linux-public-${LMI_DEBUG_NSIGHT_VERSION}.run" + +# Define the installation directory (default is /opt/nvidia/nsight-systems) +INSTALL_DIR="/opt/nvidia/nsight-systems" + +# Update and install prerequisites +echo "Updating system and installing prerequisites..." +apt-get update +apt-get install -y wget build-essential aria2 expect + +# Download Nsight Systems installer +echo "Downloading Nsight Systems ${LMI_DEBUG_NSIGHT_VERSION}..." +aria2c -x 16 "$DOWNLOAD_URL" -o nsight-systems-installer.run + +# Verify the download +if [ ! -f "nsight-systems-installer.run" ]; then + echo "Download failed. Exiting." + exit 1 +fi + +# Make the installer executable +echo "Making the installer executable..." +chmod +x nsight-systems-installer.run + +# Run the installer +echo "Running the Nsight Systems installer..." +# The installer is not respecting the CLI arguments +expect <> ~/.bashrc +source ~/.bashrc + +# Verify installation +echo "Verifying Nsight Systems installation..." +if command -v nsys &>/dev/null; then + echo "Nsight Systems installed successfully!" + nsys --version +else + echo "Nsight Systems installation failed." + exit 1 +fi + +# Clean up +echo "Cleaning up installer..." +rm -f nsight-systems-installer.run + +echo "Installation complete. You can now use Nsight Systems with the 'nsys' command." diff --git a/serving/docker/scripts/install_s5cmd.sh b/serving/docker/scripts/install_s5cmd.sh index ab6daace5..c0ad4e64c 100755 --- a/serving/docker/scripts/install_s5cmd.sh +++ b/serving/docker/scripts/install_s5cmd.sh @@ -10,6 +10,11 @@ else curl https://github.com/peak/s5cmd/releases/download/v2.0.0/s5cmd_2.0.0_Linux-64bit.tar.gz -L -o s5cmd.tar.gz fi -mkdir -p /opt/djl/bin -tar -xvf s5cmd.tar.gz -C /opt/djl/bin +INSTALL_DIR="/opt/djl/bin" + +mkdir -p "${INSTALL_DIR}" +tar -xvf s5cmd.tar.gz -C "${INSTALL_DIR}" rm -rf s5cmd.tar.gz + +export PATH="${INSTALL_DIR}:${PATH}" +echo "export PATH=${INSTALL_DIR}:\$PATH" >> ~/.bashrc diff --git a/serving/docker/scripts/start_debug_tools.sh b/serving/docker/scripts/start_debug_tools.sh new file mode 100644 index 000000000..5d05200fa --- /dev/null +++ b/serving/docker/scripts/start_debug_tools.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +set -e + +# Function to validate numeric variables +validate_numeric_variable() { + local var_name="$1" + local var_value="$2" + + if [[ "${var_value}" =~ ^[0-9]+$ ]]; then + echo "${var_name} is valid: ${var_value}" + else + echo "${var_name} is invalid: ${var_value}" + exit 1 + fi +} + +# Delay for start of profile capture to avoid profiling unintended setup steps +LMI_DEBUG_NSYS_PROFILE_DELAY=${LMI_DEBUG_NSYS_PROFILE_DELAY:-30} +# Security Validation +validate_numeric_variable "LMI_DEBUG_NSYS_PROFILE_DELAY" "${LMI_DEBUG_NSYS_PROFILE_DELAY}" + +# Duration for profile capture to avoid diluting the profile. +LMI_DEBUG_NSYS_PROFILE_DURATION=${LMI_DEBUG_NSYS_PROFILE_DURATION:-600} +# Security Validation +validate_numeric_variable "LMI_DEBUG_NSYS_PROFILE_DURATION" "${LMI_DEBUG_NSYS_PROFILE_DURATION}" + +# Duration for profile capture to avoid diluting the profile. +LMI_DEBUG_NSYS_PROFILE_TRACE=${LMI_DEBUG_NSYS_PROFILE_TRACE:-"cuda,nvtx,osrt,cudnn,cublas,mpi,python-gil"} +# Security Validation +if [[ "$LMI_DEBUG_NSYS_PROFILE_TRACE" =~ ^[a-z0-9,-]+$ ]]; then + echo "LMI_DEBUG_NSYS_PROFILE_TRACE is valid: ${LMI_DEBUG_NSYS_PROFILE_TRACE}" +else + echo "LMI_DEBUG_NSYS_PROFILE_TRACE is invalid: ${LMI_DEBUG_NSYS_PROFILE_TRACE}" + echo "Only lowercase letters, numbers, commas, and hyphens are allowed." + exit 1 +fi + +if [ -n "${LMI_DEBUG_S3_ARTIFACT_PATH}" ]; then + # Validate the S3 path format + if [[ ! "$LMI_DEBUG_S3_ARTIFACT_PATH" =~ ^s3://[a-z0-9.\-]+(/([a-zA-Z0-9.\-_]+)*)?/$ ]]; then + echo "Error: LMI_DEBUG_S3_ARTIFACT_PATH must be of the format s3://bucket/key/" + exit 1 + fi +fi + +nsys profile \ + --kill=sigkill \ + --wait=primary \ + --show-output true \ + --osrt-threshold 10000 \ + --delay "${LMI_DEBUG_NSYS_PROFILE_DELAY}" \ + --duration "${LMI_DEBUG_NSYS_PROFILE_DURATION}" \ + --python-backtrace=cuda \ + --trace "${LMI_DEBUG_NSYS_PROFILE_TRACE}" \ + --cudabacktrace all:10000 \ + --output "$(head /dev/urandom | tr -dc A-Za-z0-9 | head -c 8).nsys-rep" \ + -- djl-serving "$@" || true # Nsys exits with non-zero code when the application is terminated due to a timeout which is expected + +if [ -n "${LMI_DEBUG_S3_ARTIFACT_PATH}" ]; then + s5cmd cp /opt/djl/*.nsys-rep "$LMI_DEBUG_S3_ARTIFACT_PATH" +fi \ No newline at end of file diff --git a/serving/docker/tensorrt-llm.Dockerfile b/serving/docker/tensorrt-llm.Dockerfile index b35066958..3116b769a 100644 --- a/serving/docker/tensorrt-llm.Dockerfile +++ b/serving/docker/tensorrt-llm.Dockerfile @@ -52,6 +52,8 @@ ENV PYTORCH_KERNEL_CACHE_PATH=/tmp/.cache ENV BITSANDBYTES_NOWELCOME=1 ENV LD_LIBRARY_PATH=/opt/tritonserver/lib:/usr/local/lib/python${python_version}/dist-packages/tensorrt_libs:/usr/local/lib/python${python_version}/dist-packages/tensorrt_llm/libs/:${LD_LIBRARY_PATH} ENV SERVING_FEATURES=trtllm +# Making s5cmd discoverable +ENV PATH="/opt/djl/bin:${PATH}" ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"] CMD ["serve"]