deepjavalibrary · Lokiiiiii · Jan 30, 2025 · Jan 17, 2025 · Jan 17, 2025 · Jan 30, 2025
@@ -0,0 +1,61 @@
+# **Troubleshooting Guide**
+
+This guide provides steps and information to troubleshoot issues related to the model server and debugging tools. It is a work in progress and will eventually be moved to `serving/docs` upon finalization.
+
+---
+
+## **Profiling**
+
+> Note that profiling is still being worked upon and the interfaces are bound to change until finalized. In it's current state this is only recommended for personal debugging.
+
+The container can be started in **DEBUG mode** by setting the environment variable `LMI_DEBUG_NSYS_ENABLED=1`. When enabled, this mode facilitates advanced profiling and debugging capabilities with the following effects:
+
+### **1. Installation of Debugging Tools**
+
+In DEBUG mode, the following tool will be installed automatically:
+
+- **[NVIDIA Nsight Systems](https://docs.nvidia.com/nsight-systems/)**
+  - Nsight Systems enables system-wide performance analysis.
+  - The version of Nsight can be controlled using the environment variable:
+    - `LMI_DEBUG_NSIGHT_VERSION`: Specifies the version of Nsight Systems to install (e.g., `2024.6.1`).
+
+### **2. Profiling with Nsight Systems**
+
+The model server will automatically start under the `nsys` profiler when `LMI_DEBUG_NSYS_ENABLED` is enabled. The following environment variables can be configured to customize the profiling behavior:
+
+- **`LMI_DEBUG_NSYS_PROFILE_DELAY`**:  
+  - Specifies the delay in seconds before profiling begins.  
+  - Use this to exclude startup activities and capture only relevant information.  
+  - **Default**: `30` seconds.
+
+- **`LMI_DEBUG_NSYS_PROFILE_DURATION`**:  
+  - Specifies the duration in seconds for profiling.  
+  - Avoid setting this to values larger than 600 seconds (10 minutes) to prevent generating large and unwieldy reports.  
+  - **Default**: `600` seconds.
+
+- **`LMI_DEBUG_NSYS_PROFILE_TRACE`**:  
+  - Allows customization of the APIs and operations to trace.  
+  - Examples include `cuda`, `nvtx`, `osrt`, `cudnn`, `cublas`, `mpi`, and `python-gil`.  
+  - Refer to the [Nsight Systems User Guide](https://docs.nvidia.com/nsight-systems/UserGuide/index.html) for more details.
+
+### **3. Report Generation and Upload**
+
+- After profiling is complete, the generated `.nsys-rep` report will be automatically uploaded to the specified S3 bucket if the `LMI_DEBUG_S3_ARTIFACT_PATH` environment variable is provided.  
+- **`LMI_DEBUG_S3_ARTIFACT_PATH`**:  
+  - Specifies the S3 bucket and path for storing the profiling report.  
+  - **Example**: `s3://my-bucket/profiles/`.
+
+---
+
+### **Example Usage**
+
+To enable profiling and customize its behavior:
+
+```bash
+LMI_DEBUG_NSYS_ENABLED=1 \
+LMI_DEBUG_NSIGHT_VERSION=2024.6.1 \
+LMI_DEBUG_NSYS_PROFILE_DELAY=20 \
+LMI_DEBUG_NSYS_PROFILE_DURATION=300 \
+LMI_DEBUG_NSYS_PROFILE_TRACE="cuda,nvtx,osrt" \
+LMI_DEBUG_S3_ARTIFACT_PATH="s3://my-bucket/debug-reports/" \
+docker run my-container
@@ -44,6 +44,8 @@ ENV MODEL_SERVER_HOME=/opt/djl
 ENV DJL_CACHE_DIR=/tmp/.djl.ai
 ENV HF_HOME=/tmp/.cache/huggingface
 ENV TRANSFORMERS_CACHE=/tmp/.cache/huggingface/transformers
+# Making s5cmd discoverable
+ENV PATH="/opt/djl/bin:${PATH}"
 
 RUN useradd -m -d /home/djl djl && \
     chown -R djl:djl /opt/djl

@@ -26,6 +26,8 @@ ENV HF_HOME=/tmp/.cache/huggingface
 ENV TRANSFORMERS_CACHE=/tmp/.cache/huggingface/transformers
 ENV DNNL_DEFAULT_FPMATH_MODE=BF16
 ENV LRU_CACHE_CAPACITY=1024
+# Making s5cmd discoverable
+ENV PATH="/opt/djl/bin:${PATH}"
 
 ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"]
 CMD ["serve"]

@@ -60,11 +60,18 @@ fi
 
 if [[ "$1" = "serve" ]]; then
   shift 1
-  code=77
-  while [[ code -eq 77 ]]; do
-    /usr/bin/djl-serving "$@"
-    code=$?
-  done
+  echo "LMI_DEBUG_NSYS_ENABLED=$LMI_DEBUG_NSYS_ENABLED"
+  if [[ -n "$LMI_DEBUG_NSYS_ENABLED" ]]; then
+    set -e
+    source /opt/djl/scripts/install_debug_tools.sh
+    /opt/djl/scripts/start_debug_tools.sh "$@"
+  else
+    code=77
+    while [[ code -eq 77 ]]; do
+      /usr/bin/djl-serving "$@"
+      code=$?
+    done
+  fi
 elif [[ "$1" = "partition" ]] || [[ "$1" = "train" ]]; then
   shift 1
   /usr/bin/python3 /opt/djl/partition/partition.py "$@"

@@ -9,12 +9,19 @@ fi
 
 if [[ "$1" = "serve" ]]; then
   shift 1
-  code=77
-  while [[ code -eq 77 ]]; do
-    /usr/bin/djl-serving "$@"
-    code=$?
-  done
-  exit $code
+  echo "LMI_DEBUG_NSYS_ENABLED=$LMI_DEBUG_NSYS_ENABLED"
+  if [[ -n "$LMI_DEBUG_NSYS_ENABLED" ]]; then
+    set -e
+    source /opt/djl/scripts/install_debug_tools.sh
+    /opt/djl/scripts/start_debug_tools.sh "$@"
+  else
+    code=77
+    while [[ code -eq 77 ]]; do
+      /usr/bin/djl-serving "$@"
+      code=$?
+    done
+    exit $code
+  fi
 elif [[ "$1" = "partition" ]] || [[ "$1" = "train" ]]; then
   set -e
   shift 1

@@ -55,11 +55,14 @@ ENV TORCH_NCCL_ASYNC_ERROR_HANDLING=1
 ENV TORCH_NCCL_AVOID_RECORD_STREAMS=1
 ENV SERVING_FEATURES=vllm,lmi-dist
 ENV DEBIAN_FRONTEND=noninteractive
+# Making s5cmd discoverable
+ENV PATH="/opt/djl/bin:${PATH}"
 
 ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"]
 CMD ["serve"]
 
 COPY scripts scripts/
+RUN chmod -R +x scripts
 RUN mkdir -p /opt/djl/conf \
     && mkdir -p /opt/djl/deps \
     && mkdir -p /opt/djl/partition \
@@ -100,7 +103,6 @@ RUN scripts/patch_oss_dlc.sh python \
     && apt-get clean -y && rm -rf /var/lib/apt/lists/*
 
 RUN scripts/install_djl_serving.sh $djl_version $djl_serving_version ${djl_torch_version} \
-    && rm -rf scripts \
     && djl-serving -i ai.djl.onnxruntime:onnxruntime-engine:$djl_version \
     && djl-serving -i com.microsoft.onnxruntime:onnxruntime_gpu:$djl_onnx_version
 

@@ -47,6 +47,8 @@ ENV JAVA_OPTS="-Xmx1g -Xms1g -XX:+ExitOnOutOfMemoryError -Dai.djl.default_engine
 ENV HF_HOME=/tmp/.cache/huggingface
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
 ENV PYTORCH_KERNEL_CACHE_PATH=/tmp/.cache
+# Making s5cmd discoverable
+ENV PATH="/opt/djl/bin:${PATH}"
 
 COPY distribution[s]/ ./
 RUN mv *.deb djl-serving_all.deb || true

@@ -63,6 +63,8 @@ ENV PYTORCH_VERSION=2.1.2
 ENV JAVA_OPTS="-Xmx1g -Xms1g -Xss2m -XX:+ExitOnOutOfMemoryError"
 ENV NEURON_CC_FLAGS="--logfile /tmp/compile.log --temp-dir=/tmp"
 ENV SERVING_FEATURES=vllm,lmi-dist,tnx
+# Making s5cmd discoverable
+ENV PATH="/opt/djl/bin:${PATH}"
 
 ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"]
 CMD ["serve"]

@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+set -e
+
+source /opt/djl/scripts/install_nsys.sh
@@ -0,0 +1,90 @@
+#!/usr/bin/env bash
+
+# Define the base URL for Nsight Systems
+BASE_URL="https://developer.download.nvidia.com/devtools/nsight-systems/"
+
+# Check for LMI_DEBUG_NSIGHT_VERSION
+if [ -n "${LMI_DEBUG_NSIGHT_VERSION}" ]; then
+    # Check if the variable contains only numbers, dots, and hyphens
+    echo "LMI_DEBUG_NSIGHT_VERSION is set: ${LMI_DEBUG_NSIGHT_VERSION}"
+else
+    # Find the latest version dynamically
+    echo "Fetching the latest Nsight Systems version..."
+    LMI_DEBUG_NSIGHT_VERSION=$(wget -qO- "$BASE_URL" | grep -oP 'NsightSystems-linux-public-\K[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+-[0-9]+' | sort -V | tail -1)
+
+    if [ -z "$LMI_DEBUG_NSIGHT_VERSION" ]; then
+        echo "Failed to fetch the latest version. Exiting."
+        exit 1
+    fi
+
+    echo "Latest Nsight Systems version found: $LMI_DEBUG_NSIGHT_VERSION"
+fi
+
+# Security Validation
+if [[ "${LMI_DEBUG_NSIGHT_VERSION}" =~ ^[0-9.-]+$ ]]; then
+  echo "LMI_DEBUG_NSIGHT_VERSION is valid: ${LMI_DEBUG_NSIGHT_VERSION}"
+else
+  echo "LMI_DEBUG_NSIGHT_VERSION is invalid: ${LMI_DEBUG_NSIGHT_VERSION}"
+  exit 1
+fi
+
+# Construct the download URL
+DOWNLOAD_URL="${BASE_URL}NsightSystems-linux-public-${LMI_DEBUG_NSIGHT_VERSION}.run"
+
+# Define the installation directory (default is /opt/nvidia/nsight-systems)
+INSTALL_DIR="/opt/nvidia/nsight-systems"
+
+# Update and install prerequisites
+echo "Updating system and installing prerequisites..."
+apt-get update
+apt-get install -y wget build-essential aria2 expect
+
+# Download Nsight Systems installer
+echo "Downloading Nsight Systems ${LMI_DEBUG_NSIGHT_VERSION}..."
+aria2c -x 16 "$DOWNLOAD_URL" -o nsight-systems-installer.run
+
+# Verify the download
+if [ ! -f "nsight-systems-installer.run" ]; then
+    echo "Download failed. Exiting."
+    exit 1
+fi
+
+# Make the installer executable
+echo "Making the installer executable..."
+chmod +x nsight-systems-installer.run
+
+# Run the installer
+echo "Running the Nsight Systems installer..."
+# The installer is not respecting the CLI arguments
+expect <<EOF
+spawn ./nsight-systems-installer.run --quiet --accept --target ${INSTALL_DIR}
+# Send ENTER and ACCEPT without waiting for specific prompts
+send "\r"
+sleep 1
+send "ACCEPT\r"
+sleep 1
+send "${INSTALL_DIR}\r"
+expect eof
+EOF
+
+# Add Nsight Systems to PATH
+echo "Adding Nsight Systems to PATH..."
+export PATH="${INSTALL_DIR}/pkg/bin:${PATH}"
+echo "export PATH=${INSTALL_DIR}/pkg/bin:\$PATH" >> ~/.bashrc
+source ~/.bashrc
+
+# Verify installation
+echo "Verifying Nsight Systems installation..."
+if command -v nsys &>/dev/null; then
+    echo "Nsight Systems installed successfully!"
+    nsys --version
+else
+    echo "Nsight Systems installation failed."
+    exit 1
+fi
+
+# Clean up
+echo "Cleaning up installer..."
+rm -f nsight-systems-installer.run
+
+echo "Installation complete. You can now use Nsight Systems with the 'nsys' command."
@@ -10,6 +10,11 @@ else
   curl https://github.com/peak/s5cmd/releases/download/v2.0.0/s5cmd_2.0.0_Linux-64bit.tar.gz -L -o s5cmd.tar.gz
 fi
 
-mkdir -p /opt/djl/bin
-tar -xvf s5cmd.tar.gz -C /opt/djl/bin
+INSTALL_DIR="/opt/djl/bin"
+
+mkdir -p "${INSTALL_DIR}"
+tar -xvf s5cmd.tar.gz -C "${INSTALL_DIR}"
 rm -rf s5cmd.tar.gz
+
+export PATH="${INSTALL_DIR}:${PATH}"
+echo "export PATH=${INSTALL_DIR}:\$PATH" >> ~/.bashrc
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+set -e
+
+# Function to validate numeric variables
+validate_numeric_variable() {
+  local var_name="$1"
+  local var_value="$2"
+
+  if [[ "${var_value}" =~ ^[0-9]+$ ]]; then
+    echo "${var_name} is valid: ${var_value}"
+  else
+    echo "${var_name} is invalid: ${var_value}"
+    exit 1
+  fi
+}
+
+# Delay for start of profile capture to avoid profiling unintended setup steps
+LMI_DEBUG_NSYS_PROFILE_DELAY=${LMI_DEBUG_NSYS_PROFILE_DELAY:-30}
+# Security Validation
+validate_numeric_variable "LMI_DEBUG_NSYS_PROFILE_DELAY" "${LMI_DEBUG_NSYS_PROFILE_DELAY}"
+
+# Duration for profile capture to avoid diluting the profile.
+LMI_DEBUG_NSYS_PROFILE_DURATION=${LMI_DEBUG_NSYS_PROFILE_DURATION:-600}
+# Security Validation
+validate_numeric_variable "LMI_DEBUG_NSYS_PROFILE_DURATION" "${LMI_DEBUG_NSYS_PROFILE_DURATION}"
+
+# Duration for profile capture to avoid diluting the profile.
+LMI_DEBUG_NSYS_PROFILE_TRACE=${LMI_DEBUG_NSYS_PROFILE_TRACE:-"cuda,nvtx,osrt,cudnn,cublas,mpi,python-gil"}
+# Security Validation
+if [[ "$LMI_DEBUG_NSYS_PROFILE_TRACE" =~ ^[a-z0-9,-]+$ ]]; then
+  echo "LMI_DEBUG_NSYS_PROFILE_TRACE is valid: ${LMI_DEBUG_NSYS_PROFILE_TRACE}"
+else
+  echo "LMI_DEBUG_NSYS_PROFILE_TRACE is invalid: ${LMI_DEBUG_NSYS_PROFILE_TRACE}"
+  echo "Only lowercase letters, numbers, commas, and hyphens are allowed."
+  exit 1
+fi
+
+if [ -n "${LMI_DEBUG_S3_ARTIFACT_PATH}" ]; then
+  # Validate the S3 path format
+  if [[ ! "$LMI_DEBUG_S3_ARTIFACT_PATH" =~ ^s3://[a-z0-9.\-]+(/([a-zA-Z0-9.\-_]+)*)?/$ ]]; then
+    echo "Error: LMI_DEBUG_S3_ARTIFACT_PATH must be of the format s3://bucket/key/"
+    exit 1
+  fi
+fi
+
+nsys profile \
+  --kill=sigkill \
+  --wait=primary \
+  --show-output true \
+  --osrt-threshold 10000 \
+  --delay "${LMI_DEBUG_NSYS_PROFILE_DELAY}" \
+  --duration "${LMI_DEBUG_NSYS_PROFILE_DURATION}" \
+  --python-backtrace=cuda \
+  --trace "${LMI_DEBUG_NSYS_PROFILE_TRACE}" \
+  --cudabacktrace all:10000 \
+  --output "$(head /dev/urandom | tr -dc A-Za-z0-9 | head -c 8).nsys-rep" \
+  -- djl-serving "$@" || true   # Nsys exits with non-zero code when the application is terminated due to a timeout which is expected
+
+if [ -n "${LMI_DEBUG_S3_ARTIFACT_PATH}" ]; then
+  s5cmd cp /opt/djl/*.nsys-rep "$LMI_DEBUG_S3_ARTIFACT_PATH"
+fi
@@ -52,6 +52,8 @@ ENV PYTORCH_KERNEL_CACHE_PATH=/tmp/.cache
 ENV BITSANDBYTES_NOWELCOME=1
 ENV LD_LIBRARY_PATH=/opt/tritonserver/lib:/usr/local/lib/python${python_version}/dist-packages/tensorrt_libs:/usr/local/lib/python${python_version}/dist-packages/tensorrt_llm/libs/:${LD_LIBRARY_PATH}
 ENV SERVING_FEATURES=trtllm
+# Making s5cmd discoverable
+ENV PATH="/opt/djl/bin:${PATH}"
 
 ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"]
 CMD ["serve"]