From ed023bcdedd93407a8dd10eda94400270e054818 Mon Sep 17 00:00:00 2001
From: Nathan Cassereau <nathan.cassereau@idris.fr>
Date: Fri, 31 Jan 2025 10:53:33 +0100
Subject: [PATCH 1/4] Solved an issue where lighteval vllm would hang
 indefinitely in multi node settings

---
 src/lighteval/models/vllm/vllm_model.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
index 3398f7218..b8ddcf1e2 100644
--- a/src/lighteval/models/vllm/vllm_model.py
+++ b/src/lighteval/models/vllm/vllm_model.py
@@ -93,6 +93,7 @@ class VLLMModelConfig:
     )
     pairwise_tokenization: bool = False  # whether to tokenize the context and continuation separately or together.
     generation_parameters: GenerationParameters = None  # sampling parameters to use for generation
+    enforce_eager: bool = False  # whether or not to disable cuda graphs with vllm
 
     subfolder: Optional[str] = None
 
@@ -136,13 +137,19 @@ def tokenizer(self):
         return self._tokenizer
 
     def cleanup(self):
-        destroy_model_parallel()
+        if ray is not None:
+            ray.get(ray.remote(destroy_model_parallel).remote())
+        else:
+            destroy_model_parallel()
         if self.model is not None:
             del self.model.llm_engine.model_executor.driver_worker
         self.model = None
         gc.collect()
         ray.shutdown()
-        destroy_distributed_environment()
+        if ray is not None:
+            ray.get(ray.remote(destroy_distributed_environment).remote())
+        else:
+            destroy_distributed_environment()
         torch.cuda.empty_cache()
 
     @property
@@ -182,6 +189,7 @@ def _create_auto_model(self, config: VLLMModelConfig, env_config: EnvConfig) ->
             "max_model_len": self._max_length,
             "swap_space": 4,
             "seed": 1234,
+            "enforce_eager": config.enforce_eager,
         }
         if int(config.data_parallel_size) > 1:
             self.model_args["worker_use_ray"] = True

From c66362831f985c9f9ccbd8accbbae49409f82741 Mon Sep 17 00:00:00 2001
From: Nathan Cassereau <nathan.cassereau@idris.fr>
Date: Fri, 31 Jan 2025 13:08:22 +0100
Subject: [PATCH 2/4] Allows using custom OpenAI endpoint (for instance with
 vLLM)

---
 src/lighteval/models/endpoints/openai_model.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/lighteval/models/endpoints/openai_model.py b/src/lighteval/models/endpoints/openai_model.py
index 37b8ca347..c3ad8f98e 100644
--- a/src/lighteval/models/endpoints/openai_model.py
+++ b/src/lighteval/models/endpoints/openai_model.py
@@ -84,7 +84,7 @@ class OpenAIClient(LightevalModel):
 
     def __init__(self, config: OpenAIModelConfig, env_config) -> None:
         api_key = os.environ["OPENAI_API_KEY"]
-        self.client = OpenAI(api_key=api_key)
+        self.client = OpenAI(api_key=api_key, base_url=os.getenv("OPENAI_BASE_URL"))
         self.generation_parameters = config.generation_parameters
         self.sampling_params = self.generation_parameters.to_vllm_openai_dict()
 
@@ -99,7 +99,19 @@ def __init__(self, config: OpenAIModelConfig, env_config) -> None:
         self.API_RETRY_MULTIPLIER = 2
         self.CONCURENT_CALLS = 100
         self.model = config.model
-        self._tokenizer = tiktoken.encoding_for_model(self.model)
+        try:
+            self._tokenizer = tiktoken.encoding_for_model(self.model)
+        except KeyError:
+            if "TOKENIZER_PATH" in os.environ:
+                from transformers import AutoTokenizer
+
+                self._tokenizer = AutoTokenizer.from_pretrained(os.getenv("TOKENIZER_PATH"))
+            elif os.path.exists(self.model) and os.path.isdir(self.model):
+                from transformers import AutoTokenizer
+
+                self._tokenizer = AutoTokenizer.from_pretrained(self.model)
+            else:
+                raise
         self.pairwise_tokenization = False
 
     def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, logit_bias):

From b42ec3f8e5e0fc0acf2642366f4cf8ec3fd05cf0 Mon Sep 17 00:00:00 2001
From: Nathan Cassereau <nathan.cassereau@idris.fr>
Date: Fri, 31 Jan 2025 14:30:41 +0100
Subject: [PATCH 3/4] Added documentation for lighteval vllm and an example for
 the lost souls

---
 docs/source/use-vllm-as-backend.mdx  | 49 ++++++++++++++++++++++++
 examples/slurm/multi_node_vllm.slurm | 57 ++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+)
 create mode 100644 examples/slurm/multi_node_vllm.slurm

diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/use-vllm-as-backend.mdx
index 787848c36..42e421241 100644
--- a/docs/source/use-vllm-as-backend.mdx
+++ b/docs/source/use-vllm-as-backend.mdx
@@ -47,3 +47,52 @@ Available arguments for `vllm` can be found in the `VLLMModelConfig`:
 > [!WARNING]
 > In the case of OOM issues, you might need to reduce the context size of the
 > model as well as reduce the `gpu_memory_utilisation` parameter.
+
+
+## Multi-node vLLM
+
+It is entirely possible to use vLLM in a multi-node setting. For this, we will use Ray.
+In these examples, we will assume that we are on a Slurm cluster where nodes do not have internet access.
+Those scripts are heavily inspired by [https://github.com/NERSC/slurm-ray-cluster/](https://github.com/NERSC/slurm-ray-cluster/).
+
+First you need to start the Ray cluster. It has one master, and you need to find an available
+port on this node
+
+```bash
+function find_available_port {
+    printf $(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')
+}
+
+PORT=$(find_available_port)
+NODELIST=($(scontrol show hostnames $SLURM_JOB_NODELIST))
+MASTER=${NODELIST[0]}  # Name of master node
+MASTER_IP=$(hostname --ip-address)  # IP address of master node
+
+function set_VLLM_HOST_IP {
+    export VLLM_HOST_IP=$(hostname --ip-address);
+}
+export -f set_VLLM_HOST_IP;
+
+# Start the master
+srun -N1 -n1 -c $(( SLURM_CPUS_PER_TASK/2 )) -w $MASTER bash -c "set_VLLM_HOST_IP; ray start --head --port=$PORT --block" &
+sleep 5
+
+# Start all other nodes :)
+if [[ $SLURM_NNODES -gt 1 ]]; then
+    srun -N $(( SLURM_NNODES-1 )) --ntasks-per-node=1 -c $(( SLURM_CPUS_PER_TASK/2 )) -x $MASTER bash -c "set_VLLM_HOST_IP; ray start --address=$MASTER_IP:$PORT --block" &
+    sleep 5
+fi
+```
+
+Then, once the Ray cluster is running, you can launch vLLM through lighteval.
+
+```bash
+set_VLLM_HOST_IP
+
+MODEL_ARGS="pretrained=$MODEL_DIRECTORY,gpu_memory_utilisation=0.5,trust_remote_code=False,dtype=bfloat16,max_model_length=16384,tensor_parallel_size=<gpus_per_node * nnodes>"
+TASK_ARGS="community|gpqa-fr|0|0,community|ifeval-fr|0|0"
+
+srun -N1 -n1 -c $(( SLURM_CPUS_PER_TASK/2 )) --overlap -w $MASTER lighteval vllm "$MODEL_ARGS" "$TASK_ARGS" --custom-tasks $TASK_FILE
+```
+
+The full script is available here: [https://github.com/huggingface/lighteval/blob/main/examples/slurm/multi_node_vllm.slurm](https://github.com/huggingface/lighteval/blob/main/examples/slurm/multi_node_vllm.slurm)
diff --git a/examples/slurm/multi_node_vllm.slurm b/examples/slurm/multi_node_vllm.slurm
new file mode 100644
index 000000000..ddc193939
--- /dev/null
+++ b/examples/slurm/multi_node_vllm.slurm
@@ -0,0 +1,57 @@
+#! /bin/bash
+
+#SBATCH --job-name=EVALUATE_Llama-3.2-1B-Instruct
+#SBATCH --account=brb@h100
+#SBATCH --output=evaluation.log
+#SBATCH --error=evaluation.log
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=96
+#SBATCH --gres=gpu:4
+#SBATCH --hint=nomultithread
+#SBATCH --constraint=h100
+#SBATCH --time=02:00:00
+#SBATCH --exclusive
+#SBATCH --parsable
+
+set -e
+set -x
+
+module purge
+module load miniforge/24.9.0
+conda activate $WORKDIR/lighteval-h100
+
+function find_available_port {
+    printf $(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')
+}
+
+PORT=$(find_available_port)
+NODELIST=($(scontrol show hostnames $SLURM_JOB_NODELIST))
+MASTER=${NODELIST[0]}
+MASTER_IP=$(hostname --ip-address)
+
+export HF_HOME=$WORKDIR/HF_HOME
+export MODEL_DIRECTORY=$WORKDIR/HuggingFace_Models/meta-llama/Llama-3.2-1B-Instruct
+export TASK_FILE=$WORKDIR/community_tasks/french_eval.py
+export HF_HUB_OFFLINE=1
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
+
+function set_VLLM_HOST_IP {
+    export VLLM_HOST_IP=$(hostname --ip-address);
+}
+export -f set_VLLM_HOST_IP;
+
+srun -N1 -n1 -c $(( SLURM_CPUS_PER_TASK/2 )) -w $MASTER bash -c "set_VLLM_HOST_IP; ray start --head --port=$PORT --block" &
+sleep 5
+
+if [[ $SLURM_NNODES -gt 1 ]]; then
+    srun -N $(( SLURM_NNODES-1 )) --ntasks-per-node=1 -c $(( SLURM_CPUS_PER_TASK/2 )) -x $MASTER bash -c "set_VLLM_HOST_IP; ray start --address=$MASTER_IP:$PORT --block" &
+    sleep 5
+fi
+
+set_VLLM_HOST_IP
+
+MODEL_ARGS="pretrained=$MODEL_DIRECTORY,gpu_memory_utilisation=0.5,trust_remote_code=False,dtype=bfloat16,max_model_length=8192,tensor_parallel_size=8"
+TASK_ARGS="community|gpqa-fr|0|0,community|ifeval-fr|0|0"
+
+srun -N1 -n1 -c $(( SLURM_CPUS_PER_TASK/2 )) --overlap -w $MASTER lighteval vllm "$MODEL_ARGS" "$TASK_ARGS" --custom-tasks $TASK_FILE

From a5ad8b5f868f723f0e832fbd6e38a1baf4f0db54 Mon Sep 17 00:00:00 2001
From: Nathan Cassereau <nathan.cassereau@idris.fr>
Date: Fri, 31 Jan 2025 14:31:30 +0100
Subject: [PATCH 4/4] Added documentation for lighteval endpoint with vllm
 serve backend and an example for the lost souls

---
 docs/source/use-vllm-as-backend.mdx        | 62 +++++++++++++++
 examples/slurm/multi_node_vllm_serve.slurm | 90 ++++++++++++++++++++++
 2 files changed, 152 insertions(+)
 create mode 100644 examples/slurm/multi_node_vllm_serve.slurm

diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/use-vllm-as-backend.mdx
index 42e421241..92af929ad 100644
--- a/docs/source/use-vllm-as-backend.mdx
+++ b/docs/source/use-vllm-as-backend.mdx
@@ -96,3 +96,65 @@ srun -N1 -n1 -c $(( SLURM_CPUS_PER_TASK/2 )) --overlap -w $MASTER lighteval vllm
 ```
 
 The full script is available here: [https://github.com/huggingface/lighteval/blob/main/examples/slurm/multi_node_vllm.slurm](https://github.com/huggingface/lighteval/blob/main/examples/slurm/multi_node_vllm.slurm)
+
+## With vLLM Serve
+
+It is also possible to use the vLLM serve command to achieve a similar result.
+It has the following benefits: can be queried by multiple jobs, can be launched only once when needing multiple evaluation,
+has lower peak memory on rank 0.
+
+We also need to start the Ray cluster, the exact same way as before. However, now,
+before calling lighteval, we need to start our vllm server.
+
+```bash
+MODEL_NAME="Llama-3.2-1B-Instruct"
+SERVER_PORT=$(find_available_port)
+export OPENAI_API_KEY="I-love-vLLM"
+export OPENAI_BASE_URL="http://localhost:$SERVER_PORT/v1"
+
+vllm serve $MODEL_DIRECTORY \
+    --served-model-name $MODEL_NAME \
+    --api-key $OPENAI_API_KEY \
+    --enforce-eager \
+    --port $SERVER_PORT \
+    --tensor-parallel-size <gpus_per_node * nnodes> \
+    --dtype bfloat16 \
+    --max-model-len 16384 \
+    --gpu-memory-utilization 0.8 \
+    --disable-custom-all-reduce \
+    1>vllm.stdout 2>vllm.stderr &
+```
+
+In my case, I want the evaluation to be done within the same job, hence why vllm serve
+is put in the background. Therefore, we need to wait until it is up & running before
+launching lighteval
+
+```bash
+ATTEMPT=0
+DELAY=5
+MAX_ATTEMPTS=60 # Might need to be increased in case of very large model
+until curl -s -o /dev/null -w "%{http_code}" $OPENAI_BASE_URL/models -H "Authorization: Bearer $OPENAI_API_KEY" | grep -E "^2[0-9]{2}$"; do
+    ATTEMPT=$((ATTEMPT + 1))
+    echo "$ATTEMPT attempts"
+    if [ "$ATTEMPT" -ge "$MAX_ATTEMPTS" ]; then
+        echo "Failed: the server did not respond any of the $MAX_ATTEMPTS requests."
+        exit 1
+    fi
+    echo "vllm serve is not ready yet"
+    sleep $DELAY
+done
+```
+
+Finally, the above script only finishes when vllm serve is ready, so we can launch
+the evaluation.
+
+```bash
+export TOKENIZER_PATH="$MODEL_DIRECTORY"
+
+TASK_ARGS="community|gpqa-fr|0|0,community|ifeval-fr|0|0"
+
+lighteval endpoint openai "$MODEL_NAME" "$TASK_ARGS" --custom-tasks $TASK_FILE
+```
+
+The full script is available here:
+[https://github.com/huggingface/lighteval/blob/main/examples/slurm/multi_node_vllm_serve.slurm](https://github.com/huggingface/lighteval/blob/main/examples/slurm/multi_node_vllm_serve.slurm)
diff --git a/examples/slurm/multi_node_vllm_serve.slurm b/examples/slurm/multi_node_vllm_serve.slurm
new file mode 100644
index 000000000..2b4a12eb7
--- /dev/null
+++ b/examples/slurm/multi_node_vllm_serve.slurm
@@ -0,0 +1,90 @@
+#! /bin/bash
+
+#SBATCH --job-name=EVALUATE_Llama-3.2-1B-Instruct
+#SBATCH --account=brb@h100
+#SBATCH --output=evaluation.log
+#SBATCH --error=evaluation.log
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=96
+#SBATCH --gres=gpu:4
+#SBATCH --hint=nomultithread
+#SBATCH --constraint=h100
+#SBATCH --time=02:00:00
+#SBATCH --exclusive
+#SBATCH --parsable
+
+set -e
+set -x
+
+module purge
+module load miniforge/24.9.0
+conda activate $WORKDIR/lighteval-h100
+
+function find_available_port {
+    printf $(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')
+}
+
+PORT=$(find_available_port)
+NODELIST=($(scontrol show hostnames $SLURM_JOB_NODELIST))
+MASTER=${NODELIST[0]}
+MASTER_IP=$(hostname --ip-address)
+
+export HF_HOME=$WORKDIR/HF_HOME
+export MODEL_DIRECTORY=$WORKDIR/HuggingFace_Models/meta-llama/Llama-3.2-1B-Instruct
+export TASK_FILE=$WORKDIR/community_tasks/french_eval.py
+export HF_HUB_OFFLINE=1
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
+
+function set_VLLM_HOST_IP {
+    export VLLM_HOST_IP=$(hostname --ip-address);
+}
+export -f set_VLLM_HOST_IP;
+
+srun -N1 -n1 -c $(( SLURM_CPUS_PER_TASK/2 )) -w $MASTER bash -c "set_VLLM_HOST_IP; ray start --head --port=$PORT --block" &
+sleep 5
+
+if [[ $SLURM_NNODES -gt 1 ]]; then
+    srun -N $(( SLURM_NNODES-1 )) --ntasks-per-node=1 -c $(( SLURM_CPUS_PER_TASK/2 )) -x $MASTER bash -c "set_VLLM_HOST_IP; ray start --address=$MASTER_IP:$PORT --block" &
+    sleep 5
+fi
+
+set_VLLM_HOST_IP
+
+MODEL_NAME="Llama-3.2-1B-Instruct"
+SERVER_PORT=$(find_available_port)
+export OPENAI_API_KEY="I-love-vllm-serve"
+export OPENAI_BASE_URL="http://localhost:$SERVER_PORT/v1"
+
+vllm serve $MODEL_DIRECTORY \
+    --served-model-name $MODEL_NAME \
+    --api-key $OPENAI_API_KEY \
+    --enforce-eager \
+    --port $SERVER_PORT \
+    --tensor-parallel-size 8 \
+    --dtype bfloat16 \
+    --max-model-len 16384 \
+    --gpu-memory-utilization 0.8 \
+    --disable-custom-all-reduce \
+    1>vllm.stdout 2>vllm.stderr &
+
+
+ATTEMPT=0
+DELAY=5
+MAX_ATTEMPTS=60
+until curl -s -o /dev/null -w "%{http_code}" $OPENAI_BASE_URL/models -H "Authorization: Bearer $OPENAI_API_KEY" | grep -E "^2[0-9]{2}$"; do
+    ATTEMPT=$((ATTEMPT + 1))
+    echo "$ATTEMPT attempts"
+    if [ "$ATTEMPT" -ge "$MAX_ATTEMPTS" ]; then
+        echo "Failed: the server did not respond any of the $MAX_ATTEMPTS requests."
+        exit 1
+    fi
+    echo "vllm serve is not ready yet"
+    sleep $DELAY
+done
+
+export TOKENIZER_PATH="$MODEL_DIRECTORY"
+
+TASK_ARGS="community|gpqa-fr|0|0,community|ifeval-fr|0|0"
+
+lighteval endpoint openai "$MODEL_NAME" "$TASK_ARGS" --custom-tasks $TASK_FILE