feat

l1997i · Apr 25, 2024 · 9b61736 · 9b61736
1 parent aa20dbc
commit 9b61736
Show file tree

Hide file tree

Showing 15 changed files with 350 additions and 7 deletions.
diff --git a/.gitignore b/.gitignore
@@ -131,3 +131,5 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+.pytorch
diff --git a/install_hpc_helper_v0_1_1.sh b/install_hpc_helper_v0_1_1.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Define the URL of the zip file
+ZIP_URL="https://github.com/l1997i/slurm_hpc_helper/releases/download/v0.1.1/hpc_helper_v0_1_1.zip"
+ZIP_FILE="hpc_helper_v0_1_1.zip"
+DIR_NAME="hpc_helper_v0_1_1"
+
+# Download the zip file
+curl -L $ZIP_URL -o $ZIP_FILE
+
+# Unzip the file into the current directory
+unzip $ZIP_FILE
+
+# Navigate into the directory
+cd $DIR_NAME
+
+# Make the install script executable
+chmod +x install.sh
+
+# Run the install script
+./install.sh
diff --git a/release/bin/code b/release/bin/code
diff --git a/release/bin/server b/release/bin/server
diff --git a/release/config.json b/release/config.json
@@ -0,0 +1,7 @@
+{
+    "defaults":{
+        "home_folder": "/home2/mznv82",
+        "name":"train"
+    },
+    "port":15001
+}
diff --git a/release/hpc_helper b/release/hpc_helper
diff --git a/release/install.sh b/release/install.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Ask for user input
+read -p "Enter your password: " user_password
+read -p "Enter your email (e.g., [email protected]): " user_email
+read -p "Enter your home directory (e.g., /home2/mznv82): " user_home
+
+
+
+# Replace the email in the specified files
+sed -i "s/[email protected]/$user_email/g" src/templates/bash/code_tunnel.sh
+sed -i "s/[email protected]/$user_email/g" src/templates/bash/sshd.sh
+sed -i "s/[email protected]/$user_email/g" src/templates/bash/final_stage.sh
+
+# Replace the home directory in the config.json file
+sed -i "s|/home2/mznv82|$user_home|g" config.json
+
+# Set up the local bin directory and copy executables
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+mkdir -p ~/.local/bin
+cp -r bin/* ~/.local/bin
+chmod +x ~/.local/bin/code
+chmod +x ~/.local/bin/server
+echo "export PATH=\"$DIR:\$PATH\"" >> ~/.bashrc
+
+# Install stage 2 PyTorch environment
+mkdir -p .pytorch
+cd .pytorch || exit
+python3 -m venv .
+source bin/activate
+pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113
+pip install Werkzeug~=2.0.0
+
+# Run the python script to reset the password
+python reset_password.py "$user_password"
+
+# Notify the user of completion
+echo "Installation and configuration complete."
diff --git a/release/reset_password.py b/release/reset_password.py
@@ -0,0 +1,9 @@
+import sys, os
+from werkzeug.security import generate_password_hash
+
+pw = sys.argv[1]
+salt = '123456uwu654321'
+hash = generate_password_hash(salt+pw)
+
+with open(f'{os.getcwd()}/password.txt', 'w') as f:
+    f.write(hash)
diff --git a/release/src/templates/bash/code_tunnel.sh b/release/src/templates/bash/code_tunnel.sh
@@ -0,0 +1,17 @@
+################### Mail and code tunnel  <<<<<<<<<<<<<<<<<<<<<<<<<
+receipt_addr="[email protected]"
+host=$(hostname -s)
+date=$(date "+%Y%m%d%H%M%S")
+mkdir -p ~/.jobs/${date}_${SLURM_JOB_ID}
+mkdir -p .logs/runner/${SLURM_JOB_ID}
+nohup code tunnel --accept-server-license-terms --name "${host}-${SLURM_JOB_ID}" --cli-data-dir ~/.jobs/${date}_${SLURM_JOB_ID} > .logs/runner/${SLURM_JOB_ID}/runner-${SLURM_JOB_ID}.log 2>&1 &
+server_pid=$!
+content="Slurm job name: ${SLURM_JOB_NAME}. 
+Online dev: https://vscode.dev/tunnel/${host}-${SLURM_JOB_ID}$(pwd)    Server PID: ${server_pid}. 
+Slurm jobs overview: http://ncc.clients.dur.ac.uk/grafana/d/5UwAWAzWk/slurm-jobs?orgId=1&var-user=$(whoami)&var-job=All&from=now-6h&to=now. 
+To get the code, please run the command remotely: 
+> more $(pwd)/.logs/runner/${SLURM_JOB_ID}/runner-${SLURM_JOB_ID}.log | grep -o 'code [A-Z0-9-]*' | tail -n 1
+Then, log into https://github.com/login/device to grant the server access.
+$(nvidia-smi)"
+echo ${content}
+server ${receipt_addr} "[NCC: ${host}] job #${SLURM_JOB_ID} is running" "${content}" > /dev/null 2>&1
diff --git a/release/src/templates/bash/final_stage.sh b/release/src/templates/bash/final_stage.sh
@@ -0,0 +1,11 @@
+################### Final Stage  <<<<<<<<<<<<<<<<<<<<<<<<<
+receipt_addr="[email protected]"
+source .pytorch/bin/activate
+module load cuda/11.3
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+cd $DIR/../py
+pwd
+#nohup python3 pytorch_stage2.py > /dev/null 2>&1 &
+python3 pytorch_stage2.py
+final_pid=$!
+server ${receipt_addr} "[NCC: ${host}] job #${SLURM_JOB_ID} +2 stage is running" "PID 2: ${final_pid}" > /dev/null 2>&1
diff --git a/release/src/templates/bash/sshd.sh b/release/src/templates/bash/sshd.sh
@@ -0,0 +1,14 @@
+################### Mail and sshd  <<<<<<<<<<<<<<<<<<<<<<<<<
+receipt_addr="[email protected]"
+host=$(hostname -s)
+date=$(date "+%Y%m%d%H%M%S")
+PORT=$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')
+nohup /usr/sbin/sshd -D -p ${PORT} -f /dev/null -h ~/.ssh/id_rsa > /dev/null 2>&1 &
+server_pid=$!
+content="Slurm job name: ${SLURM_JOB_NAME},     Server PID: ${server_pid}. 
+Slurm jobs overview: http://ncc.clients.dur.ac.uk/grafana/d/5UwAWAzWk/slurm-jobs?orgId=1&var-user=$(whoami)&var-job=All&from=now-6h&to=now. 
+[SSH] To get the access to the node, please run the command locally: 
+> ssh -J $(whoami)@ncc1.clients.dur.ac.uk $(whoami)@${host} -p ${PORT} -i ~/.ssh/id_rsa
+$(nvidia-smi)"
+echo ${content}
+server ${receipt_addr} "[NCC: ${host}] job #${SLURM_JOB_ID} is running" "${content}" > /dev/null 2>&1
diff --git a/release/src/templates/py/pytorch_stage2.py b/release/src/templates/py/pytorch_stage2.py
@@ -0,0 +1,69 @@
+from torchvision import models
+import torch
+import sys
+import time
+
+# Ensures that torch uses GPU if available for all operations
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+def get_gpu_memory():
+    torch.cuda.empty_cache()  # Clear memory cache
+    total_memory = torch.cuda.get_device_properties(device).total_memory
+    reserved_memory = torch.cuda.memory_reserved(device)
+    available_memory = total_memory - reserved_memory
+    return reserved_memory, available_memory
+
+def occupy_gpu_memory(size_in_gb):
+    num_elements = int(size_in_gb * (1024 ** 3) / 4)  # Each float32 element takes 4 bytes
+    dummy_tensor = torch.empty(num_elements, dtype=torch.float32, device=device)
+    return dummy_tensor
+
+if __name__ == "__main__":
+
+    print("We are using PyTorch: " + torch.__version__)
+    print("Number of GPUs available:", torch.cuda.device_count())
+    print()
+    if device.type == 'cuda':
+        print("The first GPU available is:", torch.cuda.get_device_name(0))
+        print()
+
+        print("Testing PyTorch with GPU ....")
+
+        # Load the pretrained AlexNet model directly to GPU
+        alexnet = models.alexnet(pretrained=True).to(device)
+        print("alexnet init...")
+
+        # Create a random tensor to simulate input data and directly allocate it to GPU
+        x = torch.randn(1, 3, 227, 227, device=device)
+        print("prepare input feature...")
+
+        try:
+            # Perform a forward pass with the model
+            y = alexnet(x)
+            print("compute y...")
+
+            # Occupy 75% of the available GPU memory, leaving a buffer of 1.8GB
+            reserved_memory, available_memory = get_gpu_memory()
+            total_memory = reserved_memory + available_memory
+            print(f"Total GPU Memory: {total_memory / 1e9} GB")
+            print(f"Reserved Memory: {reserved_memory / 1e9} GB")
+            print(f"Available Memory: {available_memory / 1e9} GB")
+
+            dummy_tensor = occupy_gpu_memory(available_memory / 1e9 * 0.75 - 1.8)
+
+            # TODO: Replace the infinite loop with actual computation that needs to run.
+            # Remove the loop or add a condition to exit the loop when needed.
+            while True:
+                y = alexnet(x)
+                time.sleep(1) #could be used to simulate processing time
+        except Exception as e:
+            print("GPU computation *** FAILURE ***.")
+            print(str(e))
+            print()
+    else:
+        print("CUDA is not available. No GPU will be used.")
+
+    # Output versions for numpy and Python
+    print("We are using numpy:", torch.numpy_version)
+    print("This is Python:", sys.version)
+
diff --git a/release/src/templates/py/run_script.py b/release/src/templates/py/run_script.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+
+import subprocess
+import sys
+import os
+import json
+
+try:
+    from setproctitle import setproctitle
+except ImportError:
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "setproctitle"])
+    from setproctitle import setproctitle  # Retry the import after installation
+
+slurm_job_id = os.environ.get('SLURM_JOB_ID')
+
+
+def run_script(script_path, title, output_file=None):
+    stage_id = title.split('_')[-1]
+    setproctitle(title)
+    if output_file:
+        with open(output_file, 'w') as file:
+            process = subprocess.Popen(["bash", script_path], stdout=file, stderr=subprocess.STDOUT)
+    else:
+        process = subprocess.Popen(["bash", script_path])
+
+    print("[Stage {}] Started process PID: {}".format(stage_id, process.pid))
+    process.wait()
+    return process.pid
+
+
+def run_script_with_out(script_path, title, output_file):
+    stage_id = title.split('_')[-1]
+    # Note: This changes the title of the Python script itself, not the subprocess
+    setproctitle(title)
+
+    command = ['bash', '-c', f'exec -a "{title}" bash {script_path}']
+    if output_file:
+        with open(output_file, 'w') as file:
+            process = subprocess.Popen(command, stdout=file, stderr=subprocess.STDOUT)
+    else:
+        process = subprocess.Popen(command)
+
+    print("[Stage {}] Started process PID: {}".format(stage_id, process.pid))
+    return process.pid
+
+
+
+def pid2json(pid, stage_id):
+    current_path = os.getcwd()
+    hpc_gui_path = os.environ.get('HPC_GUI_PATH')
+    json_path = os.path.join(current_path, ".logs/job_scripts/{}/job_info.json".format(slurm_job_id))
+    job_path = os.path.join(hpc_gui_path, "data/jobs.json")
+    write_job_json(pid, stage_id, job_path)
+    write_json(pid, stage_id, json_path)
+
+
+def write_json(pid, stage_id, json_path):
+    with open(json_path, 'r') as file:
+        data = json.load(file)
+        key = 'pid_' + stage_id
+        data[key] = pid
+    with open(json_path, 'w') as file:
+        json.dump(data, file)
+
+
+def write_job_json(pid, stage_id, json_path):
+    with open(json_path, 'r') as file:
+        data = json.load(file)
+        key = 'pid_' + stage_id
+        data[slurm_job_id][key] = pid
+    with open(json_path, 'w') as file:
+        json.dump(data, file)
+
+
+if __name__ == "__main__":
+    script_path = sys.argv[1]
+    title = sys.argv[2]
+    if len(sys.argv) == 4:
+        output_file = sys.argv[3]
+        run_script_with_out(script_path, title, output_file)
+    else:
+        run_script(script_path, title)
diff --git a/src/templates/bash/final_stage.sh b/src/templates/bash/final_stage.sh
@@ -1,12 +1,13 @@
 ################### Final Stage  <<<<<<<<<<<<<<<<<<<<<<<<<
 receipt_addr="[email protected]"
-source /etc/profile
-source ~/anaconda3/etc/profile.d/conda.sh
-module load cuda/11.1
-conda activate virconv
-cd ~/second-stage/tools
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+cd $DIR/../../../
 pwd
-rm -rf ~/VirConv/output/models/kitti/VirConv-T-SECOND-STAGE/
-python3 train.py --cfg_file cfgs/models/kitti/VirConv-T-SECOND-STAGE.yaml
+source .pytorch/bin/activate
+module load cuda/11.3
+cd $DIR/../py
+pwd
+#nohup python3 pytorch_stage2.py > /dev/null 2>&1 &
+python3 pytorch_stage2.py
 final_pid=$!
 server ${receipt_addr} "[NCC: ${host}] job #${SLURM_JOB_ID} +2 stage is running" "PID 2: ${final_pid}" > /dev/null 2>&1
diff --git a/src/templates/py/pytorch_stage2.py b/src/templates/py/pytorch_stage2.py
@@ -0,0 +1,72 @@
+from torchvision import models
+import torch
+import sys
+import time
+
+torch.set_num_threads(4)
+torch.set_num_interop_threads(4)
+
+# Ensures that torch uses GPU if available for all operations
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+def get_gpu_memory():
+    torch.cuda.empty_cache()  # Clear memory cache
+    total_memory = torch.cuda.get_device_properties(device).total_memory
+    reserved_memory = torch.cuda.memory_reserved(device)
+    available_memory = total_memory - reserved_memory
+    return reserved_memory, available_memory
+
+def occupy_gpu_memory(size_in_gb):
+    num_elements = int(size_in_gb * (1024 ** 3) / 4)  # Each float32 element takes 4 bytes
+    dummy_tensor = torch.empty(num_elements, dtype=torch.float32, device=device)
+    return dummy_tensor
+
+if __name__ == "__main__":
+
+    print("We are using PyTorch: " + torch.__version__)
+    print("Number of GPUs available:", torch.cuda.device_count())
+    print()
+    if device.type == 'cuda':
+        print("The first GPU available is:", torch.cuda.get_device_name(0))
+        print()
+
+        print("Testing PyTorch with GPU ....")
+
+        # Load the pretrained AlexNet model directly to GPU
+        alexnet = models.alexnet(pretrained=True).to(device)
+        print("alexnet init...")
+
+        # Create a random tensor to simulate input data and directly allocate it to GPU
+        x = torch.randn(1, 3, 227, 227, device=device)
+        print("prepare input feature...")
+
+        try:
+            # Perform a forward pass with the model
+            y = alexnet(x)
+            print("compute y...")
+
+            # Occupy 75% of the available GPU memory, leaving a buffer of 1.8GB
+            reserved_memory, available_memory = get_gpu_memory()
+            total_memory = reserved_memory + available_memory
+            print(f"Total GPU Memory: {total_memory / 1e9} GB")
+            print(f"Reserved Memory: {reserved_memory / 1e9} GB")
+            print(f"Available Memory: {available_memory / 1e9} GB")
+
+            dummy_tensor = occupy_gpu_memory(available_memory / 1e9 * 0.75 - 1.8)
+
+            # TODO: Replace the infinite loop with actual computation that needs to run.
+            # Remove the loop or add a condition to exit the loop when needed.
+            while True:
+                y = alexnet(x)
+                time.sleep(1) #could be used to simulate processing time
+        except Exception as e:
+            print("GPU computation *** FAILURE ***.")
+            print(str(e))
+            print()
+    else:
+        print("CUDA is not available. No GPU will be used.")
+
+    # Output versions for numpy and Python
+    print("We are using numpy:", torch.numpy_version)
+    print("This is Python:", sys.version)
+
Original file line number	Diff line number	Diff line change
Expand Up		@@ -131,3 +131,5 @@ dmypy.json

		# Pyre type checker
		.pyre/

		.pytorch