Skip to content

Commit

Permalink
feat
Browse files Browse the repository at this point in the history
  • Loading branch information
l1997i committed Apr 25, 2024
1 parent aa20dbc commit 9b61736
Show file tree
Hide file tree
Showing 15 changed files with 350 additions and 7 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,5 @@ dmypy.json

# Pyre type checker
.pyre/

.pytorch
21 changes: 21 additions & 0 deletions install_hpc_helper_v0_1_1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash

# Define the URL of the zip file
ZIP_URL="https://github.com/l1997i/slurm_hpc_helper/releases/download/v0.1.1/hpc_helper_v0_1_1.zip"
ZIP_FILE="hpc_helper_v0_1_1.zip"
DIR_NAME="hpc_helper_v0_1_1"

# Download the zip file
curl -L $ZIP_URL -o $ZIP_FILE

# Unzip the file into the current directory
unzip $ZIP_FILE

# Navigate into the directory
cd $DIR_NAME

# Make the install script executable
chmod +x install.sh

# Run the install script
./install.sh
Binary file added release/bin/code
Binary file not shown.
Binary file added release/bin/server
Binary file not shown.
7 changes: 7 additions & 0 deletions release/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"defaults":{
"home_folder": "/home2/mznv82",
"name":"train"
},
"port":15001
}
Binary file added release/hpc_helper
Binary file not shown.
38 changes: 38 additions & 0 deletions release/install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash

# Ask for user input
read -p "Enter your password: " user_password
read -p "Enter your email (e.g., [email protected]): " user_email
read -p "Enter your home directory (e.g., /home2/mznv82): " user_home



# Replace the email in the specified files
sed -i "s/[email protected]/$user_email/g" src/templates/bash/code_tunnel.sh
sed -i "s/[email protected]/$user_email/g" src/templates/bash/sshd.sh
sed -i "s/[email protected]/$user_email/g" src/templates/bash/final_stage.sh

# Replace the home directory in the config.json file
sed -i "s|/home2/mznv82|$user_home|g" config.json

# Set up the local bin directory and copy executables
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
mkdir -p ~/.local/bin
cp -r bin/* ~/.local/bin
chmod +x ~/.local/bin/code
chmod +x ~/.local/bin/server
echo "export PATH=\"$DIR:\$PATH\"" >> ~/.bashrc

# Install stage 2 PyTorch environment
mkdir -p .pytorch
cd .pytorch || exit
python3 -m venv .
source bin/activate
pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113
pip install Werkzeug~=2.0.0

# Run the python script to reset the password
python reset_password.py "$user_password"

# Notify the user of completion
echo "Installation and configuration complete."
9 changes: 9 additions & 0 deletions release/reset_password.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import sys, os
from werkzeug.security import generate_password_hash

pw = sys.argv[1]
salt = '123456uwu654321'
hash = generate_password_hash(salt+pw)

with open(f'{os.getcwd()}/password.txt', 'w') as f:
f.write(hash)
17 changes: 17 additions & 0 deletions release/src/templates/bash/code_tunnel.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
################### Mail and code tunnel <<<<<<<<<<<<<<<<<<<<<<<<<
receipt_addr="[email protected]"
host=$(hostname -s)
date=$(date "+%Y%m%d%H%M%S")
mkdir -p ~/.jobs/${date}_${SLURM_JOB_ID}
mkdir -p .logs/runner/${SLURM_JOB_ID}
nohup code tunnel --accept-server-license-terms --name "${host}-${SLURM_JOB_ID}" --cli-data-dir ~/.jobs/${date}_${SLURM_JOB_ID} > .logs/runner/${SLURM_JOB_ID}/runner-${SLURM_JOB_ID}.log 2>&1 &
server_pid=$!
content="Slurm job name: ${SLURM_JOB_NAME}.
Online dev: https://vscode.dev/tunnel/${host}-${SLURM_JOB_ID}$(pwd) Server PID: ${server_pid}.
Slurm jobs overview: http://ncc.clients.dur.ac.uk/grafana/d/5UwAWAzWk/slurm-jobs?orgId=1&var-user=$(whoami)&var-job=All&from=now-6h&to=now.
To get the code, please run the command remotely:
> more $(pwd)/.logs/runner/${SLURM_JOB_ID}/runner-${SLURM_JOB_ID}.log | grep -o 'code [A-Z0-9-]*' | tail -n 1
Then, log into https://github.com/login/device to grant the server access.
$(nvidia-smi)"
echo ${content}
server ${receipt_addr} "[NCC: ${host}] job #${SLURM_JOB_ID} is running" "${content}" > /dev/null 2>&1
11 changes: 11 additions & 0 deletions release/src/templates/bash/final_stage.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
################### Final Stage <<<<<<<<<<<<<<<<<<<<<<<<<
receipt_addr="[email protected]"
source .pytorch/bin/activate
module load cuda/11.3
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
cd $DIR/../py
pwd
#nohup python3 pytorch_stage2.py > /dev/null 2>&1 &
python3 pytorch_stage2.py
final_pid=$!
server ${receipt_addr} "[NCC: ${host}] job #${SLURM_JOB_ID} +2 stage is running" "PID 2: ${final_pid}" > /dev/null 2>&1
14 changes: 14 additions & 0 deletions release/src/templates/bash/sshd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
################### Mail and sshd <<<<<<<<<<<<<<<<<<<<<<<<<
receipt_addr="[email protected]"
host=$(hostname -s)
date=$(date "+%Y%m%d%H%M%S")
PORT=$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')
nohup /usr/sbin/sshd -D -p ${PORT} -f /dev/null -h ~/.ssh/id_rsa > /dev/null 2>&1 &
server_pid=$!
content="Slurm job name: ${SLURM_JOB_NAME}, Server PID: ${server_pid}.
Slurm jobs overview: http://ncc.clients.dur.ac.uk/grafana/d/5UwAWAzWk/slurm-jobs?orgId=1&var-user=$(whoami)&var-job=All&from=now-6h&to=now.
[SSH] To get the access to the node, please run the command locally:
> ssh -J $(whoami)@ncc1.clients.dur.ac.uk $(whoami)@${host} -p ${PORT} -i ~/.ssh/id_rsa
$(nvidia-smi)"
echo ${content}
server ${receipt_addr} "[NCC: ${host}] job #${SLURM_JOB_ID} is running" "${content}" > /dev/null 2>&1
69 changes: 69 additions & 0 deletions release/src/templates/py/pytorch_stage2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from torchvision import models
import torch
import sys
import time

# Ensures that torch uses GPU if available for all operations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_gpu_memory():
torch.cuda.empty_cache() # Clear memory cache
total_memory = torch.cuda.get_device_properties(device).total_memory
reserved_memory = torch.cuda.memory_reserved(device)
available_memory = total_memory - reserved_memory
return reserved_memory, available_memory

def occupy_gpu_memory(size_in_gb):
num_elements = int(size_in_gb * (1024 ** 3) / 4) # Each float32 element takes 4 bytes
dummy_tensor = torch.empty(num_elements, dtype=torch.float32, device=device)
return dummy_tensor

if __name__ == "__main__":

print("We are using PyTorch: " + torch.__version__)
print("Number of GPUs available:", torch.cuda.device_count())
print()
if device.type == 'cuda':
print("The first GPU available is:", torch.cuda.get_device_name(0))
print()

print("Testing PyTorch with GPU ....")

# Load the pretrained AlexNet model directly to GPU
alexnet = models.alexnet(pretrained=True).to(device)
print("alexnet init...")

# Create a random tensor to simulate input data and directly allocate it to GPU
x = torch.randn(1, 3, 227, 227, device=device)
print("prepare input feature...")

try:
# Perform a forward pass with the model
y = alexnet(x)
print("compute y...")

# Occupy 75% of the available GPU memory, leaving a buffer of 1.8GB
reserved_memory, available_memory = get_gpu_memory()
total_memory = reserved_memory + available_memory
print(f"Total GPU Memory: {total_memory / 1e9} GB")
print(f"Reserved Memory: {reserved_memory / 1e9} GB")
print(f"Available Memory: {available_memory / 1e9} GB")

dummy_tensor = occupy_gpu_memory(available_memory / 1e9 * 0.75 - 1.8)

# TODO: Replace the infinite loop with actual computation that needs to run.
# Remove the loop or add a condition to exit the loop when needed.
while True:
y = alexnet(x)
time.sleep(1) #could be used to simulate processing time
except Exception as e:
print("GPU computation *** FAILURE ***.")
print(str(e))
print()
else:
print("CUDA is not available. No GPU will be used.")

# Output versions for numpy and Python
print("We are using numpy:", torch.numpy_version)
print("This is Python:", sys.version)

82 changes: 82 additions & 0 deletions release/src/templates/py/run_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/usr/bin/env python3

import subprocess
import sys
import os
import json

try:
from setproctitle import setproctitle
except ImportError:
subprocess.check_call([sys.executable, "-m", "pip", "install", "setproctitle"])
from setproctitle import setproctitle # Retry the import after installation

slurm_job_id = os.environ.get('SLURM_JOB_ID')


def run_script(script_path, title, output_file=None):
stage_id = title.split('_')[-1]
setproctitle(title)
if output_file:
with open(output_file, 'w') as file:
process = subprocess.Popen(["bash", script_path], stdout=file, stderr=subprocess.STDOUT)
else:
process = subprocess.Popen(["bash", script_path])

print("[Stage {}] Started process PID: {}".format(stage_id, process.pid))
process.wait()
return process.pid


def run_script_with_out(script_path, title, output_file):
stage_id = title.split('_')[-1]
# Note: This changes the title of the Python script itself, not the subprocess
setproctitle(title)

command = ['bash', '-c', f'exec -a "{title}" bash {script_path}']
if output_file:
with open(output_file, 'w') as file:
process = subprocess.Popen(command, stdout=file, stderr=subprocess.STDOUT)
else:
process = subprocess.Popen(command)

print("[Stage {}] Started process PID: {}".format(stage_id, process.pid))
return process.pid



def pid2json(pid, stage_id):
current_path = os.getcwd()
hpc_gui_path = os.environ.get('HPC_GUI_PATH')
json_path = os.path.join(current_path, ".logs/job_scripts/{}/job_info.json".format(slurm_job_id))
job_path = os.path.join(hpc_gui_path, "data/jobs.json")
write_job_json(pid, stage_id, job_path)
write_json(pid, stage_id, json_path)


def write_json(pid, stage_id, json_path):
with open(json_path, 'r') as file:
data = json.load(file)
key = 'pid_' + stage_id
data[key] = pid
with open(json_path, 'w') as file:
json.dump(data, file)


def write_job_json(pid, stage_id, json_path):
with open(json_path, 'r') as file:
data = json.load(file)
key = 'pid_' + stage_id
data[slurm_job_id][key] = pid
with open(json_path, 'w') as file:
json.dump(data, file)


if __name__ == "__main__":
script_path = sys.argv[1]
title = sys.argv[2]
if len(sys.argv) == 4:
output_file = sys.argv[3]
run_script_with_out(script_path, title, output_file)
else:
run_script(script_path, title)
15 changes: 8 additions & 7 deletions src/templates/bash/final_stage.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
################### Final Stage <<<<<<<<<<<<<<<<<<<<<<<<<
receipt_addr="[email protected]"
source /etc/profile
source ~/anaconda3/etc/profile.d/conda.sh
module load cuda/11.1
conda activate virconv
cd ~/second-stage/tools
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
cd $DIR/../../../
pwd
rm -rf ~/VirConv/output/models/kitti/VirConv-T-SECOND-STAGE/
python3 train.py --cfg_file cfgs/models/kitti/VirConv-T-SECOND-STAGE.yaml
source .pytorch/bin/activate
module load cuda/11.3
cd $DIR/../py
pwd
#nohup python3 pytorch_stage2.py > /dev/null 2>&1 &
python3 pytorch_stage2.py
final_pid=$!
server ${receipt_addr} "[NCC: ${host}] job #${SLURM_JOB_ID} +2 stage is running" "PID 2: ${final_pid}" > /dev/null 2>&1
72 changes: 72 additions & 0 deletions src/templates/py/pytorch_stage2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from torchvision import models
import torch
import sys
import time

torch.set_num_threads(4)
torch.set_num_interop_threads(4)

# Ensures that torch uses GPU if available for all operations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_gpu_memory():
torch.cuda.empty_cache() # Clear memory cache
total_memory = torch.cuda.get_device_properties(device).total_memory
reserved_memory = torch.cuda.memory_reserved(device)
available_memory = total_memory - reserved_memory
return reserved_memory, available_memory

def occupy_gpu_memory(size_in_gb):
num_elements = int(size_in_gb * (1024 ** 3) / 4) # Each float32 element takes 4 bytes
dummy_tensor = torch.empty(num_elements, dtype=torch.float32, device=device)
return dummy_tensor

if __name__ == "__main__":

print("We are using PyTorch: " + torch.__version__)
print("Number of GPUs available:", torch.cuda.device_count())
print()
if device.type == 'cuda':
print("The first GPU available is:", torch.cuda.get_device_name(0))
print()

print("Testing PyTorch with GPU ....")

# Load the pretrained AlexNet model directly to GPU
alexnet = models.alexnet(pretrained=True).to(device)
print("alexnet init...")

# Create a random tensor to simulate input data and directly allocate it to GPU
x = torch.randn(1, 3, 227, 227, device=device)
print("prepare input feature...")

try:
# Perform a forward pass with the model
y = alexnet(x)
print("compute y...")

# Occupy 75% of the available GPU memory, leaving a buffer of 1.8GB
reserved_memory, available_memory = get_gpu_memory()
total_memory = reserved_memory + available_memory
print(f"Total GPU Memory: {total_memory / 1e9} GB")
print(f"Reserved Memory: {reserved_memory / 1e9} GB")
print(f"Available Memory: {available_memory / 1e9} GB")

dummy_tensor = occupy_gpu_memory(available_memory / 1e9 * 0.75 - 1.8)

# TODO: Replace the infinite loop with actual computation that needs to run.
# Remove the loop or add a condition to exit the loop when needed.
while True:
y = alexnet(x)
time.sleep(1) #could be used to simulate processing time
except Exception as e:
print("GPU computation *** FAILURE ***.")
print(str(e))
print()
else:
print("CUDA is not available. No GPU will be used.")

# Output versions for numpy and Python
print("We are using numpy:", torch.numpy_version)
print("This is Python:", sys.version)

0 comments on commit 9b61736

Please sign in to comment.