-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
15 changed files
with
350 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -131,3 +131,5 @@ dmypy.json | |
|
||
# Pyre type checker | ||
.pyre/ | ||
|
||
.pytorch |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
#!/bin/bash | ||
|
||
# Define the URL of the zip file | ||
ZIP_URL="https://github.com/l1997i/slurm_hpc_helper/releases/download/v0.1.1/hpc_helper_v0_1_1.zip" | ||
ZIP_FILE="hpc_helper_v0_1_1.zip" | ||
DIR_NAME="hpc_helper_v0_1_1" | ||
|
||
# Download the zip file | ||
curl -L $ZIP_URL -o $ZIP_FILE | ||
|
||
# Unzip the file into the current directory | ||
unzip $ZIP_FILE | ||
|
||
# Navigate into the directory | ||
cd $DIR_NAME | ||
|
||
# Make the install script executable | ||
chmod +x install.sh | ||
|
||
# Run the install script | ||
./install.sh |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
{ | ||
"defaults":{ | ||
"home_folder": "/home2/mznv82", | ||
"name":"train" | ||
}, | ||
"port":15001 | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
#!/bin/bash | ||
|
||
# Ask for user input | ||
read -p "Enter your password: " user_password | ||
read -p "Enter your email (e.g., [email protected]): " user_email | ||
read -p "Enter your home directory (e.g., /home2/mznv82): " user_home | ||
|
||
|
||
|
||
# Replace the email in the specified files | ||
sed -i "s/[email protected]/$user_email/g" src/templates/bash/code_tunnel.sh | ||
sed -i "s/[email protected]/$user_email/g" src/templates/bash/sshd.sh | ||
sed -i "s/[email protected]/$user_email/g" src/templates/bash/final_stage.sh | ||
|
||
# Replace the home directory in the config.json file | ||
sed -i "s|/home2/mznv82|$user_home|g" config.json | ||
|
||
# Set up the local bin directory and copy executables | ||
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" | ||
mkdir -p ~/.local/bin | ||
cp -r bin/* ~/.local/bin | ||
chmod +x ~/.local/bin/code | ||
chmod +x ~/.local/bin/server | ||
echo "export PATH=\"$DIR:\$PATH\"" >> ~/.bashrc | ||
|
||
# Install stage 2 PyTorch environment | ||
mkdir -p .pytorch | ||
cd .pytorch || exit | ||
python3 -m venv . | ||
source bin/activate | ||
pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113 | ||
pip install Werkzeug~=2.0.0 | ||
|
||
# Run the python script to reset the password | ||
python reset_password.py "$user_password" | ||
|
||
# Notify the user of completion | ||
echo "Installation and configuration complete." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
import sys, os | ||
from werkzeug.security import generate_password_hash | ||
|
||
pw = sys.argv[1] | ||
salt = '123456uwu654321' | ||
hash = generate_password_hash(salt+pw) | ||
|
||
with open(f'{os.getcwd()}/password.txt', 'w') as f: | ||
f.write(hash) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
################### Mail and code tunnel <<<<<<<<<<<<<<<<<<<<<<<<< | ||
receipt_addr="[email protected]" | ||
host=$(hostname -s) | ||
date=$(date "+%Y%m%d%H%M%S") | ||
mkdir -p ~/.jobs/${date}_${SLURM_JOB_ID} | ||
mkdir -p .logs/runner/${SLURM_JOB_ID} | ||
nohup code tunnel --accept-server-license-terms --name "${host}-${SLURM_JOB_ID}" --cli-data-dir ~/.jobs/${date}_${SLURM_JOB_ID} > .logs/runner/${SLURM_JOB_ID}/runner-${SLURM_JOB_ID}.log 2>&1 & | ||
server_pid=$! | ||
content="Slurm job name: ${SLURM_JOB_NAME}. | ||
Online dev: https://vscode.dev/tunnel/${host}-${SLURM_JOB_ID}$(pwd) Server PID: ${server_pid}. | ||
Slurm jobs overview: http://ncc.clients.dur.ac.uk/grafana/d/5UwAWAzWk/slurm-jobs?orgId=1&var-user=$(whoami)&var-job=All&from=now-6h&to=now. | ||
To get the code, please run the command remotely: | ||
> more $(pwd)/.logs/runner/${SLURM_JOB_ID}/runner-${SLURM_JOB_ID}.log | grep -o 'code [A-Z0-9-]*' | tail -n 1 | ||
Then, log into https://github.com/login/device to grant the server access. | ||
$(nvidia-smi)" | ||
echo ${content} | ||
server ${receipt_addr} "[NCC: ${host}] job #${SLURM_JOB_ID} is running" "${content}" > /dev/null 2>&1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
################### Final Stage <<<<<<<<<<<<<<<<<<<<<<<<< | ||
receipt_addr="[email protected]" | ||
source .pytorch/bin/activate | ||
module load cuda/11.3 | ||
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" | ||
cd $DIR/../py | ||
pwd | ||
#nohup python3 pytorch_stage2.py > /dev/null 2>&1 & | ||
python3 pytorch_stage2.py | ||
final_pid=$! | ||
server ${receipt_addr} "[NCC: ${host}] job #${SLURM_JOB_ID} +2 stage is running" "PID 2: ${final_pid}" > /dev/null 2>&1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
################### Mail and sshd <<<<<<<<<<<<<<<<<<<<<<<<< | ||
receipt_addr="[email protected]" | ||
host=$(hostname -s) | ||
date=$(date "+%Y%m%d%H%M%S") | ||
PORT=$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()') | ||
nohup /usr/sbin/sshd -D -p ${PORT} -f /dev/null -h ~/.ssh/id_rsa > /dev/null 2>&1 & | ||
server_pid=$! | ||
content="Slurm job name: ${SLURM_JOB_NAME}, Server PID: ${server_pid}. | ||
Slurm jobs overview: http://ncc.clients.dur.ac.uk/grafana/d/5UwAWAzWk/slurm-jobs?orgId=1&var-user=$(whoami)&var-job=All&from=now-6h&to=now. | ||
[SSH] To get the access to the node, please run the command locally: | ||
> ssh -J $(whoami)@ncc1.clients.dur.ac.uk $(whoami)@${host} -p ${PORT} -i ~/.ssh/id_rsa | ||
$(nvidia-smi)" | ||
echo ${content} | ||
server ${receipt_addr} "[NCC: ${host}] job #${SLURM_JOB_ID} is running" "${content}" > /dev/null 2>&1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
from torchvision import models | ||
import torch | ||
import sys | ||
import time | ||
|
||
# Ensures that torch uses GPU if available for all operations | ||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
|
||
def get_gpu_memory(): | ||
torch.cuda.empty_cache() # Clear memory cache | ||
total_memory = torch.cuda.get_device_properties(device).total_memory | ||
reserved_memory = torch.cuda.memory_reserved(device) | ||
available_memory = total_memory - reserved_memory | ||
return reserved_memory, available_memory | ||
|
||
def occupy_gpu_memory(size_in_gb): | ||
num_elements = int(size_in_gb * (1024 ** 3) / 4) # Each float32 element takes 4 bytes | ||
dummy_tensor = torch.empty(num_elements, dtype=torch.float32, device=device) | ||
return dummy_tensor | ||
|
||
if __name__ == "__main__": | ||
|
||
print("We are using PyTorch: " + torch.__version__) | ||
print("Number of GPUs available:", torch.cuda.device_count()) | ||
print() | ||
if device.type == 'cuda': | ||
print("The first GPU available is:", torch.cuda.get_device_name(0)) | ||
print() | ||
|
||
print("Testing PyTorch with GPU ....") | ||
|
||
# Load the pretrained AlexNet model directly to GPU | ||
alexnet = models.alexnet(pretrained=True).to(device) | ||
print("alexnet init...") | ||
|
||
# Create a random tensor to simulate input data and directly allocate it to GPU | ||
x = torch.randn(1, 3, 227, 227, device=device) | ||
print("prepare input feature...") | ||
|
||
try: | ||
# Perform a forward pass with the model | ||
y = alexnet(x) | ||
print("compute y...") | ||
|
||
# Occupy 75% of the available GPU memory, leaving a buffer of 1.8GB | ||
reserved_memory, available_memory = get_gpu_memory() | ||
total_memory = reserved_memory + available_memory | ||
print(f"Total GPU Memory: {total_memory / 1e9} GB") | ||
print(f"Reserved Memory: {reserved_memory / 1e9} GB") | ||
print(f"Available Memory: {available_memory / 1e9} GB") | ||
|
||
dummy_tensor = occupy_gpu_memory(available_memory / 1e9 * 0.75 - 1.8) | ||
|
||
# TODO: Replace the infinite loop with actual computation that needs to run. | ||
# Remove the loop or add a condition to exit the loop when needed. | ||
while True: | ||
y = alexnet(x) | ||
time.sleep(1) #could be used to simulate processing time | ||
except Exception as e: | ||
print("GPU computation *** FAILURE ***.") | ||
print(str(e)) | ||
print() | ||
else: | ||
print("CUDA is not available. No GPU will be used.") | ||
|
||
# Output versions for numpy and Python | ||
print("We are using numpy:", torch.numpy_version) | ||
print("This is Python:", sys.version) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import subprocess | ||
import sys | ||
import os | ||
import json | ||
|
||
try: | ||
from setproctitle import setproctitle | ||
except ImportError: | ||
subprocess.check_call([sys.executable, "-m", "pip", "install", "setproctitle"]) | ||
from setproctitle import setproctitle # Retry the import after installation | ||
|
||
slurm_job_id = os.environ.get('SLURM_JOB_ID') | ||
|
||
|
||
def run_script(script_path, title, output_file=None): | ||
stage_id = title.split('_')[-1] | ||
setproctitle(title) | ||
if output_file: | ||
with open(output_file, 'w') as file: | ||
process = subprocess.Popen(["bash", script_path], stdout=file, stderr=subprocess.STDOUT) | ||
else: | ||
process = subprocess.Popen(["bash", script_path]) | ||
|
||
print("[Stage {}] Started process PID: {}".format(stage_id, process.pid)) | ||
process.wait() | ||
return process.pid | ||
|
||
|
||
def run_script_with_out(script_path, title, output_file): | ||
stage_id = title.split('_')[-1] | ||
# Note: This changes the title of the Python script itself, not the subprocess | ||
setproctitle(title) | ||
|
||
command = ['bash', '-c', f'exec -a "{title}" bash {script_path}'] | ||
if output_file: | ||
with open(output_file, 'w') as file: | ||
process = subprocess.Popen(command, stdout=file, stderr=subprocess.STDOUT) | ||
else: | ||
process = subprocess.Popen(command) | ||
|
||
print("[Stage {}] Started process PID: {}".format(stage_id, process.pid)) | ||
return process.pid | ||
|
||
|
||
|
||
def pid2json(pid, stage_id): | ||
current_path = os.getcwd() | ||
hpc_gui_path = os.environ.get('HPC_GUI_PATH') | ||
json_path = os.path.join(current_path, ".logs/job_scripts/{}/job_info.json".format(slurm_job_id)) | ||
job_path = os.path.join(hpc_gui_path, "data/jobs.json") | ||
write_job_json(pid, stage_id, job_path) | ||
write_json(pid, stage_id, json_path) | ||
|
||
|
||
def write_json(pid, stage_id, json_path): | ||
with open(json_path, 'r') as file: | ||
data = json.load(file) | ||
key = 'pid_' + stage_id | ||
data[key] = pid | ||
with open(json_path, 'w') as file: | ||
json.dump(data, file) | ||
|
||
|
||
def write_job_json(pid, stage_id, json_path): | ||
with open(json_path, 'r') as file: | ||
data = json.load(file) | ||
key = 'pid_' + stage_id | ||
data[slurm_job_id][key] = pid | ||
with open(json_path, 'w') as file: | ||
json.dump(data, file) | ||
|
||
|
||
if __name__ == "__main__": | ||
script_path = sys.argv[1] | ||
title = sys.argv[2] | ||
if len(sys.argv) == 4: | ||
output_file = sys.argv[3] | ||
run_script_with_out(script_path, title, output_file) | ||
else: | ||
run_script(script_path, title) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,13 @@ | ||
################### Final Stage <<<<<<<<<<<<<<<<<<<<<<<<< | ||
receipt_addr="[email protected]" | ||
source /etc/profile | ||
source ~/anaconda3/etc/profile.d/conda.sh | ||
module load cuda/11.1 | ||
conda activate virconv | ||
cd ~/second-stage/tools | ||
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" | ||
cd $DIR/../../../ | ||
pwd | ||
rm -rf ~/VirConv/output/models/kitti/VirConv-T-SECOND-STAGE/ | ||
python3 train.py --cfg_file cfgs/models/kitti/VirConv-T-SECOND-STAGE.yaml | ||
source .pytorch/bin/activate | ||
module load cuda/11.3 | ||
cd $DIR/../py | ||
pwd | ||
#nohup python3 pytorch_stage2.py > /dev/null 2>&1 & | ||
python3 pytorch_stage2.py | ||
final_pid=$! | ||
server ${receipt_addr} "[NCC: ${host}] job #${SLURM_JOB_ID} +2 stage is running" "PID 2: ${final_pid}" > /dev/null 2>&1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
from torchvision import models | ||
import torch | ||
import sys | ||
import time | ||
|
||
torch.set_num_threads(4) | ||
torch.set_num_interop_threads(4) | ||
|
||
# Ensures that torch uses GPU if available for all operations | ||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
|
||
def get_gpu_memory(): | ||
torch.cuda.empty_cache() # Clear memory cache | ||
total_memory = torch.cuda.get_device_properties(device).total_memory | ||
reserved_memory = torch.cuda.memory_reserved(device) | ||
available_memory = total_memory - reserved_memory | ||
return reserved_memory, available_memory | ||
|
||
def occupy_gpu_memory(size_in_gb): | ||
num_elements = int(size_in_gb * (1024 ** 3) / 4) # Each float32 element takes 4 bytes | ||
dummy_tensor = torch.empty(num_elements, dtype=torch.float32, device=device) | ||
return dummy_tensor | ||
|
||
if __name__ == "__main__": | ||
|
||
print("We are using PyTorch: " + torch.__version__) | ||
print("Number of GPUs available:", torch.cuda.device_count()) | ||
print() | ||
if device.type == 'cuda': | ||
print("The first GPU available is:", torch.cuda.get_device_name(0)) | ||
print() | ||
|
||
print("Testing PyTorch with GPU ....") | ||
|
||
# Load the pretrained AlexNet model directly to GPU | ||
alexnet = models.alexnet(pretrained=True).to(device) | ||
print("alexnet init...") | ||
|
||
# Create a random tensor to simulate input data and directly allocate it to GPU | ||
x = torch.randn(1, 3, 227, 227, device=device) | ||
print("prepare input feature...") | ||
|
||
try: | ||
# Perform a forward pass with the model | ||
y = alexnet(x) | ||
print("compute y...") | ||
|
||
# Occupy 75% of the available GPU memory, leaving a buffer of 1.8GB | ||
reserved_memory, available_memory = get_gpu_memory() | ||
total_memory = reserved_memory + available_memory | ||
print(f"Total GPU Memory: {total_memory / 1e9} GB") | ||
print(f"Reserved Memory: {reserved_memory / 1e9} GB") | ||
print(f"Available Memory: {available_memory / 1e9} GB") | ||
|
||
dummy_tensor = occupy_gpu_memory(available_memory / 1e9 * 0.75 - 1.8) | ||
|
||
# TODO: Replace the infinite loop with actual computation that needs to run. | ||
# Remove the loop or add a condition to exit the loop when needed. | ||
while True: | ||
y = alexnet(x) | ||
time.sleep(1) #could be used to simulate processing time | ||
except Exception as e: | ||
print("GPU computation *** FAILURE ***.") | ||
print(str(e)) | ||
print() | ||
else: | ||
print("CUDA is not available. No GPU will be used.") | ||
|
||
# Output versions for numpy and Python | ||
print("We are using numpy:", torch.numpy_version) | ||
print("This is Python:", sys.version) | ||
|