-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfabfile.py
179 lines (124 loc) · 5.37 KB
/
fabfile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import os
import tarfile
from datetime import datetime
import yaml
from fabric import task
from gitignore_parser import parse_gitignore
from pathlib import PurePosixPath, Path
user = 'dockeruser'
docker_image_name = 'jac241/muscle_segmentation'
docker_build_cmd = f'docker build -t {docker_image_name} .'
home_dir = PurePosixPath('/home') / user
dataset_dir = home_dir / 'data' / 'datasets'
output_dir = home_dir / 'output'
predictions_dir = home_dir / 'predictions'
training_output_dir = home_dir / 'training_output'
app_target_dir = home_dir / 'test'
docker_app_dir = PurePosixPath('/opt/app')
docker_dataset_dir = docker_app_dir / 'datasets'
docker_output_dir = docker_app_dir / 'output'
docker_predictions_dir = docker_app_dir / 'predictions'
docker_training_output_dir = docker_app_dir / 'training_output'
local_root = Path(os.getcwd())
archive_path = local_root / 'tmp' / 'repo.tar.gz'
local_training_output_dir = local_root / 'from_remote' / 'training_output'
@task
def run_training(connection, run_name, gpuids=0, extra_python_args=""):
"""
Install necessary packages first with: pip install -r deploy-requirements.txt
USAGE:
fab --prompt-for-login-password -H [email protected] run-training --run_name=[RUN_NAME]
Put command line arguments for run_training.py in a .args file in config/run/[RUN_NAME].args
Outputs a folder in the from_remote/training_output with the format [RUN_NAME]_(Time of Run)
that contains the output from the training.
"""
run_dir_name = get_run_dir_name(run_name)
docker_run_output_dir = docker_training_output_dir / run_dir_name
build_docker_image_on_remote(connection, run_dir_name)
run_docker_image_for_training(
connection,
args_file_path=get_args_file_path(run_name),
run_output_dir=docker_run_output_dir,
gpuids=gpuids,
extra_python_args=extra_python_args,
)
retrieve_training_output(connection, run_dir_name=run_dir_name)
@task
def run_in_docker(connection, run_name, docker_cmd, gpuids=0):
run_dir_name = get_run_dir_name(run_name)
build_docker_image_on_remote(connection, run_dir_name)
run_training_cmd = docker_run_cmd(docker_cmd, gpuids=gpuids)
print(run_training_cmd)
connection.run(run_training_cmd)
def get_run_dir_name(run_name):
datetime_tag = datetime.now().strftime('%Y%m%d-%H%M%S%f')
return f'{run_name}_{datetime_tag}'
def build_docker_image_on_remote(c, run_dir_name):
print("Building image on remote")
print("Making app target dir")
make_app_target_dir(c)
print("zipping local app")
zip_app_locally()
print("copying app to remote")
copy_app_to_remote(c)
print("unzipping app on remote")
unzip_app_on_remote(c)
print("creating run output directory")
create_run_output_dir(c, run_output_dirname=run_dir_name)
print("running docker build command")
run_docker_build_command(c)
def make_app_target_dir(c):
c.run(f'mkdir -p {app_target_dir}')
def zip_app_locally():
with tarfile.open(archive_path, 'w:gz') as archive:
archive.add('./', exclude=exclude_select_dirs)
def exclude_select_dirs(path):
try:
return is_hidden_dir(path) or is_in_gitignore(Path(os.getcwd(), path))
except IndexError:
return False
def is_hidden_dir(path):
return os.path.isdir(path) and os.path.split(path)[1][0] == '.'
is_in_gitignore = parse_gitignore('.gitignore', base_dir=os.getcwd())
def copy_app_to_remote(c):
c.put(str(archive_path), str(app_target_dir))
def unzip_app_on_remote(c):
remote_archive_path = app_target_dir / os.path.split(archive_path)[1]
print(remote_archive_path)
c.run(f'tar -xzf {remote_archive_path} --directory {app_target_dir}')
c.run(f'rm {remote_archive_path}')
def create_run_output_dir(c, run_output_dirname):
c.run(f'mkdir -p {training_output_dir / run_output_dirname}')
def run_docker_build_command(c):
c.run(f'cd {app_target_dir} && ' + docker_build_cmd)
def get_args_file_path(run_name):
return PurePosixPath('config', 'run', f'{run_name}.args')
def run_docker_image_for_training(c, args_file_path, run_output_dir, gpuids, extra_python_args):
python_cmd = f'python run_training.py @{args_file_path} --output_dir={run_output_dir} {extra_python_args}'
run_training_cmd = docker_run_cmd(python_cmd, gpuids=gpuids)
c.run(run_training_cmd)
def docker_run_cmd(cmd, gpuids):
return (
f'docker run -u $(id -u):$(id -g) --runtime=nvidia --rm '
f'-e NVIDIA_VISIBLE_DEVICES={gpuids} '
f'-v {dataset_dir}:{docker_dataset_dir} '
f'-v {output_dir}:{docker_output_dir} '
f'-v {predictions_dir}:{docker_predictions_dir} '
f'-v {training_output_dir}:{docker_training_output_dir} '
f'-t {docker_image_name} '
f'{cmd} '
)
def retrieve_training_output(c, run_dir_name):
remote_tar_file_path = training_output_dir / f'{run_dir_name}.tar.gz'
c.run(f'tar -czvf {remote_tar_file_path} -C {training_output_dir} {run_dir_name}')
ensure_dir_exists(local_training_output_dir)
local_tarfile_path = local_training_output_dir / f'{run_dir_name}.tar.gz'
c.get(remote_tar_file_path, local_tarfile_path)
with tarfile.open(local_tarfile_path, 'r:gz') as archive:
archive.extractall(path=local_training_output_dir)
os.remove(local_tarfile_path)
def ensure_dir_exists(path):
try:
os.mkdir(path)
except FileExistsError:
pass