-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathresnet20_cifar10_shuffled_moniqua_tuning.py
executable file
·93 lines (82 loc) · 2.35 KB
/
resnet20_cifar10_shuffled_moniqua_tuning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python3
import os
from jobmonitor.api import (
kubernetes_schedule_job,
kubernetes_schedule_job_queue,
register_job,
upload_code_package,
)
from jobmonitor.connections import mongo
excluded_files = [
"core",
"output.tmp",
".vscode",
"node_modules",
"scripts",
".git",
"*.pyc",
"._*",
"__pycache__",
"*.pdf",
"*.js",
"*.yaml",
".pylintrc",
".gitignore",
".AppleDouble",
".jobignore",
]
project = "decentralized_powersgd"
experiment = os.path.splitext(os.path.basename(__file__))[0]
script = "train.py"
description = """
This is meant to be equivalent to the setup in the Choco DL paper
""".strip()
base_config = {
"n_workers": 8,
"topology": "ring",
"batch_size": 128,
"task_architecture": "ResNet20",
"lr_schedule_milestones": [(150, 0.1), (225, 0.1)],
"optimizer_diffusion_rate": 1.308,
}
code_package, files_uploaded = upload_code_package(".", excludes=excluded_files + ["gossip_run.py"])
print("Uploaded {} files.".format(len(files_uploaded)))
def schedule(name, config, skip_existing=False):
# Skip pre-existing entries
if (
skip_existing
and mongo.job.count_documents({"project": project, "job": name, "experiment": experiment})
> 0
):
return
config = {**base_config, **config}
n_workers = config["n_workers"]
job_id = register_job(
user="vogels",
project=project,
experiment=experiment,
job=name,
n_workers=n_workers,
priority=10,
config_overrides=config,
runtime_environment={"clone": {"code_package": code_package}, "script": script},
annotations={"description": description},
)
print(
f'sbatch --ntasks {n_workers} --job-name="{name}" --gpus-per-task=1 --cpus-per-task=8 --wrap="srun jobrun {job_id} --mpi"'
)
seed = 0
for factor in [4]:
for theta in [0.25]:
for diffusion_rate in [5e-3]:
schedule(
f"moniqua-lr{factor}-theta{theta}-dr{diffusion_rate}",
dict(
distributed_lr_warmup_factor=factor,
optimizer="moniqua",
optimizer_diffusion_rate=diffusion_rate,
optimizer_theta=theta,
seed=seed,
),
skip_existing=True,
)