-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstart.sh
executable file
·97 lines (72 loc) · 3.21 KB
/
start.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env bash
# ensure envars are loaded
source /root/.bash_profile
# set droplet ID
export DROPLET_UID=$(curl -s http://169.254.169.254/metadata/v1/id)
export DROPLET_NAME=$(curl -s http://169.254.169.254/metadata/v1/hostname)
# create ssh tunnels for connection to rabbitmq, the database, and the api
autossh -M 0 -o "ServerAliveInterval 30" -o "ServerAliveCountMax 3" -N -L 5672:localhost:5672 [email protected] &
autossh -M 0 -o "ServerAliveInterval 30" -o "ServerAliveCountMax 3" -N -L 5432:localhost:5432 [email protected] &
#autossh -M 0 -o "ServerAliveInterval 30" -o "ServerAliveCountMax 3" -N -L 5000:localhost:5000 [email protected] &
# wait for ssh tunnels to be created, ready to go
sleep 10s
# test connection to api, attempt to establish one if it doesn't exist
#curl localhost:5000 || autossh -M 0 -o "ServerAliveInterval 30" -o "ServerAliveCountMax 3" -N -L 5000:localhost:5000 [email protected] &
# start the system monitor script
nohup /var/lib/jobs/$JOB_NAME/venv/bin/python3 /var/lib/jobs/$JOB_NAME/system_monitor.py &
# start pre-reduction publisher
nohup /var/lib/jobs/$JOB_NAME/reducer/venv/bin/python3 /var/lib/jobs/$JOB_NAME/reducer/publisher.py &
prereduction_publisher_process=$!
# start reduction writer
nohup /var/lib/jobs/$JOB_NAME/reducer/venv/bin/python3 /var/lib/jobs/$JOB_NAME/reducer/writer.py &
reduction_writer_process=$!
# Start x reducers
cpu_count=$(grep -c ^processor /proc/cpuinfo)
#worker_count=$(( cpu_count / 2 ))
worker_count=$(( cpu_count / 1 ))
# start reducers
reducer_processes=()
for i in $(seq 1 $worker_count)
do
nohup /var/lib/jobs/$JOB_NAME/reducer/venv/bin/python3 /var/lib/jobs/$JOB_NAME/reducer/reducer.py &
reducer_processes+=($!)
done
# wait until reduction is complete
while [ true ]
do
sleep 1m
export r=$(curl --user $JM_USER:$JM_PASS $JOB_MANAGER/jobs/$JOB_ID/state) && [ $r == \"reduced\" ] && break || continue
done
# TODO: bad code, remove this - fix should be in reducers where job state is
# updated
# wait for all reducers in queue to finish
sleep 1m
# start pre-vectorization publisher
nohup /var/lib/jobs/$JOB_NAME/vectorizer/venv/bin/python3 /var/lib/jobs/$JOB_NAME/vectorizer/publisher.py &
prevectorization_publisher_process=$!
# start vectorization writer
nohup /var/lib/jobs/$JOB_NAME/vectorizer/venv/bin/python3 /var/lib/jobs/$JOB_NAME/vectorizer/writer.py &
vectorization_writer_process=$!
# start vectorizers
vectorizer_processes=()
for i in $(seq 1 $worker_count)
do
nohup /var/lib/jobs/$JOB_NAME/vectorizer/venv/bin/python3 /var/lib/jobs/$JOB_NAME/vectorizer/vectorizer.py &
vectorizer_processes+=($!)
done
# wait until vectorization is complete
while [ true ]
do
sleep 2m
export r=$(curl --user $JM_USER:$JM_PASS $JOB_MANAGER/jobs/$JOB_ID/state) && [ $r == \"vectorized\" ] && break || continue
done
# TODO: bad code, remove this - fix should be in reducers where job state is
# updated
# wait for all reducers in queue to finish
sleep 1m
# attempt to cast spells (train tf model)
curl --user $JM_USER:$JM_PASS -X POST $JOB_MANAGER/jobs/$JOB_ID/spell
sleep 10s
# once vectorization is complete, the droplet is no longer needed, droplet makes
# a DELETE request on itself.
curl --user $JM_USER:$JM_PASS -X DELETE $JOB_MANAGER/droplets/$DROPLET_UID