Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Move user code background #1461

Merged
merged 3 commits into from
Oct 10, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 26 additions & 10 deletions src/rest-server/src/templates/dockerContainerScript.mustache
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ function exit_handler()
{
printf "%s %s\n" \
"[DEBUG]" "Docker container exit handler: EXIT signal received in docker container, exiting ..."
kill 0
}

set -x
Expand All @@ -36,11 +35,7 @@ trap exit_handler EXIT


touch "/alive/docker_$PAI_CONTAINER_ID"
while /bin/true; do
[ $(( $(date +%s) - $(stat -c %Y /alive/yarn_$PAI_CONTAINER_ID) )) -gt 60 ] \
&& pkill -9 --ns 1
sleep 20
done &



export PAI_WORK_DIR="$(pwd)"
Expand Down Expand Up @@ -183,8 +178,29 @@ fi
# Write env to system-wide environment
env | grep -E "^PAI|PATH|PREFIX|JAVA|HADOOP|NVIDIA|CUDA" > /etc/environment

printf "%s %s\n\n" "[INFO]" "USER COMMAND START"
{{{ taskData.command }}} || exit $?
printf "\n%s %s\n\n" "[INFO]" "USER COMMAND END"
function run_user_command()
{
printf "%s %s\n\n" "[INFO]" "USER COMMAND START"
{{{ taskData.command }}} || exit $?
printf "\n%s %s\n\n" "[INFO]" "USER COMMAND END"
exit 0
}

run_user_command &
user_command_pid=$!

while [ $(( $(date +%s) - $(stat -c %Y /alive/yarn_$PAI_CONTAINER_ID) )) -lt 30 ] && \
kill -0 $user_command_pid 2>/dev/null; do
sleep 20
done

if kill -0 $user_command_pid 2>/dev/null; then
echo "job has been killed, docker container exiting"
exit 0
else
wait $user_command_pid
user_command_exitcode=$?
echo "job has finished with exit code $user_command_exitcode"
exit $user_command_exitcode
fi

exit 0
18 changes: 5 additions & 13 deletions src/rest-server/src/templates/yarnContainerScript.mustache
Original file line number Diff line number Diff line change
Expand Up @@ -41,23 +41,14 @@ function exit_handler()
local handler="Yarn container exit handler"
debug_log "$handler" "EXIT signal received in yarn container, performing clean up action..."

debug_log "$handler" "clean the container code"
rm -fr tmp/pai-root/code 2>/dev/null

debug_log "$handler" "trying to kill docker container $docker_name"
pid=$(docker inspect --format={{{ inspectFormat }}} $docker_name 2>/dev/null)
if [ $pid ]; then
kill -9 $pid &&\
debug_log "$handler" "docker caontainer $docker_name killed successfully." ||\
debug_log "$handler" "tries to kill the container $docker_name but failed. Maybe it has already exited."
else
debug_log "$handler" "docker container $docker_name has already exited"
fi

debug_log "$handler" "write exit code to file"
debug_log "$handler" "yarn container exit code: $rc"
debug_log "$handler" "exit code file path: /var/lib/hadoopdata/nm-local-dir/nmPrivate/$APP_ID/$CONTAINER_ID/$CONTAINER_ID.pid.exitcode"
echo $rc > "/var/lib/hadoopdata/nm-local-dir/nmPrivate/$APP_ID/$CONTAINER_ID/$CONTAINER_ID.pid.exitcode"

debug_log "$handler" "clean the container code"
rm -fr tmp/pai-root/code 2>/dev/null

exit $rc
}

Expand Down Expand Up @@ -265,6 +256,7 @@ docker run --name $docker_name \
--device=/dev/fuse \
--security-opt apparmor:unconfined \
--volume /tmp/pai-root/alive/$APP_ID:/alive \
--volume /tmp/pai-root/alive/$APP_ID/yarn_$CONTAINER_ID:/alive/yarn_$CONTAINER_ID:ro \
--volume /tmp/pai-root/log/$APP_ID/$CONTAINER_ID:/pai/log \
--volume $container_local_dir/$bootstrap_dir:/pai/bootstrap:ro \
--volume $container_local_dir/$code_dir:/pai/code:ro \
Expand Down