Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: retry request spot #3116

Merged
merged 6 commits into from
Oct 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions build-system/scripts/create_ecr_manifest
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ for A in $ARCH_LIST
do
ARCH_IMAGE=$IMAGE_URI-$A
echo "Adding image $ARCH_IMAGE to manifest list."
docker manifest create $IMAGE_URI --amend $ARCH_IMAGE
retry docker manifest create $IMAGE_URI --amend $ARCH_IMAGE
done
IFS=$OLD_IFS
unset OLD_IFS

docker manifest push --purge $IMAGE_URI
retry docker manifest push --purge $IMAGE_URI
1 change: 1 addition & 0 deletions build-system/scripts/remote_run_script
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ shift
SSH_CONFIG_PATH=${SSH_CONFIG_PATH:-$BUILD_SYSTEM_PATH/remote/ssh_config}

# Copy the runner script to spot instance. This is what we actually run.
echo "Copying ./remote_runner to $IP..."
scp -rF $SSH_CONFIG_PATH $BUILD_SYSTEM_PATH/scripts/remote_runner $IP:.

# Run script on remote instance, passing environment variables.
Expand Down
13 changes: 9 additions & 4 deletions build-system/scripts/request_spot
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ INSTANCE_TYPE_SUFFIX=${cpu_map[$CPUS]}

# Check if INSTANCE_TYPE_SUFFIX is set, if not, the CPU count is not recognized.
if [ -z "$INSTANCE_TYPE_SUFFIX" ]; then
echo "Unrecognized CPU count: $CPUS"
>&2 echo "Unrecognized CPU count: $CPUS"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

&2 at end of line seems to read better to me but ymmv

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having it at the start of the line makes it very clear I think. Ones brain can't fail to see it.

exit 1
fi

Expand Down Expand Up @@ -110,6 +110,11 @@ done

# Wait till ssh port is open.
>&2 echo "Waiting for SSH at $IP..."
while ! nc -z $IP 22; do sleep 1; done;

echo $IP
for I in {1..60}; do
if nc -z $IP 22; then
echo $IP
exit 0
fi
sleep 1
done
exit 1
2 changes: 1 addition & 1 deletion build-system/scripts/retry
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ ATTEMPTS=3
for i in $(seq 1 $ATTEMPTS); do
"$@" && exit || sleep 10
done
echo "$@ failed after $ATTEMPTS attempts"
>&2 echo "$@ failed after $ATTEMPTS attempts"
exit 1
29 changes: 17 additions & 12 deletions build-system/scripts/spot_run_script
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,31 @@ CONTENT_HASH=$1
CPUS=$2
shift 2

# On any sort of exit (error or not), kill spot request so it doesn't count against quota.
# On any sort of exit (error or not).
function on_exit {
set +e

if [ -n "$IP" ]; then
echo "Terminating spot instance..."
ssh -F $SSH_CONFIG_PATH $IP sudo halt -p > /dev/null 2>&1
fi

# Kill spot request so it doesn't count against quota.
if [ -f "sir-$CONTENT_HASH:$JOB_NAME.txt" ]; then
SIR=$(cat sir-$CONTENT_HASH:$JOB_NAME.txt)
echo "Cancelling spot instance request $SIR (silently)"
echo "Cancelling spot instance request $SIR..."
aws ec2 cancel-spot-instance-requests --spot-instance-request-ids $SIR >/dev/null 2>&1 || true
fi
}
trap on_exit EXIT

# Get spot instance.
IP=$(request_spot $CONTENT_HASH:$JOB_NAME $CPUS)
IP=$(retry request_spot $CONTENT_HASH:$JOB_NAME $CPUS)

# Run script remotely on spot instance, capturing success or failure.
set +e
remote_run_script $IP $@
CODE=$?

# Shutdown spot.
echo "Terminating spot instance..."
ssh -F $SSH_CONFIG_PATH $IP sudo halt -p > /dev/null 2>&1
if [ -z "$IP" ]; then
echo "Failed to get spot instance."
exit 1
fi

exit $CODE
# Run script remotely on spot instance, capturing success or failure.
remote_run_script $IP $@