From d5f9508faf8234e694bb6adaf2de441df7275d91 Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 29 Jul 2020 11:10:04 +0200 Subject: [PATCH 01/84] add architrave example --- examples/architrave/Dockerfile | 60 ++++++ examples/architrave/LICENSE | 201 ++++++++++++++++++ examples/architrave/README.md | 20 ++ examples/architrave/debs.lst | 26 +++ examples/architrave/mpi-run.sh | 140 ++++++++++++ examples/architrave/run_batch.sh | 53 +++++ .../architrave/scar-architrave-batch.yaml | 15 ++ .../architrave/scar-architrave-lambda.yaml | 14 ++ 8 files changed, 529 insertions(+) create mode 100755 examples/architrave/Dockerfile create mode 100755 examples/architrave/LICENSE create mode 100755 examples/architrave/README.md create mode 100755 examples/architrave/debs.lst create mode 100644 examples/architrave/mpi-run.sh create mode 100755 examples/architrave/run_batch.sh create mode 100644 examples/architrave/scar-architrave-batch.yaml create mode 100644 examples/architrave/scar-architrave-lambda.yaml diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile new file mode 100755 index 00000000..ae7e27d7 --- /dev/null +++ b/examples/architrave/Dockerfile @@ -0,0 +1,60 @@ +FROM debian:stretch-slim + +ARG ADD_BASE_DIR_ARCHITRAVE=scar/examples/architrave +ARG ADD_PRIVATE_BASE_DIR=architrave +ARG SSHDIR=/root/.ssh +ARG BUILD_PACKAGES=' wget make gcc g++ iproute2 cmake build-essential gfortran curl unzip ' + +ENV ADD_PRIVATE_BASE_DIR=${ADD_PRIVATE_BASE_DIR} +ENV ADD_BASE_DIR_ARCHITRAVE=${ADD_BASE_DIR_ARCHITRAVE} + +ENV EXAMPLE_FILE=/opt/examples/example +ENV TMP_OUTPUT_DIR=/tmp +ENV APP_BIN=/opt/simest +ENV APP_PARAMS="" +ENV MPI_PARAMS='-np 1 --debug-daemons' +ENV JOB_DIR /root/exec/ +ENV SCRATCH_DIR /root/scratch +ENV NOTVISIBLE "in users profile" +ENV VERSION=1.7 +ENV DEBIAN_FRONTEND=noninteractive +## Set to either lambda or batch +ENV EXEC_TYPE=lambda +ENV AWS_ACCESS_KEY='' +ENV AWS_SECRET_ACCESS_KEY='' +ENV AWS_REGION='us-east-1' +ENV AWS_OUTPUT='json' + +# ADD http://launchpadlibrarian.net/265139591/wget_1.13.4-2ubuntu1.4_amd64.deb \ +# http://launchpadlibrarian.net/280581005/libidn11_1.23-2ubuntu0.1_amd64.deb \ +# http://launchpadlibrarian.net/304494179/libssl1.0.0_1.0.1-4ubuntu5.39_amd64.deb \ +# http://launchpadlibrarian.net/312073165/multiarch-support_2.15-0ubuntu10.18_amd64.deb \ +# http://launchpadlibrarian.net/365856917/libc6-dev_2.27-3ubuntu1_amd64.deb \ +# http://launchpadlibrarian.net/365856930/libc-dev-bin_2.27-3ubuntu1_amd64.deb \ +# http://launchpadlibrarian.net/367128628/linux-libc-dev_4.15.0-20.21_amd64.deb \ +ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run_batch.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ + +RUN apt-get update \ + && apt-get install -y $BUILD_PACKAGES \ + && wget -q --no-check-certificate -qO- https://download.open-mpi.org/release/open-mpi/v1.4/openmpi-1.4.3.tar.bz2 | tar xvfj - -C /tmp/ \ + && cd /tmp/openmpi-1.4.3/ \ + && ./configure --disable-pty-support \ + && make -j8 \ + && make install \ + && wget -q -P /tmp -i /opt/debs.lst \ + && apt-get remove --purge -y $BUILD_PACKAGES gnupg* gnupg-agent* \ + && apt-get autoremove --purge -y \ + # && apt-get purge $(dpkg-query -Wf '${Package;-40}${Priority}\n' | awk '$2 ~ /optional|extra/ { print $1 }') \ + # && apt-get install -y openssh-server openssh-client bash groff less \ + # dpkg -i /opt/libc6-dev* /opt/libc-dev-bin* /opt/linux-libc-dev* \ + # && dpkg -i --force-all /opt/*.deb \ + && dpkg --force-all -i /tmp/*.deb \ + # && apt-get remove --allow-remove-essential --purge -y $BUILD_PACKAGES $(apt-mark showauto) \ + && ulimit -n 1024 \ + && rm -rf /tmp/* /var/lib/apt/lists/* \ + && chmod 755 /opt/mpi-run.sh \ + && chmod 755 /opt/run_batch.sh \ + && chmod 755 ${APP_BIN} + + +CMD /opt/run_batch.sh diff --git a/examples/architrave/LICENSE b/examples/architrave/LICENSE new file mode 100755 index 00000000..261eeb9e --- /dev/null +++ b/examples/architrave/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/examples/architrave/README.md b/examples/architrave/README.md new file mode 100755 index 00000000..41eca9e0 --- /dev/null +++ b/examples/architrave/README.md @@ -0,0 +1,20 @@ +# architrave +Running a commercial app in a Docker container on Lambda and Batch + +You can ignore everything but the private files and those from ##scar/examples/architrave## by creating a `.dockerignore` file in the root of the context with the following content: + +``` +# Ignore everything +** + +# Allow files and directories +!/architrave/** +!/scar/examples/architrave/** + +# Ignore unnecessary files inside allowed directories +# This should go after the allowed directories +**/scar-architrave-batch.yaml +**/scar-architrave-lambda.yaml +**/README.md +**/LICENSE +``` diff --git a/examples/architrave/debs.lst b/examples/architrave/debs.lst new file mode 100755 index 00000000..44eb9e14 --- /dev/null +++ b/examples/architrave/debs.lst @@ -0,0 +1,26 @@ +http://launchpadlibrarian.net/102057142/gcc-4.6-base_4.6.3-1ubuntu5_amd64.deb +http://launchpadlibrarian.net/79554367/libamd2.2.0_3.4.0-2ubuntu3_amd64.deb +http://launchpadlibrarian.net/79562140/libatlas3gf-base_3.8.4-3build1_amd64.deb +http://launchpadlibrarian.net/93163952/libblacs-mpi1_1.1-31ubuntu1_amd64.deb +http://launchpadlibrarian.net/79432349/libblas3gf_1.2.20110419-2ubuntu1_amd64.deb +http://launchpadlibrarian.net/102057179/libgfortran3_4.6.3-1ubuntu5_amd64.deb +http://launchpadlibrarian.net/86855144/libhdf5-openmpi-1.8.4_1.8.4-patch1-3ubuntu2_amd64.deb +http://launchpadlibrarian.net/57731806/libhypre-2.4.0_2.4.0b-7_amd64.deb +http://launchpadlibrarian.net/83117384/libibverbs1_1.1.5-1ubuntu1_amd64.deb +http://launchpadlibrarian.net/79599965/libmumps-4.9.2_4.9.2.dfsg-7build1_amd64.deb +http://launchpadlibrarian.net/92313905/libnuma1_2.0.8~rc3-1_amd64.deb +http://launchpadlibrarian.net/88908426/libpetsc3.1_3.1.dfsg-11ubuntu1_amd64.deb +http://launchpadlibrarian.net/102057147/libquadmath0_4.6.3-1ubuntu5_amd64.deb +http://launchpadlibrarian.net/88918631/libscalapack-mpi1_1.8.0-7build1_amd64.deb +http://launchpadlibrarian.net/88960741/libscotch-5.1_5.1.12b.dfsg-1_amd64.deb +http://launchpadlibrarian.net/88523913/libspooles2.2_2.2-9_amd64.deb +http://launchpadlibrarian.net/12667499/libsuperlu3_3.0+20070106-3_amd64.deb +http://launchpadlibrarian.net/206369988/libtorque2_2.4.16+dfsg-1+deb7u4build0.12.04.1_amd64.deb +http://launchpadlibrarian.net/79554377/libumfpack5.4.0_3.4.0-2ubuntu3_amd64.deb +http://launchpadlibrarian.net/202507343/libx11-6_1.4.99.1-0ubuntu2.3_amd64.deb +http://launchpadlibrarian.net/202507376/libx11-data_1.4.99.1-0ubuntu2.3_all.deb +http://launchpadlibrarian.net/84788368/libxau6_1.0.6-4_amd64.deb +http://launchpadlibrarian.net/140864111/libxcb1_1.8.1-1ubuntu0.2_amd64.deb +http://launchpadlibrarian.net/84805563/libxdmcp6_1.1.0-4_amd64.deb +http://launchpadlibrarian.net/88909481/mpi-default-bin_1.0.1_amd64.deb +http://launchpadlibrarian.net/312073165/multiarch-support_2.15-0ubuntu10.18_amd64.deb diff --git a/examples/architrave/mpi-run.sh b/examples/architrave/mpi-run.sh new file mode 100644 index 00000000..6dceda43 --- /dev/null +++ b/examples/architrave/mpi-run.sh @@ -0,0 +1,140 @@ +#!/bin/bash + +cd $JOB_DIR + +#PATH="$PATH:/opt/openmpi/bin/" +BASENAME="${0##*/}" +log () { + echo "${BASENAME} - ${1}" +} +HOST_FILE_PATH="/tmp/hostfile" +AWS_BATCH_EXIT_CODE_FILE="/tmp/batch-exit-code" + +#aws s3 cp $S3_INPUT $SCRATCH_DIR +#tar -xvf $SCRATCH_DIR/*.tar.gz -C $SCRATCH_DIR + +sleep 2 + +usage () { + if [ "${#@}" -ne 0 ]; then + log "* ${*}" + log + fi + cat <&2 + log "${2:-1}" > $AWS_BATCH_EXIT_CODE_FILE + kill $(cat /tmp/supervisord.pid) +} + +# Set child by default switch to main if on main node container +NODE_TYPE="child" +if [ "${AWS_BATCH_JOB_MAIN_NODE_INDEX}" == "${AWS_BATCH_JOB_NODE_INDEX}" ]; then + log "Running synchronize as the main node" + NODE_TYPE="main" +fi + + +# wait for all nodes to report +wait_for_nodes () { + log "Running as master node" + + touch $HOST_FILE_PATH + ip=$(/sbin/ip -o -4 addr list eth0 | awk '{print $4}' | cut -d/ -f1) + + if [ -x "$(command -v nvidia-smi)" ] ; then + NUM_GPUS=$(ls -l /dev/nvidia[0-9] | wc -l) + availablecores=$NUM_GPUS + else + availablecores=$(nproc) + fi + + log "master details -> $ip:$availablecores" + echo "$ip slots=$availablecores" >> $HOST_FILE_PATH + + lines=$(sort $HOST_FILE_PATH|uniq|wc -l) + while [ "$AWS_BATCH_JOB_NUM_NODES" -gt "$lines" ] + do + log "$lines out of $AWS_BATCH_JOB_NUM_NODES nodes joined, check again in 1 second" + sleep 1 + lines=$(sort $HOST_FILE_PATH|uniq|wc -l) + done + # Make the temporary file executable and run it with any given arguments + log "All nodes successfully joined" + + # remove duplicates if there are any. + awk '!a[$0]++' $HOST_FILE_PATH > ${HOST_FILE_PATH}-deduped + cat $HOST_FILE_PATH-deduped + log "executing main MPIRUN workflow" + + cd $SCRATCH_DIR + mkdir output + mpirun --mca btl_tcp_if_include eth0 --debug-daemons \ + -x PATH -x LD_LIBRARY_PATH + --allow-run-as-root --machinefile ${HOST_FILE_PATH}-deduped \ + ${APP_BIN} ${APP_PARAMS} + sleep 2 + + if [ "${NODE_TYPE}" = 'main' ]; then + env GZIP=-9 tar -czvf $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $SCRATCH_DIR/output/* + aws s3 cp $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $S3_OUTPUT + fi + + log "done! goodbye, writing exit code to $AWS_BATCH_EXIT_CODE_FILE and shutting down my supervisord" + echo "0" > $AWS_BATCH_EXIT_CODE_FILE + kill $(cat /tmp/supervisord.pid) + exit 0 +} + + +# Fetch and run a script +report_to_master () { + # get own ip and num cpus + # + ip=$(/sbin/ip -o -4 addr list eth0 | awk '{print $4}' | cut -d/ -f1) + + if [ -x "$(command -v nvidia-smi)" ] ; then + NUM_GPUS=$(ls -l /dev/nvidia[0-9] | wc -l) + availablecores=$NUM_GPUS + else + availablecores=$(nproc) + fi + + log "I am a child node -> $ip:$availablecores, reporting to the master node -> ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS}" + until echo "$ip slots=$availablecores" | ssh ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS} "cat >> /$HOST_FILE_PATH" + do + echo "Sleeping 5 seconds and trying again" + done + log "done! goodbye" + exit 0 +} + + +# Main - dispatch user request to appropriate function +log $NODE_TYPE +case $NODE_TYPE in + main) + wait_for_nodes "${@}" + ;; + + child) + report_to_master "${@}" + ;; + + *) + log $NODE_TYPE + usage "Could not determine node type. Expected (main/child)" + ;; +esac diff --git a/examples/architrave/run_batch.sh b/examples/architrave/run_batch.sh new file mode 100755 index 00000000..0d7956e0 --- /dev/null +++ b/examples/architrave/run_batch.sh @@ -0,0 +1,53 @@ +#!/bin/bash +if [ "${EXEC_TYPE,,}" = 'lambda' ]; then + export OMPI_MCA_plm_rsh_agent=/bin/false + mpirun ${MPI_PARAMS} ${APP_BIN} ${APP_PARAMS} + +elif [ "${EXEC_TYPE,,}" = 'batch' ]; then + + apt-get update + apt-get install -y wget + wget -q -P /tmp https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip + unzip -q -d /tmp /tmp/awscli-exe-linux-x86_64.zip + /tmp/aws/install + echo "Version of dist: ${VERSION}" + mkdir ~/.aws/ + ## S3 OPTIMIZATION + aws configure set default.s3.max_concurrent_requests 30 + aws configure set default.s3.max_queue_size 10000 + aws configure set default.s3.multipart_threshold 64MB + aws configure set default.s3.multipart_chunksize 16MB + aws configure set default.s3.max_bandwidth 4096MB/s + aws configure set default.s3.addressing_style path + printf '%s\n' '[default]' "aws_access_key_id=${AWS_ACCESS_KEY}" "aws_secret_access_key=${AWS_SECRET_ACCESS_KEY}" > ~/.aws/credentials + printf '%s\n' '[default]' "region=${AWS_REGION}" "output=${AWS_OUTPUT}" > ~/.aws/config + #aws s3 cp $S3_INPUT/common $SCRATCH_DIR + chmod +x ${SCRATCH_DIR}/simest + ## Install ssh from S3 + mkdir /tmp/deps_batch + aws cli cp ${S3_INPUT}/batch /tmp/batch + dpkg -i /tmp/batch/deps/*.deb + + # COnfigure ssh + sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + echo "export VISIBLE=now" >> /etc/profile + echo "${USER} ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers + mkdir -p ${SSHDIR} + touch ${SSHDIR}/sshd_config + ssh-keygen -t rsa -f ${SSHDIR}/ssh_host_rsa_key -N '' + cp ${SSHDIR}/ssh_host_rsa_key.pub ${SSHDIR}/authorized_keys + cp ${SSHDIR}/ssh_host_rsa_key ${SSHDIR}/id_rsa + echo " IdentityFile ${SSHDIR}/id_rsa" >> /etc/ssh/ssh_config + echo "Host *" >> /etc/ssh/ssh_config + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config + chmod -R 600 ${SSHDIR}/* + chown -R ${USER}:${USER} ${SSHDIR}/ + # check if ssh agent is running or not, if not, run + eval `ssh-agent -s` + ssh-add ${SSHDIR}/id_rsa + + /opt/mpi-run.sh +else + echo "ERROR: unknown execution type '${EXEC_TYPE}'" + exit 1 # terminate and indicate error +fi diff --git a/examples/architrave/scar-architrave-batch.yaml b/examples/architrave/scar-architrave-batch.yaml new file mode 100644 index 00000000..38c996b9 --- /dev/null +++ b/examples/architrave/scar-architrave-batch.yaml @@ -0,0 +1,15 @@ +functions: + aws: + - lambda: + name: scar-architrave + execution_mode: batch + container: + image_file: /tmp/architrave-docker-img.tar.gz + environment: + Variables: + EXEC_TYPE: batch + deployment: + bucket: scar-architrave + output: + - storage_provider: s3 + path: scar-architrave/output diff --git a/examples/architrave/scar-architrave-lambda.yaml b/examples/architrave/scar-architrave-lambda.yaml new file mode 100644 index 00000000..14363a5f --- /dev/null +++ b/examples/architrave/scar-architrave-lambda.yaml @@ -0,0 +1,14 @@ +functions: + aws: + - lambda: + name: scar-architrave + container: + image_file: /tmp/architrave-docker-img.tar.gz + environment: + Variables: + EXEC_TYPE: lambda + deployment: + bucket: scar-architrave + output: + - storage_provider: s3 + path: scar-architrave/output From 101daa626481bb1eb736bac51d2ad209a4db8b7c Mon Sep 17 00:00:00 2001 From: asalic Date: Fri, 31 Jul 2020 15:24:05 +0200 Subject: [PATCH 02/84] finalized lambda container --- examples/architrave/Dockerfile | 33 ++++++------------- examples/architrave/debs.lst | 1 - examples/architrave/run_batch.sh | 3 +- .../architrave/scar-architrave-lambda.yaml | 2 ++ 4 files changed, 13 insertions(+), 26 deletions(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index ae7e27d7..66a79476 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -3,39 +3,32 @@ FROM debian:stretch-slim ARG ADD_BASE_DIR_ARCHITRAVE=scar/examples/architrave ARG ADD_PRIVATE_BASE_DIR=architrave ARG SSHDIR=/root/.ssh -ARG BUILD_PACKAGES=' wget make gcc g++ iproute2 cmake build-essential gfortran curl unzip ' +ARG BUILD_PACKAGES=' make gcc g++ iproute2 cmake build-essential gfortran curl unzip ' ENV ADD_PRIVATE_BASE_DIR=${ADD_PRIVATE_BASE_DIR} ENV ADD_BASE_DIR_ARCHITRAVE=${ADD_BASE_DIR_ARCHITRAVE} +ENV VERSION=1.7 +ENV DEBIAN_FRONTEND=noninteractive +## Set to either lambda or batch +ENV EXEC_TYPE=lambda + ENV EXAMPLE_FILE=/opt/examples/example ENV TMP_OUTPUT_DIR=/tmp ENV APP_BIN=/opt/simest ENV APP_PARAMS="" ENV MPI_PARAMS='-np 1 --debug-daemons' -ENV JOB_DIR /root/exec/ -ENV SCRATCH_DIR /root/scratch -ENV NOTVISIBLE "in users profile" -ENV VERSION=1.7 -ENV DEBIAN_FRONTEND=noninteractive -## Set to either lambda or batch -ENV EXEC_TYPE=lambda +ENV JOB_DIR=/root/exec/ +ENV SCRATCH_DIR=/root/scratch ENV AWS_ACCESS_KEY='' ENV AWS_SECRET_ACCESS_KEY='' ENV AWS_REGION='us-east-1' ENV AWS_OUTPUT='json' -# ADD http://launchpadlibrarian.net/265139591/wget_1.13.4-2ubuntu1.4_amd64.deb \ -# http://launchpadlibrarian.net/280581005/libidn11_1.23-2ubuntu0.1_amd64.deb \ -# http://launchpadlibrarian.net/304494179/libssl1.0.0_1.0.1-4ubuntu5.39_amd64.deb \ -# http://launchpadlibrarian.net/312073165/multiarch-support_2.15-0ubuntu10.18_amd64.deb \ -# http://launchpadlibrarian.net/365856917/libc6-dev_2.27-3ubuntu1_amd64.deb \ -# http://launchpadlibrarian.net/365856930/libc-dev-bin_2.27-3ubuntu1_amd64.deb \ -# http://launchpadlibrarian.net/367128628/linux-libc-dev_4.15.0-20.21_amd64.deb \ ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run_batch.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ RUN apt-get update \ - && apt-get install -y $BUILD_PACKAGES \ + && apt-get install -y $BUILD_PACKAGES wget \ && wget -q --no-check-certificate -qO- https://download.open-mpi.org/release/open-mpi/v1.4/openmpi-1.4.3.tar.bz2 | tar xvfj - -C /tmp/ \ && cd /tmp/openmpi-1.4.3/ \ && ./configure --disable-pty-support \ @@ -44,17 +37,11 @@ RUN apt-get update \ && wget -q -P /tmp -i /opt/debs.lst \ && apt-get remove --purge -y $BUILD_PACKAGES gnupg* gnupg-agent* \ && apt-get autoremove --purge -y \ - # && apt-get purge $(dpkg-query -Wf '${Package;-40}${Priority}\n' | awk '$2 ~ /optional|extra/ { print $1 }') \ - # && apt-get install -y openssh-server openssh-client bash groff less \ - # dpkg -i /opt/libc6-dev* /opt/libc-dev-bin* /opt/linux-libc-dev* \ - # && dpkg -i --force-all /opt/*.deb \ && dpkg --force-all -i /tmp/*.deb \ - # && apt-get remove --allow-remove-essential --purge -y $BUILD_PACKAGES $(apt-mark showauto) \ && ulimit -n 1024 \ && rm -rf /tmp/* /var/lib/apt/lists/* \ && chmod 755 /opt/mpi-run.sh \ && chmod 755 /opt/run_batch.sh \ && chmod 755 ${APP_BIN} - -CMD /opt/run_batch.sh +CMD echo ${TST} && /opt/run_batch.sh diff --git a/examples/architrave/debs.lst b/examples/architrave/debs.lst index 44eb9e14..5d97fc7f 100755 --- a/examples/architrave/debs.lst +++ b/examples/architrave/debs.lst @@ -23,4 +23,3 @@ http://launchpadlibrarian.net/84788368/libxau6_1.0.6-4_amd64.deb http://launchpadlibrarian.net/140864111/libxcb1_1.8.1-1ubuntu0.2_amd64.deb http://launchpadlibrarian.net/84805563/libxdmcp6_1.1.0-4_amd64.deb http://launchpadlibrarian.net/88909481/mpi-default-bin_1.0.1_amd64.deb -http://launchpadlibrarian.net/312073165/multiarch-support_2.15-0ubuntu10.18_amd64.deb diff --git a/examples/architrave/run_batch.sh b/examples/architrave/run_batch.sh index 0d7956e0..c5a64548 100755 --- a/examples/architrave/run_batch.sh +++ b/examples/architrave/run_batch.sh @@ -1,12 +1,11 @@ #!/bin/bash + if [ "${EXEC_TYPE,,}" = 'lambda' ]; then export OMPI_MCA_plm_rsh_agent=/bin/false mpirun ${MPI_PARAMS} ${APP_BIN} ${APP_PARAMS} elif [ "${EXEC_TYPE,,}" = 'batch' ]; then - apt-get update - apt-get install -y wget wget -q -P /tmp https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip unzip -q -d /tmp /tmp/awscli-exe-linux-x86_64.zip /tmp/aws/install diff --git a/examples/architrave/scar-architrave-lambda.yaml b/examples/architrave/scar-architrave-lambda.yaml index 14363a5f..d5524303 100644 --- a/examples/architrave/scar-architrave-lambda.yaml +++ b/examples/architrave/scar-architrave-lambda.yaml @@ -2,11 +2,13 @@ functions: aws: - lambda: name: scar-architrave + run_script: /tmp/scar_run_init.sh container: image_file: /tmp/architrave-docker-img.tar.gz environment: Variables: EXEC_TYPE: lambda + TST: default_init default_run deployment: bucket: scar-architrave output: From e5e964c1168ce6acd508ee7a28a6aa5a2d0b1a41 Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 2 Sep 2020 10:37:10 +0200 Subject: [PATCH 03/84] upd README --- examples/architrave/Dockerfile | 9 ++++---- examples/architrave/README.md | 22 +++++++++++++++++++ examples/architrave/mpi-run.sh | 10 ++++----- examples/architrave/run_batch.sh | 19 +++++++++++----- .../architrave/scar-architrave-batch.yaml | 1 + .../architrave/scar-architrave-lambda.yaml | 1 - examples/architrave/scar_run_init.sh | 2 ++ 7 files changed, 47 insertions(+), 17 deletions(-) mode change 100644 => 100755 examples/architrave/mpi-run.sh mode change 100644 => 100755 examples/architrave/scar-architrave-batch.yaml mode change 100644 => 100755 examples/architrave/scar-architrave-lambda.yaml create mode 100755 examples/architrave/scar_run_init.sh diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index 66a79476..0e1d4974 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -2,8 +2,7 @@ FROM debian:stretch-slim ARG ADD_BASE_DIR_ARCHITRAVE=scar/examples/architrave ARG ADD_PRIVATE_BASE_DIR=architrave -ARG SSHDIR=/root/.ssh -ARG BUILD_PACKAGES=' make gcc g++ iproute2 cmake build-essential gfortran curl unzip ' +ARG BUILD_PACKAGES=' make gcc g++ iproute2 cmake build-essential gfortran curl ' ENV ADD_PRIVATE_BASE_DIR=${ADD_PRIVATE_BASE_DIR} ENV ADD_BASE_DIR_ARCHITRAVE=${ADD_BASE_DIR_ARCHITRAVE} @@ -12,6 +11,7 @@ ENV VERSION=1.7 ENV DEBIAN_FRONTEND=noninteractive ## Set to either lambda or batch ENV EXEC_TYPE=lambda +ENV SSHDIR=/root/.ssh ENV EXAMPLE_FILE=/opt/examples/example ENV TMP_OUTPUT_DIR=/tmp @@ -24,11 +24,12 @@ ENV AWS_ACCESS_KEY='' ENV AWS_SECRET_ACCESS_KEY='' ENV AWS_REGION='us-east-1' ENV AWS_OUTPUT='json' +ENV S3_BUCKET="s3://scar-architrave" ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run_batch.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ RUN apt-get update \ - && apt-get install -y $BUILD_PACKAGES wget \ + && apt-get install -y $BUILD_PACKAGES wget unzip \ && wget -q --no-check-certificate -qO- https://download.open-mpi.org/release/open-mpi/v1.4/openmpi-1.4.3.tar.bz2 | tar xvfj - -C /tmp/ \ && cd /tmp/openmpi-1.4.3/ \ && ./configure --disable-pty-support \ @@ -44,4 +45,4 @@ RUN apt-get update \ && chmod 755 /opt/run_batch.sh \ && chmod 755 ${APP_BIN} -CMD echo ${TST} && /opt/run_batch.sh +CMD /opt/run_batch.sh diff --git a/examples/architrave/README.md b/examples/architrave/README.md index 41eca9e0..af703dc9 100755 --- a/examples/architrave/README.md +++ b/examples/architrave/README.md @@ -18,3 +18,25 @@ You can ignore everything but the private files and those from ##scar/examples/a **/README.md **/LICENSE ``` + +### Batch additional required packages on S3 + +Start a Docker container based on the image of the distribution you use __to run on AWS__ the legacy application (not the distribution __of__ the legacy application). + +`docker run -it -v /tmp/deps:/tmp/deps debian:stretch-slim` + +In the running container: + +``` +# determine all of the dependencies needed by the packages we want to install: +apt update && apt install -y apt-rdepends && \ +apt-rdepends openssh-server openssh-client iproute2 unzip | sed -E -e 's/^\s*Depends:\s*|^\s*PreDepends:\s*|\s*\(.*\)//g' | sort | uniq > /tmp/deps_tmp.lst &&\ +apt-get --purge autoremove -y apt-rdepends && \ +# filter out already installed packages (since we use the same base distro to get that packages and to run the legacy app) +apt list --installed | sed -E -e 's/\/.*//g' > /tmp/deps_installed.lst && \ +grep -F -v -f /tmp/deps_installed.lst /tmp/deps_tmp.lst > /tmp/deps.lst && \ +# download the list of packages, but don't install them +cd /tmp/deps && apt-get download $(cat /tmp/deps.lst) && \ +# Create the list of deps in a file; This file is used to download the required deps from an S3 bucket +ls -1 /tmp/deps > /tmp/deps/deps_batch.lst +``` diff --git a/examples/architrave/mpi-run.sh b/examples/architrave/mpi-run.sh old mode 100644 new mode 100755 index 6dceda43..1bc49ee1 --- a/examples/architrave/mpi-run.sh +++ b/examples/architrave/mpi-run.sh @@ -81,16 +81,14 @@ wait_for_nodes () { cd $SCRATCH_DIR mkdir output - mpirun --mca btl_tcp_if_include eth0 --debug-daemons \ - -x PATH -x LD_LIBRARY_PATH - --allow-run-as-root --machinefile ${HOST_FILE_PATH}-deduped \ + mpirun --mca btl_tcp_if_include eth0 --debug-daemons --machinefile ${HOST_FILE_PATH}-deduped \ ${APP_BIN} ${APP_PARAMS} sleep 2 - if [ "${NODE_TYPE}" = 'main' ]; then + #if [ "${NODE_TYPE}" = 'main' ]; then env GZIP=-9 tar -czvf $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $SCRATCH_DIR/output/* - aws s3 cp $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $S3_OUTPUT - fi + aws s3 cp $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $S3_BUCKET/output/batch_output_${AWS_BATCH_JOB_ID}.tar.gz + #fi log "done! goodbye, writing exit code to $AWS_BATCH_EXIT_CODE_FILE and shutting down my supervisord" echo "0" > $AWS_BATCH_EXIT_CODE_FILE diff --git a/examples/architrave/run_batch.sh b/examples/architrave/run_batch.sh index c5a64548..d5179de9 100755 --- a/examples/architrave/run_batch.sh +++ b/examples/architrave/run_batch.sh @@ -6,8 +6,14 @@ if [ "${EXEC_TYPE,,}" = 'lambda' ]; then elif [ "${EXEC_TYPE,,}" = 'batch' ]; then + export AWS_BATCH_EXIT_CODE_FILE=~/batch_exit_code.file + echo "Running on node index $AWS_BATCH_JOB_NODE_INDEX out of $AWS_BATCH_JOB_NUM_NODES nodes" + echo "Master node index is $AWS_BATCH_JOB_MAIN_NODE_INDEX and its IP is $AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS" + + #wget -q -P /tmp --no-check-certificate --no-proxy 'http://scar-architrave.s3.amazonaws.com/awscli-exe-linux-x86_64.zip' wget -q -P /tmp https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip - unzip -q -d /tmp /tmp/awscli-exe-linux-x86_64.zip + unzip -o -q -d /tmp /tmp/awscli-exe-linux-x86_64.zip + chmod +x /tmp/aws/install /tmp/aws/install echo "Version of dist: ${VERSION}" mkdir ~/.aws/ @@ -21,11 +27,12 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then printf '%s\n' '[default]' "aws_access_key_id=${AWS_ACCESS_KEY}" "aws_secret_access_key=${AWS_SECRET_ACCESS_KEY}" > ~/.aws/credentials printf '%s\n' '[default]' "region=${AWS_REGION}" "output=${AWS_OUTPUT}" > ~/.aws/config #aws s3 cp $S3_INPUT/common $SCRATCH_DIR - chmod +x ${SCRATCH_DIR}/simest - ## Install ssh from S3 - mkdir /tmp/deps_batch - aws cli cp ${S3_INPUT}/batch /tmp/batch - dpkg -i /tmp/batch/deps/*.deb + ## Install batch only dependencies from S3 + mkdir ${SCRATCH_DIR} + mkdir ${JOB_DIR} + aws s3 cp ${S3_BUCKET}/batch_deps/deps.tar.gz /tmp + tar -zxf /tmp/deps.tar.gz -C /tmp + dpkg -i /tmp/*.deb # COnfigure ssh sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd diff --git a/examples/architrave/scar-architrave-batch.yaml b/examples/architrave/scar-architrave-batch.yaml old mode 100644 new mode 100755 index 38c996b9..7301cff8 --- a/examples/architrave/scar-architrave-batch.yaml +++ b/examples/architrave/scar-architrave-batch.yaml @@ -8,6 +8,7 @@ functions: environment: Variables: EXEC_TYPE: batch + deployment: bucket: scar-architrave output: diff --git a/examples/architrave/scar-architrave-lambda.yaml b/examples/architrave/scar-architrave-lambda.yaml old mode 100644 new mode 100755 index d5524303..fae9e1d0 --- a/examples/architrave/scar-architrave-lambda.yaml +++ b/examples/architrave/scar-architrave-lambda.yaml @@ -8,7 +8,6 @@ functions: environment: Variables: EXEC_TYPE: lambda - TST: default_init default_run deployment: bucket: scar-architrave output: diff --git a/examples/architrave/scar_run_init.sh b/examples/architrave/scar_run_init.sh new file mode 100755 index 00000000..19a470fb --- /dev/null +++ b/examples/architrave/scar_run_init.sh @@ -0,0 +1,2 @@ +export APP_PARAMS="" +/opt/run_batch.sh From 27c6349a8f660f6c1ba5f26b86b382e1fea6a0bd Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 2 Sep 2020 11:16:22 +0200 Subject: [PATCH 04/84] rm ADD envs chg defaults for base dir and private --- examples/architrave/Dockerfile | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index 0e1d4974..f4da037f 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -1,12 +1,9 @@ FROM debian:stretch-slim -ARG ADD_BASE_DIR_ARCHITRAVE=scar/examples/architrave -ARG ADD_PRIVATE_BASE_DIR=architrave +ARG ADD_BASE_DIR_ARCHITRAVE=. +ARG ADD_PRIVATE_BASE_DIR ARG BUILD_PACKAGES=' make gcc g++ iproute2 cmake build-essential gfortran curl ' -ENV ADD_PRIVATE_BASE_DIR=${ADD_PRIVATE_BASE_DIR} -ENV ADD_BASE_DIR_ARCHITRAVE=${ADD_BASE_DIR_ARCHITRAVE} - ENV VERSION=1.7 ENV DEBIAN_FRONTEND=noninteractive ## Set to either lambda or batch From ff29ccb8e11d71f4f4d85200b271a4c9572acdeb Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 2 Sep 2020 11:23:37 +0200 Subject: [PATCH 05/84] chg default base dir to relative to scar base --- examples/architrave/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index f4da037f..85eb5118 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -1,9 +1,10 @@ FROM debian:stretch-slim -ARG ADD_BASE_DIR_ARCHITRAVE=. +ARG ADD_BASE_DIR_ARCHITRAVE=examples/architrave ARG ADD_PRIVATE_BASE_DIR ARG BUILD_PACKAGES=' make gcc g++ iproute2 cmake build-essential gfortran curl ' + ENV VERSION=1.7 ENV DEBIAN_FRONTEND=noninteractive ## Set to either lambda or batch From 12caee347eac2b907f5f3d08838f688f375424bd Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 2 Sep 2020 11:40:10 +0200 Subject: [PATCH 06/84] rm chmod app bin --- examples/architrave/Dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index 85eb5118..f6a776a8 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -40,7 +40,6 @@ RUN apt-get update \ && ulimit -n 1024 \ && rm -rf /tmp/* /var/lib/apt/lists/* \ && chmod 755 /opt/mpi-run.sh \ - && chmod 755 /opt/run_batch.sh \ - && chmod 755 ${APP_BIN} + && chmod 755 /opt/run_batch.sh CMD /opt/run_batch.sh From 60086ee7731aba5af7b6cb02c9bb1011b5abeaef Mon Sep 17 00:00:00 2001 From: asalic Date: Thu, 3 Sep 2020 13:19:16 +0200 Subject: [PATCH 07/84] upd batch private files --- examples/architrave/Dockerfile | 5 ++++- examples/architrave/README.md | 19 ++++++++++++++++++- examples/architrave/run_batch.sh | 11 +++++++++-- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index f6a776a8..276d9f16 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -23,11 +23,14 @@ ENV AWS_SECRET_ACCESS_KEY='' ENV AWS_REGION='us-east-1' ENV AWS_OUTPUT='json' ENV S3_BUCKET="s3://scar-architrave" +ENV S3_BATCH_DEPS_REL_PATH=batch/deps.tar.gz +ENV S3_BATCH_PRIVATE_REL_PATH= +ENV PRIVATE_PASSWD= ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run_batch.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ RUN apt-get update \ - && apt-get install -y $BUILD_PACKAGES wget unzip \ + && apt-get install -y $BUILD_PACKAGES wget p7zip-full \ && wget -q --no-check-certificate -qO- https://download.open-mpi.org/release/open-mpi/v1.4/openmpi-1.4.3.tar.bz2 | tar xvfj - -C /tmp/ \ && cd /tmp/openmpi-1.4.3/ \ && ./configure --disable-pty-support \ diff --git a/examples/architrave/README.md b/examples/architrave/README.md index af703dc9..eac00722 100755 --- a/examples/architrave/README.md +++ b/examples/architrave/README.md @@ -1,6 +1,23 @@ # architrave Running a commercial app in a Docker container on Lambda and Batch +## Building the containers + +Due to the differences between Amazon Batch and Lambda and our choice to have one Dockerfile for both, there are some things one has to take into account when running the containers with SCAR. + +### Lambda + +We included all necessary operations in the Dockerfile, therefore leaving the runtime execution populated only with the application execution itself. +The Docker image doesn't have to be public, thus we can build it locally by calling: + +`docker build --build-arg ADD_BASE_DIR_ARCHITRAVE=scar/examples/architrave --build-arg --build-arg ADD_PRIVATE_BASE_DIR=architrave -f /tmp/scar/examples/architrave/Dockerfile --label architrave -t architrave /tmp` + +This command implies that __/tmp__ is the base directory for the context. +Herein, there's a folder __architrave__ that contains the private bits of the distribution, in our case the binary of the application and examples. +The same base directory contains the cloned scar repository with the architrave example (the public bits). + + + You can ignore everything but the private files and those from ##scar/examples/architrave## by creating a `.dockerignore` file in the root of the context with the following content: ``` @@ -30,7 +47,7 @@ In the running container: ``` # determine all of the dependencies needed by the packages we want to install: apt update && apt install -y apt-rdepends && \ -apt-rdepends openssh-server openssh-client iproute2 unzip | sed -E -e 's/^\s*Depends:\s*|^\s*PreDepends:\s*|\s*\(.*\)//g' | sort | uniq > /tmp/deps_tmp.lst &&\ +apt-rdepends openssh-server openssh-client iproute2 | sed -E -e 's/^\s*Depends:\s*|^\s*PreDepends:\s*|\s*\(.*\)//g' | sort | uniq > /tmp/deps_tmp.lst &&\ apt-get --purge autoremove -y apt-rdepends && \ # filter out already installed packages (since we use the same base distro to get that packages and to run the legacy app) apt list --installed | sed -E -e 's/\/.*//g' > /tmp/deps_installed.lst && \ diff --git a/examples/architrave/run_batch.sh b/examples/architrave/run_batch.sh index d5179de9..2980816d 100755 --- a/examples/architrave/run_batch.sh +++ b/examples/architrave/run_batch.sh @@ -12,7 +12,7 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then #wget -q -P /tmp --no-check-certificate --no-proxy 'http://scar-architrave.s3.amazonaws.com/awscli-exe-linux-x86_64.zip' wget -q -P /tmp https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip - unzip -o -q -d /tmp /tmp/awscli-exe-linux-x86_64.zip + 7z x -aoa -o/tmp/ /tmp/awscli-exe-linux-x86_64.zip chmod +x /tmp/aws/install /tmp/aws/install echo "Version of dist: ${VERSION}" @@ -30,10 +30,15 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then ## Install batch only dependencies from S3 mkdir ${SCRATCH_DIR} mkdir ${JOB_DIR} - aws s3 cp ${S3_BUCKET}/batch_deps/deps.tar.gz /tmp + aws s3 cp ${S3_BUCKET}/${S3_BATCH_DEPS_REL_PATH} /tmp tar -zxf /tmp/deps.tar.gz -C /tmp dpkg -i /tmp/*.deb + ## Add the private data from S3 + #rm -rf /tmp/* + aws s3 cp ${S3_BUCKET}/${S3_BATCH_PRIVATE_REL_PATH} /tmp + 7z x -aoa -p${PRIVATE_PASSWD} -o/opt /tmp/*.7z + # COnfigure ssh sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd echo "export VISIBLE=now" >> /etc/profile @@ -52,6 +57,8 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then eval `ssh-agent -s` ssh-add ${SSHDIR}/id_rsa + chmod +x ${APP_BIN} + /opt/mpi-run.sh else echo "ERROR: unknown execution type '${EXEC_TYPE}'" From 0a42afb8a8d4c5fe2d1036cff89f0b5d3bac209c Mon Sep 17 00:00:00 2001 From: asalic Date: Fri, 4 Sep 2020 13:11:16 +0200 Subject: [PATCH 08/84] new init script upd README for local exec fix problems launching batch --- examples/architrave/Dockerfile | 6 ++-- examples/architrave/README.md | 35 +++++++++++++++---- examples/architrave/run.sh | 14 ++++++++ examples/architrave/run_batch.sh | 11 ------ .../architrave/scar-architrave-batch.yaml | 5 ++- 5 files changed, 48 insertions(+), 23 deletions(-) create mode 100644 examples/architrave/run.sh diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index 276d9f16..ce96797f 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -27,7 +27,7 @@ ENV S3_BATCH_DEPS_REL_PATH=batch/deps.tar.gz ENV S3_BATCH_PRIVATE_REL_PATH= ENV PRIVATE_PASSWD= -ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run_batch.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ +ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/run_batch.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ RUN apt-get update \ && apt-get install -y $BUILD_PACKAGES wget p7zip-full \ @@ -43,6 +43,6 @@ RUN apt-get update \ && ulimit -n 1024 \ && rm -rf /tmp/* /var/lib/apt/lists/* \ && chmod 755 /opt/mpi-run.sh \ - && chmod 755 /opt/run_batch.sh + && chmod 755 /opt/run.sh -CMD /opt/run_batch.sh +CMD /opt/run.sh diff --git a/examples/architrave/README.md b/examples/architrave/README.md index eac00722..0821d948 100755 --- a/examples/architrave/README.md +++ b/examples/architrave/README.md @@ -8,15 +8,36 @@ Due to the differences between Amazon Batch and Lambda and our choice to have on ### Lambda We included all necessary operations in the Dockerfile, therefore leaving the runtime execution populated only with the application execution itself. -The Docker image doesn't have to be public, thus we can build it locally by calling: +The Docker image doesn't have to be public, we can build it locally. -`docker build --build-arg ADD_BASE_DIR_ARCHITRAVE=scar/examples/architrave --build-arg --build-arg ADD_PRIVATE_BASE_DIR=architrave -f /tmp/scar/examples/architrave/Dockerfile --label architrave -t architrave /tmp` +``` +# The base dir is the root context for Docker +# We assume that you cloned this repo in the BASE_DIR +export BASE_DIR=/tmp +# The base dir with the private bits; It must be a child of BASE_DIR +export ADD_PRIVATE_BASE_DIR=architrave -This command implies that __/tmp__ is the base directory for the context. -Herein, there's a folder __architrave__ that contains the private bits of the distribution, in our case the binary of the application and examples. -The same base directory contains the cloned scar repository with the architrave example (the public bits). +docker build --build-arg ADD_BASE_DIR_ARCHITRAVE=scar/examples/architrave --build-arg ADD_PRIVATE_BASE_DIR="$ADD_PRIVATE_BASE_DIR" -f "$BASE_DIR/scar/examples/architrave/Dockerfile" --label architrave -t architrave "$BASE_DIR" +``` +Take into account that the input files must be located in the __ADD_PRIVATE_BASE_DIR__ directory. +e.g. if you have something like `$BASE_DIR/$ADD_PRIVATE_BASE_DIR/examples/example_input.file`, then you the example input ends up on the following path: `/opt/examples/example_input.file` +If you want to run the container locally before launching it on Amazon Lambda, you can use the following: + +``` +# This is the path inside the container where the binary can be found +# it is the relative path of ADD_PRIVATE_BASE_DIR without the root +# e.g. for architrave/path/path2/execute_me (where ADD_PRIVATE_BASE_DIR=architrave) is path/path2/execute_me +export APP_BIN=/opt/ +# The full list of params needed for the app, don't forget the (double) quotes when there are spaces +export APP_PARAMS= + +# Mount the results dir you specify in the APP_PARAMS env variable to +docker run -d -e EXEC_TYPE=lambda -e APP_BIN="$APP_BIN" -e APP_PARAMS="$APP_PARAMS" --name architrave_local -v /tmp/architrave-result:/ architrave:latest +``` + +#### Build context You can ignore everything but the private files and those from ##scar/examples/architrave## by creating a `.dockerignore` file in the root of the context with the following content: @@ -36,7 +57,9 @@ You can ignore everything but the private files and those from ##scar/examples/a **/LICENSE ``` -### Batch additional required packages on S3 +### Batch + +#### Batch additional required packages on S3 Start a Docker container based on the image of the distribution you use __to run on AWS__ the legacy application (not the distribution __of__ the legacy application). diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh new file mode 100644 index 00000000..a75da0cb --- /dev/null +++ b/examples/architrave/run.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +if [ "${EXEC_TYPE,,}" = 'lambda' ]; then + export OMPI_MCA_plm_rsh_agent=/bin/false + mpirun ${MPI_PARAMS} ${APP_BIN} ${APP_PARAMS} + +elif [ "${EXEC_TYPE,,}" = 'batch' ]; then + + chmod 755 /opt/run.sh + /opt/run_batch.sh +else + echo "ERROR: unknown execution type '${EXEC_TYPE}'" + exit 1 # terminate and indicate error +fi diff --git a/examples/architrave/run_batch.sh b/examples/architrave/run_batch.sh index 2980816d..2f55d152 100755 --- a/examples/architrave/run_batch.sh +++ b/examples/architrave/run_batch.sh @@ -1,11 +1,4 @@ #!/bin/bash - -if [ "${EXEC_TYPE,,}" = 'lambda' ]; then - export OMPI_MCA_plm_rsh_agent=/bin/false - mpirun ${MPI_PARAMS} ${APP_BIN} ${APP_PARAMS} - -elif [ "${EXEC_TYPE,,}" = 'batch' ]; then - export AWS_BATCH_EXIT_CODE_FILE=~/batch_exit_code.file echo "Running on node index $AWS_BATCH_JOB_NODE_INDEX out of $AWS_BATCH_JOB_NUM_NODES nodes" echo "Master node index is $AWS_BATCH_JOB_MAIN_NODE_INDEX and its IP is $AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS" @@ -60,7 +53,3 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then chmod +x ${APP_BIN} /opt/mpi-run.sh -else - echo "ERROR: unknown execution type '${EXEC_TYPE}'" - exit 1 # terminate and indicate error -fi diff --git a/examples/architrave/scar-architrave-batch.yaml b/examples/architrave/scar-architrave-batch.yaml index 7301cff8..7a3ff410 100755 --- a/examples/architrave/scar-architrave-batch.yaml +++ b/examples/architrave/scar-architrave-batch.yaml @@ -2,15 +2,14 @@ functions: aws: - lambda: name: scar-architrave + init_script: run_batch.sh execution_mode: batch container: - image_file: /tmp/architrave-docker-img.tar.gz + image: asalic/scar-architrave environment: Variables: EXEC_TYPE: batch - deployment: - bucket: scar-architrave output: - storage_provider: s3 path: scar-architrave/output From b009a5a5fd8a20ae94edfe57602a485232a98a83 Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 7 Sep 2020 11:52:41 +0200 Subject: [PATCH 09/84] mod to run on aws batch --- examples/architrave/Dockerfile | 2 +- examples/architrave/run.sh | 59 ++++++++++++++++++- examples/architrave/run_batch.sh | 56 +----------------- .../architrave/scar-architrave-batch.yaml | 4 +- 4 files changed, 62 insertions(+), 59 deletions(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index ce96797f..82511eab 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -27,7 +27,7 @@ ENV S3_BATCH_DEPS_REL_PATH=batch/deps.tar.gz ENV S3_BATCH_PRIVATE_REL_PATH= ENV PRIVATE_PASSWD= -ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/run_batch.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ +ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ RUN apt-get update \ && apt-get install -y $BUILD_PACKAGES wget p7zip-full \ diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index a75da0cb..a905c929 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -6,8 +6,63 @@ if [ "${EXEC_TYPE,,}" = 'lambda' ]; then elif [ "${EXEC_TYPE,,}" = 'batch' ]; then - chmod 755 /opt/run.sh - /opt/run_batch.sh +# The following comment line will be replaced with the necessary env vars: +#$ENV_VARS$ + + export AWS_BATCH_EXIT_CODE_FILE=~/batch_exit_code.file + echo "Running on node index $AWS_BATCH_JOB_NODE_INDEX out of $AWS_BATCH_JOB_NUM_NODES nodes" + echo "Master node index is $AWS_BATCH_JOB_MAIN_NODE_INDEX and its IP is $AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS" + + #wget -q -P /tmp --no-check-certificate --no-proxy 'http://scar-architrave.s3.amazonaws.com/awscli-exe-linux-x86_64.zip' + wget -q -P /tmp https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip + 7z x -aoa -o/tmp/ /tmp/awscli-exe-linux-x86_64.zip + chmod +x /tmp/aws/install + /tmp/aws/install + echo "Version of dist: ${VERSION}" + mkdir ~/.aws/ + ## S3 OPTIMIZATION + aws configure set default.s3.max_concurrent_requests 30 + aws configure set default.s3.max_queue_size 10000 + aws configure set default.s3.multipart_threshold 64MB + aws configure set default.s3.multipart_chunksize 16MB + aws configure set default.s3.max_bandwidth 4096MB/s + aws configure set default.s3.addressing_style path + printf '%s\n' '[default]' "aws_access_key_id=${AWS_ACCESS_KEY}" "aws_secret_access_key=${AWS_SECRET_ACCESS_KEY}" > ~/.aws/credentials + printf '%s\n' '[default]' "region=${AWS_REGION}" "output=${AWS_OUTPUT}" > ~/.aws/config + #aws s3 cp $S3_INPUT/common $SCRATCH_DIR + ## Install batch only dependencies from S3 + mkdir ${SCRATCH_DIR} + mkdir ${JOB_DIR} + aws s3 cp ${S3_BUCKET}/${S3_BATCH_DEPS_REL_PATH} /tmp + tar -zxf /tmp/deps.tar.gz -C /tmp + dpkg -i /tmp/*.deb + + ## Add the private data from S3 + #rm -rf /tmp/* + aws s3 cp ${S3_BUCKET}/${S3_BATCH_PRIVATE_REL_PATH} /tmp + 7z x -aoa -p${PRIVATE_PASSWD} -o/opt /tmp/*.7z + + # COnfigure ssh + sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + echo "export VISIBLE=now" >> /etc/profile + echo "${USER} ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers + mkdir -p ${SSHDIR} + touch ${SSHDIR}/sshd_config + ssh-keygen -t rsa -f ${SSHDIR}/ssh_host_rsa_key -N '' + cp ${SSHDIR}/ssh_host_rsa_key.pub ${SSHDIR}/authorized_keys + cp ${SSHDIR}/ssh_host_rsa_key ${SSHDIR}/id_rsa + echo " IdentityFile ${SSHDIR}/id_rsa" >> /etc/ssh/ssh_config + echo "Host *" >> /etc/ssh/ssh_config + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config + chmod -R 600 ${SSHDIR}/* + chown -R ${USER}:${USER} ${SSHDIR}/ + # check if ssh agent is running or not, if not, run + eval `ssh-agent -s` + ssh-add ${SSHDIR}/id_rsa + + chmod +x ${APP_BIN} + + /opt/mpi-run.sh else echo "ERROR: unknown execution type '${EXEC_TYPE}'" exit 1 # terminate and indicate error diff --git a/examples/architrave/run_batch.sh b/examples/architrave/run_batch.sh index 2f55d152..7d22f90e 100755 --- a/examples/architrave/run_batch.sh +++ b/examples/architrave/run_batch.sh @@ -1,55 +1 @@ -#!/bin/bash - export AWS_BATCH_EXIT_CODE_FILE=~/batch_exit_code.file - echo "Running on node index $AWS_BATCH_JOB_NODE_INDEX out of $AWS_BATCH_JOB_NUM_NODES nodes" - echo "Master node index is $AWS_BATCH_JOB_MAIN_NODE_INDEX and its IP is $AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS" - - #wget -q -P /tmp --no-check-certificate --no-proxy 'http://scar-architrave.s3.amazonaws.com/awscli-exe-linux-x86_64.zip' - wget -q -P /tmp https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip - 7z x -aoa -o/tmp/ /tmp/awscli-exe-linux-x86_64.zip - chmod +x /tmp/aws/install - /tmp/aws/install - echo "Version of dist: ${VERSION}" - mkdir ~/.aws/ - ## S3 OPTIMIZATION - aws configure set default.s3.max_concurrent_requests 30 - aws configure set default.s3.max_queue_size 10000 - aws configure set default.s3.multipart_threshold 64MB - aws configure set default.s3.multipart_chunksize 16MB - aws configure set default.s3.max_bandwidth 4096MB/s - aws configure set default.s3.addressing_style path - printf '%s\n' '[default]' "aws_access_key_id=${AWS_ACCESS_KEY}" "aws_secret_access_key=${AWS_SECRET_ACCESS_KEY}" > ~/.aws/credentials - printf '%s\n' '[default]' "region=${AWS_REGION}" "output=${AWS_OUTPUT}" > ~/.aws/config - #aws s3 cp $S3_INPUT/common $SCRATCH_DIR - ## Install batch only dependencies from S3 - mkdir ${SCRATCH_DIR} - mkdir ${JOB_DIR} - aws s3 cp ${S3_BUCKET}/${S3_BATCH_DEPS_REL_PATH} /tmp - tar -zxf /tmp/deps.tar.gz -C /tmp - dpkg -i /tmp/*.deb - - ## Add the private data from S3 - #rm -rf /tmp/* - aws s3 cp ${S3_BUCKET}/${S3_BATCH_PRIVATE_REL_PATH} /tmp - 7z x -aoa -p${PRIVATE_PASSWD} -o/opt /tmp/*.7z - - # COnfigure ssh - sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd - echo "export VISIBLE=now" >> /etc/profile - echo "${USER} ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers - mkdir -p ${SSHDIR} - touch ${SSHDIR}/sshd_config - ssh-keygen -t rsa -f ${SSHDIR}/ssh_host_rsa_key -N '' - cp ${SSHDIR}/ssh_host_rsa_key.pub ${SSHDIR}/authorized_keys - cp ${SSHDIR}/ssh_host_rsa_key ${SSHDIR}/id_rsa - echo " IdentityFile ${SSHDIR}/id_rsa" >> /etc/ssh/ssh_config - echo "Host *" >> /etc/ssh/ssh_config - echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config - chmod -R 600 ${SSHDIR}/* - chown -R ${USER}:${USER} ${SSHDIR}/ - # check if ssh agent is running or not, if not, run - eval `ssh-agent -s` - ssh-add ${SSHDIR}/id_rsa - - chmod +x ${APP_BIN} - - /opt/mpi-run.sh +bash $INPUT_FILE_PATH diff --git a/examples/architrave/scar-architrave-batch.yaml b/examples/architrave/scar-architrave-batch.yaml index 7a3ff410..2bac2e5f 100755 --- a/examples/architrave/scar-architrave-batch.yaml +++ b/examples/architrave/scar-architrave-batch.yaml @@ -9,7 +9,9 @@ functions: environment: Variables: EXEC_TYPE: batch - + input: + - storage_provider: s3 + path: scar-architrave/input output: - storage_provider: s3 path: scar-architrave/output From b5432bae390d6587303105aafc524b764b41b830 Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 7 Sep 2020 14:04:54 +0200 Subject: [PATCH 10/84] mod to run on aws batch --- examples/architrave/Dockerfile | 2 +- examples/architrave/run.sh | 2 +- examples/architrave/scar-architrave-batch.yaml | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index 82511eab..dba335ac 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -30,7 +30,7 @@ ENV PRIVATE_PASSWD= ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ RUN apt-get update \ - && apt-get install -y $BUILD_PACKAGES wget p7zip-full \ + && apt-get install -y $BUILD_PACKAGES wget p7zip-full xz-utils \ && wget -q --no-check-certificate -qO- https://download.open-mpi.org/release/open-mpi/v1.4/openmpi-1.4.3.tar.bz2 | tar xvfj - -C /tmp/ \ && cd /tmp/openmpi-1.4.3/ \ && ./configure --disable-pty-support \ diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index a905c929..4bf7f892 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -7,7 +7,7 @@ if [ "${EXEC_TYPE,,}" = 'lambda' ]; then elif [ "${EXEC_TYPE,,}" = 'batch' ]; then # The following comment line will be replaced with the necessary env vars: -#$ENV_VARS$ +#=ENV_VARS= export AWS_BATCH_EXIT_CODE_FILE=~/batch_exit_code.file echo "Running on node index $AWS_BATCH_JOB_NODE_INDEX out of $AWS_BATCH_JOB_NUM_NODES nodes" diff --git a/examples/architrave/scar-architrave-batch.yaml b/examples/architrave/scar-architrave-batch.yaml index 2bac2e5f..0daaf9be 100755 --- a/examples/architrave/scar-architrave-batch.yaml +++ b/examples/architrave/scar-architrave-batch.yaml @@ -2,6 +2,7 @@ functions: aws: - lambda: name: scar-architrave + log_level: DEBUG init_script: run_batch.sh execution_mode: batch container: From d92fbdc9032ec6a45de048d4b43dbac1fadefa20 Mon Sep 17 00:00:00 2001 From: asalic Date: Tue, 8 Sep 2020 11:48:59 +0200 Subject: [PATCH 11/84] install xz manually --- examples/architrave/Dockerfile | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index dba335ac..af55918c 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -30,10 +30,17 @@ ENV PRIVATE_PASSWD= ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ RUN apt-get update \ - && apt-get install -y $BUILD_PACKAGES wget p7zip-full xz-utils \ + && apt-get install -y $BUILD_PACKAGES wget p7zip-full \ + && wget -P /tmp https://tukaani.org/xz/xz-5.2.5.tar.gz \ + && tar -zxf /tmp/xz-5.2.5.tar.gz -C /tmp \ + && cd /tmp/xz-5.2.5 \ + && ./configure --enable-shared --disable-doc \ + && make \ + && make install \ + && ldconfig \ && wget -q --no-check-certificate -qO- https://download.open-mpi.org/release/open-mpi/v1.4/openmpi-1.4.3.tar.bz2 | tar xvfj - -C /tmp/ \ && cd /tmp/openmpi-1.4.3/ \ - && ./configure --disable-pty-support \ + && ./configure --disable-pty-support --disable-doc \ && make -j8 \ && make install \ && wget -q -P /tmp -i /opt/debs.lst \ From 2643ea5700de87cf67b8be24c2a89f94d97b23d0 Mon Sep 17 00:00:00 2001 From: asalic Date: Thu, 10 Sep 2020 13:09:41 +0200 Subject: [PATCH 12/84] upd run to include messages --- examples/architrave/Dockerfile | 9 +-------- examples/architrave/run.sh | 10 ++++++---- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index af55918c..b4372154 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -5,7 +5,7 @@ ARG ADD_PRIVATE_BASE_DIR ARG BUILD_PACKAGES=' make gcc g++ iproute2 cmake build-essential gfortran curl ' -ENV VERSION=1.7 +ENV VERSION=1.8 ENV DEBIAN_FRONTEND=noninteractive ## Set to either lambda or batch ENV EXEC_TYPE=lambda @@ -31,13 +31,6 @@ ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_AR RUN apt-get update \ && apt-get install -y $BUILD_PACKAGES wget p7zip-full \ - && wget -P /tmp https://tukaani.org/xz/xz-5.2.5.tar.gz \ - && tar -zxf /tmp/xz-5.2.5.tar.gz -C /tmp \ - && cd /tmp/xz-5.2.5 \ - && ./configure --enable-shared --disable-doc \ - && make \ - && make install \ - && ldconfig \ && wget -q --no-check-certificate -qO- https://download.open-mpi.org/release/open-mpi/v1.4/openmpi-1.4.3.tar.bz2 | tar xvfj - -C /tmp/ \ && cd /tmp/openmpi-1.4.3/ \ && ./configure --disable-pty-support --disable-doc \ diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index 4bf7f892..46563ea6 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -14,7 +14,8 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then echo "Master node index is $AWS_BATCH_JOB_MAIN_NODE_INDEX and its IP is $AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS" #wget -q -P /tmp --no-check-certificate --no-proxy 'http://scar-architrave.s3.amazonaws.com/awscli-exe-linux-x86_64.zip' - wget -q -P /tmp https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip + wget -P /tmp https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip + echo "Download awscli complete" 7z x -aoa -o/tmp/ /tmp/awscli-exe-linux-x86_64.zip chmod +x /tmp/aws/install /tmp/aws/install @@ -30,19 +31,19 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then printf '%s\n' '[default]' "aws_access_key_id=${AWS_ACCESS_KEY}" "aws_secret_access_key=${AWS_SECRET_ACCESS_KEY}" > ~/.aws/credentials printf '%s\n' '[default]' "region=${AWS_REGION}" "output=${AWS_OUTPUT}" > ~/.aws/config #aws s3 cp $S3_INPUT/common $SCRATCH_DIR - ## Install batch only dependencies from S3 + echo "Install batch only dependencies from S3" mkdir ${SCRATCH_DIR} mkdir ${JOB_DIR} aws s3 cp ${S3_BUCKET}/${S3_BATCH_DEPS_REL_PATH} /tmp tar -zxf /tmp/deps.tar.gz -C /tmp dpkg -i /tmp/*.deb - ## Add the private data from S3 + echo "Add private data from S3" #rm -rf /tmp/* aws s3 cp ${S3_BUCKET}/${S3_BATCH_PRIVATE_REL_PATH} /tmp 7z x -aoa -p${PRIVATE_PASSWD} -o/opt /tmp/*.7z - # COnfigure ssh + echo "Configure ssh" sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd echo "export VISIBLE=now" >> /etc/profile echo "${USER} ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers @@ -62,6 +63,7 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then chmod +x ${APP_BIN} + echo "Running app" /opt/mpi-run.sh else echo "ERROR: unknown execution type '${EXEC_TYPE}'" From e8ea46be1424d9e9fc1722acb830d49837a2bc76 Mon Sep 17 00:00:00 2001 From: asalic Date: Fri, 11 Sep 2020 11:12:14 +0200 Subject: [PATCH 13/84] add mnt S3 batch --- examples/architrave/Dockerfile | 1 + examples/architrave/run.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index b4372154..0a214d5e 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -26,6 +26,7 @@ ENV S3_BUCKET="s3://scar-architrave" ENV S3_BATCH_DEPS_REL_PATH=batch/deps.tar.gz ENV S3_BATCH_PRIVATE_REL_PATH= ENV PRIVATE_PASSWD= +ENV S3_BATCH_MNT=/mnt/batch ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index 46563ea6..ec73bbd1 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -12,6 +12,7 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then export AWS_BATCH_EXIT_CODE_FILE=~/batch_exit_code.file echo "Running on node index $AWS_BATCH_JOB_NODE_INDEX out of $AWS_BATCH_JOB_NUM_NODES nodes" echo "Master node index is $AWS_BATCH_JOB_MAIN_NODE_INDEX and its IP is $AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS" + ls -al $S3_BATCH_MNT #wget -q -P /tmp --no-check-certificate --no-proxy 'http://scar-architrave.s3.amazonaws.com/awscli-exe-linux-x86_64.zip' wget -P /tmp https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip From 7d81278b1b68cbc39de3e878fce6de4e814aa17c Mon Sep 17 00:00:00 2001 From: asalic Date: Fri, 11 Sep 2020 12:59:56 +0200 Subject: [PATCH 14/84] add build hooks with network host --- examples/architrave/hooks/build | 90 +++++++++++++++++++++++++++++++++ examples/architrave/run.sh | 1 - 2 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 examples/architrave/hooks/build diff --git a/examples/architrave/hooks/build b/examples/architrave/hooks/build new file mode 100644 index 00000000..faee07bf --- /dev/null +++ b/examples/architrave/hooks/build @@ -0,0 +1,90 @@ +#!/bin/bash +# hooks/build +# https://docs.docker.com/docker-cloud/builds/advanced/ + +## requires to be an architecture+variant as defined by the manifest +DEFAULT_ARCH="amd64" + +## $IMAGE_NAME var is injected into the build so the tag is correct. +echo "[***] Build hook starting..." + +# $(echo "index.docker.io/user/respository" | cut -d '/' -f 3) = "repository" +APPLICATION=$(echo "${DOCKER_REPO}" | cut -d '/' -f 3) + +echo "[---] DOCKERFILE_PATH: ${DOCKERFILE_PATH}" +echo "[---] DOCKER_REPO: ${DOCKER_REPO}" +echo "[---] IMAGE_NAME: ${IMAGE_NAME}" +echo "[---] APPLICATION: ${APPLICATION}" + +# $(echo "index.docker.io/user/repository" | cut -d '/' -f 2-3) = "user/repository" +# otherwise, you will need to set ENVIRONMENT VARIABLES for your build. +if [ -z $GITHUB_USERREPO ]; then + GITHUB_USERREPO=$(echo "${DOCKER_REPO}" | cut -d '/' -f 2-3) +fi + +# Set description from github +DESCRIPTION=$(curl -s https://api.github.com/repos/${GITHUB_USERREPO} \ + | grep '"description".*' \ + | head -n 1 \ + | cut -d '"' -f 4) + +echo "[---] GITHUB_USERREPO: ${GITHUB_USERREPO}" +echo "[---] DESCRIPTION: ${DESCRIPTION}" + +## Build all variant images. +for FILE in ${DOCKERFILE_PATH}.* +do + TARGET_ARCH=$(echo "${FILE}" | cut -d '.' -f 2) + + ## FUDGE Factor because Docker Hub does not respect "32" in os/arch model + case "$TARGET_ARCH" in + *arm32v5) + BUILD_ARCH="armv5" + ;; + *arm32v6) + BUILD_ARCH="armv6" + ;; + *arm32v7) + BUILD_ARCH="armv7" + ;; + *) + BUILD_ARCH="${TARGET_ARCH}" + ;; + esac + + # Not exactly sure this needs to run EVERY time, but for good measure. + docker run --rm --privileged multiarch/qemu-user-static:register --reset + + docker build \ + --network=host \ + --file "${DOCKERFILE_PATH}.${TARGET_ARCH}" \ + --build-arg APPLICATION=${APPLICATION} \ + --build-arg BUILD_RFC3339=$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ + --build-arg REVISION=$(git rev-parse --short HEAD) \ + --build-arg PACKAGE="${GITHUB_USERREPO}" \ + --build-arg DESCRIPTION="${DESCRIPTION}" \ + --build-arg VERSION=$(git describe --tags --always) \ + -t ${IMAGE_NAME}_${BUILD_ARCH} \ + . +done + + +## Build the prime image at the end. +docker build \ + --file "${DOCKERFILE_PATH}" \ + --build-arg APPLICATION=${APPLICATION} \ + --build-arg BUILD_RFC3339=$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ + --build-arg REVISION=$(git rev-parse --short HEAD) \ + --build-arg PACKAGE="${GITHUB_USERREPO}" \ + --build-arg DESCRIPTION="${DESCRIPTION}" \ + --build-arg VERSION=$(git describe --tags --always) \ + -t ${IMAGE_NAME}_${DEFAULT_ARCH} \ + . + +## Push the default arch image so manifest-tool can find it +docker push ${IMAGE_NAME}_${DEFAULT_ARCH} + +## Tag the default image so dockerhub can push it. +docker tag ${IMAGE_NAME}_${DEFAULT_ARCH} ${IMAGE_NAME} + +echo "[***] ...build hook complete." diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index ec73bbd1..46563ea6 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -12,7 +12,6 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then export AWS_BATCH_EXIT_CODE_FILE=~/batch_exit_code.file echo "Running on node index $AWS_BATCH_JOB_NODE_INDEX out of $AWS_BATCH_JOB_NUM_NODES nodes" echo "Master node index is $AWS_BATCH_JOB_MAIN_NODE_INDEX and its IP is $AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS" - ls -al $S3_BATCH_MNT #wget -q -P /tmp --no-check-certificate --no-proxy 'http://scar-architrave.s3.amazonaws.com/awscli-exe-linux-x86_64.zip' wget -P /tmp https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip From 2d614cf53b9a6f39b78ce63ad297db70d95c7ab1 Mon Sep 17 00:00:00 2001 From: asalic Date: Fri, 11 Sep 2020 13:02:20 +0200 Subject: [PATCH 15/84] add build hooks with network host --- examples/architrave/hooks/build | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/architrave/hooks/build b/examples/architrave/hooks/build index faee07bf..f803ba33 100644 --- a/examples/architrave/hooks/build +++ b/examples/architrave/hooks/build @@ -71,6 +71,7 @@ done ## Build the prime image at the end. docker build \ + --network=host \ --file "${DOCKERFILE_PATH}" \ --build-arg APPLICATION=${APPLICATION} \ --build-arg BUILD_RFC3339=$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ From 71fb17ac3d6359dd6d1e1ed06905cceb86933ab4 Mon Sep 17 00:00:00 2001 From: asalic Date: Fri, 11 Sep 2020 13:26:17 +0200 Subject: [PATCH 16/84] add build hooks with network host --- examples/architrave/hooks/build | 38 +-------------------------------- 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/examples/architrave/hooks/build b/examples/architrave/hooks/build index f803ba33..d16ed492 100644 --- a/examples/architrave/hooks/build +++ b/examples/architrave/hooks/build @@ -31,42 +31,6 @@ DESCRIPTION=$(curl -s https://api.github.com/repos/${GITHUB_USERREPO} \ echo "[---] GITHUB_USERREPO: ${GITHUB_USERREPO}" echo "[---] DESCRIPTION: ${DESCRIPTION}" -## Build all variant images. -for FILE in ${DOCKERFILE_PATH}.* -do - TARGET_ARCH=$(echo "${FILE}" | cut -d '.' -f 2) - - ## FUDGE Factor because Docker Hub does not respect "32" in os/arch model - case "$TARGET_ARCH" in - *arm32v5) - BUILD_ARCH="armv5" - ;; - *arm32v6) - BUILD_ARCH="armv6" - ;; - *arm32v7) - BUILD_ARCH="armv7" - ;; - *) - BUILD_ARCH="${TARGET_ARCH}" - ;; - esac - - # Not exactly sure this needs to run EVERY time, but for good measure. - docker run --rm --privileged multiarch/qemu-user-static:register --reset - - docker build \ - --network=host \ - --file "${DOCKERFILE_PATH}.${TARGET_ARCH}" \ - --build-arg APPLICATION=${APPLICATION} \ - --build-arg BUILD_RFC3339=$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ - --build-arg REVISION=$(git rev-parse --short HEAD) \ - --build-arg PACKAGE="${GITHUB_USERREPO}" \ - --build-arg DESCRIPTION="${DESCRIPTION}" \ - --build-arg VERSION=$(git describe --tags --always) \ - -t ${IMAGE_NAME}_${BUILD_ARCH} \ - . -done ## Build the prime image at the end. @@ -79,7 +43,7 @@ docker build \ --build-arg PACKAGE="${GITHUB_USERREPO}" \ --build-arg DESCRIPTION="${DESCRIPTION}" \ --build-arg VERSION=$(git describe --tags --always) \ - -t ${IMAGE_NAME}_${DEFAULT_ARCH} \ + -t ${IMAGE_NAME} \ . ## Push the default arch image so manifest-tool can find it From e3b4c0a2fb0df59bd79e37d9d21cf3d81dc5ddbf Mon Sep 17 00:00:00 2001 From: asalic Date: Fri, 11 Sep 2020 13:32:26 +0200 Subject: [PATCH 17/84] add build hooks with network host --- examples/architrave/hooks/build | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/examples/architrave/hooks/build b/examples/architrave/hooks/build index d16ed492..ea56c290 100644 --- a/examples/architrave/hooks/build +++ b/examples/architrave/hooks/build @@ -2,9 +2,6 @@ # hooks/build # https://docs.docker.com/docker-cloud/builds/advanced/ -## requires to be an architecture+variant as defined by the manifest -DEFAULT_ARCH="amd64" - ## $IMAGE_NAME var is injected into the build so the tag is correct. echo "[***] Build hook starting..." @@ -47,9 +44,9 @@ docker build \ . ## Push the default arch image so manifest-tool can find it -docker push ${IMAGE_NAME}_${DEFAULT_ARCH} +docker push ${IMAGE_NAME} ## Tag the default image so dockerhub can push it. -docker tag ${IMAGE_NAME}_${DEFAULT_ARCH} ${IMAGE_NAME} +docker tag ${IMAGE_NAME} ${IMAGE_NAME} echo "[***] ...build hook complete." From 6879441b333ab78ce6787d4a74850239c1f70790 Mon Sep 17 00:00:00 2001 From: asalic Date: Fri, 11 Sep 2020 13:46:42 +0200 Subject: [PATCH 18/84] add build hooks with network host --- examples/architrave/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index 0a214d5e..d832e174 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -28,7 +28,7 @@ ENV S3_BATCH_PRIVATE_REL_PATH= ENV PRIVATE_PASSWD= ENV S3_BATCH_MNT=/mnt/batch -ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ +ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst RUN apt-get update \ && apt-get install -y $BUILD_PACKAGES wget p7zip-full \ From 09c0cff1412e673d8f75f9fa5de2f097271287df Mon Sep 17 00:00:00 2001 From: asalic Date: Tue, 15 Sep 2020 10:24:53 +0200 Subject: [PATCH 19/84] no rel path for files to be included in the container --- examples/architrave/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index d832e174..458ce357 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -1,6 +1,6 @@ FROM debian:stretch-slim -ARG ADD_BASE_DIR_ARCHITRAVE=examples/architrave +ARG ADD_BASE_DIR_ARCHITRAVE=./ ARG ADD_PRIVATE_BASE_DIR ARG BUILD_PACKAGES=' make gcc g++ iproute2 cmake build-essential gfortran curl ' From 7ab0d5d75d16d2e004f114626bff98f09ac5bc7f Mon Sep 17 00:00:00 2001 From: asalic Date: Tue, 15 Sep 2020 10:32:55 +0200 Subject: [PATCH 20/84] no rel path for files to be included in the container --- examples/architrave/Dockerfile | 2 +- examples/architrave/hooks/build | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index 458ce357..d832e174 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -1,6 +1,6 @@ FROM debian:stretch-slim -ARG ADD_BASE_DIR_ARCHITRAVE=./ +ARG ADD_BASE_DIR_ARCHITRAVE=examples/architrave ARG ADD_PRIVATE_BASE_DIR ARG BUILD_PACKAGES=' make gcc g++ iproute2 cmake build-essential gfortran curl ' diff --git a/examples/architrave/hooks/build b/examples/architrave/hooks/build index ea56c290..a44ab812 100644 --- a/examples/architrave/hooks/build +++ b/examples/architrave/hooks/build @@ -34,12 +34,6 @@ echo "[---] DESCRIPTION: ${DESCRIPTION}" docker build \ --network=host \ --file "${DOCKERFILE_PATH}" \ - --build-arg APPLICATION=${APPLICATION} \ - --build-arg BUILD_RFC3339=$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ - --build-arg REVISION=$(git rev-parse --short HEAD) \ - --build-arg PACKAGE="${GITHUB_USERREPO}" \ - --build-arg DESCRIPTION="${DESCRIPTION}" \ - --build-arg VERSION=$(git describe --tags --always) \ -t ${IMAGE_NAME} \ . From a765449d17642c690859fda478853de1e38acf80 Mon Sep 17 00:00:00 2001 From: asalic Date: Tue, 15 Sep 2020 10:37:01 +0200 Subject: [PATCH 21/84] no rel path for files to be included in the container --- examples/architrave/hooks/build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/architrave/hooks/build b/examples/architrave/hooks/build index a44ab812..6e18d0f1 100644 --- a/examples/architrave/hooks/build +++ b/examples/architrave/hooks/build @@ -33,7 +33,7 @@ echo "[---] DESCRIPTION: ${DESCRIPTION}" ## Build the prime image at the end. docker build \ --network=host \ - --file "${DOCKERFILE_PATH}" \ + --file Dockerfile \ -t ${IMAGE_NAME} \ . From fe8ea24172f82be869cb2a87fbbfd551f6d05fde Mon Sep 17 00:00:00 2001 From: asalic Date: Tue, 15 Sep 2020 10:52:18 +0200 Subject: [PATCH 22/84] no rel path for files to be included in the container --- examples/architrave/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index d832e174..4a6856b4 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -1,6 +1,6 @@ FROM debian:stretch-slim -ARG ADD_BASE_DIR_ARCHITRAVE=examples/architrave +ARG ADD_BASE_DIR_ARCHITRAVE=. ARG ADD_PRIVATE_BASE_DIR ARG BUILD_PACKAGES=' make gcc g++ iproute2 cmake build-essential gfortran curl ' From 5f7369e153b1dda03047c72ca95396abde2e405d Mon Sep 17 00:00:00 2001 From: asalic Date: Tue, 15 Sep 2020 10:52:57 +0200 Subject: [PATCH 23/84] no rel path for files to be included in the container --- examples/architrave/hooks/build | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/architrave/hooks/build b/examples/architrave/hooks/build index 6e18d0f1..02f60744 100644 --- a/examples/architrave/hooks/build +++ b/examples/architrave/hooks/build @@ -34,6 +34,12 @@ echo "[---] DESCRIPTION: ${DESCRIPTION}" docker build \ --network=host \ --file Dockerfile \ + --build-arg APPLICATION=${APPLICATION} \ + --build-arg BUILD_RFC3339=$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ + --build-arg REVISION=$(git rev-parse --short HEAD) \ + --build-arg PACKAGE="${GITHUB_USERREPO}" \ + --build-arg DESCRIPTION="${DESCRIPTION}" \ + --build-arg VERSION=$(git describe --tags --always) \ -t ${IMAGE_NAME} \ . From d3e19709695604f6509c21c0e01065f6acd29589 Mon Sep 17 00:00:00 2001 From: asalic Date: Tue, 15 Sep 2020 10:56:57 +0200 Subject: [PATCH 24/84] no rel path for files to be included in the container --- examples/architrave/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index 4a6856b4..c90b0409 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -28,7 +28,7 @@ ENV S3_BATCH_PRIVATE_REL_PATH= ENV PRIVATE_PASSWD= ENV S3_BATCH_MNT=/mnt/batch -ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst +ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ RUN apt-get update \ && apt-get install -y $BUILD_PACKAGES wget p7zip-full \ From d38d598ddf53f74e9f59f95cb094256667792975 Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 16 Sep 2020 09:25:14 +0200 Subject: [PATCH 25/84] rm deps for downloading during batch exec --- examples/architrave/Dockerfile | 24 ++++++++++++------------ examples/architrave/run.sh | 29 ++++------------------------- 2 files changed, 16 insertions(+), 37 deletions(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index c90b0409..4f4e0812 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -2,10 +2,8 @@ FROM debian:stretch-slim ARG ADD_BASE_DIR_ARCHITRAVE=. ARG ADD_PRIVATE_BASE_DIR -ARG BUILD_PACKAGES=' make gcc g++ iproute2 cmake build-essential gfortran curl ' +ARG BUILD_PACKAGES=' make gcc g++ iproute2 cmake build-essential gfortran curl wget ' - -ENV VERSION=1.8 ENV DEBIAN_FRONTEND=noninteractive ## Set to either lambda or batch ENV EXEC_TYPE=lambda @@ -18,20 +16,20 @@ ENV APP_PARAMS="" ENV MPI_PARAMS='-np 1 --debug-daemons' ENV JOB_DIR=/root/exec/ ENV SCRATCH_DIR=/root/scratch -ENV AWS_ACCESS_KEY='' -ENV AWS_SECRET_ACCESS_KEY='' -ENV AWS_REGION='us-east-1' -ENV AWS_OUTPUT='json' -ENV S3_BUCKET="s3://scar-architrave" -ENV S3_BATCH_DEPS_REL_PATH=batch/deps.tar.gz -ENV S3_BATCH_PRIVATE_REL_PATH= +#ENV AWS_ACCESS_KEY='' +#ENV AWS_SECRET_ACCESS_KEY='' +#ENV AWS_REGION='us-east-1' +#ENV AWS_OUTPUT='json' +#ENV S3_BUCKET="s3://scar-architrave" +#ENV S3_BATCH_DEPS_REL_PATH=batch/deps.tar.gz +#ENV S3_BATCH_PRIVATE_REL_PATH= ENV PRIVATE_PASSWD= ENV S3_BATCH_MNT=/mnt/batch ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ RUN apt-get update \ - && apt-get install -y $BUILD_PACKAGES wget p7zip-full \ + && apt-get install -y $BUILD_PACKAGES p7zip-full \ && wget -q --no-check-certificate -qO- https://download.open-mpi.org/release/open-mpi/v1.4/openmpi-1.4.3.tar.bz2 | tar xvfj - -C /tmp/ \ && cd /tmp/openmpi-1.4.3/ \ && ./configure --disable-pty-support --disable-doc \ @@ -44,6 +42,8 @@ RUN apt-get update \ && ulimit -n 1024 \ && rm -rf /tmp/* /var/lib/apt/lists/* \ && chmod 755 /opt/mpi-run.sh \ - && chmod 755 /opt/run.sh + && chmod 755 /opt/run.sh \ + && echo $(date) > /build_date \ + && echo "Build date: $(cat /build_date)" CMD /opt/run.sh diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index 46563ea6..db72989f 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -1,5 +1,7 @@ #!/bin/bash +echo "Build date: $(cat /build_date)" + if [ "${EXEC_TYPE,,}" = 'lambda' ]; then export OMPI_MCA_plm_rsh_agent=/bin/false mpirun ${MPI_PARAMS} ${APP_BIN} ${APP_PARAMS} @@ -13,35 +15,12 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then echo "Running on node index $AWS_BATCH_JOB_NODE_INDEX out of $AWS_BATCH_JOB_NUM_NODES nodes" echo "Master node index is $AWS_BATCH_JOB_MAIN_NODE_INDEX and its IP is $AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS" - #wget -q -P /tmp --no-check-certificate --no-proxy 'http://scar-architrave.s3.amazonaws.com/awscli-exe-linux-x86_64.zip' - wget -P /tmp https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip - echo "Download awscli complete" - 7z x -aoa -o/tmp/ /tmp/awscli-exe-linux-x86_64.zip - chmod +x /tmp/aws/install - /tmp/aws/install - echo "Version of dist: ${VERSION}" - mkdir ~/.aws/ - ## S3 OPTIMIZATION - aws configure set default.s3.max_concurrent_requests 30 - aws configure set default.s3.max_queue_size 10000 - aws configure set default.s3.multipart_threshold 64MB - aws configure set default.s3.multipart_chunksize 16MB - aws configure set default.s3.max_bandwidth 4096MB/s - aws configure set default.s3.addressing_style path - printf '%s\n' '[default]' "aws_access_key_id=${AWS_ACCESS_KEY}" "aws_secret_access_key=${AWS_SECRET_ACCESS_KEY}" > ~/.aws/credentials - printf '%s\n' '[default]' "region=${AWS_REGION}" "output=${AWS_OUTPUT}" > ~/.aws/config - #aws s3 cp $S3_INPUT/common $SCRATCH_DIR - echo "Install batch only dependencies from S3" mkdir ${SCRATCH_DIR} mkdir ${JOB_DIR} - aws s3 cp ${S3_BUCKET}/${S3_BATCH_DEPS_REL_PATH} /tmp - tar -zxf /tmp/deps.tar.gz -C /tmp - dpkg -i /tmp/*.deb + dpkg -i ${S3_BATCH_MNT}/deps/*.deb echo "Add private data from S3" - #rm -rf /tmp/* - aws s3 cp ${S3_BUCKET}/${S3_BATCH_PRIVATE_REL_PATH} /tmp - 7z x -aoa -p${PRIVATE_PASSWD} -o/opt /tmp/*.7z + 7z x -aoa -p${PRIVATE_PASSWD} -o/opt ${S3_BATCH_MNT}/*.7z echo "Configure ssh" sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd From 5aa54b8b87bccc25cb249312b8b7c6fa150f7707 Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 16 Sep 2020 09:28:36 +0200 Subject: [PATCH 26/84] rm hooks --- examples/architrave/hooks/build | 52 --------------------------------- 1 file changed, 52 deletions(-) delete mode 100644 examples/architrave/hooks/build diff --git a/examples/architrave/hooks/build b/examples/architrave/hooks/build deleted file mode 100644 index 02f60744..00000000 --- a/examples/architrave/hooks/build +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash -# hooks/build -# https://docs.docker.com/docker-cloud/builds/advanced/ - -## $IMAGE_NAME var is injected into the build so the tag is correct. -echo "[***] Build hook starting..." - -# $(echo "index.docker.io/user/respository" | cut -d '/' -f 3) = "repository" -APPLICATION=$(echo "${DOCKER_REPO}" | cut -d '/' -f 3) - -echo "[---] DOCKERFILE_PATH: ${DOCKERFILE_PATH}" -echo "[---] DOCKER_REPO: ${DOCKER_REPO}" -echo "[---] IMAGE_NAME: ${IMAGE_NAME}" -echo "[---] APPLICATION: ${APPLICATION}" - -# $(echo "index.docker.io/user/repository" | cut -d '/' -f 2-3) = "user/repository" -# otherwise, you will need to set ENVIRONMENT VARIABLES for your build. -if [ -z $GITHUB_USERREPO ]; then - GITHUB_USERREPO=$(echo "${DOCKER_REPO}" | cut -d '/' -f 2-3) -fi - -# Set description from github -DESCRIPTION=$(curl -s https://api.github.com/repos/${GITHUB_USERREPO} \ - | grep '"description".*' \ - | head -n 1 \ - | cut -d '"' -f 4) - -echo "[---] GITHUB_USERREPO: ${GITHUB_USERREPO}" -echo "[---] DESCRIPTION: ${DESCRIPTION}" - - - -## Build the prime image at the end. -docker build \ - --network=host \ - --file Dockerfile \ - --build-arg APPLICATION=${APPLICATION} \ - --build-arg BUILD_RFC3339=$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ - --build-arg REVISION=$(git rev-parse --short HEAD) \ - --build-arg PACKAGE="${GITHUB_USERREPO}" \ - --build-arg DESCRIPTION="${DESCRIPTION}" \ - --build-arg VERSION=$(git describe --tags --always) \ - -t ${IMAGE_NAME} \ - . - -## Push the default arch image so manifest-tool can find it -docker push ${IMAGE_NAME} - -## Tag the default image so dockerhub can push it. -docker tag ${IMAGE_NAME} ${IMAGE_NAME} - -echo "[***] ...build hook complete." From d26543e4f348fbd06cefb9cd00900ab0e653412b Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 16 Sep 2020 09:48:41 +0200 Subject: [PATCH 27/84] default path for private dir --- examples/architrave/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index 4f4e0812..a5546240 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -1,6 +1,6 @@ FROM debian:stretch-slim -ARG ADD_BASE_DIR_ARCHITRAVE=. +ARG ADD_BASE_DIR_ARCHITRAVE=examples/architrave ARG ADD_PRIVATE_BASE_DIR ARG BUILD_PACKAGES=' make gcc g++ iproute2 cmake build-essential gfortran curl wget ' From 13cc945c7213134f6661ca8bf3c8752866156435 Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 16 Sep 2020 13:54:06 +0200 Subject: [PATCH 28/84] upd uploading result --- examples/architrave/Dockerfile | 1 + examples/architrave/README.md | 2 +- examples/architrave/mpi-run.sh | 4 ++-- examples/architrave/run.sh | 4 ++++ 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index a5546240..475bac6d 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -25,6 +25,7 @@ ENV SCRATCH_DIR=/root/scratch #ENV S3_BATCH_PRIVATE_REL_PATH= ENV PRIVATE_PASSWD= ENV S3_BATCH_MNT=/mnt/batch +ENV S3_OUTPUT="s3://scar-architrave/output" ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ diff --git a/examples/architrave/README.md b/examples/architrave/README.md index 0821d948..69ddb231 100755 --- a/examples/architrave/README.md +++ b/examples/architrave/README.md @@ -70,7 +70,7 @@ In the running container: ``` # determine all of the dependencies needed by the packages we want to install: apt update && apt install -y apt-rdepends && \ -apt-rdepends openssh-server openssh-client iproute2 | sed -E -e 's/^\s*Depends:\s*|^\s*PreDepends:\s*|\s*\(.*\)//g' | sort | uniq > /tmp/deps_tmp.lst &&\ +apt-rdepends openssh-server openssh-client iproute2 inotify-tools | sed -E -e 's/^\s*Depends:\s*|^\s*PreDepends:\s*|\s*\(.*\)//g' | sort | uniq > /tmp/deps_tmp.lst &&\ apt-get --purge autoremove -y apt-rdepends && \ # filter out already installed packages (since we use the same base distro to get that packages and to run the legacy app) apt list --installed | sed -E -e 's/\/.*//g' > /tmp/deps_installed.lst && \ diff --git a/examples/architrave/mpi-run.sh b/examples/architrave/mpi-run.sh index 1bc49ee1..aff9d3dd 100755 --- a/examples/architrave/mpi-run.sh +++ b/examples/architrave/mpi-run.sh @@ -86,8 +86,8 @@ wait_for_nodes () { sleep 2 #if [ "${NODE_TYPE}" = 'main' ]; then - env GZIP=-9 tar -czvf $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $SCRATCH_DIR/output/* - aws s3 cp $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $S3_BUCKET/output/batch_output_${AWS_BATCH_JOB_ID}.tar.gz + # env GZIP=-9 tar -czvf $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $SCRATCH_DIR/output/* + # aws s3 cp $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $S3_BUCKET/output/batch_output_${AWS_BATCH_JOB_ID}.tar.gz #fi log "done! goodbye, writing exit code to $AWS_BATCH_EXIT_CODE_FILE and shutting down my supervisord" diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index db72989f..02113c3a 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -17,6 +17,7 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then mkdir ${SCRATCH_DIR} mkdir ${JOB_DIR} + mkdir ${S3_BATCH_MNT}/output dpkg -i ${S3_BATCH_MNT}/deps/*.deb echo "Add private data from S3" @@ -44,6 +45,9 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then echo "Running app" /opt/mpi-run.sh + + echo ${S3_OUTPUT} > ${S3_BATCH_MNT}/exec/docker_done + while inotifywait ${S3_BATCH_MNT}/exec -e create; do { echo "test"; break; }; done else echo "ERROR: unknown execution type '${EXEC_TYPE}'" exit 1 # terminate and indicate error From aaf1926788c6ea30537739ed649f7701cbca49a4 Mon Sep 17 00:00:00 2001 From: asalic Date: Fri, 18 Sep 2020 11:02:47 +0200 Subject: [PATCH 29/84] execute command on host after container done different handling for master/slave on communication with host --- examples/architrave/mpi-run.sh | 5 +++++ examples/architrave/run.sh | 2 -- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/architrave/mpi-run.sh b/examples/architrave/mpi-run.sh index aff9d3dd..aaf65702 100755 --- a/examples/architrave/mpi-run.sh +++ b/examples/architrave/mpi-run.sh @@ -93,6 +93,10 @@ wait_for_nodes () { log "done! goodbye, writing exit code to $AWS_BATCH_EXIT_CODE_FILE and shutting down my supervisord" echo "0" > $AWS_BATCH_EXIT_CODE_FILE kill $(cat /tmp/supervisord.pid) + #echo "#!/bin/bash" > ${S3_BATCH_MNT}/exec/docker_done + echo "env GZIP=-9 tar -czvf /mnt/batch/output/result.tar.gz /mnt/batch/output/*" > ${S3_BATCH_MNT}/exec/docker_done + echo "/usr/local/bin/aws s3 cp /mnt/batch/output/result.tar.gz s3://scar-architrave/output/result_$(date | tr ' ' _ ).tar.gz" > ${S3_BATCH_MNT}/exec/docker_done + while inotifywait ${S3_BATCH_MNT}/exec -e create; do { echo "EC2 host post-execution process completed, exiting container"; break; }; done exit 0 } @@ -116,6 +120,7 @@ report_to_master () { echo "Sleeping 5 seconds and trying again" done log "done! goodbye" + touch ${S3_BATCH_MNT}/exec/docker_done exit 0 } diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index 02113c3a..007c3b16 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -46,8 +46,6 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then echo "Running app" /opt/mpi-run.sh - echo ${S3_OUTPUT} > ${S3_BATCH_MNT}/exec/docker_done - while inotifywait ${S3_BATCH_MNT}/exec -e create; do { echo "test"; break; }; done else echo "ERROR: unknown execution type '${EXEC_TYPE}'" exit 1 # terminate and indicate error From 158fe6e1a037ccfcd5b64c1604c7690ced8e603e Mon Sep 17 00:00:00 2001 From: asalic Date: Fri, 18 Sep 2020 11:36:09 +0200 Subject: [PATCH 30/84] start ssh service in the running script --- examples/architrave/run.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index 007c3b16..fce83d67 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -42,6 +42,9 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then ssh-add ${SSHDIR}/id_rsa chmod +x ${APP_BIN} + service ssh status + service ssh restart + service ssh status echo "Running app" /opt/mpi-run.sh From 4ec50d41a70251b483868708b673cf2a324069f5 Mon Sep 17 00:00:00 2001 From: asalic Date: Fri, 18 Sep 2020 13:05:57 +0200 Subject: [PATCH 31/84] added cert to connect between nodes --- examples/architrave/Dockerfile | 1 - examples/architrave/run.sh | 20 ++++++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index 475bac6d..ca2ba252 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -7,7 +7,6 @@ ARG BUILD_PACKAGES=' make gcc g++ iproute2 cmake build-essential gfortran curl ENV DEBIAN_FRONTEND=noninteractive ## Set to either lambda or batch ENV EXEC_TYPE=lambda -ENV SSHDIR=/root/.ssh ENV EXAMPLE_FILE=/opt/examples/example ENV TMP_OUTPUT_DIR=/tmp diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index fce83d67..63782281 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -1,6 +1,7 @@ #!/bin/bash echo "Build date: $(cat /build_date)" +echo "Runing as: ${USER} home @ ${HOME}" if [ "${EXEC_TYPE,,}" = 'lambda' ]; then export OMPI_MCA_plm_rsh_agent=/bin/false @@ -10,7 +11,6 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then # The following comment line will be replaced with the necessary env vars: #=ENV_VARS= - export AWS_BATCH_EXIT_CODE_FILE=~/batch_exit_code.file echo "Running on node index $AWS_BATCH_JOB_NODE_INDEX out of $AWS_BATCH_JOB_NUM_NODES nodes" echo "Master node index is $AWS_BATCH_JOB_MAIN_NODE_INDEX and its IP is $AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS" @@ -27,19 +27,19 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd echo "export VISIBLE=now" >> /etc/profile echo "${USER} ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers - mkdir -p ${SSHDIR} - touch ${SSHDIR}/sshd_config - ssh-keygen -t rsa -f ${SSHDIR}/ssh_host_rsa_key -N '' - cp ${SSHDIR}/ssh_host_rsa_key.pub ${SSHDIR}/authorized_keys - cp ${SSHDIR}/ssh_host_rsa_key ${SSHDIR}/id_rsa - echo " IdentityFile ${SSHDIR}/id_rsa" >> /etc/ssh/ssh_config + mkdir -p ${HOME}/.ssh + touch ${HOME}/.ssh/sshd_config + #ssh-keygen -t rsa -f ${SSHDIR}/ssh_host_rsa_key -N '' + cat /opt/ssh_host_rsa_key.pub > ${HOME}/.ssh/authorized_keys + cp /opt/ssh_host_rsa_key ${HOME}/.ssh/id_rsa + echo " IdentityFile ${HOME}/.ssh/id_rsa" >> /etc/ssh/ssh_config echo "Host *" >> /etc/ssh/ssh_config echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config - chmod -R 600 ${SSHDIR}/* - chown -R ${USER}:${USER} ${SSHDIR}/ + chmod -R 600 ${HOME}/.ssh/* + chown -R ${USER}:${USER} ${HOME}/.ssh/ # check if ssh agent is running or not, if not, run eval `ssh-agent -s` - ssh-add ${SSHDIR}/id_rsa + ssh-add ${HOME}/id_rsa chmod +x ${APP_BIN} service ssh status From bda8ba38b5952909116401be411b58cdbf29d1f6 Mon Sep 17 00:00:00 2001 From: asalic Date: Fri, 18 Sep 2020 13:30:18 +0200 Subject: [PATCH 32/84] push LDs to slaves and allow root exec --- examples/architrave/mpi-run.sh | 2 +- examples/architrave/run.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/architrave/mpi-run.sh b/examples/architrave/mpi-run.sh index aaf65702..8b672c9a 100755 --- a/examples/architrave/mpi-run.sh +++ b/examples/architrave/mpi-run.sh @@ -81,7 +81,7 @@ wait_for_nodes () { cd $SCRATCH_DIR mkdir output - mpirun --mca btl_tcp_if_include eth0 --debug-daemons --machinefile ${HOST_FILE_PATH}-deduped \ + mpirun --mca btl_tcp_if_include eth0 --debug-daemons -x PATH -x LD_LIBRARY_PATH --allow-run-as-root --machinefile ${HOST_FILE_PATH}-deduped \ ${APP_BIN} ${APP_PARAMS} sleep 2 diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index 63782281..70816296 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -39,7 +39,7 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then chown -R ${USER}:${USER} ${HOME}/.ssh/ # check if ssh agent is running or not, if not, run eval `ssh-agent -s` - ssh-add ${HOME}/id_rsa + ssh-add ${HOME}/.ssh/id_rsa chmod +x ${APP_BIN} service ssh status From c19c44b981f559e6b4ee7a96001d326853352be1 Mon Sep 17 00:00:00 2001 From: asalic Date: Fri, 18 Sep 2020 13:51:54 +0200 Subject: [PATCH 33/84] rm allow as root --- examples/architrave/mpi-run.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/architrave/mpi-run.sh b/examples/architrave/mpi-run.sh index 8b672c9a..4368e83f 100755 --- a/examples/architrave/mpi-run.sh +++ b/examples/architrave/mpi-run.sh @@ -81,7 +81,8 @@ wait_for_nodes () { cd $SCRATCH_DIR mkdir output - mpirun --mca btl_tcp_if_include eth0 --debug-daemons -x PATH -x LD_LIBRARY_PATH --allow-run-as-root --machinefile ${HOST_FILE_PATH}-deduped \ + # --allow-run-as-root + mpirun --mca btl_tcp_if_include eth0 --debug-daemons -x PATH -x LD_LIBRARY_PATH --machinefile ${HOST_FILE_PATH}-deduped \ ${APP_BIN} ${APP_PARAMS} sleep 2 From 4a3ed59497724ab6523836e379215b146a9d2ba8 Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 21 Sep 2020 12:08:55 +0200 Subject: [PATCH 34/84] add signalling for master execution termination --- examples/architrave/mpi-run.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/architrave/mpi-run.sh b/examples/architrave/mpi-run.sh index 4368e83f..c920cca6 100755 --- a/examples/architrave/mpi-run.sh +++ b/examples/architrave/mpi-run.sh @@ -81,7 +81,7 @@ wait_for_nodes () { cd $SCRATCH_DIR mkdir output - # --allow-run-as-root + # --allow-run-as-root mpirun --mca btl_tcp_if_include eth0 --debug-daemons -x PATH -x LD_LIBRARY_PATH --machinefile ${HOST_FILE_PATH}-deduped \ ${APP_BIN} ${APP_PARAMS} sleep 2 @@ -96,7 +96,9 @@ wait_for_nodes () { kill $(cat /tmp/supervisord.pid) #echo "#!/bin/bash" > ${S3_BATCH_MNT}/exec/docker_done echo "env GZIP=-9 tar -czvf /mnt/batch/output/result.tar.gz /mnt/batch/output/*" > ${S3_BATCH_MNT}/exec/docker_done - echo "/usr/local/bin/aws s3 cp /mnt/batch/output/result.tar.gz s3://scar-architrave/output/result_$(date | tr ' ' _ ).tar.gz" > ${S3_BATCH_MNT}/exec/docker_done + echo "/usr/local/bin/aws s3 cp /mnt/batch/output/result.tar.gz s3://scar-architrave/output/result_$(date | tr ' ' _ ).tar.gz" >> ${S3_BATCH_MNT}/exec/docker_done + log "Signaling children to exit" + cat ${HOST_FILE_PATH}-deduped | awk -F_ '{print $1}' | xargs -I{} -n1 ssh {} "touch /mnt/batch/mpi/master_done" while inotifywait ${S3_BATCH_MNT}/exec -e create; do { echo "EC2 host post-execution process completed, exiting container"; break; }; done exit 0 } @@ -120,8 +122,10 @@ report_to_master () { do echo "Sleeping 5 seconds and trying again" done - log "done! goodbye" touch ${S3_BATCH_MNT}/exec/docker_done + + echo "Wait for master to finish" + while inotifywait ${S3_BATCH_MNT}/mpi -e create; do { echo "Master has finished its execution, done! goodbye"; break; }; done exit 0 } From 08b9e1a56ba9870f842e1ac9a3adf34d2ad5b7dd Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 23 Sep 2020 13:01:14 +0200 Subject: [PATCH 35/84] timing mpirun --- examples/architrave/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index 70816296..9b5453e4 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -5,7 +5,7 @@ echo "Runing as: ${USER} home @ ${HOME}" if [ "${EXEC_TYPE,,}" = 'lambda' ]; then export OMPI_MCA_plm_rsh_agent=/bin/false - mpirun ${MPI_PARAMS} ${APP_BIN} ${APP_PARAMS} + { time mpirun ${MPI_PARAMS} ${APP_BIN} ${APP_PARAMS}; } 2>&1 | cat > $TMP_OUTPUT_DIR/time.log elif [ "${EXEC_TYPE,,}" = 'batch' ]; then From 915471ae9323c0344800116d4011504679cbaf76 Mon Sep 17 00:00:00 2001 From: asalic Date: Thu, 24 Sep 2020 11:09:40 +0200 Subject: [PATCH 36/84] control of execution moved to run --- examples/architrave/run.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index 9b5453e4..15a7179b 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -15,6 +15,13 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then echo "Running on node index $AWS_BATCH_JOB_NODE_INDEX out of $AWS_BATCH_JOB_NUM_NODES nodes" echo "Master node index is $AWS_BATCH_JOB_MAIN_NODE_INDEX and its IP is $AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS" + mkdir -p /mnt/batch/exec + rm -rf /mnt/batch/exec/* + mkdir -p /mnt/batch/output + rm -rf /mnt/batch/output/* + mkdir -p /mnt/batch/mpi + rm -rf /mnt/batch/mpi/* + mkdir ${SCRATCH_DIR} mkdir ${JOB_DIR} mkdir ${S3_BATCH_MNT}/output From 9b002d69bd62f481ec5a563feb050457cdd39789 Mon Sep 17 00:00:00 2001 From: asalic Date: Thu, 24 Sep 2020 11:53:50 +0200 Subject: [PATCH 37/84] timing mpi execution in batch mode --- examples/architrave/mpi-run.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/architrave/mpi-run.sh b/examples/architrave/mpi-run.sh index c920cca6..2ec6bef5 100755 --- a/examples/architrave/mpi-run.sh +++ b/examples/architrave/mpi-run.sh @@ -82,8 +82,8 @@ wait_for_nodes () { cd $SCRATCH_DIR mkdir output # --allow-run-as-root - mpirun --mca btl_tcp_if_include eth0 --debug-daemons -x PATH -x LD_LIBRARY_PATH --machinefile ${HOST_FILE_PATH}-deduped \ - ${APP_BIN} ${APP_PARAMS} + { time mpirun --mca btl_tcp_if_include eth0 --debug-daemons -x PATH -x LD_LIBRARY_PATH --machinefile ${HOST_FILE_PATH}-deduped \ + ${APP_BIN} ${APP_PARAMS}; } 2>&1 | cat > ${S3_BATCH_MNT}/output/time.log sleep 2 #if [ "${NODE_TYPE}" = 'main' ]; then From 6d23b83cbc152b5075dc493475b042a97349748f Mon Sep 17 00:00:00 2001 From: asalic Date: Thu, 24 Sep 2020 13:19:41 +0200 Subject: [PATCH 38/84] do not remove inodewatch dirs --- examples/architrave/run.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index 15a7179b..706ec8d0 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -15,13 +15,10 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then echo "Running on node index $AWS_BATCH_JOB_NODE_INDEX out of $AWS_BATCH_JOB_NUM_NODES nodes" echo "Master node index is $AWS_BATCH_JOB_MAIN_NODE_INDEX and its IP is $AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS" - mkdir -p /mnt/batch/exec rm -rf /mnt/batch/exec/* - mkdir -p /mnt/batch/output rm -rf /mnt/batch/output/* - mkdir -p /mnt/batch/mpi rm -rf /mnt/batch/mpi/* - + mkdir ${SCRATCH_DIR} mkdir ${JOB_DIR} mkdir ${S3_BATCH_MNT}/output From 6fffd86ce31642d18a49fbbd09e33fbe09577c58 Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 9 Nov 2020 12:34:14 +0100 Subject: [PATCH 39/84] move deps download to container --- examples/architrave/run.sh | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index 706ec8d0..1ba5da47 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -14,10 +14,35 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then export AWS_BATCH_EXIT_CODE_FILE=~/batch_exit_code.file echo "Running on node index $AWS_BATCH_JOB_NODE_INDEX out of $AWS_BATCH_JOB_NUM_NODES nodes" echo "Master node index is $AWS_BATCH_JOB_MAIN_NODE_INDEX and its IP is $AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS" + + cd /tmp + wget -nc https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip + unzip awscli-exe-linux-x86_64.zip + chmod +x aws/install + ./aws/install + + /usr/local/bin/aws configure set default.s3.max_concurrent_requests 30 + /usr/local/bin/aws configure set default.s3.max_queue_size 10000 + /usr/local/bin/aws configure set default.s3.multipart_threshold 64MB + /usr/local/bin/aws configure set default.s3.multipart_chunksize 16MB + /usr/local/bin/aws configure set default.s3.max_bandwidth 4096MB/s + /usr/local/bin/aws configure set default.s3.addressing_style path + + + mkdir -p ${S3_BATCH_MNT}/deps + mkdir -p ${S3_BATCH_MNT}/exec + rm -rf ${S3_BATCH_MNT}/exec/* + mkdir -p ${S3_BATCH_MNT}/output + rm -rf ${S3_BATCH_MNT}/output/* + mkdir -p ${S3_BATCH_MNT}/mpi + rm -rf ${S3_BATCH_MNT}/mpi/* + /usr/local/bin/aws s3 cp s3://scar-architrave/batch/private.7z ${S3_BATCH_MNT} + /usr/local/bin/aws s3 cp s3://scar-architrave/batch/deps.tar.gz ${S3_BATCH_MNT} + tar -zxf ${S3_BATCH_MNT}/deps.tar.gz -C ${S3_BATCH_MNT}/deps - rm -rf /mnt/batch/exec/* - rm -rf /mnt/batch/output/* - rm -rf /mnt/batch/mpi/* + #rm -rf /mnt/batch/exec/* + #rm -rf /mnt/batch/output/* + #rm -rf /mnt/batch/mpi/* mkdir ${SCRATCH_DIR} mkdir ${JOB_DIR} From 74912f79d31e16b172ce33b327ac5613814b80a6 Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 11 Nov 2020 12:23:29 +0100 Subject: [PATCH 40/84] add wget to img --- examples/architrave/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index ca2ba252..0a67fea3 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -2,7 +2,7 @@ FROM debian:stretch-slim ARG ADD_BASE_DIR_ARCHITRAVE=examples/architrave ARG ADD_PRIVATE_BASE_DIR -ARG BUILD_PACKAGES=' make gcc g++ iproute2 cmake build-essential gfortran curl wget ' +ARG BUILD_PACKAGES=' make gcc g++ iproute2 cmake build-essential gfortran curl ' ENV DEBIAN_FRONTEND=noninteractive ## Set to either lambda or batch @@ -29,7 +29,7 @@ ENV S3_OUTPUT="s3://scar-architrave/output" ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ RUN apt-get update \ - && apt-get install -y $BUILD_PACKAGES p7zip-full \ + && apt-get install -y $BUILD_PACKAGES p7zip-full wget \ && wget -q --no-check-certificate -qO- https://download.open-mpi.org/release/open-mpi/v1.4/openmpi-1.4.3.tar.bz2 | tar xvfj - -C /tmp/ \ && cd /tmp/openmpi-1.4.3/ \ && ./configure --disable-pty-support --disable-doc \ From 6672245a07edefa56c53bf8ea7ce065052a71cf0 Mon Sep 17 00:00:00 2001 From: asalic Date: Thu, 12 Nov 2020 12:19:45 +0100 Subject: [PATCH 41/84] add unzip dep --- examples/architrave/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index 0a67fea3..ae995a2e 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -29,7 +29,7 @@ ENV S3_OUTPUT="s3://scar-architrave/output" ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ RUN apt-get update \ - && apt-get install -y $BUILD_PACKAGES p7zip-full wget \ + && apt-get install -y $BUILD_PACKAGES p7zip-full wget unzip \ && wget -q --no-check-certificate -qO- https://download.open-mpi.org/release/open-mpi/v1.4/openmpi-1.4.3.tar.bz2 | tar xvfj - -C /tmp/ \ && cd /tmp/openmpi-1.4.3/ \ && ./configure --disable-pty-support --disable-doc \ From bf9021d50db6b094835b4e24f68f571372df12c9 Mon Sep 17 00:00:00 2001 From: asalic Date: Thu, 19 Nov 2020 09:04:03 +0100 Subject: [PATCH 42/84] rm config_path from function config rm full path of init_script from function config --- scar/providers/aws/functioncode.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scar/providers/aws/functioncode.py b/scar/providers/aws/functioncode.py index 50d75f57..2979eca7 100644 --- a/scar/providers/aws/functioncode.py +++ b/scar/providers/aws/functioncode.py @@ -16,16 +16,25 @@ from typing import Dict from zipfile import ZipFile +import ntpath from scar.providers.aws.udocker import Udocker from scar.providers.aws.validators import AWSValidator from scar.exceptions import exception import scar.logger as logger from scar.utils import FileUtils +def clean_function_config(function_cfg: Dict): + # Rm full path from the init_script + if function_cfg.get('init_script', True): + function_cfg['init_script'] = ntpath.basename(function_cfg['init_script']) + # Rm the config path + function_cfg.pop('config_path', None) + return function_cfg def create_function_config(resources_info): function_cfg = {'storage_providers': FileUtils.load_tmp_config_file().get('storage_providers', {})} function_cfg.update(resources_info.get('lambda')) + clean_function_config(function_cfg) return function_cfg From 0063fe08e81c38366034c8de9d816b3f11498d6e Mon Sep 17 00:00:00 2001 From: asalic Date: Tue, 24 Nov 2020 13:03:55 +0100 Subject: [PATCH 43/84] disable waiting --- examples/architrave/mpi-run.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/architrave/mpi-run.sh b/examples/architrave/mpi-run.sh index 2ec6bef5..bf38359d 100755 --- a/examples/architrave/mpi-run.sh +++ b/examples/architrave/mpi-run.sh @@ -95,11 +95,11 @@ wait_for_nodes () { echo "0" > $AWS_BATCH_EXIT_CODE_FILE kill $(cat /tmp/supervisord.pid) #echo "#!/bin/bash" > ${S3_BATCH_MNT}/exec/docker_done - echo "env GZIP=-9 tar -czvf /mnt/batch/output/result.tar.gz /mnt/batch/output/*" > ${S3_BATCH_MNT}/exec/docker_done - echo "/usr/local/bin/aws s3 cp /mnt/batch/output/result.tar.gz s3://scar-architrave/output/result_$(date | tr ' ' _ ).tar.gz" >> ${S3_BATCH_MNT}/exec/docker_done - log "Signaling children to exit" - cat ${HOST_FILE_PATH}-deduped | awk -F_ '{print $1}' | xargs -I{} -n1 ssh {} "touch /mnt/batch/mpi/master_done" - while inotifywait ${S3_BATCH_MNT}/exec -e create; do { echo "EC2 host post-execution process completed, exiting container"; break; }; done + #echo "env GZIP=-9 tar -czvf /mnt/batch/output/result.tar.gz /mnt/batch/output/*" > ${S3_BATCH_MNT}/exec/docker_done + #echo "/usr/local/bin/aws s3 cp /mnt/batch/output/result.tar.gz s3://scar-architrave/output/result_$(date | tr ' ' _ ).tar.gz" >> ${S3_BATCH_MNT}/exec/docker_done + #log "Signaling children to exit" + #cat ${HOST_FILE_PATH}-deduped | awk -F_ '{print $1}' | xargs -I{} -n1 ssh {} "touch /mnt/batch/mpi/master_done" + #while inotifywait ${S3_BATCH_MNT}/exec -e create; do { echo "EC2 host post-execution process completed, exiting container"; break; }; done exit 0 } @@ -122,10 +122,10 @@ report_to_master () { do echo "Sleeping 5 seconds and trying again" done - touch ${S3_BATCH_MNT}/exec/docker_done + #touch ${S3_BATCH_MNT}/exec/docker_done - echo "Wait for master to finish" - while inotifywait ${S3_BATCH_MNT}/mpi -e create; do { echo "Master has finished its execution, done! goodbye"; break; }; done + #echo "Wait for master to finish" + #while inotifywait ${S3_BATCH_MNT}/mpi -e create; do { echo "Master has finished its execution, done! goodbye"; break; }; done exit 0 } From fcbee7dfd95aa744b565dd5a38f00206404a11d0 Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 25 Nov 2020 13:35:05 +0100 Subject: [PATCH 44/84] separate params --- examples/architrave/Dockerfile | 5 +++-- examples/architrave/mpi-run.sh | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index ae995a2e..f431ccd9 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -8,10 +8,11 @@ ENV DEBIAN_FRONTEND=noninteractive ## Set to either lambda or batch ENV EXEC_TYPE=lambda -ENV EXAMPLE_FILE=/opt/examples/example +ENV APP_IN_FILE=/opt/examples/example ENV TMP_OUTPUT_DIR=/tmp ENV APP_BIN=/opt/simest -ENV APP_PARAMS="" +ENV APP_PARAMS1="" +ENV APP_PARAMS2="" ENV MPI_PARAMS='-np 1 --debug-daemons' ENV JOB_DIR=/root/exec/ ENV SCRATCH_DIR=/root/scratch diff --git a/examples/architrave/mpi-run.sh b/examples/architrave/mpi-run.sh index bf38359d..5d709301 100755 --- a/examples/architrave/mpi-run.sh +++ b/examples/architrave/mpi-run.sh @@ -83,7 +83,7 @@ wait_for_nodes () { mkdir output # --allow-run-as-root { time mpirun --mca btl_tcp_if_include eth0 --debug-daemons -x PATH -x LD_LIBRARY_PATH --machinefile ${HOST_FILE_PATH}-deduped \ - ${APP_BIN} ${APP_PARAMS}; } 2>&1 | cat > ${S3_BATCH_MNT}/output/time.log + ${APP_BIN} ${APP_IN_FILE} ${APP_PARAMS1} ${TMP_OUTPUT_DIR} ${APP_PARAMS2}; } 2>&1 | cat > ${S3_BATCH_MNT}/output/time.log sleep 2 #if [ "${NODE_TYPE}" = 'main' ]; then From 394b66783cd10be0ba772a6fb101ec1b506fccd5 Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 2 Dec 2020 09:58:15 +0100 Subject: [PATCH 45/84] show comp result --- examples/architrave/mpi-run.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/architrave/mpi-run.sh b/examples/architrave/mpi-run.sh index 5d709301..02186405 100755 --- a/examples/architrave/mpi-run.sh +++ b/examples/architrave/mpi-run.sh @@ -85,6 +85,8 @@ wait_for_nodes () { { time mpirun --mca btl_tcp_if_include eth0 --debug-daemons -x PATH -x LD_LIBRARY_PATH --machinefile ${HOST_FILE_PATH}-deduped \ ${APP_BIN} ${APP_IN_FILE} ${APP_PARAMS1} ${TMP_OUTPUT_DIR} ${APP_PARAMS2}; } 2>&1 | cat > ${S3_BATCH_MNT}/output/time.log sleep 2 + echo 'Exec output:' + cat ${S3_BATCH_MNT}/output/time.log #if [ "${NODE_TYPE}" = 'main' ]; then # env GZIP=-9 tar -czvf $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $SCRATCH_DIR/output/* From 4084e4c6204012137dea19089bc04ac19990779c Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 7 Dec 2020 10:52:20 +0100 Subject: [PATCH 46/84] add basic mpi example --- examples/mpi-example/Dockerfile | 35 +++ examples/mpi-example/LICENSE | 201 ++++++++++++++++++ examples/mpi-example/README.md | 82 +++++++ examples/mpi-example/mpi-run.sh | 150 +++++++++++++ examples/mpi-example/run.sh | 49 +++++ examples/mpi-example/run_batch.sh | 1 + .../mpi-example/scar-architrave-batch.yaml | 18 ++ .../mpi-example/scar-architrave-lambda.yaml | 15 ++ 8 files changed, 551 insertions(+) create mode 100644 examples/mpi-example/Dockerfile create mode 100644 examples/mpi-example/LICENSE create mode 100644 examples/mpi-example/README.md create mode 100644 examples/mpi-example/mpi-run.sh create mode 100644 examples/mpi-example/run.sh create mode 100644 examples/mpi-example/run_batch.sh create mode 100644 examples/mpi-example/scar-architrave-batch.yaml create mode 100644 examples/mpi-example/scar-architrave-lambda.yaml diff --git a/examples/mpi-example/Dockerfile b/examples/mpi-example/Dockerfile new file mode 100644 index 00000000..7d29df4d --- /dev/null +++ b/examples/mpi-example/Dockerfile @@ -0,0 +1,35 @@ +FROM debian:stretch-slim + +ARG ADD_BASE_DIR_ARCHITRAVE=examples/mpi-example +ARG ADD_PRIVATE_BASE_DIR +ARG BUILD_PACKAGES=' git make gcc g++ iproute2 cmake build-essential gfortran curl ' + +ENV DEBIAN_FRONTEND=noninteractive +## Set to either lambda or batch +ENV EXEC_TYPE=lambda + +ENV APP_PARAMS="" +ENV MPI_PARAMS='-np 1 --debug-daemons --allow-run-as-root' +ENV GIT_REPO=https://github.com/mpitutorial/mpitutorial +ENV GIT_REPO_REL_PATH_SRC=mpitutorial/tutorials/mpi-hello-world/code +ENV GIT_REPO_REL_PATH_EXEC=mpitutorial/tutorials/mpi-hello-world/code/mpi_hello_world +ENV APP_BIN=/opt/$GIT_REPO_REL_PATH_EXEC + +ADD ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh /opt/ + +RUN apt-get update \ + && apt-get install -y $BUILD_PACKAGES openmpi-bin libopenmpi-dev \ + && cd /opt/ \ + && git clone $GIT_REPO \ + && cd /opt/$GIT_REPO_REL_PATH_SRC \ + && make \ + && apt-get remove --purge -y $BUILD_PACKAGES gnupg* gnupg-agent* \ + && apt-get autoremove --purge -y \ + && ulimit -n 1024 \ + && chmod 755 /opt/$GIT_REPO_REL_PATH_EXEC \ + && chmod 755 /opt/mpi-run.sh \ + && chmod 755 /opt/run.sh \ + && echo $(date) > /build_date \ + && echo "Build date: $(cat /build_date)" + +CMD /opt/run.sh diff --git a/examples/mpi-example/LICENSE b/examples/mpi-example/LICENSE new file mode 100644 index 00000000..261eeb9e --- /dev/null +++ b/examples/mpi-example/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/examples/mpi-example/README.md b/examples/mpi-example/README.md new file mode 100644 index 00000000..69ddb231 --- /dev/null +++ b/examples/mpi-example/README.md @@ -0,0 +1,82 @@ +# architrave +Running a commercial app in a Docker container on Lambda and Batch + +## Building the containers + +Due to the differences between Amazon Batch and Lambda and our choice to have one Dockerfile for both, there are some things one has to take into account when running the containers with SCAR. + +### Lambda + +We included all necessary operations in the Dockerfile, therefore leaving the runtime execution populated only with the application execution itself. +The Docker image doesn't have to be public, we can build it locally. + +``` +# The base dir is the root context for Docker +# We assume that you cloned this repo in the BASE_DIR +export BASE_DIR=/tmp +# The base dir with the private bits; It must be a child of BASE_DIR +export ADD_PRIVATE_BASE_DIR=architrave + +docker build --build-arg ADD_BASE_DIR_ARCHITRAVE=scar/examples/architrave --build-arg ADD_PRIVATE_BASE_DIR="$ADD_PRIVATE_BASE_DIR" -f "$BASE_DIR/scar/examples/architrave/Dockerfile" --label architrave -t architrave "$BASE_DIR" +``` + +Take into account that the input files must be located in the __ADD_PRIVATE_BASE_DIR__ directory. +e.g. if you have something like `$BASE_DIR/$ADD_PRIVATE_BASE_DIR/examples/example_input.file`, then you the example input ends up on the following path: `/opt/examples/example_input.file` + +If you want to run the container locally before launching it on Amazon Lambda, you can use the following: + +``` +# This is the path inside the container where the binary can be found +# it is the relative path of ADD_PRIVATE_BASE_DIR without the root +# e.g. for architrave/path/path2/execute_me (where ADD_PRIVATE_BASE_DIR=architrave) is path/path2/execute_me +export APP_BIN=/opt/ +# The full list of params needed for the app, don't forget the (double) quotes when there are spaces +export APP_PARAMS= + +# Mount the results dir you specify in the APP_PARAMS env variable to +docker run -d -e EXEC_TYPE=lambda -e APP_BIN="$APP_BIN" -e APP_PARAMS="$APP_PARAMS" --name architrave_local -v /tmp/architrave-result:/ architrave:latest +``` + +#### Build context + +You can ignore everything but the private files and those from ##scar/examples/architrave## by creating a `.dockerignore` file in the root of the context with the following content: + +``` +# Ignore everything +** + +# Allow files and directories +!/architrave/** +!/scar/examples/architrave/** + +# Ignore unnecessary files inside allowed directories +# This should go after the allowed directories +**/scar-architrave-batch.yaml +**/scar-architrave-lambda.yaml +**/README.md +**/LICENSE +``` + +### Batch + +#### Batch additional required packages on S3 + +Start a Docker container based on the image of the distribution you use __to run on AWS__ the legacy application (not the distribution __of__ the legacy application). + +`docker run -it -v /tmp/deps:/tmp/deps debian:stretch-slim` + +In the running container: + +``` +# determine all of the dependencies needed by the packages we want to install: +apt update && apt install -y apt-rdepends && \ +apt-rdepends openssh-server openssh-client iproute2 inotify-tools | sed -E -e 's/^\s*Depends:\s*|^\s*PreDepends:\s*|\s*\(.*\)//g' | sort | uniq > /tmp/deps_tmp.lst &&\ +apt-get --purge autoremove -y apt-rdepends && \ +# filter out already installed packages (since we use the same base distro to get that packages and to run the legacy app) +apt list --installed | sed -E -e 's/\/.*//g' > /tmp/deps_installed.lst && \ +grep -F -v -f /tmp/deps_installed.lst /tmp/deps_tmp.lst > /tmp/deps.lst && \ +# download the list of packages, but don't install them +cd /tmp/deps && apt-get download $(cat /tmp/deps.lst) && \ +# Create the list of deps in a file; This file is used to download the required deps from an S3 bucket +ls -1 /tmp/deps > /tmp/deps/deps_batch.lst +``` diff --git a/examples/mpi-example/mpi-run.sh b/examples/mpi-example/mpi-run.sh new file mode 100644 index 00000000..a09ffa9e --- /dev/null +++ b/examples/mpi-example/mpi-run.sh @@ -0,0 +1,150 @@ +#!/bin/bash + +cd $JOB_DIR + +#PATH="$PATH:/opt/openmpi/bin/" +BASENAME="${0##*/}" +log () { + echo "${BASENAME} - ${1}" +} +HOST_FILE_PATH="/tmp/hostfile" +AWS_BATCH_EXIT_CODE_FILE="/tmp/batch-exit-code" + +#aws s3 cp $S3_INPUT $SCRATCH_DIR +#tar -xvf $SCRATCH_DIR/*.tar.gz -C $SCRATCH_DIR + +sleep 2 + +usage () { + if [ "${#@}" -ne 0 ]; then + log "* ${*}" + log + fi + cat <&2 + log "${2:-1}" > $AWS_BATCH_EXIT_CODE_FILE + kill $(cat /tmp/supervisord.pid) +} + +# Set child by default switch to main if on main node container +NODE_TYPE="child" +if [ "${AWS_BATCH_JOB_MAIN_NODE_INDEX}" == "${AWS_BATCH_JOB_NODE_INDEX}" ]; then + log "Running synchronize as the main node" + NODE_TYPE="main" +fi + + +# wait for all nodes to report +wait_for_nodes () { + log "Running as master node" + + touch $HOST_FILE_PATH + ip=$(/sbin/ip -o -4 addr list eth0 | awk '{print $4}' | cut -d/ -f1) + + if [ -x "$(command -v nvidia-smi)" ] ; then + NUM_GPUS=$(ls -l /dev/nvidia[0-9] | wc -l) + availablecores=$NUM_GPUS + else + availablecores=$(nproc) + fi + + log "master details -> $ip:$availablecores" + echo "$ip slots=$availablecores" >> $HOST_FILE_PATH + + lines=$(sort $HOST_FILE_PATH|uniq|wc -l) + while [ "$AWS_BATCH_JOB_NUM_NODES" -gt "$lines" ] + do + log "$lines out of $AWS_BATCH_JOB_NUM_NODES nodes joined, check again in 1 second" + sleep 1 + lines=$(sort $HOST_FILE_PATH|uniq|wc -l) + done + # Make the temporary file executable and run it with any given arguments + log "All nodes successfully joined" + + # remove duplicates if there are any. + awk '!a[$0]++' $HOST_FILE_PATH > ${HOST_FILE_PATH}-deduped + cat $HOST_FILE_PATH-deduped + log "executing main MPIRUN workflow" + + cd $SCRATCH_DIR + mkdir output + # --allow-run-as-root + { time mpirun --mca btl_tcp_if_include eth0 --debug-daemons -x PATH -x LD_LIBRARY_PATH --machinefile ${HOST_FILE_PATH}-deduped \ + ${APP_BIN} ${APP_PARAMS} }; } 2>&1 | cat > ${S3_BATCH_MNT}/output/time.log + sleep 2 + echo 'Exec output:' + cat ${S3_BATCH_MNT}/output/time.log + + #if [ "${NODE_TYPE}" = 'main' ]; then + # env GZIP=-9 tar -czvf $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $SCRATCH_DIR/output/* + # aws s3 cp $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $S3_BUCKET/output/batch_output_${AWS_BATCH_JOB_ID}.tar.gz + #fi + + log "done! goodbye, writing exit code to $AWS_BATCH_EXIT_CODE_FILE and shutting down my supervisord" + echo "0" > $AWS_BATCH_EXIT_CODE_FILE + kill $(cat /tmp/supervisord.pid) + #echo "#!/bin/bash" > ${S3_BATCH_MNT}/exec/docker_done + #echo "env GZIP=-9 tar -czvf /mnt/batch/output/result.tar.gz /mnt/batch/output/*" > ${S3_BATCH_MNT}/exec/docker_done + #echo "/usr/local/bin/aws s3 cp /mnt/batch/output/result.tar.gz s3://scar-architrave/output/result_$(date | tr ' ' _ ).tar.gz" >> ${S3_BATCH_MNT}/exec/docker_done + #log "Signaling children to exit" + #cat ${HOST_FILE_PATH}-deduped | awk -F_ '{print $1}' | xargs -I{} -n1 ssh {} "touch /mnt/batch/mpi/master_done" + #while inotifywait ${S3_BATCH_MNT}/exec -e create; do { echo "EC2 host post-execution process completed, exiting container"; break; }; done + exit 0 +} + + +# Fetch and run a script +report_to_master () { + # get own ip and num cpus + # + ip=$(/sbin/ip -o -4 addr list eth0 | awk '{print $4}' | cut -d/ -f1) + + if [ -x "$(command -v nvidia-smi)" ] ; then + NUM_GPUS=$(ls -l /dev/nvidia[0-9] | wc -l) + availablecores=$NUM_GPUS + else + availablecores=$(nproc) + fi + + log "I am a child node -> $ip:$availablecores, reporting to the master node -> ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS}" + until echo "$ip slots=$availablecores" | ssh ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS} "cat >> /$HOST_FILE_PATH" + do + echo "Sleeping 5 seconds and trying again" + done + #touch ${S3_BATCH_MNT}/exec/docker_done + + #echo "Wait for master to finish" + #while inotifywait ${S3_BATCH_MNT}/mpi -e create; do { echo "Master has finished its execution, done! goodbye"; break; }; done + exit 0 +} + + +# Main - dispatch user request to appropriate function +log $NODE_TYPE +case $NODE_TYPE in + main) + wait_for_nodes "${@}" + ;; + + child) + report_to_master "${@}" + ;; + + *) + log $NODE_TYPE + usage "Could not determine node type. Expected (main/child)" + ;; +esac diff --git a/examples/mpi-example/run.sh b/examples/mpi-example/run.sh new file mode 100644 index 00000000..52a65c95 --- /dev/null +++ b/examples/mpi-example/run.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +echo "Build date: $(cat /build_date)" +echo "Runing as: ${USER} home @ ${HOME}" + +if [ "${EXEC_TYPE,,}" = 'lambda' ]; then + echo 'Run lambda' + export OMPI_MCA_plm_rsh_agent=/bin/false + { time mpirun ${MPI_PARAMS} ${APP_BIN} ${APP_PARAMS}; } 2>&1 | cat > $TMP_OUTPUT_DIR/time.log + +elif [ "${EXEC_TYPE,,}" = 'batch' ]; then + echo 'Run batch' + + apt update + apt install -y openssh-server openssh-client + echo "Configure ssh" + sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + echo "export VISIBLE=now" >> /etc/profile + echo "${USER} ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers + mkdir -p ${HOME}/.ssh + touch ${HOME}/.ssh/sshd_config + #ssh-keygen -t rsa -f ${SSHDIR}/ssh_host_rsa_key -N '' + cat /opt/ssh_host_rsa_key.pub > ${HOME}/.ssh/authorized_keys + cp /opt/ssh_host_rsa_key ${HOME}/.ssh/id_rsa + echo " IdentityFile ${HOME}/.ssh/id_rsa" >> /etc/ssh/ssh_config + echo "Host *" >> /etc/ssh/ssh_config + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config + chmod -R 600 ${HOME}/.ssh/* + chown -R ${USER}:${USER} ${HOME}/.ssh/ + # check if ssh agent is running or not, if not, run + eval `ssh-agent -s` + ssh-add ${HOME}/.ssh/id_rsa + + chmod +x ${APP_BIN} + service ssh status + service ssh restart + service ssh status + + export AWS_BATCH_JOB_NODE_INDEX=0 + export AWS_BATCH_JOB_NUM_NODES=1 + export AWS_BATCH_JOB_MAIN_NODE_INDEX=0 + + echo "Running app" + /opt/mpi-run.sh + +else + echo "ERROR: unknown execution type '${EXEC_TYPE}'" + exit 1 # terminate and indicate error +fi diff --git a/examples/mpi-example/run_batch.sh b/examples/mpi-example/run_batch.sh new file mode 100644 index 00000000..7d22f90e --- /dev/null +++ b/examples/mpi-example/run_batch.sh @@ -0,0 +1 @@ +bash $INPUT_FILE_PATH diff --git a/examples/mpi-example/scar-architrave-batch.yaml b/examples/mpi-example/scar-architrave-batch.yaml new file mode 100644 index 00000000..22ecf8e0 --- /dev/null +++ b/examples/mpi-example/scar-architrave-batch.yaml @@ -0,0 +1,18 @@ +functions: + aws: + - lambda: + name: scar-mpi-example + log_level: DEBUG + init_script: run_batch.sh + execution_mode: batch + container: + image: asalic/scar-mpi-example + environment: + Variables: + EXEC_TYPE: batch + input: + - storage_provider: s3 + path: scar-architrave/input + output: + - storage_provider: s3 + path: scar-architrave/output diff --git a/examples/mpi-example/scar-architrave-lambda.yaml b/examples/mpi-example/scar-architrave-lambda.yaml new file mode 100644 index 00000000..ff4a098e --- /dev/null +++ b/examples/mpi-example/scar-architrave-lambda.yaml @@ -0,0 +1,15 @@ +functions: + aws: + - lambda: + name: scar-mpi-example + run_script: /tmp/scar_run_init.sh + container: + image_file: /tmp/scar-mpi-example-docker-img.tar.gz + environment: + Variables: + EXEC_TYPE: lambda + deployment: + bucket: scar-architrave + output: + - storage_provider: s3 + path: scar-architrave/output From 9601dbb4bdc0beb4d8b2e117ca38789cda321da9 Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 7 Dec 2020 11:32:57 +0100 Subject: [PATCH 47/84] tst --- examples/mpi-example/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/mpi-example/Dockerfile b/examples/mpi-example/Dockerfile index 7d29df4d..aa440e4f 100644 --- a/examples/mpi-example/Dockerfile +++ b/examples/mpi-example/Dockerfile @@ -17,6 +17,7 @@ ENV APP_BIN=/opt/$GIT_REPO_REL_PATH_EXEC ADD ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh /opt/ + RUN apt-get update \ && apt-get install -y $BUILD_PACKAGES openmpi-bin libopenmpi-dev \ && cd /opt/ \ From c8b2b0de190ab73643c1c9e7798fb652baa7998c Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 7 Dec 2020 11:49:14 +0100 Subject: [PATCH 48/84] tst --- examples/mpi-example/Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/mpi-example/Dockerfile b/examples/mpi-example/Dockerfile index aa440e4f..7d29df4d 100644 --- a/examples/mpi-example/Dockerfile +++ b/examples/mpi-example/Dockerfile @@ -17,7 +17,6 @@ ENV APP_BIN=/opt/$GIT_REPO_REL_PATH_EXEC ADD ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh /opt/ - RUN apt-get update \ && apt-get install -y $BUILD_PACKAGES openmpi-bin libopenmpi-dev \ && cd /opt/ \ From 9e5760a6f42820f513cae0c99395822b723038d4 Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 7 Dec 2020 18:06:01 +0100 Subject: [PATCH 49/84] add waiting for exec rn fdl files --- ...{scar-architrave-batch.yaml => batch.yaml} | 0 ...car-architrave-lambda.yaml => lambda.yaml} | 0 examples/mpi-example/mpi-run.sh | 28 +++++++++++++++---- 3 files changed, 22 insertions(+), 6 deletions(-) rename examples/mpi-example/{scar-architrave-batch.yaml => batch.yaml} (100%) rename examples/mpi-example/{scar-architrave-lambda.yaml => lambda.yaml} (100%) diff --git a/examples/mpi-example/scar-architrave-batch.yaml b/examples/mpi-example/batch.yaml similarity index 100% rename from examples/mpi-example/scar-architrave-batch.yaml rename to examples/mpi-example/batch.yaml diff --git a/examples/mpi-example/scar-architrave-lambda.yaml b/examples/mpi-example/lambda.yaml similarity index 100% rename from examples/mpi-example/scar-architrave-lambda.yaml rename to examples/mpi-example/lambda.yaml diff --git a/examples/mpi-example/mpi-run.sh b/examples/mpi-example/mpi-run.sh index a09ffa9e..27d0ee61 100644 --- a/examples/mpi-example/mpi-run.sh +++ b/examples/mpi-example/mpi-run.sh @@ -10,6 +10,11 @@ log () { HOST_FILE_PATH="/tmp/hostfile" AWS_BATCH_EXIT_CODE_FILE="/tmp/batch-exit-code" +BATCH_SIGNAL_DIR=/tmp/batch +if [ -d "${BATCH_SIGNAL_DIR}" ]; then rm -Rf ${BATCH_SIGNAL_DIR}; fi +mkdir -p ${BATCH_SIGNAL_DIR}master_done +mkdir -p ${BATCH_SIGNAL_DIR}/workers_done + #aws s3 cp $S3_INPUT $SCRATCH_DIR #tar -xvf $SCRATCH_DIR/*.tar.gz -C $SCRATCH_DIR @@ -83,10 +88,10 @@ wait_for_nodes () { mkdir output # --allow-run-as-root { time mpirun --mca btl_tcp_if_include eth0 --debug-daemons -x PATH -x LD_LIBRARY_PATH --machinefile ${HOST_FILE_PATH}-deduped \ - ${APP_BIN} ${APP_PARAMS} }; } 2>&1 | cat > ${S3_BATCH_MNT}/output/time.log + ${APP_BIN} ${APP_PARAMS} }; } 2>&1 | cat > ${TMP_OUTPUT_DIR}/time.log sleep 2 echo 'Exec output:' - cat ${S3_BATCH_MNT}/output/time.log + cat ${TMP_OUTPUT_DIR}/time.log #if [ "${NODE_TYPE}" = 'main' ]; then # env GZIP=-9 tar -czvf $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $SCRATCH_DIR/output/* @@ -99,8 +104,18 @@ wait_for_nodes () { #echo "#!/bin/bash" > ${S3_BATCH_MNT}/exec/docker_done #echo "env GZIP=-9 tar -czvf /mnt/batch/output/result.tar.gz /mnt/batch/output/*" > ${S3_BATCH_MNT}/exec/docker_done #echo "/usr/local/bin/aws s3 cp /mnt/batch/output/result.tar.gz s3://scar-architrave/output/result_$(date | tr ' ' _ ).tar.gz" >> ${S3_BATCH_MNT}/exec/docker_done - #log "Signaling children to exit" - #cat ${HOST_FILE_PATH}-deduped | awk -F_ '{print $1}' | xargs -I{} -n1 ssh {} "touch /mnt/batch/mpi/master_done" + log "Signaling children to exit" + cat ${HOST_FILE_PATH}-deduped | awk -F_ '{print $1}' | xargs -I{} -n1 ssh {} "touch ${BATCH_SIGNAL_DIR}/master_done/done" + + log "Wait for children to finish their execution" + num_finished=$(ls ${BATCH_SIGNAL_DIR}/workers_done/|uniq|wc -l) + while [ "$AWS_BATCH_JOB_NUM_NODES" -gt "$num_finished" ] + do + log "$num_finished out of $AWS_BATCH_JOB_NUM_NODES nodes are done, check again in 1 second" + sleep 1 + num_finished=$(ls ${BATCH_SIGNAL_DIR}/workers_done/|uniq|wc -l) + done + #while inotifywait ${S3_BATCH_MNT}/exec -e create; do { echo "EC2 host post-execution process completed, exiting container"; break; }; done exit 0 } @@ -126,8 +141,9 @@ report_to_master () { done #touch ${S3_BATCH_MNT}/exec/docker_done - #echo "Wait for master to finish" - #while inotifywait ${S3_BATCH_MNT}/mpi -e create; do { echo "Master has finished its execution, done! goodbye"; break; }; done + echo "Wait for master to finish" + while inotifywait ${BATCH_SIGNAL_DIR}/master_done -e create; do { echo "Master has finished its execution, done! goodbye"; break; }; done + ssh ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS} "touch ${BATCH_SIGNAL_DIR}/workers_done/${ip}" exit 0 } From f3d1d6160978c4691e0fc28737cecdebd88e54c6 Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 9 Dec 2020 10:34:03 +0100 Subject: [PATCH 50/84] switch output/input folders --- examples/mpi-example/batch.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/mpi-example/batch.yaml b/examples/mpi-example/batch.yaml index 22ecf8e0..50248018 100644 --- a/examples/mpi-example/batch.yaml +++ b/examples/mpi-example/batch.yaml @@ -12,7 +12,7 @@ functions: EXEC_TYPE: batch input: - storage_provider: s3 - path: scar-architrave/input + path: scar-mpi-example/input output: - storage_provider: s3 - path: scar-architrave/output + path: scar-mpi-example/output From 57b795cc81ef660023bc07655b935bca51168b3b Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 9 Dec 2020 11:51:22 +0100 Subject: [PATCH 51/84] updated waiting algo save output to TMP folder monitored by the supervisor --- examples/architrave/mpi-run.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/architrave/mpi-run.sh b/examples/architrave/mpi-run.sh index 02186405..cc61a67e 100755 --- a/examples/architrave/mpi-run.sh +++ b/examples/architrave/mpi-run.sh @@ -83,10 +83,10 @@ wait_for_nodes () { mkdir output # --allow-run-as-root { time mpirun --mca btl_tcp_if_include eth0 --debug-daemons -x PATH -x LD_LIBRARY_PATH --machinefile ${HOST_FILE_PATH}-deduped \ - ${APP_BIN} ${APP_IN_FILE} ${APP_PARAMS1} ${TMP_OUTPUT_DIR} ${APP_PARAMS2}; } 2>&1 | cat > ${S3_BATCH_MNT}/output/time.log + ${APP_BIN} ${APP_IN_FILE} ${APP_PARAMS1} ${TMP_OUTPUT_DIR} ${APP_PARAMS2}; } 2>&1 | cat > ${TMP_OUTPUT_DIR}/time.log sleep 2 echo 'Exec output:' - cat ${S3_BATCH_MNT}/output/time.log + cat ${TMP_OUTPUT_DIR}/time.log #if [ "${NODE_TYPE}" = 'main' ]; then # env GZIP=-9 tar -czvf $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $SCRATCH_DIR/output/* From 0ceebc30d3d07c6396d366b1a24dd303784c6805 Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 9 Dec 2020 11:52:35 +0100 Subject: [PATCH 52/84] rm amazon specific vars --- examples/mpi-example/run.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/mpi-example/run.sh b/examples/mpi-example/run.sh index 52a65c95..d2453ca3 100644 --- a/examples/mpi-example/run.sh +++ b/examples/mpi-example/run.sh @@ -36,9 +36,9 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then service ssh restart service ssh status - export AWS_BATCH_JOB_NODE_INDEX=0 - export AWS_BATCH_JOB_NUM_NODES=1 - export AWS_BATCH_JOB_MAIN_NODE_INDEX=0 +# export AWS_BATCH_JOB_NODE_INDEX=0 +# export AWS_BATCH_JOB_NUM_NODES=1 +# export AWS_BATCH_JOB_MAIN_NODE_INDEX=0 echo "Running app" /opt/mpi-run.sh From 944dd3e4ec6eaee6865b4c5b964fefd137ff8620 Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 9 Dec 2020 11:53:16 +0100 Subject: [PATCH 53/84] add support for multi-node parallel jobs --- scar/parser/cfgfile.py | 5 ++ scar/providers/aws/batchfunction.py | 76 +++++++++++++++++++---------- 2 files changed, 54 insertions(+), 27 deletions(-) diff --git a/scar/parser/cfgfile.py b/scar/parser/cfgfile.py index ce47d5a3..25fb2858 100644 --- a/scar/parser/cfgfile.py +++ b/scar/parser/cfgfile.py @@ -131,6 +131,11 @@ "log_retention_policy_in_days": 30 }, "batch": { + "multi_node_parallel": { + "enabled": False, + "number_nodes": 10, + "main_node_index": 0 + }, "boto_profile": "default", "region": "us-east-1", "vcpus": 1, diff --git a/scar/providers/aws/batchfunction.py b/scar/providers/aws/batchfunction.py index f9caaf1c..ef37721b 100644 --- a/scar/providers/aws/batchfunction.py +++ b/scar/providers/aws/batchfunction.py @@ -151,33 +151,41 @@ def _get_creations_job_queue_args(self): def _get_job_definition_args(self): job_def_args = { - 'jobDefinitionName': self.function_name, - 'type': 'container', - 'containerProperties': { - 'image': self.resources_info.get('lambda').get('container').get('image'), - 'memory': int(self.batch.get('memory')), - 'vcpus': int(self.batch.get('vcpus')), - 'command': [ - '/bin/sh', - '-c', - 'echo $EVENT | /opt/faas-supervisor/bin/supervisor' - ], - 'volumes': [ - { - 'host': { - 'sourcePath': '/opt/faas-supervisor/bin' - }, - 'name': 'supervisor-bin' - } - ], - 'environment': [{'name': key, 'value': value} for key, value in self.resources_info['batch']['environment']['Variables'].items()], - 'mountPoints': [ - { - 'containerPath': '/opt/faas-supervisor/bin', - 'sourceVolume': 'supervisor-bin' - } - ] - } + 'jobDefinitionName': self.function_name + } + if self.batch.get('multi_node_parallel').get('enabled'): + job_def_args['nodeProperties'] = self._get_node_properties_multi_node_args() + job_def_args['type'] = 'multinode' + else: + job_def_args['containerProperties'] = self._get_container_properties_single_node_args() + job_def_args['type'] = 'container' + return job_def_args + + def _get_container_properties_single_node_args(self): + job_def_args = { + 'image': self.resources_info.get('lambda').get('container').get('image'), + 'memory': int(self.batch.get('memory')), + 'vcpus': int(self.batch.get('vcpus')), + 'command': [ + '/bin/sh', + '-c', + 'echo $EVENT | /opt/faas-supervisor/bin/supervisor' + ], + 'volumes': [ + { + 'host': { + 'sourcePath': '/opt/faas-supervisor/bin' + }, + 'name': 'supervisor-bin' + } + ], + 'environment': [{'name': key, 'value': value} for key, value in self.resources_info['batch']['environment']['Variables'].items()], + 'mountPoints': [ + { + 'containerPath': '/opt/faas-supervisor/bin', + 'sourceVolume': 'supervisor-bin' + } + ] } if self.batch.get('enable_gpu'): job_def_args['containerProperties']['resourceRequirements'] = [ @@ -188,6 +196,20 @@ def _get_job_definition_args(self): ] return job_def_args + def _get_node_properties_multi_node_args(self): + targetNodes = self.batch.get('multi_node_parallel').get('number_nodes') - 1 + job_def_args = { + "numNodes": int(self.batch.get('multi_node_parallel').get('number_nodes')), + "mainNode": int(self.batch.get('multi_node_parallel').get('main_node_index')), + "nodeRangeProperties": [ + { + "targetNodes": "0:" + str(targetNodes), + "container": self._get_container_properties_single_node_args() + } + ]#[self._get_node_node_range_property_multi_node_args(target_nodes) for target_nodes in self.batch.get('multi_node_parallel').get('target_nodes')] + } + return job_def_args + def _get_state_and_status_of_compute_env(self): creation_args = self._get_describe_compute_env_args() response = self.client.describe_compute_environments(**creation_args) From 14f06f1c9e9612380cfd9748a33c72520cf24eac Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 9 Dec 2020 14:00:00 +0100 Subject: [PATCH 54/84] add specific funcion_config for batch --- scar/providers/aws/functioncode.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scar/providers/aws/functioncode.py b/scar/providers/aws/functioncode.py index 2979eca7..fd39679d 100644 --- a/scar/providers/aws/functioncode.py +++ b/scar/providers/aws/functioncode.py @@ -35,6 +35,12 @@ def create_function_config(resources_info): function_cfg = {'storage_providers': FileUtils.load_tmp_config_file().get('storage_providers', {})} function_cfg.update(resources_info.get('lambda')) clean_function_config(function_cfg) + print(resources_info) + # Add Batch specific info + if resources_info.get('lambda').get("execution_mode") == "batch": + function_cfg.update({"batch": { + "multi_node_parallel": resources_info.get('batch').get("multi_node_parallel") + }}) return function_cfg From 2a2c390c93b45be1f6771133acc338ae65c8b92c Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 14 Dec 2020 12:00:04 +0100 Subject: [PATCH 55/84] rm target node group end index for parallel multi node batch jobs --- scar/providers/aws/batchfunction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scar/providers/aws/batchfunction.py b/scar/providers/aws/batchfunction.py index ef37721b..9cfadedd 100644 --- a/scar/providers/aws/batchfunction.py +++ b/scar/providers/aws/batchfunction.py @@ -203,7 +203,7 @@ def _get_node_properties_multi_node_args(self): "mainNode": int(self.batch.get('multi_node_parallel').get('main_node_index')), "nodeRangeProperties": [ { - "targetNodes": "0:" + str(targetNodes), + "targetNodes": "0:", #+ str(targetNodes), "container": self._get_container_properties_single_node_args() } ]#[self._get_node_node_range_property_multi_node_args(target_nodes) for target_nodes in self.batch.get('multi_node_parallel').get('target_nodes')] From fb62594e23c82ac125c76339f56ca3deef193664 Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 14 Dec 2020 12:01:22 +0100 Subject: [PATCH 56/84] rm additional execution script integrate full funcionality in the run script --- examples/mpi-example/mpi-run.sh | 166 -------------------------- examples/mpi-example/run.sh | 199 +++++++++++++++++++++++++++++++- 2 files changed, 195 insertions(+), 170 deletions(-) delete mode 100644 examples/mpi-example/mpi-run.sh diff --git a/examples/mpi-example/mpi-run.sh b/examples/mpi-example/mpi-run.sh deleted file mode 100644 index 27d0ee61..00000000 --- a/examples/mpi-example/mpi-run.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -cd $JOB_DIR - -#PATH="$PATH:/opt/openmpi/bin/" -BASENAME="${0##*/}" -log () { - echo "${BASENAME} - ${1}" -} -HOST_FILE_PATH="/tmp/hostfile" -AWS_BATCH_EXIT_CODE_FILE="/tmp/batch-exit-code" - -BATCH_SIGNAL_DIR=/tmp/batch -if [ -d "${BATCH_SIGNAL_DIR}" ]; then rm -Rf ${BATCH_SIGNAL_DIR}; fi -mkdir -p ${BATCH_SIGNAL_DIR}master_done -mkdir -p ${BATCH_SIGNAL_DIR}/workers_done - -#aws s3 cp $S3_INPUT $SCRATCH_DIR -#tar -xvf $SCRATCH_DIR/*.tar.gz -C $SCRATCH_DIR - -sleep 2 - -usage () { - if [ "${#@}" -ne 0 ]; then - log "* ${*}" - log - fi - cat <&2 - log "${2:-1}" > $AWS_BATCH_EXIT_CODE_FILE - kill $(cat /tmp/supervisord.pid) -} - -# Set child by default switch to main if on main node container -NODE_TYPE="child" -if [ "${AWS_BATCH_JOB_MAIN_NODE_INDEX}" == "${AWS_BATCH_JOB_NODE_INDEX}" ]; then - log "Running synchronize as the main node" - NODE_TYPE="main" -fi - - -# wait for all nodes to report -wait_for_nodes () { - log "Running as master node" - - touch $HOST_FILE_PATH - ip=$(/sbin/ip -o -4 addr list eth0 | awk '{print $4}' | cut -d/ -f1) - - if [ -x "$(command -v nvidia-smi)" ] ; then - NUM_GPUS=$(ls -l /dev/nvidia[0-9] | wc -l) - availablecores=$NUM_GPUS - else - availablecores=$(nproc) - fi - - log "master details -> $ip:$availablecores" - echo "$ip slots=$availablecores" >> $HOST_FILE_PATH - - lines=$(sort $HOST_FILE_PATH|uniq|wc -l) - while [ "$AWS_BATCH_JOB_NUM_NODES" -gt "$lines" ] - do - log "$lines out of $AWS_BATCH_JOB_NUM_NODES nodes joined, check again in 1 second" - sleep 1 - lines=$(sort $HOST_FILE_PATH|uniq|wc -l) - done - # Make the temporary file executable and run it with any given arguments - log "All nodes successfully joined" - - # remove duplicates if there are any. - awk '!a[$0]++' $HOST_FILE_PATH > ${HOST_FILE_PATH}-deduped - cat $HOST_FILE_PATH-deduped - log "executing main MPIRUN workflow" - - cd $SCRATCH_DIR - mkdir output - # --allow-run-as-root - { time mpirun --mca btl_tcp_if_include eth0 --debug-daemons -x PATH -x LD_LIBRARY_PATH --machinefile ${HOST_FILE_PATH}-deduped \ - ${APP_BIN} ${APP_PARAMS} }; } 2>&1 | cat > ${TMP_OUTPUT_DIR}/time.log - sleep 2 - echo 'Exec output:' - cat ${TMP_OUTPUT_DIR}/time.log - - #if [ "${NODE_TYPE}" = 'main' ]; then - # env GZIP=-9 tar -czvf $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $SCRATCH_DIR/output/* - # aws s3 cp $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $S3_BUCKET/output/batch_output_${AWS_BATCH_JOB_ID}.tar.gz - #fi - - log "done! goodbye, writing exit code to $AWS_BATCH_EXIT_CODE_FILE and shutting down my supervisord" - echo "0" > $AWS_BATCH_EXIT_CODE_FILE - kill $(cat /tmp/supervisord.pid) - #echo "#!/bin/bash" > ${S3_BATCH_MNT}/exec/docker_done - #echo "env GZIP=-9 tar -czvf /mnt/batch/output/result.tar.gz /mnt/batch/output/*" > ${S3_BATCH_MNT}/exec/docker_done - #echo "/usr/local/bin/aws s3 cp /mnt/batch/output/result.tar.gz s3://scar-architrave/output/result_$(date | tr ' ' _ ).tar.gz" >> ${S3_BATCH_MNT}/exec/docker_done - log "Signaling children to exit" - cat ${HOST_FILE_PATH}-deduped | awk -F_ '{print $1}' | xargs -I{} -n1 ssh {} "touch ${BATCH_SIGNAL_DIR}/master_done/done" - - log "Wait for children to finish their execution" - num_finished=$(ls ${BATCH_SIGNAL_DIR}/workers_done/|uniq|wc -l) - while [ "$AWS_BATCH_JOB_NUM_NODES" -gt "$num_finished" ] - do - log "$num_finished out of $AWS_BATCH_JOB_NUM_NODES nodes are done, check again in 1 second" - sleep 1 - num_finished=$(ls ${BATCH_SIGNAL_DIR}/workers_done/|uniq|wc -l) - done - - #while inotifywait ${S3_BATCH_MNT}/exec -e create; do { echo "EC2 host post-execution process completed, exiting container"; break; }; done - exit 0 -} - - -# Fetch and run a script -report_to_master () { - # get own ip and num cpus - # - ip=$(/sbin/ip -o -4 addr list eth0 | awk '{print $4}' | cut -d/ -f1) - - if [ -x "$(command -v nvidia-smi)" ] ; then - NUM_GPUS=$(ls -l /dev/nvidia[0-9] | wc -l) - availablecores=$NUM_GPUS - else - availablecores=$(nproc) - fi - - log "I am a child node -> $ip:$availablecores, reporting to the master node -> ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS}" - until echo "$ip slots=$availablecores" | ssh ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS} "cat >> /$HOST_FILE_PATH" - do - echo "Sleeping 5 seconds and trying again" - done - #touch ${S3_BATCH_MNT}/exec/docker_done - - echo "Wait for master to finish" - while inotifywait ${BATCH_SIGNAL_DIR}/master_done -e create; do { echo "Master has finished its execution, done! goodbye"; break; }; done - ssh ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS} "touch ${BATCH_SIGNAL_DIR}/workers_done/${ip}" - exit 0 -} - - -# Main - dispatch user request to appropriate function -log $NODE_TYPE -case $NODE_TYPE in - main) - wait_for_nodes "${@}" - ;; - - child) - report_to_master "${@}" - ;; - - *) - log $NODE_TYPE - usage "Could not determine node type. Expected (main/child)" - ;; -esac diff --git a/examples/mpi-example/run.sh b/examples/mpi-example/run.sh index d2453ca3..27025751 100644 --- a/examples/mpi-example/run.sh +++ b/examples/mpi-example/run.sh @@ -1,5 +1,138 @@ #!/bin/bash +# wait for all nodes to report +wait_for_nodes () { + log "Running as master node" + + touch $HOST_FILE_PATH + ip=$(/sbin/ip -o -4 addr list eth0 | awk '{print $4}' | cut -d/ -f1) + + if [ -x "$(command -v nvidia-smi)" ] ; then + NUM_GPUS=$(ls -l /dev/nvidia[0-9] | wc -l) + availablecores=$NUM_GPUS + else + availablecores=$(nproc) + fi + + log "master details -> $ip:$availablecores" + echo "$ip slots=$availablecores" >> $HOST_FILE_PATH + + lines=$(sort $HOST_FILE_PATH|uniq|wc -l) + i=0 + numCyclesWait=30 + while [ "$AWS_BATCH_JOB_NUM_NODES" -gt "$lines" ] && [ "$i" -lt "$numCyclesWait" ] + do + log "$lines out of $AWS_BATCH_JOB_NUM_NODES nodes joined, check again in 3 seconds" + sleep 3 + lines=$(sort $HOST_FILE_PATH|uniq|wc -l) + ((i=i+1)) + done + + if [ "$i" -eq "$numCyclesWait" ]; then + echo "children did not join" + exit 1 + fi + + # Make the temporary file executable and run it with any given arguments + log "All nodes successfully joined" + + # remove duplicates if there are any. + awk '!a[$0]++' $HOST_FILE_PATH > ${HOST_FILE_PATH}-deduped + cat $HOST_FILE_PATH-deduped + log "executing main MPIRUN workflow" + + # --allow-run-as-root + { time mpirun --allow-run-as-root --mca btl_tcp_if_include eth0 --debug-daemons -x PATH -x LD_LIBRARY_PATH --machinefile ${HOST_FILE_PATH}-deduped \ + ${APP_BIN} ${APP_PARAMS} }; } 2>&1 | cat > ${TMP_OUTPUT_DIR}/time.log + sleep 2 + echo 'Exec output:' + cat ${TMP_OUTPUT_DIR}/time.log + + #if [ "${NODE_TYPE}" = 'main' ]; then + # env GZIP=-9 tar -czvf $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $SCRATCH_DIR/output/* + # aws s3 cp $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $S3_BUCKET/output/batch_output_${AWS_BATCH_JOB_ID}.tar.gz + #fi + + #log "done! goodbye, writing exit code to $AWS_BATCH_EXIT_CODE_FILE and shutting down my supervisord" + #echo "0" > $AWS_BATCH_EXIT_CODE_FILE + #kill $(cat /tmp/supervisord.pid) + #echo "#!/bin/bash" > ${S3_BATCH_MNT}/exec/docker_done + #echo "env GZIP=-9 tar -czvf /mnt/batch/output/result.tar.gz /mnt/batch/output/*" > ${S3_BATCH_MNT}/exec/docker_done + #echo "/usr/local/bin/aws s3 cp /mnt/batch/output/result.tar.gz s3://scar-architrave/output/result_$(date | tr ' ' _ ).tar.gz" >> ${S3_BATCH_MNT}/exec/docker_done + log "Signaling children to exit" + cat ${HOST_FILE_PATH}-deduped | awk -F_ '{print $1}' | xargs -I{} -n1 ssh {} "touch ${BATCH_SIGNAL_DIR}/master_done/done" + + log "Wait for children to finish their execution" + num_finished=$(ls ${BATCH_SIGNAL_DIR}/workers_done/|uniq|wc -l) + while [ "$AWS_BATCH_JOB_NUM_NODES" -gt "$((num_finished+1))" ] + do + log "$num_finished out of $AWS_BATCH_JOB_NUM_NODES nodes are done, check again in 1 second" + sleep 1 + num_finished=$(ls ${BATCH_SIGNAL_DIR}/workers_done/|uniq|wc -l) + done + + #while inotifywait ${S3_BATCH_MNT}/exec -e create; do { echo "EC2 host post-execution process completed, exiting container"; break; }; done + exit 0 +} + +# Fetch and run a script +report_to_master () { + # get own ip and num cpus + # + ip=$(/sbin/ip -o -4 addr list eth0 | awk '{print $4}' | cut -d/ -f1) + + if [ -x "$(command -v nvidia-smi)" ] ; then + NUM_GPUS=$(ls -l /dev/nvidia[0-9] | wc -l) + availablecores=$NUM_GPUS + else + availablecores=$(nproc) + fi + + log "I am a child node -> $ip:$availablecores, reporting to the master node -> ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS}" +# echo "$ip slots=$availablecores" | ssh ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS} "cat >> /$HOST_FILE_PATH" -vvv +# sleep 15 +# echo "$ip slots=$availablecores" | ssh ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS} "cat >> /$HOST_FILE_PATH" -vvv + + until echo "$ip slots=$availablecores" | ssh ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS} "cat >> /$HOST_FILE_PATH" + do + echo "Sleeping 5 seconds and trying again" + sleep 5 + done + + echo "Wait for master to finish" + while inotifywait ${BATCH_SIGNAL_DIR}/master_done -e create; do { echo "Child ${ip} has finished its execution, done! goodbye"; break; }; done + ssh ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS} "touch ${BATCH_SIGNAL_DIR}/workers_done/${ip}" + exit 0 +} + +# Standard function to print an error and exit with a failing return code +error_exit () { + log "${BASENAME} - ${1}" >&2 + log "${2:-1}" > $AWS_BATCH_EXIT_CODE_FILE + kill $(cat /tmp/supervisord.pid) +} + +usage () { + if [ "${#@}" -ne 0 ]; then + log "* ${*}" + log + fi + cat <> /etc/profile @@ -25,23 +172,67 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then echo " IdentityFile ${HOME}/.ssh/id_rsa" >> /etc/ssh/ssh_config echo "Host *" >> /etc/ssh/ssh_config echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config + echo "PermitRootLogin without-password" >> /etc/ssh/sshd_config + #sed -i -e 's/#PermitRootLogin yes/PermitRootLogin yes/' /etc/ssh/sshd_config + #cat /etc/ssh/sshd_config chmod -R 600 ${HOME}/.ssh/* chown -R ${USER}:${USER} ${HOME}/.ssh/ # check if ssh agent is running or not, if not, run eval `ssh-agent -s` - ssh-add ${HOME}/.ssh/id_rsa chmod +x ${APP_BIN} service ssh status service ssh restart service ssh status - + ssh-add ${HOME}/.ssh/id_rsa + service ssh restart # export AWS_BATCH_JOB_NODE_INDEX=0 # export AWS_BATCH_JOB_NUM_NODES=1 # export AWS_BATCH_JOB_MAIN_NODE_INDEX=0 echo "Running app" - /opt/mpi-run.sh + + #/opt/mpi-run.sh + + +#PATH="$PATH:/opt/openmpi/bin/" +BASENAME="${0##*/}" +HOST_FILE_PATH="/tmp/hostfile" +AWS_BATCH_EXIT_CODE_FILE="/tmp/batch-exit-code" + +BATCH_SIGNAL_DIR=/tmp/batch +if [ -d "${BATCH_SIGNAL_DIR}" ]; then rm -Rf ${BATCH_SIGNAL_DIR}; fi +mkdir -p ${BATCH_SIGNAL_DIR}/master_done +mkdir -p ${BATCH_SIGNAL_DIR}/workers_done + +#aws s3 cp $S3_INPUT $SCRATCH_DIR +#tar -xvf $SCRATCH_DIR/*.tar.gz -C $SCRATCH_DIR + +sleep 2 + +# Set child by default switch to main if on main node container +NODE_TYPE="child" +if [ "${AWS_BATCH_JOB_MAIN_NODE_INDEX}" == "${AWS_BATCH_JOB_NODE_INDEX}" ]; then + log "Running synchronize as the main node" + NODE_TYPE="main" +fi + + +# Main - dispatch user request to appropriate function +log $NODE_TYPE +case $NODE_TYPE in + main) + wait_for_nodes "${@}" + ;; + + child) + report_to_master "${@}" + ;; + + *) log $NODE_TYPE + usage "Could not determine node type. Expected (main/child)" + ;; +esac else echo "ERROR: unknown execution type '${EXEC_TYPE}'" From a4090a8dff84544d8b6d95ea447d315ecde5b9bf Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 14 Dec 2020 12:05:01 +0100 Subject: [PATCH 57/84] rm additional run script, integrate everything in the script called by the supervisor --- examples/architrave/mpi-run.sh | 150 ----------------------- examples/architrave/run.sh | 210 +++++++++++++++++++++++++++++---- 2 files changed, 189 insertions(+), 171 deletions(-) delete mode 100755 examples/architrave/mpi-run.sh diff --git a/examples/architrave/mpi-run.sh b/examples/architrave/mpi-run.sh deleted file mode 100755 index cc61a67e..00000000 --- a/examples/architrave/mpi-run.sh +++ /dev/null @@ -1,150 +0,0 @@ -#!/bin/bash - -cd $JOB_DIR - -#PATH="$PATH:/opt/openmpi/bin/" -BASENAME="${0##*/}" -log () { - echo "${BASENAME} - ${1}" -} -HOST_FILE_PATH="/tmp/hostfile" -AWS_BATCH_EXIT_CODE_FILE="/tmp/batch-exit-code" - -#aws s3 cp $S3_INPUT $SCRATCH_DIR -#tar -xvf $SCRATCH_DIR/*.tar.gz -C $SCRATCH_DIR - -sleep 2 - -usage () { - if [ "${#@}" -ne 0 ]; then - log "* ${*}" - log - fi - cat <&2 - log "${2:-1}" > $AWS_BATCH_EXIT_CODE_FILE - kill $(cat /tmp/supervisord.pid) -} - -# Set child by default switch to main if on main node container -NODE_TYPE="child" -if [ "${AWS_BATCH_JOB_MAIN_NODE_INDEX}" == "${AWS_BATCH_JOB_NODE_INDEX}" ]; then - log "Running synchronize as the main node" - NODE_TYPE="main" -fi - - -# wait for all nodes to report -wait_for_nodes () { - log "Running as master node" - - touch $HOST_FILE_PATH - ip=$(/sbin/ip -o -4 addr list eth0 | awk '{print $4}' | cut -d/ -f1) - - if [ -x "$(command -v nvidia-smi)" ] ; then - NUM_GPUS=$(ls -l /dev/nvidia[0-9] | wc -l) - availablecores=$NUM_GPUS - else - availablecores=$(nproc) - fi - - log "master details -> $ip:$availablecores" - echo "$ip slots=$availablecores" >> $HOST_FILE_PATH - - lines=$(sort $HOST_FILE_PATH|uniq|wc -l) - while [ "$AWS_BATCH_JOB_NUM_NODES" -gt "$lines" ] - do - log "$lines out of $AWS_BATCH_JOB_NUM_NODES nodes joined, check again in 1 second" - sleep 1 - lines=$(sort $HOST_FILE_PATH|uniq|wc -l) - done - # Make the temporary file executable and run it with any given arguments - log "All nodes successfully joined" - - # remove duplicates if there are any. - awk '!a[$0]++' $HOST_FILE_PATH > ${HOST_FILE_PATH}-deduped - cat $HOST_FILE_PATH-deduped - log "executing main MPIRUN workflow" - - cd $SCRATCH_DIR - mkdir output - # --allow-run-as-root - { time mpirun --mca btl_tcp_if_include eth0 --debug-daemons -x PATH -x LD_LIBRARY_PATH --machinefile ${HOST_FILE_PATH}-deduped \ - ${APP_BIN} ${APP_IN_FILE} ${APP_PARAMS1} ${TMP_OUTPUT_DIR} ${APP_PARAMS2}; } 2>&1 | cat > ${TMP_OUTPUT_DIR}/time.log - sleep 2 - echo 'Exec output:' - cat ${TMP_OUTPUT_DIR}/time.log - - #if [ "${NODE_TYPE}" = 'main' ]; then - # env GZIP=-9 tar -czvf $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $SCRATCH_DIR/output/* - # aws s3 cp $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $S3_BUCKET/output/batch_output_${AWS_BATCH_JOB_ID}.tar.gz - #fi - - log "done! goodbye, writing exit code to $AWS_BATCH_EXIT_CODE_FILE and shutting down my supervisord" - echo "0" > $AWS_BATCH_EXIT_CODE_FILE - kill $(cat /tmp/supervisord.pid) - #echo "#!/bin/bash" > ${S3_BATCH_MNT}/exec/docker_done - #echo "env GZIP=-9 tar -czvf /mnt/batch/output/result.tar.gz /mnt/batch/output/*" > ${S3_BATCH_MNT}/exec/docker_done - #echo "/usr/local/bin/aws s3 cp /mnt/batch/output/result.tar.gz s3://scar-architrave/output/result_$(date | tr ' ' _ ).tar.gz" >> ${S3_BATCH_MNT}/exec/docker_done - #log "Signaling children to exit" - #cat ${HOST_FILE_PATH}-deduped | awk -F_ '{print $1}' | xargs -I{} -n1 ssh {} "touch /mnt/batch/mpi/master_done" - #while inotifywait ${S3_BATCH_MNT}/exec -e create; do { echo "EC2 host post-execution process completed, exiting container"; break; }; done - exit 0 -} - - -# Fetch and run a script -report_to_master () { - # get own ip and num cpus - # - ip=$(/sbin/ip -o -4 addr list eth0 | awk '{print $4}' | cut -d/ -f1) - - if [ -x "$(command -v nvidia-smi)" ] ; then - NUM_GPUS=$(ls -l /dev/nvidia[0-9] | wc -l) - availablecores=$NUM_GPUS - else - availablecores=$(nproc) - fi - - log "I am a child node -> $ip:$availablecores, reporting to the master node -> ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS}" - until echo "$ip slots=$availablecores" | ssh ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS} "cat >> /$HOST_FILE_PATH" - do - echo "Sleeping 5 seconds and trying again" - done - #touch ${S3_BATCH_MNT}/exec/docker_done - - #echo "Wait for master to finish" - #while inotifywait ${S3_BATCH_MNT}/mpi -e create; do { echo "Master has finished its execution, done! goodbye"; break; }; done - exit 0 -} - - -# Main - dispatch user request to appropriate function -log $NODE_TYPE -case $NODE_TYPE in - main) - wait_for_nodes "${@}" - ;; - - child) - report_to_master "${@}" - ;; - - *) - log $NODE_TYPE - usage "Could not determine node type. Expected (main/child)" - ;; -esac diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index 1ba5da47..d4cd8a00 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -1,5 +1,138 @@ #!/bin/bash +# wait for all nodes to report +wait_for_nodes () { + log "Running as master node" + + touch $HOST_FILE_PATH + ip=$(/sbin/ip -o -4 addr list eth0 | awk '{print $4}' | cut -d/ -f1) + + if [ -x "$(command -v nvidia-smi)" ] ; then + NUM_GPUS=$(ls -l /dev/nvidia[0-9] | wc -l) + availablecores=$NUM_GPUS + else + availablecores=$(nproc) + fi + + log "master details -> $ip:$availablecores" + echo "$ip slots=$availablecores" >> $HOST_FILE_PATH + + lines=$(sort $HOST_FILE_PATH|uniq|wc -l) + i=0 + numCyclesWait=30 + while [ "$AWS_BATCH_JOB_NUM_NODES" -gt "$lines" ] && [ "$i" -lt "$numCyclesWait" ] + do + log "$lines out of $AWS_BATCH_JOB_NUM_NODES nodes joined, check again in 3 seconds" + sleep 3 + lines=$(sort $HOST_FILE_PATH|uniq|wc -l) + ((i=i+1)) + done + + if [ "$i" -eq "$numCyclesWait" ]; then + echo "children did not join" + exit 1 + fi + + # Make the temporary file executable and run it with any given arguments + log "All nodes successfully joined" + + # remove duplicates if there are any. + awk '!a[$0]++' $HOST_FILE_PATH > ${HOST_FILE_PATH}-deduped + cat $HOST_FILE_PATH-deduped + log "executing main MPIRUN workflow" + + # --allow-run-as-root + { time mpirun --allow-run-as-root --mca btl_tcp_if_include eth0 --debug-daemons -x PATH -x LD_LIBRARY_PATH --machinefile ${HOST_FILE_PATH}-deduped \ + ${APP_BIN} ${APP_IN_FILE} ${APP_PARAMS1} ${TMP_OUTPUT_DIR} ${APP_PARAMS2}; }; } 2>&1 | cat > ${TMP_OUTPUT_DIR}/time.log + sleep 2 + echo 'Exec output:' + cat ${TMP_OUTPUT_DIR}/time.log + + #if [ "${NODE_TYPE}" = 'main' ]; then + # env GZIP=-9 tar -czvf $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $SCRATCH_DIR/output/* + # aws s3 cp $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $S3_BUCKET/output/batch_output_${AWS_BATCH_JOB_ID}.tar.gz + #fi + + #log "done! goodbye, writing exit code to $AWS_BATCH_EXIT_CODE_FILE and shutting down my supervisord" + #echo "0" > $AWS_BATCH_EXIT_CODE_FILE + #kill $(cat /tmp/supervisord.pid) + #echo "#!/bin/bash" > /tmp/exec/docker_done + #echo "env GZIP=-9 tar -czvf /mnt/batch/output/result.tar.gz /mnt/batch/output/*" > /tmp/exec/docker_done + #echo "/usr/local/bin/aws s3 cp /mnt/batch/output/result.tar.gz s3://scar-architrave/output/result_$(date | tr ' ' _ ).tar.gz" >> /tmp/exec/docker_done + log "Signaling children to exit" + cat ${HOST_FILE_PATH}-deduped | awk -F_ '{print $1}' | xargs -I{} -n1 ssh {} "touch ${BATCH_SIGNAL_DIR}/master_done/done" + + log "Wait for children to finish their execution" + num_finished=$(ls ${BATCH_SIGNAL_DIR}/workers_done/|uniq|wc -l) + while [ "$AWS_BATCH_JOB_NUM_NODES" -gt "$((num_finished+1))" ] + do + log "$num_finished out of $AWS_BATCH_JOB_NUM_NODES nodes are done, check again in 1 second" + sleep 1 + num_finished=$(ls ${BATCH_SIGNAL_DIR}/workers_done/|uniq|wc -l) + done + + #while inotifywait /tmp/exec -e create; do { echo "EC2 host post-execution process completed, exiting container"; break; }; done + exit 0 +} + +# Fetch and run a script +report_to_master () { + # get own ip and num cpus + # + ip=$(/sbin/ip -o -4 addr list eth0 | awk '{print $4}' | cut -d/ -f1) + + if [ -x "$(command -v nvidia-smi)" ] ; then + NUM_GPUS=$(ls -l /dev/nvidia[0-9] | wc -l) + availablecores=$NUM_GPUS + else + availablecores=$(nproc) + fi + + log "I am a child node -> $ip:$availablecores, reporting to the master node -> ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS}" +# echo "$ip slots=$availablecores" | ssh ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS} "cat >> /$HOST_FILE_PATH" -vvv +# sleep 15 +# echo "$ip slots=$availablecores" | ssh ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS} "cat >> /$HOST_FILE_PATH" -vvv + + until echo "$ip slots=$availablecores" | ssh ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS} "cat >> /$HOST_FILE_PATH" + do + echo "Sleeping 5 seconds and trying again" + sleep 5 + done + + echo "Wait for master to finish" + while inotifywait ${BATCH_SIGNAL_DIR}/master_done -e create; do { echo "Child ${ip} has finished its execution, done! goodbye"; break; }; done + ssh ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS} "touch ${BATCH_SIGNAL_DIR}/workers_done/${ip}" + exit 0 +} + +# Standard function to print an error and exit with a failing return code +error_exit () { + log "${BASENAME} - ${1}" >&2 + log "${2:-1}" > $AWS_BATCH_EXIT_CODE_FILE + kill $(cat /tmp/supervisord.pid) +} + +usage () { + if [ "${#@}" -ne 0 ]; then + log "* ${*}" + log + fi + cat < Date: Mon, 14 Dec 2020 12:31:35 +0100 Subject: [PATCH 58/84] rm external mpi script --- examples/architrave/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index f431ccd9..cd21d061 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -27,7 +27,7 @@ ENV PRIVATE_PASSWD= ENV S3_BATCH_MNT=/mnt/batch ENV S3_OUTPUT="s3://scar-architrave/output" -ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ +ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ RUN apt-get update \ && apt-get install -y $BUILD_PACKAGES p7zip-full wget unzip \ From e30da3cded01c88062abc42e7ebababe05af3f85 Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 14 Dec 2020 13:34:38 +0100 Subject: [PATCH 59/84] add locales support upd README name of ssh keys are now changeable --- examples/mpi-example/Dockerfile | 5 +- examples/mpi-example/README.md | 86 +++++---------------------------- examples/mpi-example/batch.yaml | 1 + examples/mpi-example/run.sh | 4 +- 4 files changed, 19 insertions(+), 77 deletions(-) diff --git a/examples/mpi-example/Dockerfile b/examples/mpi-example/Dockerfile index 7d29df4d..afcd5146 100644 --- a/examples/mpi-example/Dockerfile +++ b/examples/mpi-example/Dockerfile @@ -14,11 +14,14 @@ ENV GIT_REPO=https://github.com/mpitutorial/mpitutorial ENV GIT_REPO_REL_PATH_SRC=mpitutorial/tutorials/mpi-hello-world/code ENV GIT_REPO_REL_PATH_EXEC=mpitutorial/tutorials/mpi-hello-world/code/mpi_hello_world ENV APP_BIN=/opt/$GIT_REPO_REL_PATH_EXEC +ENV SSH_PRIV_FILE_KEY=ssh_host_rsa_key +ENV SSH_PUB_FILE_KEY=ssh_host_rsa_key.pub ADD ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh /opt/ RUN apt-get update \ - && apt-get install -y $BUILD_PACKAGES openmpi-bin libopenmpi-dev \ + && apt-get install -y $BUILD_PACKAGES openmpi-bin libopenmpi-dev locales \ + && locale-gen en_US.UTF-8 \ && cd /opt/ \ && git clone $GIT_REPO \ && cd /opt/$GIT_REPO_REL_PATH_SRC \ diff --git a/examples/mpi-example/README.md b/examples/mpi-example/README.md index 69ddb231..71f249c7 100644 --- a/examples/mpi-example/README.md +++ b/examples/mpi-example/README.md @@ -1,82 +1,20 @@ -# architrave -Running a commercial app in a Docker container on Lambda and Batch +# MPI with SCAR +Running a distributed MPI app in a Docker container on Lambda and Batch -## Building the containers +There are two different modes of running for this example: lambda through the __lambda.yaml__ and batch through __batch.yaml__. +This example uses the S3 bucket __scar-mpi-example__ for the input (the execution trigger) and the output of the job execution. +It is created automatically once a job is initialized, along with the input and output folders. -Due to the differences between Amazon Batch and Lambda and our choice to have one Dockerfile for both, there are some things one has to take into account when running the containers with SCAR. +The steps to run the example are: -### Lambda +* Init the function using scar -We included all necessary operations in the Dockerfile, therefore leaving the runtime execution populated only with the application execution itself. -The Docker image doesn't have to be public, we can build it locally. +`scar init -f .yaml` -``` -# The base dir is the root context for Docker -# We assume that you cloned this repo in the BASE_DIR -export BASE_DIR=/tmp -# The base dir with the private bits; It must be a child of BASE_DIR -export ADD_PRIVATE_BASE_DIR=architrave +* [Batch only] Generate private/public keys used by ssh to communicate between nodes. The default names can be changed using the env variables SSH_PRIV_FILE_KEY and SSH_PUB_FILE_KEY -docker build --build-arg ADD_BASE_DIR_ARCHITRAVE=scar/examples/architrave --build-arg ADD_PRIVATE_BASE_DIR="$ADD_PRIVATE_BASE_DIR" -f "$BASE_DIR/scar/examples/architrave/Dockerfile" --label architrave -t architrave "$BASE_DIR" -``` +* [Batch only] Upload a tar.gz archive file with the public and private keys in the root of the archive to the root of the bucket __scar-mpi-example__ -Take into account that the input files must be located in the __ADD_PRIVATE_BASE_DIR__ directory. -e.g. if you have something like `$BASE_DIR/$ADD_PRIVATE_BASE_DIR/examples/example_input.file`, then you the example input ends up on the following path: `/opt/examples/example_input.file` +* Upload a file to the bucket __scar-mpi-example/input__ -If you want to run the container locally before launching it on Amazon Lambda, you can use the following: - -``` -# This is the path inside the container where the binary can be found -# it is the relative path of ADD_PRIVATE_BASE_DIR without the root -# e.g. for architrave/path/path2/execute_me (where ADD_PRIVATE_BASE_DIR=architrave) is path/path2/execute_me -export APP_BIN=/opt/ -# The full list of params needed for the app, don't forget the (double) quotes when there are spaces -export APP_PARAMS= - -# Mount the results dir you specify in the APP_PARAMS env variable to -docker run -d -e EXEC_TYPE=lambda -e APP_BIN="$APP_BIN" -e APP_PARAMS="$APP_PARAMS" --name architrave_local -v /tmp/architrave-result:/ architrave:latest -``` - -#### Build context - -You can ignore everything but the private files and those from ##scar/examples/architrave## by creating a `.dockerignore` file in the root of the context with the following content: - -``` -# Ignore everything -** - -# Allow files and directories -!/architrave/** -!/scar/examples/architrave/** - -# Ignore unnecessary files inside allowed directories -# This should go after the allowed directories -**/scar-architrave-batch.yaml -**/scar-architrave-lambda.yaml -**/README.md -**/LICENSE -``` - -### Batch - -#### Batch additional required packages on S3 - -Start a Docker container based on the image of the distribution you use __to run on AWS__ the legacy application (not the distribution __of__ the legacy application). - -`docker run -it -v /tmp/deps:/tmp/deps debian:stretch-slim` - -In the running container: - -``` -# determine all of the dependencies needed by the packages we want to install: -apt update && apt install -y apt-rdepends && \ -apt-rdepends openssh-server openssh-client iproute2 inotify-tools | sed -E -e 's/^\s*Depends:\s*|^\s*PreDepends:\s*|\s*\(.*\)//g' | sort | uniq > /tmp/deps_tmp.lst &&\ -apt-get --purge autoremove -y apt-rdepends && \ -# filter out already installed packages (since we use the same base distro to get that packages and to run the legacy app) -apt list --installed | sed -E -e 's/\/.*//g' > /tmp/deps_installed.lst && \ -grep -F -v -f /tmp/deps_installed.lst /tmp/deps_tmp.lst > /tmp/deps.lst && \ -# download the list of packages, but don't install them -cd /tmp/deps && apt-get download $(cat /tmp/deps.lst) && \ -# Create the list of deps in a file; This file is used to download the required deps from an S3 bucket -ls -1 /tmp/deps > /tmp/deps/deps_batch.lst -``` +* The results are uploaded to __scar-mpi-example/output__ once the execution has been successfully finalized diff --git a/examples/mpi-example/batch.yaml b/examples/mpi-example/batch.yaml index 50248018..92e1cdfc 100644 --- a/examples/mpi-example/batch.yaml +++ b/examples/mpi-example/batch.yaml @@ -10,6 +10,7 @@ functions: environment: Variables: EXEC_TYPE: batch + PYTHONIOENCODING: utf-8 input: - storage_provider: s3 path: scar-mpi-example/input diff --git a/examples/mpi-example/run.sh b/examples/mpi-example/run.sh index 27025751..cb1f065c 100644 --- a/examples/mpi-example/run.sh +++ b/examples/mpi-example/run.sh @@ -167,8 +167,8 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then mkdir -p ${HOME}/.ssh touch ${HOME}/.ssh/sshd_config #ssh-keygen -t rsa -f ${SSHDIR}/ssh_host_rsa_key -N '' - cat /opt/ssh_host_rsa_key.pub > ${HOME}/.ssh/authorized_keys - cp /opt/ssh_host_rsa_key ${HOME}/.ssh/id_rsa + cat /opt/${SSH_PUB_FILE_KEY} > ${HOME}/.ssh/authorized_keys + cp /opt/${SSH_PRIV_FILE_KEY} ${HOME}/.ssh/id_rsa echo " IdentityFile ${HOME}/.ssh/id_rsa" >> /etc/ssh/ssh_config echo "Host *" >> /etc/ssh/ssh_config echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config From c88ecd3b4c06678b9191d684eb6be2affde2bf1e Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 14 Dec 2020 13:39:15 +0100 Subject: [PATCH 60/84] rm references to mpi-run.sh --- examples/architrave/Dockerfile | 1 - examples/mpi-example/Dockerfile | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index cd21d061..be12013f 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -42,7 +42,6 @@ RUN apt-get update \ && dpkg --force-all -i /tmp/*.deb \ && ulimit -n 1024 \ && rm -rf /tmp/* /var/lib/apt/lists/* \ - && chmod 755 /opt/mpi-run.sh \ && chmod 755 /opt/run.sh \ && echo $(date) > /build_date \ && echo "Build date: $(cat /build_date)" diff --git a/examples/mpi-example/Dockerfile b/examples/mpi-example/Dockerfile index afcd5146..bc528eab 100644 --- a/examples/mpi-example/Dockerfile +++ b/examples/mpi-example/Dockerfile @@ -17,7 +17,7 @@ ENV APP_BIN=/opt/$GIT_REPO_REL_PATH_EXEC ENV SSH_PRIV_FILE_KEY=ssh_host_rsa_key ENV SSH_PUB_FILE_KEY=ssh_host_rsa_key.pub -ADD ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/mpi-run.sh /opt/ +ADD ${ADD_BASE_DIR_ARCHITRAVE}/run.sh /opt/ RUN apt-get update \ && apt-get install -y $BUILD_PACKAGES openmpi-bin libopenmpi-dev locales \ @@ -30,7 +30,6 @@ RUN apt-get update \ && apt-get autoremove --purge -y \ && ulimit -n 1024 \ && chmod 755 /opt/$GIT_REPO_REL_PATH_EXEC \ - && chmod 755 /opt/mpi-run.sh \ && chmod 755 /opt/run.sh \ && echo $(date) > /build_date \ && echo "Build date: $(cat /build_date)" From f8101582f5a5c72d342ee0ac23a6bbad512254c5 Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 14 Dec 2020 17:36:45 +0100 Subject: [PATCH 61/84] add more locale settings --- examples/mpi-example/Dockerfile | 6 ++++++ examples/mpi-example/batch.yaml | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/mpi-example/Dockerfile b/examples/mpi-example/Dockerfile index bc528eab..290384ba 100644 --- a/examples/mpi-example/Dockerfile +++ b/examples/mpi-example/Dockerfile @@ -17,11 +17,16 @@ ENV APP_BIN=/opt/$GIT_REPO_REL_PATH_EXEC ENV SSH_PRIV_FILE_KEY=ssh_host_rsa_key ENV SSH_PUB_FILE_KEY=ssh_host_rsa_key.pub +ENV LANG en_US.UTF-8 + ADD ${ADD_BASE_DIR_ARCHITRAVE}/run.sh /opt/ RUN apt-get update \ && apt-get install -y $BUILD_PACKAGES openmpi-bin libopenmpi-dev locales \ && locale-gen en_US.UTF-8 \ + && sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen \ + && dpkg-reconfigure --frontend=noninteractive locales \ + && update-locale LANG=en_US.UTF-8 \ && cd /opt/ \ && git clone $GIT_REPO \ && cd /opt/$GIT_REPO_REL_PATH_SRC \ @@ -34,4 +39,5 @@ RUN apt-get update \ && echo $(date) > /build_date \ && echo "Build date: $(cat /build_date)" + CMD /opt/run.sh diff --git a/examples/mpi-example/batch.yaml b/examples/mpi-example/batch.yaml index 92e1cdfc..75fcd54b 100644 --- a/examples/mpi-example/batch.yaml +++ b/examples/mpi-example/batch.yaml @@ -10,7 +10,7 @@ functions: environment: Variables: EXEC_TYPE: batch - PYTHONIOENCODING: utf-8 + PYTHONIOENCODING: utf8 input: - storage_provider: s3 path: scar-mpi-example/input From c833ddd15bd61e4d8eff3f4bed60f3abfe06ce3c Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 16 Dec 2020 11:00:19 +0100 Subject: [PATCH 62/84] renamed mpi basic example --- examples/{mpi-example => mpi}/Dockerfile | 0 examples/{mpi-example => mpi}/LICENSE | 0 examples/{mpi-example => mpi}/README.md | 0 examples/{mpi-example => mpi}/batch.yaml | 0 examples/{mpi-example => mpi}/lambda.yaml | 0 examples/{mpi-example => mpi}/run.sh | 9 +++++++-- examples/{mpi-example => mpi}/run_batch.sh | 0 7 files changed, 7 insertions(+), 2 deletions(-) rename examples/{mpi-example => mpi}/Dockerfile (100%) rename examples/{mpi-example => mpi}/LICENSE (100%) rename examples/{mpi-example => mpi}/README.md (100%) rename examples/{mpi-example => mpi}/batch.yaml (100%) rename examples/{mpi-example => mpi}/lambda.yaml (100%) rename examples/{mpi-example => mpi}/run.sh (96%) rename examples/{mpi-example => mpi}/run_batch.sh (100%) diff --git a/examples/mpi-example/Dockerfile b/examples/mpi/Dockerfile similarity index 100% rename from examples/mpi-example/Dockerfile rename to examples/mpi/Dockerfile diff --git a/examples/mpi-example/LICENSE b/examples/mpi/LICENSE similarity index 100% rename from examples/mpi-example/LICENSE rename to examples/mpi/LICENSE diff --git a/examples/mpi-example/README.md b/examples/mpi/README.md similarity index 100% rename from examples/mpi-example/README.md rename to examples/mpi/README.md diff --git a/examples/mpi-example/batch.yaml b/examples/mpi/batch.yaml similarity index 100% rename from examples/mpi-example/batch.yaml rename to examples/mpi/batch.yaml diff --git a/examples/mpi-example/lambda.yaml b/examples/mpi/lambda.yaml similarity index 100% rename from examples/mpi-example/lambda.yaml rename to examples/mpi/lambda.yaml diff --git a/examples/mpi-example/run.sh b/examples/mpi/run.sh similarity index 96% rename from examples/mpi-example/run.sh rename to examples/mpi/run.sh index cb1f065c..3ce16806 100644 --- a/examples/mpi-example/run.sh +++ b/examples/mpi/run.sh @@ -146,8 +146,13 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then apt update apt install -y inotify-tools iproute2 wget unzip openssh-server openssh-client - wget -nc https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip - unzip awscli-exe-linux-x86_64.zip + + #locale-gen en_US.UTF-8 + #sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen + #dpkg-reconfigure --frontend=noninteractive locales + #update-locale LANG=en_US.UTF-8 + wget -nc -nv https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip + unzip -qq awscli-exe-linux-x86_64.zip chmod +x aws/install ./aws/install /usr/local/bin/aws configure set default.s3.max_concurrent_requests 30 diff --git a/examples/mpi-example/run_batch.sh b/examples/mpi/run_batch.sh similarity index 100% rename from examples/mpi-example/run_batch.sh rename to examples/mpi/run_batch.sh From 3b69ae17992e1abeeb848cdff5199b8db5efc2bd Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 16 Dec 2020 11:01:25 +0100 Subject: [PATCH 63/84] upd lambda architrave --- examples/architrave/Dockerfile | 15 +++++---------- examples/architrave/README.md | 17 ++++++++++++++++- .../{scar-architrave-batch.yaml => batch.yaml} | 0 ...{scar-architrave-lambda.yaml => lambda.yaml} | 5 +++-- examples/architrave/run.sh | 2 +- 5 files changed, 25 insertions(+), 14 deletions(-) rename examples/architrave/{scar-architrave-batch.yaml => batch.yaml} (100%) rename examples/architrave/{scar-architrave-lambda.yaml => lambda.yaml} (71%) diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile index be12013f..f742363d 100755 --- a/examples/architrave/Dockerfile +++ b/examples/architrave/Dockerfile @@ -2,7 +2,7 @@ FROM debian:stretch-slim ARG ADD_BASE_DIR_ARCHITRAVE=examples/architrave ARG ADD_PRIVATE_BASE_DIR -ARG BUILD_PACKAGES=' make gcc g++ iproute2 cmake build-essential gfortran curl ' +ARG BUILD_PACKAGES=' make gcc g++ iproute2 cmake build-essential gfortran curl locales ' ENV DEBIAN_FRONTEND=noninteractive ## Set to either lambda or batch @@ -13,24 +13,19 @@ ENV TMP_OUTPUT_DIR=/tmp ENV APP_BIN=/opt/simest ENV APP_PARAMS1="" ENV APP_PARAMS2="" -ENV MPI_PARAMS='-np 1 --debug-daemons' -ENV JOB_DIR=/root/exec/ -ENV SCRATCH_DIR=/root/scratch -#ENV AWS_ACCESS_KEY='' -#ENV AWS_SECRET_ACCESS_KEY='' -#ENV AWS_REGION='us-east-1' -#ENV AWS_OUTPUT='json' #ENV S3_BUCKET="s3://scar-architrave" #ENV S3_BATCH_DEPS_REL_PATH=batch/deps.tar.gz #ENV S3_BATCH_PRIVATE_REL_PATH= ENV PRIVATE_PASSWD= -ENV S3_BATCH_MNT=/mnt/batch -ENV S3_OUTPUT="s3://scar-architrave/output" ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ RUN apt-get update \ && apt-get install -y $BUILD_PACKAGES p7zip-full wget unzip \ + && locale-gen en_US.UTF-8 \ + && sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen \ + && dpkg-reconfigure --frontend=noninteractive locales \ + && update-locale LANG=en_US.UTF-8 \ && wget -q --no-check-certificate -qO- https://download.open-mpi.org/release/open-mpi/v1.4/openmpi-1.4.3.tar.bz2 | tar xvfj - -C /tmp/ \ && cd /tmp/openmpi-1.4.3/ \ && ./configure --disable-pty-support --disable-doc \ diff --git a/examples/architrave/README.md b/examples/architrave/README.md index 69ddb231..98be1dd5 100755 --- a/examples/architrave/README.md +++ b/examples/architrave/README.md @@ -57,6 +57,21 @@ You can ignore everything but the private files and those from ##scar/examples/a **/LICENSE ``` +#### Execution + +Due to the fact that we included the private files in our image, we have to launch it from a private location. +For the sake of this example, we use the environment where we built the image in the previously steps. +First, we dump the docker image and compress i t with gzip. + +`sudo docker save asalic/scar-architrave | gzip > /tmp/architrave-docker-img.gz` + +The image we just dumped should have less than 256MB. +With scar installed, we can now create the lambda function using the example yaml file included with this example as a base: + +`scar init -f lambda.yaml` + + + ### Batch #### Batch additional required packages on S3 @@ -70,7 +85,7 @@ In the running container: ``` # determine all of the dependencies needed by the packages we want to install: apt update && apt install -y apt-rdepends && \ -apt-rdepends openssh-server openssh-client iproute2 inotify-tools | sed -E -e 's/^\s*Depends:\s*|^\s*PreDepends:\s*|\s*\(.*\)//g' | sort | uniq > /tmp/deps_tmp.lst &&\ +apt-rdepends openssh-server openssh-client iproute2 inotify-tools locales | sed -E -e 's/^\s*Depends:\s*|^\s*PreDepends:\s*|\s*\(.*\)//g' | sort | uniq > /tmp/deps_tmp.lst &&\ apt-get --purge autoremove -y apt-rdepends && \ # filter out already installed packages (since we use the same base distro to get that packages and to run the legacy app) apt list --installed | sed -E -e 's/\/.*//g' > /tmp/deps_installed.lst && \ diff --git a/examples/architrave/scar-architrave-batch.yaml b/examples/architrave/batch.yaml similarity index 100% rename from examples/architrave/scar-architrave-batch.yaml rename to examples/architrave/batch.yaml diff --git a/examples/architrave/scar-architrave-lambda.yaml b/examples/architrave/lambda.yaml similarity index 71% rename from examples/architrave/scar-architrave-lambda.yaml rename to examples/architrave/lambda.yaml index fae9e1d0..2f7e1318 100755 --- a/examples/architrave/scar-architrave-lambda.yaml +++ b/examples/architrave/lambda.yaml @@ -2,9 +2,10 @@ functions: aws: - lambda: name: scar-architrave - run_script: /tmp/scar_run_init.sh + init_script: /tmp/run.sh + run_script: /tmp/run.sh container: - image_file: /tmp/architrave-docker-img.tar.gz + image_file: /tmp/architrave-docker.img environment: Variables: EXEC_TYPE: lambda diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index d4cd8a00..3058f353 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -138,7 +138,7 @@ echo "Runing as: ${USER} home @ ${HOME}" if [ "${EXEC_TYPE,,}" = 'lambda' ]; then export OMPI_MCA_plm_rsh_agent=/bin/false - { time mpirun ${MPI_PARAMS} ${APP_BIN} ${APP_PARAMS}; } 2>&1 | cat > $TMP_OUTPUT_DIR/time.log + { time mpirun -np 1 --debug-daemons ${APP_BIN} ${APP_PARAMS}; } 2>&1 | cat > $TMP_OUTPUT_DIR/time.log elif [ "${EXEC_TYPE,,}" = 'batch' ]; then From 3430bb6808a0871a81c6ae781517eb6cbfa3599f Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 16 Dec 2020 11:01:51 +0100 Subject: [PATCH 64/84] check if init_script key present --- scar/providers/aws/functioncode.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scar/providers/aws/functioncode.py b/scar/providers/aws/functioncode.py index fd39679d..3541d454 100644 --- a/scar/providers/aws/functioncode.py +++ b/scar/providers/aws/functioncode.py @@ -25,7 +25,7 @@ def clean_function_config(function_cfg: Dict): # Rm full path from the init_script - if function_cfg.get('init_script', True): + if 'init_script' in function_cfg and function_cfg.get('init_script', True): function_cfg['init_script'] = ntpath.basename(function_cfg['init_script']) # Rm the config path function_cfg.pop('config_path', None) @@ -35,7 +35,6 @@ def create_function_config(resources_info): function_cfg = {'storage_providers': FileUtils.load_tmp_config_file().get('storage_providers', {})} function_cfg.update(resources_info.get('lambda')) clean_function_config(function_cfg) - print(resources_info) # Add Batch specific info if resources_info.get('lambda').get("execution_mode") == "batch": function_cfg.update({"batch": { From 91f34a7c6f51d6c02ee50fea0ac206aa7cfc06dd Mon Sep 17 00:00:00 2001 From: asalic Date: Thu, 17 Dec 2020 12:39:59 +0100 Subject: [PATCH 65/84] rename helper --- examples/architrave/run_helper.sh | 5 +++++ examples/architrave/scar_run_init.sh | 2 -- 2 files changed, 5 insertions(+), 2 deletions(-) create mode 100755 examples/architrave/run_helper.sh delete mode 100755 examples/architrave/scar_run_init.sh diff --git a/examples/architrave/run_helper.sh b/examples/architrave/run_helper.sh new file mode 100755 index 00000000..b4a5ceac --- /dev/null +++ b/examples/architrave/run_helper.sh @@ -0,0 +1,5 @@ +export APP_PARAMS1='' +export APP_PARAMS2='' +export APP_BIN='' +export APP_IN_FILE='' +bash /opt/run.sh diff --git a/examples/architrave/scar_run_init.sh b/examples/architrave/scar_run_init.sh deleted file mode 100755 index 19a470fb..00000000 --- a/examples/architrave/scar_run_init.sh +++ /dev/null @@ -1,2 +0,0 @@ -export APP_PARAMS="" -/opt/run_batch.sh From 8b8dccedfa1a6031948c9d0007f6c51db9acbc15 Mon Sep 17 00:00:00 2001 From: asalic Date: Thu, 17 Dec 2020 12:40:37 +0100 Subject: [PATCH 66/84] fix functions order, missing end curly brace --- examples/architrave/run.sh | 67 +++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index 3058f353..649f7d45 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -1,4 +1,36 @@ #!/bin/bash +echo "Executing as AWS ${EXEC_TYPE}" +echo "Build date: $(cat /build_date)" +echo "Runing as: ${USER} home @ ${HOME}" +echo "Running with interpreter: $(readlink -f $(which sh))" + +log () { + echo "${BASENAME} - ${1}" +} + +# Standard function to print an error and exit with a failing return code +error_exit () { + log "${BASENAME} - ${1}" >&2 + log "${2:-1}" > $AWS_BATCH_EXIT_CODE_FILE + kill $(cat /tmp/supervisord.pid) +} + +usage () { + if [ "${#@}" -ne 0 ]; then + log "* ${*}" + log + fi + cat <&1 | cat > ${TMP_OUTPUT_DIR}/time.log + ${APP_BIN} ${APP_IN_FILE} ${APP_PARAMS1} ${TMP_OUTPUT_DIR} ${APP_PARAMS2}; } 2>&1 | cat > ${TMP_OUTPUT_DIR}/time.log sleep 2 echo 'Exec output:' cat ${TMP_OUTPUT_DIR}/time.log @@ -105,40 +137,9 @@ report_to_master () { exit 0 } -# Standard function to print an error and exit with a failing return code -error_exit () { - log "${BASENAME} - ${1}" >&2 - log "${2:-1}" > $AWS_BATCH_EXIT_CODE_FILE - kill $(cat /tmp/supervisord.pid) -} - -usage () { - if [ "${#@}" -ne 0 ]; then - log "* ${*}" - log - fi - cat <&1 | cat > $TMP_OUTPUT_DIR/time.log + { time mpirun -np 1 --debug-daemons ${APP_BIN} ${APP_IN_FILE} ${APP_PARAMS1} ${TMP_OUTPUT_DIR} ${APP_PARAMS2}; } 2>&1 | cat > $TMP_OUTPUT_DIR/time.log elif [ "${EXEC_TYPE,,}" = 'batch' ]; then From 38b368b5923506a6a6bd80471c7436a259e0c674 Mon Sep 17 00:00:00 2001 From: asalic Date: Thu, 17 Dec 2020 12:41:30 +0100 Subject: [PATCH 67/84] upd names scripts examples conf launchers --- examples/architrave/batch.yaml | 2 +- examples/architrave/lambda.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/architrave/batch.yaml b/examples/architrave/batch.yaml index 0daaf9be..1e53ee6e 100755 --- a/examples/architrave/batch.yaml +++ b/examples/architrave/batch.yaml @@ -3,7 +3,7 @@ functions: - lambda: name: scar-architrave log_level: DEBUG - init_script: run_batch.sh + init_script: run_helper.sh execution_mode: batch container: image: asalic/scar-architrave diff --git a/examples/architrave/lambda.yaml b/examples/architrave/lambda.yaml index 2f7e1318..44281fa6 100755 --- a/examples/architrave/lambda.yaml +++ b/examples/architrave/lambda.yaml @@ -2,8 +2,8 @@ functions: aws: - lambda: name: scar-architrave - init_script: /tmp/run.sh - run_script: /tmp/run.sh + init_script: /tmp/run_lambda.sh + run_script: /tmp/run_lambda.sh container: image_file: /tmp/architrave-docker.img environment: From ceb7d46185df98baf465ed967b29497ec15981e5 Mon Sep 17 00:00:00 2001 From: asalic Date: Thu, 17 Dec 2020 17:34:37 +0100 Subject: [PATCH 68/84] fix support for batch --- examples/architrave/batch.yaml | 7 ++++++- examples/architrave/run.sh | 2 +- examples/architrave/run_helper.sh | 6 ++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/examples/architrave/batch.yaml b/examples/architrave/batch.yaml index 1e53ee6e..d4fbf9ec 100755 --- a/examples/architrave/batch.yaml +++ b/examples/architrave/batch.yaml @@ -3,7 +3,7 @@ functions: - lambda: name: scar-architrave log_level: DEBUG - init_script: run_helper.sh + init_script: /tmp/run_batch.sh execution_mode: batch container: image: asalic/scar-architrave @@ -16,3 +16,8 @@ functions: output: - storage_provider: s3 path: scar-architrave/output + batch: + multi_node_parallel: + enabled: false + number_nodes: 2 + main_node_index: 0 diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index 649f7d45..d0a802bb 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -163,7 +163,7 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then /usr/local/bin/aws configure set default.s3.addressing_style path - # mkdir -p /tmp/deps + mkdir -p /tmp/deps # mkdir -p /tmp/exec # rm -rf /tmp/exec/* # mkdir -p /tmp/output diff --git a/examples/architrave/run_helper.sh b/examples/architrave/run_helper.sh index b4a5ceac..64e37369 100755 --- a/examples/architrave/run_helper.sh +++ b/examples/architrave/run_helper.sh @@ -1,3 +1,9 @@ +# Uncomment PRIVATE_PASSWD, AWS_BATCH_JOB_NUM_NODES,AWS_BATCH_JOB_NODE_INDEX, and AWS_BATCH_JOB_MAIN_NODE_INDEX when running batch on single node +# export AWS_BATCH_JOB_NUM_NODES=1 +# export AWS_BATCH_JOB_NODE_INDEX=0 +# export AWS_BATCH_JOB_MAIN_NODE_INDEX=0 + +export PRIVATE_PASSWD='' export APP_PARAMS1='' export APP_PARAMS2='' export APP_BIN='' From 4074ea787997484beb95f390d8f347625241c080 Mon Sep 17 00:00:00 2001 From: asalic Date: Thu, 17 Dec 2020 18:13:28 +0100 Subject: [PATCH 69/84] silent transfer files fix old mpi lib flag run as root --- examples/architrave/run.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index d0a802bb..88eebae4 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -73,8 +73,9 @@ wait_for_nodes () { cat $HOST_FILE_PATH-deduped log "executing main MPIRUN workflow" + chmod +x ${APP_BIN} # --allow-run-as-root - { time mpirun --allow-run-as-root --mca btl_tcp_if_include eth0 --debug-daemons -x PATH -x LD_LIBRARY_PATH --machinefile ${HOST_FILE_PATH}-deduped \ + { time mpirun --mca btl_tcp_if_include eth0 --debug-daemons -x PATH -x LD_LIBRARY_PATH --machinefile ${HOST_FILE_PATH}-deduped \ ${APP_BIN} ${APP_IN_FILE} ${APP_PARAMS1} ${TMP_OUTPUT_DIR} ${APP_PARAMS2}; } 2>&1 | cat > ${TMP_OUTPUT_DIR}/time.log sleep 2 echo 'Exec output:' @@ -150,8 +151,8 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then echo "Master node index is $AWS_BATCH_JOB_MAIN_NODE_INDEX and its IP is $AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS" cd /tmp - wget -nc https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip - unzip awscli-exe-linux-x86_64.zip + wget -nc -nv https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip + unzip -qq awscli-exe-linux-x86_64.zip chmod +x aws/install ./aws/install From 1ee092c86f045c5137e046298d4b9c960b5db402 Mon Sep 17 00:00:00 2001 From: asalic Date: Fri, 18 Dec 2020 12:15:58 +0100 Subject: [PATCH 70/84] upd mpi example to run with lambda and batch --- examples/mpi/batch.yaml | 15 ++++++--- examples/mpi/lambda.yaml | 10 +++--- examples/mpi/run.sh | 65 +++++++++++++++++++------------------- examples/mpi/run_helper.sh | 11 +++++++ 4 files changed, 59 insertions(+), 42 deletions(-) create mode 100644 examples/mpi/run_helper.sh diff --git a/examples/mpi/batch.yaml b/examples/mpi/batch.yaml index 75fcd54b..23ab5f54 100644 --- a/examples/mpi/batch.yaml +++ b/examples/mpi/batch.yaml @@ -1,19 +1,24 @@ functions: aws: - lambda: - name: scar-mpi-example + name: scar-mpi log_level: DEBUG - init_script: run_batch.sh + init_script: /tmp/run_batch.sh execution_mode: batch container: - image: asalic/scar-mpi-example + image: asalic/scar-mpi environment: Variables: EXEC_TYPE: batch PYTHONIOENCODING: utf8 input: - storage_provider: s3 - path: scar-mpi-example/input + path: scar-mpi/input output: - storage_provider: s3 - path: scar-mpi-example/output + path: scar-mpi/output + batch: + multi_node_parallel: + enabled: true + number_nodes: 3 + main_node_index: 0 diff --git a/examples/mpi/lambda.yaml b/examples/mpi/lambda.yaml index ff4a098e..958633f8 100644 --- a/examples/mpi/lambda.yaml +++ b/examples/mpi/lambda.yaml @@ -1,15 +1,15 @@ functions: aws: - lambda: - name: scar-mpi-example - run_script: /tmp/scar_run_init.sh + name: scar-mpi + run_script: /tmp/run_helper.sh container: - image_file: /tmp/scar-mpi-example-docker-img.tar.gz + image_file: /tmp/scar-mpi.img environment: Variables: EXEC_TYPE: lambda deployment: - bucket: scar-architrave + bucket: scar-mpi output: - storage_provider: s3 - path: scar-architrave/output + path: scar-mpi/output diff --git a/examples/mpi/run.sh b/examples/mpi/run.sh index 3ce16806..1cf7e3ea 100644 --- a/examples/mpi/run.sh +++ b/examples/mpi/run.sh @@ -1,4 +1,36 @@ #!/bin/bash +echo "Executing as AWS ${EXEC_TYPE}" +echo "Build date: $(cat /build_date)" +echo "Runing as: ${USER} home @ ${HOME}" +echo "Running with interpreter: $(readlink -f $(which sh))" + +log () { + echo "${BASENAME} - ${1}" +} + +# Standard function to print an error and exit with a failing return code +error_exit () { + log "${BASENAME} - ${1}" >&2 + log "${2:-1}" > $AWS_BATCH_EXIT_CODE_FILE + kill $(cat /tmp/supervisord.pid) +} + +usage () { + if [ "${#@}" -ne 0 ]; then + log "* ${*}" + log + fi + cat <&1 | cat > ${TMP_OUTPUT_DIR}/time.log + ${APP_BIN} ${APP_PARAMS}; } 2>&1 | cat > ${TMP_OUTPUT_DIR}/time.log sleep 2 echo 'Exec output:' cat ${TMP_OUTPUT_DIR}/time.log @@ -105,37 +137,6 @@ report_to_master () { exit 0 } -# Standard function to print an error and exit with a failing return code -error_exit () { - log "${BASENAME} - ${1}" >&2 - log "${2:-1}" > $AWS_BATCH_EXIT_CODE_FILE - kill $(cat /tmp/supervisord.pid) -} - -usage () { - if [ "${#@}" -ne 0 ]; then - log "* ${*}" - log - fi - cat < Date: Fri, 18 Dec 2020 12:16:38 +0100 Subject: [PATCH 71/84] fix names and upd README with intructions --- examples/architrave/README.md | 39 +++++++++++++++++++++++++++++-- examples/architrave/batch.yaml | 4 ++-- examples/architrave/run_helper.sh | 2 +- 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/examples/architrave/README.md b/examples/architrave/README.md index 98be1dd5..42eab463 100755 --- a/examples/architrave/README.md +++ b/examples/architrave/README.md @@ -63,18 +63,26 @@ Due to the fact that we included the private files in our image, we have to laun For the sake of this example, we use the environment where we built the image in the previously steps. First, we dump the docker image and compress i t with gzip. -`sudo docker save asalic/scar-architrave | gzip > /tmp/architrave-docker-img.gz` +`sudo docker save scar-architrave > /tmp/architrave-docker.img` The image we just dumped should have less than 256MB. +Before launch, check the RAM and timeout set in the SCAR configuration file, this example requires at least 1.2GB RAM and 15 seconds. +The architrave's folder includes an example launch configuration yaml file used to init and run the application on lambda, __lambda.yaml__. +Please modify the env variables needed by the application, uncomment the export of the **INPUT_FILE_PATH** env variable, and set the correct path of the __run_helper.sh__ launch script in the launch configuration file. +This intermediate script launches __run.sh__ (that is found in the Docker image) that actually executes the application on Amazon Lambda or Batch. With scar installed, we can now create the lambda function using the example yaml file included with this example as a base: `scar init -f lambda.yaml` +To execute the function simply run: +`scar run -f lambda.yaml` + +Depending on the output Amazon S3 bucket/folder you have selected in the __lambda.yaml__ launch configuration, you should find the output files of the application and a __time.log__ file containing the execution log of the application. ### Batch -#### Batch additional required packages on S3 +#### Batch additional packages required on S3 Start a Docker container based on the image of the distribution you use __to run on AWS__ the legacy application (not the distribution __of__ the legacy application). @@ -94,4 +102,31 @@ grep -F -v -f /tmp/deps_installed.lst /tmp/deps_tmp.lst > /tmp/deps.lst && \ cd /tmp/deps && apt-get download $(cat /tmp/deps.lst) && \ # Create the list of deps in a file; This file is used to download the required deps from an S3 bucket ls -1 /tmp/deps > /tmp/deps/deps_batch.lst + ``` + +Since __/tmp/deps__ is shared between the host and the container, the downloaded debs can now be added to __deps.tar.gz__ archive and uploaded to an Amazon S3 bucket (defaults to **scar-architrave/batch**). +The same S3 bucket/folder should contain a 7z archive called __private.7z__ that contains the architrave executable, the example(s), and the private/public ssh keys used for communication between the nodes. +If the 7z is protected by a password, set the password via the env variable **PRIVATE_PASSWD** in the __run_helper.sh__ script. +The ssh keys should be named __ssh_host_rsa_key.pub__ for the public key and __ssh_host_rsa_key__ for the private key. + + +#### Batch execution + +Amazon batch execution is based on events. +In the example launch configuration file __batch.yaml__, the **input** section specifies an Amazon S3 bucket/folder that is monitorized for changes. +Please modify the env variables needed by the application and set the correct path of the __run_helper.sh__ launch script in the launch configuration file. + +There are two modes to execute on batch: single node or parallel multinode. +For the former case, be sure that **functions.aws.batch.multi_node_parallel.enabled** to false, and uncomment the export of **AWS_BATCH_JOB_NUM_NODES**, **AWS_BATCH_JOB_NODE_INDEX**, and **AWS_BATCH_JOB_MAIN_NODE_INDEX** in the __run_helper.sh__. +For the latter execution mode, enable the variable in __batch.yaml__ and leave the three env variables commented out. + +Once everything is set, use SCAR to init the deployment: + +`scar init -f batch.yaml` + +Nest, start the execution by uploading the customized __run_helper.sh__ script to S3 (using the default S3 bucket ): + +`aws s3 cp run_helper.sh s3://scar-architrave/input` + +This script gets executed by __run_batch.sh__. diff --git a/examples/architrave/batch.yaml b/examples/architrave/batch.yaml index d4fbf9ec..98b9ae7d 100755 --- a/examples/architrave/batch.yaml +++ b/examples/architrave/batch.yaml @@ -18,6 +18,6 @@ functions: path: scar-architrave/output batch: multi_node_parallel: - enabled: false - number_nodes: 2 + enabled: true + number_nodes: 3 main_node_index: 0 diff --git a/examples/architrave/run_helper.sh b/examples/architrave/run_helper.sh index 64e37369..b2442dce 100755 --- a/examples/architrave/run_helper.sh +++ b/examples/architrave/run_helper.sh @@ -1,4 +1,4 @@ -# Uncomment PRIVATE_PASSWD, AWS_BATCH_JOB_NUM_NODES,AWS_BATCH_JOB_NODE_INDEX, and AWS_BATCH_JOB_MAIN_NODE_INDEX when running batch on single node +# Uncomment AWS_BATCH_JOB_NUM_NODES,AWS_BATCH_JOB_NODE_INDEX, and AWS_BATCH_JOB_MAIN_NODE_INDEX when running batch on single node # export AWS_BATCH_JOB_NUM_NODES=1 # export AWS_BATCH_JOB_NODE_INDEX=0 # export AWS_BATCH_JOB_MAIN_NODE_INDEX=0 From cf1ee0a33c207493db85a7856f1bb9c4fe3ca524 Mon Sep 17 00:00:00 2001 From: asalic Date: Fri, 18 Dec 2020 12:40:26 +0100 Subject: [PATCH 72/84] fix path base dir --- examples/mpi/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/mpi/Dockerfile b/examples/mpi/Dockerfile index 290384ba..4ca6dd17 100644 --- a/examples/mpi/Dockerfile +++ b/examples/mpi/Dockerfile @@ -1,6 +1,6 @@ FROM debian:stretch-slim -ARG ADD_BASE_DIR_ARCHITRAVE=examples/mpi-example +ARG ADD_BASE_DIR_ARCHITRAVE=examples/mpi ARG ADD_PRIVATE_BASE_DIR ARG BUILD_PACKAGES=' git make gcc g++ iproute2 cmake build-essential gfortran curl ' From a375e47ce0384f8fcb9f101487da811d6da0ca12 Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 21 Dec 2020 10:33:07 +0100 Subject: [PATCH 73/84] upd readme revert to older mpi to work in lambda --- examples/mpi/Dockerfile | 19 ++++++++----- examples/mpi/README.md | 55 ++++++++++++++++++++++++++++++++++---- examples/mpi/batch.yaml | 2 +- examples/mpi/run.sh | 8 +++--- examples/mpi/run_helper.sh | 5 +--- 5 files changed, 69 insertions(+), 20 deletions(-) diff --git a/examples/mpi/Dockerfile b/examples/mpi/Dockerfile index 4ca6dd17..c245f2e1 100644 --- a/examples/mpi/Dockerfile +++ b/examples/mpi/Dockerfile @@ -1,15 +1,13 @@ FROM debian:stretch-slim -ARG ADD_BASE_DIR_ARCHITRAVE=examples/mpi -ARG ADD_PRIVATE_BASE_DIR -ARG BUILD_PACKAGES=' git make gcc g++ iproute2 cmake build-essential gfortran curl ' +ARG ADD_BASE_DIR=examples/mpi +ARG BUILD_PACKAGES=' wget locales git make gcc g++ iproute2 cmake build-essential gfortran curl ' ENV DEBIAN_FRONTEND=noninteractive ## Set to either lambda or batch ENV EXEC_TYPE=lambda ENV APP_PARAMS="" -ENV MPI_PARAMS='-np 1 --debug-daemons --allow-run-as-root' ENV GIT_REPO=https://github.com/mpitutorial/mpitutorial ENV GIT_REPO_REL_PATH_SRC=mpitutorial/tutorials/mpi-hello-world/code ENV GIT_REPO_REL_PATH_EXEC=mpitutorial/tutorials/mpi-hello-world/code/mpi_hello_world @@ -19,20 +17,29 @@ ENV SSH_PUB_FILE_KEY=ssh_host_rsa_key.pub ENV LANG en_US.UTF-8 -ADD ${ADD_BASE_DIR_ARCHITRAVE}/run.sh /opt/ +ADD ${ADD_BASE_DIR}/run.sh ${ADD_BASE_DIR}/debs.lst /opt/ RUN apt-get update \ - && apt-get install -y $BUILD_PACKAGES openmpi-bin libopenmpi-dev locales \ + && apt-get install -y $BUILD_PACKAGES \ && locale-gen en_US.UTF-8 \ && sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen \ && dpkg-reconfigure --frontend=noninteractive locales \ && update-locale LANG=en_US.UTF-8 \ + && wget -q --no-check-certificate -qO- https://download.open-mpi.org/release/open-mpi/v1.4/openmpi-1.4.3.tar.bz2 | tar xvfj - -C /tmp/ \ + && cd /tmp/openmpi-1.4.3/ \ + && ./configure --disable-pty-support --disable-doc \ + && make -j`nproc` \ + && make install \ + && ldconfig \ + #&& wget -q -P /tmp -i /opt/debs.lst \ && cd /opt/ \ && git clone $GIT_REPO \ && cd /opt/$GIT_REPO_REL_PATH_SRC \ && make \ && apt-get remove --purge -y $BUILD_PACKAGES gnupg* gnupg-agent* \ && apt-get autoremove --purge -y \ + #&& dpkg --force-all -i /tmp/*.deb \ + && rm -rf /tmp/* /var/lib/apt/lists/* \ && ulimit -n 1024 \ && chmod 755 /opt/$GIT_REPO_REL_PATH_EXEC \ && chmod 755 /opt/run.sh \ diff --git a/examples/mpi/README.md b/examples/mpi/README.md index 71f249c7..f9a08b0a 100644 --- a/examples/mpi/README.md +++ b/examples/mpi/README.md @@ -1,20 +1,65 @@ # MPI with SCAR + + Running a distributed MPI app in a Docker container on Lambda and Batch There are two different modes of running for this example: lambda through the __lambda.yaml__ and batch through __batch.yaml__. -This example uses the S3 bucket __scar-mpi-example__ for the input (the execution trigger) and the output of the job execution. +This example uses the S3 bucket __scar-mpi__ for the input (the execution trigger) and the output of the job execution. It is created automatically once a job is initialized, along with the input and output folders. The steps to run the example are: +* Clone the repository (__/tmp__ for our example) + +`git clone https://github.com/grycap/scar /tmp` + +* Create a docker ignore if you plan to run Amazon Lambda or Batch with an uploaded image to DockerHub. The contents of the ignore are listed in this README. + +* [Lambda/Batch when upload to DockerHub] Build the the Docker image locally + +`docker build --build-arg ADD_BASE_DIR=scar/examples/mpi --label scar-mpi -t scar-mpi -f /tmp/scar/examples/mpi/Dockerfile /tmp` + +* [Lambda/Batch when upload local image to DockerHub] Dump the Docker image and upload it to DockerHub in case of Amazon Batch with container built locally. + +`sudo docker save scar-mpi > /tmp/scar-mpi.img` + +* Prepare __run_helper.yaml__. Follow the instructions inside the file. + +* [Batch] Set the Docker repo/image in __batch.yaml__ + * Init the function using scar `scar init -f .yaml` -* [Batch only] Generate private/public keys used by ssh to communicate between nodes. The default names can be changed using the env variables SSH_PRIV_FILE_KEY and SSH_PUB_FILE_KEY +* [Batch] Generate private/public keys used by ssh to communicate between nodes. The default names can be changed using the env variables SSH_PRIV_FILE_KEY and SSH_PUB_FILE_KEY + +* [Batch] Upload a tar.gz archive file with the public and private keys in the root of the archive to the root of the bucket __scar-mpi__ + +* [Batch] Upload the modified __run_helper.yaml__ file to the bucket __scar-mpi/input__ + +* [Lambda] Run the function using SCAR + +`scar run -f lambda.yaml` + +* The results are uploaded automatically to __scar-mpi/output__ once the execution has been successfully finalized (__time.log__ for our example with the application log). The log file is also displayed in the console, so SCAR should show the result. + +## Git ignore examples + +Use these lines to create a __.dockerignore__ file in __/tmp__. +This is needed to avoid including unnecessary files during the image building -* [Batch only] Upload a tar.gz archive file with the public and private keys in the root of the archive to the root of the bucket __scar-mpi-example__ +``` +# Ignore everything +** -* Upload a file to the bucket __scar-mpi-example/input__ +# Allow files and directories +!/scar/examples/mpi/** -* The results are uploaded to __scar-mpi-example/output__ once the execution has been successfully finalized +# Ignore unnecessary files inside allowed directories +# This should go after the allowed directories +**/batch.yaml +**/lambda.yaml +**/run_helper.sh +**/README.md +**/LICENSE +``` diff --git a/examples/mpi/batch.yaml b/examples/mpi/batch.yaml index 23ab5f54..0b124e95 100644 --- a/examples/mpi/batch.yaml +++ b/examples/mpi/batch.yaml @@ -6,7 +6,7 @@ functions: init_script: /tmp/run_batch.sh execution_mode: batch container: - image: asalic/scar-mpi + image: environment: Variables: EXEC_TYPE: batch diff --git a/examples/mpi/run.sh b/examples/mpi/run.sh index 1cf7e3ea..9b4142e5 100644 --- a/examples/mpi/run.sh +++ b/examples/mpi/run.sh @@ -3,6 +3,7 @@ echo "Executing as AWS ${EXEC_TYPE}" echo "Build date: $(cat /build_date)" echo "Runing as: ${USER} home @ ${HOME}" echo "Running with interpreter: $(readlink -f $(which sh))" +echo "Running MPI binary: ${APP_BIN}" log () { echo "${BASENAME} - ${1}" @@ -73,8 +74,7 @@ wait_for_nodes () { cat $HOST_FILE_PATH-deduped log "executing main MPIRUN workflow" - # --allow-run-as-root - { time mpirun --allow-run-as-root --mca btl_tcp_if_include eth0 --debug-daemons -x PATH -x LD_LIBRARY_PATH --machinefile ${HOST_FILE_PATH}-deduped \ + { time mpirun --mca btl_tcp_if_include eth0 --debug-daemons -x PATH -x LD_LIBRARY_PATH --machinefile ${HOST_FILE_PATH}-deduped \ ${APP_BIN} ${APP_PARAMS}; } 2>&1 | cat > ${TMP_OUTPUT_DIR}/time.log sleep 2 echo 'Exec output:' @@ -140,8 +140,8 @@ report_to_master () { if [ "${EXEC_TYPE,,}" = 'lambda' ]; then echo 'Run lambda' export OMPI_MCA_plm_rsh_agent=/bin/false - { time mpirun ${MPI_PARAMS} ${APP_BIN} ${APP_PARAMS}; } 2>&1 | cat > $TMP_OUTPUT_DIR/time.log - + { time mpirun -np 1 --debug-daemons ${APP_BIN} ${APP_PARAMS}; } 2>&1 | cat > $TMP_OUTPUT_DIR/time.log + cat $TMP_OUTPUT_DIR/time.log elif [ "${EXEC_TYPE,,}" = 'batch' ]; then echo 'Run batch' diff --git a/examples/mpi/run_helper.sh b/examples/mpi/run_helper.sh index b2442dce..953c2e16 100644 --- a/examples/mpi/run_helper.sh +++ b/examples/mpi/run_helper.sh @@ -3,9 +3,6 @@ # export AWS_BATCH_JOB_NODE_INDEX=0 # export AWS_BATCH_JOB_MAIN_NODE_INDEX=0 -export PRIVATE_PASSWD='' -export APP_PARAMS1='' -export APP_PARAMS2='' +export APP_PARAMS='' export APP_BIN='' -export APP_IN_FILE='' bash /opt/run.sh From d081b925c99f5b00d99d97c3cb1792560b9bc7f7 Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 21 Dec 2020 10:38:13 +0100 Subject: [PATCH 74/84] rm old deps --- examples/mpi/Dockerfile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/mpi/Dockerfile b/examples/mpi/Dockerfile index c245f2e1..3667b568 100644 --- a/examples/mpi/Dockerfile +++ b/examples/mpi/Dockerfile @@ -17,7 +17,7 @@ ENV SSH_PUB_FILE_KEY=ssh_host_rsa_key.pub ENV LANG en_US.UTF-8 -ADD ${ADD_BASE_DIR}/run.sh ${ADD_BASE_DIR}/debs.lst /opt/ +ADD ${ADD_BASE_DIR}/run.sh /opt/ RUN apt-get update \ && apt-get install -y $BUILD_PACKAGES \ @@ -31,14 +31,12 @@ RUN apt-get update \ && make -j`nproc` \ && make install \ && ldconfig \ - #&& wget -q -P /tmp -i /opt/debs.lst \ && cd /opt/ \ && git clone $GIT_REPO \ && cd /opt/$GIT_REPO_REL_PATH_SRC \ && make \ && apt-get remove --purge -y $BUILD_PACKAGES gnupg* gnupg-agent* \ && apt-get autoremove --purge -y \ - #&& dpkg --force-all -i /tmp/*.deb \ && rm -rf /tmp/* /var/lib/apt/lists/* \ && ulimit -n 1024 \ && chmod 755 /opt/$GIT_REPO_REL_PATH_EXEC \ From 81447e063ef97695e7a1aa4ac5bfbc8a05a04e75 Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 21 Dec 2020 11:39:08 +0100 Subject: [PATCH 75/84] upd README enable locale edit --- examples/mpi/README.md | 10 +++++++--- examples/mpi/run.sh | 10 +++++----- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/examples/mpi/README.md b/examples/mpi/README.md index f9a08b0a..5ae94d0e 100644 --- a/examples/mpi/README.md +++ b/examples/mpi/README.md @@ -15,7 +15,7 @@ The steps to run the example are: * Create a docker ignore if you plan to run Amazon Lambda or Batch with an uploaded image to DockerHub. The contents of the ignore are listed in this README. -* [Lambda/Batch when upload to DockerHub] Build the the Docker image locally +* [Lambda/Batch when upload to DockerHub] Build the the Docker image locally. **ADD_BASE_DIR** should be set as the relative path of the Github repo where the whole examples is found. Docker uses this to access additional files that are needed to be added inside the image (like __run.sh__). `docker build --build-arg ADD_BASE_DIR=scar/examples/mpi --label scar-mpi -t scar-mpi -f /tmp/scar/examples/mpi/Dockerfile /tmp` @@ -25,7 +25,9 @@ The steps to run the example are: * Prepare __run_helper.yaml__. Follow the instructions inside the file. -* [Batch] Set the Docker repo/image in __batch.yaml__ +* [Batch] Adjust the location of __run_batch.sh__ in __batch.yaml__ + +* [Batch] Since AWS batch has to get the image from DockerHub, set the Docker repo/image in __batch.yaml__. You can create and automated build based on a Github repo by setting the **Build Context** as __/__ and the **Dockerfile location** as __examples/mpi/Dockerfile__ when asked by DockerHub. * Init the function using scar @@ -35,7 +37,9 @@ The steps to run the example are: * [Batch] Upload a tar.gz archive file with the public and private keys in the root of the archive to the root of the bucket __scar-mpi__ -* [Batch] Upload the modified __run_helper.yaml__ file to the bucket __scar-mpi/input__ +* [Batch multinode parallel] Be sure that you have access to the internet. You could create a private network for the nodes and a NAT EC2 instance both in the same security group. + +* [Batch] Upload the modified __run_helper.yaml__ file to the bucket __scar-mpi/input__ to start the execution * [Lambda] Run the function using SCAR diff --git a/examples/mpi/run.sh b/examples/mpi/run.sh index 9b4142e5..348ca692 100644 --- a/examples/mpi/run.sh +++ b/examples/mpi/run.sh @@ -146,12 +146,12 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then echo 'Run batch' apt update - apt install -y inotify-tools iproute2 wget unzip openssh-server openssh-client + apt install -y inotify-tools iproute2 wget unzip openssh-server openssh-client locales - #locale-gen en_US.UTF-8 - #sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen - #dpkg-reconfigure --frontend=noninteractive locales - #update-locale LANG=en_US.UTF-8 + locale-gen en_US.UTF-8 + sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen + dpkg-reconfigure --frontend=noninteractive locales + update-locale LANG=en_US.UTF-8 wget -nc -nv https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip unzip -qq awscli-exe-linux-x86_64.zip chmod +x aws/install From deefdcef4de619aa623d928fa48a7536f0dbda7b Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 21 Dec 2020 13:02:09 +0100 Subject: [PATCH 76/84] upd README --- examples/mpi/Dockerfile | 2 +- examples/mpi/README.md | 2 +- examples/mpi/run.sh | 6 +++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/mpi/Dockerfile b/examples/mpi/Dockerfile index 3667b568..22f650a3 100644 --- a/examples/mpi/Dockerfile +++ b/examples/mpi/Dockerfile @@ -20,7 +20,7 @@ ENV LANG en_US.UTF-8 ADD ${ADD_BASE_DIR}/run.sh /opt/ RUN apt-get update \ - && apt-get install -y $BUILD_PACKAGES \ + && apt-get install -y $BUILD_PACKAGES perl \ && locale-gen en_US.UTF-8 \ && sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen \ && dpkg-reconfigure --frontend=noninteractive locales \ diff --git a/examples/mpi/README.md b/examples/mpi/README.md index 5ae94d0e..c8c451cd 100644 --- a/examples/mpi/README.md +++ b/examples/mpi/README.md @@ -37,7 +37,7 @@ The steps to run the example are: * [Batch] Upload a tar.gz archive file with the public and private keys in the root of the archive to the root of the bucket __scar-mpi__ -* [Batch multinode parallel] Be sure that you have access to the internet. You could create a private network for the nodes and a NAT EC2 instance both in the same security group. +* [Batch] Be sure that you have access to the internet. You could create a private network for the node(s) and a NAT EC2 instance, both in the same security group. * [Batch] Upload the modified __run_helper.yaml__ file to the bucket __scar-mpi/input__ to start the execution diff --git a/examples/mpi/run.sh b/examples/mpi/run.sh index 348ca692..ebbd303b 100644 --- a/examples/mpi/run.sh +++ b/examples/mpi/run.sh @@ -146,8 +146,12 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then echo 'Run batch' apt update - apt install -y inotify-tools iproute2 wget unzip openssh-server openssh-client locales + apt install -y inotify-tools iproute2 wget unzip openssh-server openssh-client locales perl + export LANGUAGE=en_US.UTF-8 + export LANG=en_US.UTF-8 + export LC_ALL=en_US.UTF-8 + export LC_CTYPE=en_US.UTF-8 locale-gen en_US.UTF-8 sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen dpkg-reconfigure --frontend=noninteractive locales From 3f187d727fbcaeeac1cb7301c896491a39cf1940 Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 21 Dec 2020 13:02:58 +0100 Subject: [PATCH 77/84] don't remove locales --- examples/mpi/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/mpi/Dockerfile b/examples/mpi/Dockerfile index 22f650a3..b6c02d28 100644 --- a/examples/mpi/Dockerfile +++ b/examples/mpi/Dockerfile @@ -1,7 +1,7 @@ FROM debian:stretch-slim ARG ADD_BASE_DIR=examples/mpi -ARG BUILD_PACKAGES=' wget locales git make gcc g++ iproute2 cmake build-essential gfortran curl ' +ARG BUILD_PACKAGES=' wget git make gcc g++ iproute2 cmake build-essential gfortran curl ' ENV DEBIAN_FRONTEND=noninteractive ## Set to either lambda or batch @@ -20,7 +20,7 @@ ENV LANG en_US.UTF-8 ADD ${ADD_BASE_DIR}/run.sh /opt/ RUN apt-get update \ - && apt-get install -y $BUILD_PACKAGES perl \ + && apt-get install -y $BUILD_PACKAGES perl locales \ && locale-gen en_US.UTF-8 \ && sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen \ && dpkg-reconfigure --frontend=noninteractive locales \ From 67f1da01dff4bdfd01728cfbc8d6b1530980c91c Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 21 Dec 2020 17:15:53 +0100 Subject: [PATCH 78/84] disable locale set during exec --- examples/mpi/Dockerfile | 2 +- examples/mpi/run.sh | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/mpi/Dockerfile b/examples/mpi/Dockerfile index b6c02d28..4691667d 100644 --- a/examples/mpi/Dockerfile +++ b/examples/mpi/Dockerfile @@ -20,7 +20,7 @@ ENV LANG en_US.UTF-8 ADD ${ADD_BASE_DIR}/run.sh /opt/ RUN apt-get update \ - && apt-get install -y $BUILD_PACKAGES perl locales \ + && apt-get install -q -o=Dpkg::Use-Pty=0 -y $BUILD_PACKAGES perl locales \ && locale-gen en_US.UTF-8 \ && sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen \ && dpkg-reconfigure --frontend=noninteractive locales \ diff --git a/examples/mpi/run.sh b/examples/mpi/run.sh index ebbd303b..d06b6dc3 100644 --- a/examples/mpi/run.sh +++ b/examples/mpi/run.sh @@ -146,16 +146,16 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then echo 'Run batch' apt update - apt install -y inotify-tools iproute2 wget unzip openssh-server openssh-client locales perl + apt install -q -o=Dpkg::Use-Pty=0 -y inotify-tools iproute2 wget unzip openssh-server openssh-client export LANGUAGE=en_US.UTF-8 export LANG=en_US.UTF-8 export LC_ALL=en_US.UTF-8 export LC_CTYPE=en_US.UTF-8 - locale-gen en_US.UTF-8 - sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen - dpkg-reconfigure --frontend=noninteractive locales - update-locale LANG=en_US.UTF-8 + #locale-gen en_US.UTF-8 + #sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen + #dpkg-reconfigure --frontend=noninteractive locales + #update-locale LANG=en_US.UTF-8 wget -nc -nv https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip unzip -qq awscli-exe-linux-x86_64.zip chmod +x aws/install From d8a39919b842454cbdbadc4af7eb14445ac5308f Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 21 Dec 2020 17:25:47 +0100 Subject: [PATCH 79/84] add section describing Batch multinode parallel jobs --- docs/source/batch.rst | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/docs/source/batch.rst b/docs/source/batch.rst index c7aa7a00..bd12ac95 100644 --- a/docs/source/batch.rst +++ b/docs/source/batch.rst @@ -4,14 +4,14 @@ AWS Batch Integration ======================= AWS Batch allows to efficiently execute batch computing jobs on AWS by dynamically provisioning the required underlying EC2 instances on which Docker-based jobs are executed. -SCAR allows to transparently integrate the execution of the jobs through `AWS Batch `_. +SCAR allows to transparently integrate the execution of the jobs through `AWS Batch `_. Three execution modes are now available in SCAR: * `lambda`: This is the default execution mode. All executions will be run on AWS Lambda. * `lambda-batch`: Executions will be run on AWS Lambda. If the default timeout is reached, then the execution is automatically delegated to AWS Batch. * `batch`: Executions will be automatically diverted to AWS Batch. -This way, you can use AWS Lambda as a highly-scalable cache for burts of short computational jobs while longer executions can be automatically delegated to AWS Batch. +This way, you can use AWS Lambda as a highly-scalable cache for burts of short computational jobs while longer executions can be automatically delegated to AWS Batch. The very same `programming model `_ is maintained regardless of the service employed to perform the computation. Set up your configuration file @@ -47,12 +47,12 @@ The variables responsible for batch configuration are:: }, "service_role": "arn:aws:iam::{account_id}:role/service-role/AWSBatchServiceRole" } - -Since AWS Batch deploys Amazon EC2 instances, the REQUIRED variables are: + +Since AWS Batch deploys Amazon EC2 instances, the REQUIRED variables are: * `security_group_ids`: The EC2 security group that is associated with the instances launched in the compute environment. This allows to define the inbound and outbound network rules in order to allow or disallow TCP/UDP traffic generated from (or received by) the EC2 instance. You can choose the default VPC security group. * `subnets`: The VPC subnet(s) identifier(s) on which the EC2 instances will be deployed. This allows to use multiple Availability Zones for enhanced fault-tolerance. -The remaining variables have default values that should be enough to manage standard batch jobs. +The remaining variables have default values that should be enough to manage standard batch jobs. The default `fdl file `_ explains briefly the remaining Batch variables and how are they used. Additional info about the variables and the different values that can be assigned can be found in the `AWS API Documentation `_. @@ -60,7 +60,7 @@ Additional info about the variables and the different values that can be assigne Set up your Batch IAM role -------------------------- -The default IAM role used in the creation of the EC2 for the Batch Compute Environment is **arn:aws:iam::$ACCOUNT_ID:instance-profile/**ecsInstanceRole****. Thus, if you want to provide S3 access to your Batch jobs you have to specify the corresponding policies in the aforementioned role. +The default IAM role used in the creation of the EC2 for the Batch Compute Environment is **arn:aws:iam::$ACCOUNT_ID:instance-profile/**ecsInstanceRole****. Thus, if you want to provide S3 access to your Batch jobs you have to specify the corresponding policies in the aforementioned role. If you have a role aleredy configured, you can set it in the configuration file by changin the variable `batch.compute_resources.instance_role`. @@ -105,10 +105,10 @@ And trigger the execution of the function by uploading a file to be processed to SCAR automatically creates the compute environment in AWS Batch and submits a job to be executed. Input and output data files are transparently managed as well according to the programming model. -The CloudWatch logs will reveal the execution of the Lambda function as well as the execution of the AWS Batch job. -Notice that whenever the execution of the AWS Batch job has finished, the EC2 instances will be eventually terminated. +The CloudWatch logs will reveal the execution of the Lambda function as well as the execution of the AWS Batch job. +Notice that whenever the execution of the AWS Batch job has finished, the EC2 instances will be eventually terminated. Also, the number of EC2 instances will increase and shrink to handle the incoming number of jobs. - + Combine AWS Lambda and AWS Batch executions ------------------------------------------- As explained in the section :doc:`/prog_model`, if you define an output bucket as the input bucket of another function, a workflow can be created. @@ -127,3 +127,15 @@ To create the AWS Batch job, the Lambda function defines a Job with the payload The payload limit can be avoided by redefining the script used and passing the large payload files using other service (e.g S3 or some bash command like 'wget' or 'curl' to download the information in execution time). As we didi with the plant classification example, where a `bootstrap script `_ was used to download the `executed script `_. Also, AWS Batch does not allow to override the container entrypoint so containers with an entrypoint defined can not execute an user script. + +Multinode parallel jobs +----------------------- +You can execute multinode parallel jobs in batch by enabling this mode either in the scar.cfg file or in the configuration file for the job (functions->aws->batch->multi_node_parallel->enable). +You can also set the number of nodes and the index of the main node. +Please take into account that the index of the main node starts from 0 up to the number of nodes -1. + +We included two examples of MPI jobs that can be executed as multinode parallel jobs. +The first one, architrave, allows the execution of a commercial MPI application on AWS. +The second example, mpi, is just a hello world from each CPU/node available for execution. +Both work in Amazon Lambda and Batch single node, you can use the included configuration files as a starting point. +For more details, please check the README.md that comes with each example. From 009234885408bf3ff2e040d9d84fd0cda3d388e3 Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 21 Dec 2020 17:26:59 +0100 Subject: [PATCH 80/84] upd README add support for locales --- examples/architrave/README.md | 6 ++++-- examples/architrave/batch.yaml | 2 +- examples/architrave/run.sh | 20 -------------------- 3 files changed, 5 insertions(+), 23 deletions(-) diff --git a/examples/architrave/README.md b/examples/architrave/README.md index 42eab463..b0eaa62a 100755 --- a/examples/architrave/README.md +++ b/examples/architrave/README.md @@ -51,8 +51,9 @@ You can ignore everything but the private files and those from ##scar/examples/a # Ignore unnecessary files inside allowed directories # This should go after the allowed directories -**/scar-architrave-batch.yaml -**/scar-architrave-lambda.yaml +**/batch.yaml +**/lambda.yaml +**/run_helper.sh **/README.md **/LICENSE ``` @@ -116,6 +117,7 @@ The ssh keys should be named __ssh_host_rsa_key.pub__ for the public key and __s Amazon batch execution is based on events. In the example launch configuration file __batch.yaml__, the **input** section specifies an Amazon S3 bucket/folder that is monitorized for changes. Please modify the env variables needed by the application and set the correct path of the __run_helper.sh__ launch script in the launch configuration file. +Set the Docker repo/image in __batch.yaml__. There are two modes to execute on batch: single node or parallel multinode. For the former case, be sure that **functions.aws.batch.multi_node_parallel.enabled** to false, and uncomment the export of **AWS_BATCH_JOB_NUM_NODES**, **AWS_BATCH_JOB_NODE_INDEX**, and **AWS_BATCH_JOB_MAIN_NODE_INDEX** in the __run_helper.sh__. diff --git a/examples/architrave/batch.yaml b/examples/architrave/batch.yaml index 98b9ae7d..c001c4e2 100755 --- a/examples/architrave/batch.yaml +++ b/examples/architrave/batch.yaml @@ -6,7 +6,7 @@ functions: init_script: /tmp/run_batch.sh execution_mode: batch container: - image: asalic/scar-architrave + image: environment: Variables: EXEC_TYPE: batch diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh index 88eebae4..8d788074 100644 --- a/examples/architrave/run.sh +++ b/examples/architrave/run.sh @@ -165,23 +165,9 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then mkdir -p /tmp/deps - # mkdir -p /tmp/exec - # rm -rf /tmp/exec/* - # mkdir -p /tmp/output - # rm -rf /tmp/output/* - # mkdir -p /tmp/mpi - # rm -rf /tmp/mpi/* /usr/local/bin/aws s3 cp s3://scar-architrave/batch/private.7z /tmp /usr/local/bin/aws s3 cp s3://scar-architrave/batch/deps.tar.gz /tmp tar -zxf /tmp/deps.tar.gz -C /tmp/deps - - #rm -rf /mnt/batch/exec/* - #rm -rf /mnt/batch/output/* - #rm -rf /mnt/batch/mpi/* - - #mkdir ${SCRATCH_DIR} - #mkdir ${JOB_DIR} - #mkdir /tmp/output dpkg -i /tmp/deps/*.deb echo "Add private data from S3" @@ -209,8 +195,6 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then service ssh status service ssh restart service ssh status - - #PATH="$PATH:/opt/openmpi/bin/" BASENAME="${0##*/}" HOST_FILE_PATH="/tmp/hostfile" AWS_BATCH_EXIT_CODE_FILE="/tmp/batch-exit-code" @@ -219,10 +203,6 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then if [ -d "${BATCH_SIGNAL_DIR}" ]; then rm -Rf ${BATCH_SIGNAL_DIR}; fi mkdir -p ${BATCH_SIGNAL_DIR}/master_done mkdir -p ${BATCH_SIGNAL_DIR}/workers_done - - #aws s3 cp $S3_INPUT $SCRATCH_DIR - #tar -xvf $SCRATCH_DIR/*.tar.gz -C $SCRATCH_DIR - sleep 2 # Set child by default switch to main if on main node container From baa1ea003c484d9dd2b8e1ab39f4b3bde1c6f0a7 Mon Sep 17 00:00:00 2001 From: asalic Date: Mon, 21 Dec 2020 17:27:31 +0100 Subject: [PATCH 81/84] rm commented instructions --- examples/mpi/run.sh | 66 +++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 39 deletions(-) diff --git a/examples/mpi/run.sh b/examples/mpi/run.sh index d06b6dc3..cbfdc36b 100644 --- a/examples/mpi/run.sh +++ b/examples/mpi/run.sh @@ -196,53 +196,41 @@ elif [ "${EXEC_TYPE,,}" = 'batch' ]; then service ssh status ssh-add ${HOME}/.ssh/id_rsa service ssh restart -# export AWS_BATCH_JOB_NODE_INDEX=0 -# export AWS_BATCH_JOB_NUM_NODES=1 -# export AWS_BATCH_JOB_MAIN_NODE_INDEX=0 - echo "Running app" + BASENAME="${0##*/}" + HOST_FILE_PATH="/tmp/hostfile" + AWS_BATCH_EXIT_CODE_FILE="/tmp/batch-exit-code" - #/opt/mpi-run.sh - - -#PATH="$PATH:/opt/openmpi/bin/" -BASENAME="${0##*/}" -HOST_FILE_PATH="/tmp/hostfile" -AWS_BATCH_EXIT_CODE_FILE="/tmp/batch-exit-code" - -BATCH_SIGNAL_DIR=/tmp/batch -if [ -d "${BATCH_SIGNAL_DIR}" ]; then rm -Rf ${BATCH_SIGNAL_DIR}; fi -mkdir -p ${BATCH_SIGNAL_DIR}/master_done -mkdir -p ${BATCH_SIGNAL_DIR}/workers_done + BATCH_SIGNAL_DIR=/tmp/batch + if [ -d "${BATCH_SIGNAL_DIR}" ]; then rm -Rf ${BATCH_SIGNAL_DIR}; fi + mkdir -p ${BATCH_SIGNAL_DIR}/master_done + mkdir -p ${BATCH_SIGNAL_DIR}/workers_done -#aws s3 cp $S3_INPUT $SCRATCH_DIR -#tar -xvf $SCRATCH_DIR/*.tar.gz -C $SCRATCH_DIR - -sleep 2 + sleep 2 -# Set child by default switch to main if on main node container -NODE_TYPE="child" -if [ "${AWS_BATCH_JOB_MAIN_NODE_INDEX}" == "${AWS_BATCH_JOB_NODE_INDEX}" ]; then - log "Running synchronize as the main node" - NODE_TYPE="main" -fi + # Set child by default switch to main if on main node container + NODE_TYPE="child" + if [ "${AWS_BATCH_JOB_MAIN_NODE_INDEX}" == "${AWS_BATCH_JOB_NODE_INDEX}" ]; then + log "Running synchronize as the main node" + NODE_TYPE="main" + fi -# Main - dispatch user request to appropriate function -log $NODE_TYPE -case $NODE_TYPE in - main) - wait_for_nodes "${@}" - ;; + # Main - dispatch user request to appropriate function + log $NODE_TYPE + case $NODE_TYPE in + main) + wait_for_nodes "${@}" + ;; - child) - report_to_master "${@}" - ;; + child) + report_to_master "${@}" + ;; - *) log $NODE_TYPE - usage "Could not determine node type. Expected (main/child)" - ;; -esac + *) log $NODE_TYPE + usage "Could not determine node type. Expected (main/child)" + ;; + esac else echo "ERROR: unknown execution type '${EXEC_TYPE}'" From 4454a6da59ab4e15589eed65eb91efb5bd48a78b Mon Sep 17 00:00:00 2001 From: asalic Date: Wed, 10 Mar 2021 11:41:32 +0100 Subject: [PATCH 82/84] rm architrave example --- examples/architrave/Dockerfile | 44 ------ examples/architrave/LICENSE | 201 ------------------------- examples/architrave/README.md | 134 ----------------- examples/architrave/batch.yaml | 23 --- examples/architrave/debs.lst | 25 ---- examples/architrave/lambda.yaml | 16 -- examples/architrave/run.sh | 234 ------------------------------ examples/architrave/run_batch.sh | 1 - examples/architrave/run_helper.sh | 11 -- 9 files changed, 689 deletions(-) delete mode 100755 examples/architrave/Dockerfile delete mode 100755 examples/architrave/LICENSE delete mode 100755 examples/architrave/README.md delete mode 100755 examples/architrave/batch.yaml delete mode 100755 examples/architrave/debs.lst delete mode 100755 examples/architrave/lambda.yaml delete mode 100644 examples/architrave/run.sh delete mode 100755 examples/architrave/run_batch.sh delete mode 100755 examples/architrave/run_helper.sh diff --git a/examples/architrave/Dockerfile b/examples/architrave/Dockerfile deleted file mode 100755 index f742363d..00000000 --- a/examples/architrave/Dockerfile +++ /dev/null @@ -1,44 +0,0 @@ -FROM debian:stretch-slim - -ARG ADD_BASE_DIR_ARCHITRAVE=examples/architrave -ARG ADD_PRIVATE_BASE_DIR -ARG BUILD_PACKAGES=' make gcc g++ iproute2 cmake build-essential gfortran curl locales ' - -ENV DEBIAN_FRONTEND=noninteractive -## Set to either lambda or batch -ENV EXEC_TYPE=lambda - -ENV APP_IN_FILE=/opt/examples/example -ENV TMP_OUTPUT_DIR=/tmp -ENV APP_BIN=/opt/simest -ENV APP_PARAMS1="" -ENV APP_PARAMS2="" -#ENV S3_BUCKET="s3://scar-architrave" -#ENV S3_BATCH_DEPS_REL_PATH=batch/deps.tar.gz -#ENV S3_BATCH_PRIVATE_REL_PATH= -ENV PRIVATE_PASSWD= - -ADD ${ADD_PRIVATE_BASE_DIR} ${ADD_BASE_DIR_ARCHITRAVE}/run.sh ${ADD_BASE_DIR_ARCHITRAVE}/debs.lst /opt/ - -RUN apt-get update \ - && apt-get install -y $BUILD_PACKAGES p7zip-full wget unzip \ - && locale-gen en_US.UTF-8 \ - && sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen \ - && dpkg-reconfigure --frontend=noninteractive locales \ - && update-locale LANG=en_US.UTF-8 \ - && wget -q --no-check-certificate -qO- https://download.open-mpi.org/release/open-mpi/v1.4/openmpi-1.4.3.tar.bz2 | tar xvfj - -C /tmp/ \ - && cd /tmp/openmpi-1.4.3/ \ - && ./configure --disable-pty-support --disable-doc \ - && make -j8 \ - && make install \ - && wget -q -P /tmp -i /opt/debs.lst \ - && apt-get remove --purge -y $BUILD_PACKAGES gnupg* gnupg-agent* \ - && apt-get autoremove --purge -y \ - && dpkg --force-all -i /tmp/*.deb \ - && ulimit -n 1024 \ - && rm -rf /tmp/* /var/lib/apt/lists/* \ - && chmod 755 /opt/run.sh \ - && echo $(date) > /build_date \ - && echo "Build date: $(cat /build_date)" - -CMD /opt/run.sh diff --git a/examples/architrave/LICENSE b/examples/architrave/LICENSE deleted file mode 100755 index 261eeb9e..00000000 --- a/examples/architrave/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/examples/architrave/README.md b/examples/architrave/README.md deleted file mode 100755 index b0eaa62a..00000000 --- a/examples/architrave/README.md +++ /dev/null @@ -1,134 +0,0 @@ -# architrave -Running a commercial app in a Docker container on Lambda and Batch - -## Building the containers - -Due to the differences between Amazon Batch and Lambda and our choice to have one Dockerfile for both, there are some things one has to take into account when running the containers with SCAR. - -### Lambda - -We included all necessary operations in the Dockerfile, therefore leaving the runtime execution populated only with the application execution itself. -The Docker image doesn't have to be public, we can build it locally. - -``` -# The base dir is the root context for Docker -# We assume that you cloned this repo in the BASE_DIR -export BASE_DIR=/tmp -# The base dir with the private bits; It must be a child of BASE_DIR -export ADD_PRIVATE_BASE_DIR=architrave - -docker build --build-arg ADD_BASE_DIR_ARCHITRAVE=scar/examples/architrave --build-arg ADD_PRIVATE_BASE_DIR="$ADD_PRIVATE_BASE_DIR" -f "$BASE_DIR/scar/examples/architrave/Dockerfile" --label architrave -t architrave "$BASE_DIR" -``` - -Take into account that the input files must be located in the __ADD_PRIVATE_BASE_DIR__ directory. -e.g. if you have something like `$BASE_DIR/$ADD_PRIVATE_BASE_DIR/examples/example_input.file`, then you the example input ends up on the following path: `/opt/examples/example_input.file` - -If you want to run the container locally before launching it on Amazon Lambda, you can use the following: - -``` -# This is the path inside the container where the binary can be found -# it is the relative path of ADD_PRIVATE_BASE_DIR without the root -# e.g. for architrave/path/path2/execute_me (where ADD_PRIVATE_BASE_DIR=architrave) is path/path2/execute_me -export APP_BIN=/opt/ -# The full list of params needed for the app, don't forget the (double) quotes when there are spaces -export APP_PARAMS= - -# Mount the results dir you specify in the APP_PARAMS env variable to -docker run -d -e EXEC_TYPE=lambda -e APP_BIN="$APP_BIN" -e APP_PARAMS="$APP_PARAMS" --name architrave_local -v /tmp/architrave-result:/ architrave:latest -``` - -#### Build context - -You can ignore everything but the private files and those from ##scar/examples/architrave## by creating a `.dockerignore` file in the root of the context with the following content: - -``` -# Ignore everything -** - -# Allow files and directories -!/architrave/** -!/scar/examples/architrave/** - -# Ignore unnecessary files inside allowed directories -# This should go after the allowed directories -**/batch.yaml -**/lambda.yaml -**/run_helper.sh -**/README.md -**/LICENSE -``` - -#### Execution - -Due to the fact that we included the private files in our image, we have to launch it from a private location. -For the sake of this example, we use the environment where we built the image in the previously steps. -First, we dump the docker image and compress i t with gzip. - -`sudo docker save scar-architrave > /tmp/architrave-docker.img` - -The image we just dumped should have less than 256MB. -Before launch, check the RAM and timeout set in the SCAR configuration file, this example requires at least 1.2GB RAM and 15 seconds. -The architrave's folder includes an example launch configuration yaml file used to init and run the application on lambda, __lambda.yaml__. -Please modify the env variables needed by the application, uncomment the export of the **INPUT_FILE_PATH** env variable, and set the correct path of the __run_helper.sh__ launch script in the launch configuration file. -This intermediate script launches __run.sh__ (that is found in the Docker image) that actually executes the application on Amazon Lambda or Batch. -With scar installed, we can now create the lambda function using the example yaml file included with this example as a base: - -`scar init -f lambda.yaml` - -To execute the function simply run: - -`scar run -f lambda.yaml` - -Depending on the output Amazon S3 bucket/folder you have selected in the __lambda.yaml__ launch configuration, you should find the output files of the application and a __time.log__ file containing the execution log of the application. - -### Batch - -#### Batch additional packages required on S3 - -Start a Docker container based on the image of the distribution you use __to run on AWS__ the legacy application (not the distribution __of__ the legacy application). - -`docker run -it -v /tmp/deps:/tmp/deps debian:stretch-slim` - -In the running container: - -``` -# determine all of the dependencies needed by the packages we want to install: -apt update && apt install -y apt-rdepends && \ -apt-rdepends openssh-server openssh-client iproute2 inotify-tools locales | sed -E -e 's/^\s*Depends:\s*|^\s*PreDepends:\s*|\s*\(.*\)//g' | sort | uniq > /tmp/deps_tmp.lst &&\ -apt-get --purge autoremove -y apt-rdepends && \ -# filter out already installed packages (since we use the same base distro to get that packages and to run the legacy app) -apt list --installed | sed -E -e 's/\/.*//g' > /tmp/deps_installed.lst && \ -grep -F -v -f /tmp/deps_installed.lst /tmp/deps_tmp.lst > /tmp/deps.lst && \ -# download the list of packages, but don't install them -cd /tmp/deps && apt-get download $(cat /tmp/deps.lst) && \ -# Create the list of deps in a file; This file is used to download the required deps from an S3 bucket -ls -1 /tmp/deps > /tmp/deps/deps_batch.lst - -``` - -Since __/tmp/deps__ is shared between the host and the container, the downloaded debs can now be added to __deps.tar.gz__ archive and uploaded to an Amazon S3 bucket (defaults to **scar-architrave/batch**). -The same S3 bucket/folder should contain a 7z archive called __private.7z__ that contains the architrave executable, the example(s), and the private/public ssh keys used for communication between the nodes. -If the 7z is protected by a password, set the password via the env variable **PRIVATE_PASSWD** in the __run_helper.sh__ script. -The ssh keys should be named __ssh_host_rsa_key.pub__ for the public key and __ssh_host_rsa_key__ for the private key. - - -#### Batch execution - -Amazon batch execution is based on events. -In the example launch configuration file __batch.yaml__, the **input** section specifies an Amazon S3 bucket/folder that is monitorized for changes. -Please modify the env variables needed by the application and set the correct path of the __run_helper.sh__ launch script in the launch configuration file. -Set the Docker repo/image in __batch.yaml__. - -There are two modes to execute on batch: single node or parallel multinode. -For the former case, be sure that **functions.aws.batch.multi_node_parallel.enabled** to false, and uncomment the export of **AWS_BATCH_JOB_NUM_NODES**, **AWS_BATCH_JOB_NODE_INDEX**, and **AWS_BATCH_JOB_MAIN_NODE_INDEX** in the __run_helper.sh__. -For the latter execution mode, enable the variable in __batch.yaml__ and leave the three env variables commented out. - -Once everything is set, use SCAR to init the deployment: - -`scar init -f batch.yaml` - -Nest, start the execution by uploading the customized __run_helper.sh__ script to S3 (using the default S3 bucket ): - -`aws s3 cp run_helper.sh s3://scar-architrave/input` - -This script gets executed by __run_batch.sh__. diff --git a/examples/architrave/batch.yaml b/examples/architrave/batch.yaml deleted file mode 100755 index c001c4e2..00000000 --- a/examples/architrave/batch.yaml +++ /dev/null @@ -1,23 +0,0 @@ -functions: - aws: - - lambda: - name: scar-architrave - log_level: DEBUG - init_script: /tmp/run_batch.sh - execution_mode: batch - container: - image: - environment: - Variables: - EXEC_TYPE: batch - input: - - storage_provider: s3 - path: scar-architrave/input - output: - - storage_provider: s3 - path: scar-architrave/output - batch: - multi_node_parallel: - enabled: true - number_nodes: 3 - main_node_index: 0 diff --git a/examples/architrave/debs.lst b/examples/architrave/debs.lst deleted file mode 100755 index 5d97fc7f..00000000 --- a/examples/architrave/debs.lst +++ /dev/null @@ -1,25 +0,0 @@ -http://launchpadlibrarian.net/102057142/gcc-4.6-base_4.6.3-1ubuntu5_amd64.deb -http://launchpadlibrarian.net/79554367/libamd2.2.0_3.4.0-2ubuntu3_amd64.deb -http://launchpadlibrarian.net/79562140/libatlas3gf-base_3.8.4-3build1_amd64.deb -http://launchpadlibrarian.net/93163952/libblacs-mpi1_1.1-31ubuntu1_amd64.deb -http://launchpadlibrarian.net/79432349/libblas3gf_1.2.20110419-2ubuntu1_amd64.deb -http://launchpadlibrarian.net/102057179/libgfortran3_4.6.3-1ubuntu5_amd64.deb -http://launchpadlibrarian.net/86855144/libhdf5-openmpi-1.8.4_1.8.4-patch1-3ubuntu2_amd64.deb -http://launchpadlibrarian.net/57731806/libhypre-2.4.0_2.4.0b-7_amd64.deb -http://launchpadlibrarian.net/83117384/libibverbs1_1.1.5-1ubuntu1_amd64.deb -http://launchpadlibrarian.net/79599965/libmumps-4.9.2_4.9.2.dfsg-7build1_amd64.deb -http://launchpadlibrarian.net/92313905/libnuma1_2.0.8~rc3-1_amd64.deb -http://launchpadlibrarian.net/88908426/libpetsc3.1_3.1.dfsg-11ubuntu1_amd64.deb -http://launchpadlibrarian.net/102057147/libquadmath0_4.6.3-1ubuntu5_amd64.deb -http://launchpadlibrarian.net/88918631/libscalapack-mpi1_1.8.0-7build1_amd64.deb -http://launchpadlibrarian.net/88960741/libscotch-5.1_5.1.12b.dfsg-1_amd64.deb -http://launchpadlibrarian.net/88523913/libspooles2.2_2.2-9_amd64.deb -http://launchpadlibrarian.net/12667499/libsuperlu3_3.0+20070106-3_amd64.deb -http://launchpadlibrarian.net/206369988/libtorque2_2.4.16+dfsg-1+deb7u4build0.12.04.1_amd64.deb -http://launchpadlibrarian.net/79554377/libumfpack5.4.0_3.4.0-2ubuntu3_amd64.deb -http://launchpadlibrarian.net/202507343/libx11-6_1.4.99.1-0ubuntu2.3_amd64.deb -http://launchpadlibrarian.net/202507376/libx11-data_1.4.99.1-0ubuntu2.3_all.deb -http://launchpadlibrarian.net/84788368/libxau6_1.0.6-4_amd64.deb -http://launchpadlibrarian.net/140864111/libxcb1_1.8.1-1ubuntu0.2_amd64.deb -http://launchpadlibrarian.net/84805563/libxdmcp6_1.1.0-4_amd64.deb -http://launchpadlibrarian.net/88909481/mpi-default-bin_1.0.1_amd64.deb diff --git a/examples/architrave/lambda.yaml b/examples/architrave/lambda.yaml deleted file mode 100755 index 44281fa6..00000000 --- a/examples/architrave/lambda.yaml +++ /dev/null @@ -1,16 +0,0 @@ -functions: - aws: - - lambda: - name: scar-architrave - init_script: /tmp/run_lambda.sh - run_script: /tmp/run_lambda.sh - container: - image_file: /tmp/architrave-docker.img - environment: - Variables: - EXEC_TYPE: lambda - deployment: - bucket: scar-architrave - output: - - storage_provider: s3 - path: scar-architrave/output diff --git a/examples/architrave/run.sh b/examples/architrave/run.sh deleted file mode 100644 index 8d788074..00000000 --- a/examples/architrave/run.sh +++ /dev/null @@ -1,234 +0,0 @@ -#!/bin/bash -echo "Executing as AWS ${EXEC_TYPE}" -echo "Build date: $(cat /build_date)" -echo "Runing as: ${USER} home @ ${HOME}" -echo "Running with interpreter: $(readlink -f $(which sh))" - -log () { - echo "${BASENAME} - ${1}" -} - -# Standard function to print an error and exit with a failing return code -error_exit () { - log "${BASENAME} - ${1}" >&2 - log "${2:-1}" > $AWS_BATCH_EXIT_CODE_FILE - kill $(cat /tmp/supervisord.pid) -} - -usage () { - if [ "${#@}" -ne 0 ]; then - log "* ${*}" - log - fi - cat < $ip:$availablecores" - echo "$ip slots=$availablecores" >> $HOST_FILE_PATH - - lines=$(sort $HOST_FILE_PATH|uniq|wc -l) - i=0 - numCyclesWait=30 - while [ "$AWS_BATCH_JOB_NUM_NODES" -gt "$lines" ] && [ "$i" -lt "$numCyclesWait" ] - do - log "$lines out of $AWS_BATCH_JOB_NUM_NODES nodes joined, check again in 3 seconds" - sleep 3 - lines=$(sort $HOST_FILE_PATH|uniq|wc -l) - ((i=i+1)) - done - - if [ "$i" -eq "$numCyclesWait" ]; then - echo "children did not join" - exit 1 - fi - - # Make the temporary file executable and run it with any given arguments - log "All nodes successfully joined" - - # remove duplicates if there are any. - awk '!a[$0]++' $HOST_FILE_PATH > ${HOST_FILE_PATH}-deduped - cat $HOST_FILE_PATH-deduped - log "executing main MPIRUN workflow" - - chmod +x ${APP_BIN} - # --allow-run-as-root - { time mpirun --mca btl_tcp_if_include eth0 --debug-daemons -x PATH -x LD_LIBRARY_PATH --machinefile ${HOST_FILE_PATH}-deduped \ - ${APP_BIN} ${APP_IN_FILE} ${APP_PARAMS1} ${TMP_OUTPUT_DIR} ${APP_PARAMS2}; } 2>&1 | cat > ${TMP_OUTPUT_DIR}/time.log - sleep 2 - echo 'Exec output:' - cat ${TMP_OUTPUT_DIR}/time.log - - #if [ "${NODE_TYPE}" = 'main' ]; then - # env GZIP=-9 tar -czvf $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $SCRATCH_DIR/output/* - # aws s3 cp $SCRATCH_DIR/batch_output_${AWS_BATCH_JOB_ID}.tar.gz $S3_BUCKET/output/batch_output_${AWS_BATCH_JOB_ID}.tar.gz - #fi - - #log "done! goodbye, writing exit code to $AWS_BATCH_EXIT_CODE_FILE and shutting down my supervisord" - #echo "0" > $AWS_BATCH_EXIT_CODE_FILE - #kill $(cat /tmp/supervisord.pid) - #echo "#!/bin/bash" > /tmp/exec/docker_done - #echo "env GZIP=-9 tar -czvf /mnt/batch/output/result.tar.gz /mnt/batch/output/*" > /tmp/exec/docker_done - #echo "/usr/local/bin/aws s3 cp /mnt/batch/output/result.tar.gz s3://scar-architrave/output/result_$(date | tr ' ' _ ).tar.gz" >> /tmp/exec/docker_done - log "Signaling children to exit" - cat ${HOST_FILE_PATH}-deduped | awk -F_ '{print $1}' | xargs -I{} -n1 ssh {} "touch ${BATCH_SIGNAL_DIR}/master_done/done" - - log "Wait for children to finish their execution" - num_finished=$(ls ${BATCH_SIGNAL_DIR}/workers_done/|uniq|wc -l) - while [ "$AWS_BATCH_JOB_NUM_NODES" -gt "$((num_finished+1))" ] - do - log "$num_finished out of $AWS_BATCH_JOB_NUM_NODES nodes are done, check again in 1 second" - sleep 1 - num_finished=$(ls ${BATCH_SIGNAL_DIR}/workers_done/|uniq|wc -l) - done - - #while inotifywait /tmp/exec -e create; do { echo "EC2 host post-execution process completed, exiting container"; break; }; done - exit 0 -} - -# Fetch and run a script -report_to_master () { - # get own ip and num cpus - # - ip=$(/sbin/ip -o -4 addr list eth0 | awk '{print $4}' | cut -d/ -f1) - - if [ -x "$(command -v nvidia-smi)" ] ; then - NUM_GPUS=$(ls -l /dev/nvidia[0-9] | wc -l) - availablecores=$NUM_GPUS - else - availablecores=$(nproc) - fi - - log "I am a child node -> $ip:$availablecores, reporting to the master node -> ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS}" -# echo "$ip slots=$availablecores" | ssh ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS} "cat >> /$HOST_FILE_PATH" -vvv -# sleep 15 -# echo "$ip slots=$availablecores" | ssh ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS} "cat >> /$HOST_FILE_PATH" -vvv - - until echo "$ip slots=$availablecores" | ssh ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS} "cat >> /$HOST_FILE_PATH" - do - echo "Sleeping 5 seconds and trying again" - sleep 5 - done - - echo "Wait for master to finish" - while inotifywait ${BATCH_SIGNAL_DIR}/master_done -e create; do { echo "Child ${ip} has finished its execution, done! goodbye"; break; }; done - ssh ${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS} "touch ${BATCH_SIGNAL_DIR}/workers_done/${ip}" - exit 0 -} - -if [ "${EXEC_TYPE,,}" = 'lambda' ]; then - export OMPI_MCA_plm_rsh_agent=/bin/false - { time mpirun -np 1 --debug-daemons ${APP_BIN} ${APP_IN_FILE} ${APP_PARAMS1} ${TMP_OUTPUT_DIR} ${APP_PARAMS2}; } 2>&1 | cat > $TMP_OUTPUT_DIR/time.log - -elif [ "${EXEC_TYPE,,}" = 'batch' ]; then - -# The following comment line will be replaced with the necessary env vars: -#=ENV_VARS= - export AWS_BATCH_EXIT_CODE_FILE=~/batch_exit_code.file - echo "Running on node index $AWS_BATCH_JOB_NODE_INDEX out of $AWS_BATCH_JOB_NUM_NODES nodes" - echo "Master node index is $AWS_BATCH_JOB_MAIN_NODE_INDEX and its IP is $AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS" - - cd /tmp - wget -nc -nv https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip - unzip -qq awscli-exe-linux-x86_64.zip - chmod +x aws/install - ./aws/install - - /usr/local/bin/aws configure set default.s3.max_concurrent_requests 30 - /usr/local/bin/aws configure set default.s3.max_queue_size 10000 - /usr/local/bin/aws configure set default.s3.multipart_threshold 64MB - /usr/local/bin/aws configure set default.s3.multipart_chunksize 16MB - /usr/local/bin/aws configure set default.s3.max_bandwidth 4096MB/s - /usr/local/bin/aws configure set default.s3.addressing_style path - - - mkdir -p /tmp/deps - /usr/local/bin/aws s3 cp s3://scar-architrave/batch/private.7z /tmp - /usr/local/bin/aws s3 cp s3://scar-architrave/batch/deps.tar.gz /tmp - tar -zxf /tmp/deps.tar.gz -C /tmp/deps - dpkg -i /tmp/deps/*.deb - - echo "Add private data from S3" - 7z x -aoa -p${PRIVATE_PASSWD} -o/opt /tmp/*.7z - - echo "Configure ssh" - sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd - echo "export VISIBLE=now" >> /etc/profile - echo "${USER} ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers - mkdir -p ${HOME}/.ssh - touch ${HOME}/.ssh/sshd_config - #ssh-keygen -t rsa -f ${SSHDIR}/ssh_host_rsa_key -N '' - cat /opt/ssh_host_rsa_key.pub > ${HOME}/.ssh/authorized_keys - cp /opt/ssh_host_rsa_key ${HOME}/.ssh/id_rsa - echo " IdentityFile ${HOME}/.ssh/id_rsa" >> /etc/ssh/ssh_config - echo "Host *" >> /etc/ssh/ssh_config - echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config - chmod -R 600 ${HOME}/.ssh/* - chown -R ${USER}:${USER} ${HOME}/.ssh/ - # check if ssh agent is running or not, if not, run - eval `ssh-agent -s` - ssh-add ${HOME}/.ssh/id_rsa - - chmod +x ${APP_BIN} - service ssh status - service ssh restart - service ssh status - BASENAME="${0##*/}" - HOST_FILE_PATH="/tmp/hostfile" - AWS_BATCH_EXIT_CODE_FILE="/tmp/batch-exit-code" - - BATCH_SIGNAL_DIR=/tmp/batch - if [ -d "${BATCH_SIGNAL_DIR}" ]; then rm -Rf ${BATCH_SIGNAL_DIR}; fi - mkdir -p ${BATCH_SIGNAL_DIR}/master_done - mkdir -p ${BATCH_SIGNAL_DIR}/workers_done - sleep 2 - - # Set child by default switch to main if on main node container - NODE_TYPE="child" - if [ "${AWS_BATCH_JOB_MAIN_NODE_INDEX}" == "${AWS_BATCH_JOB_NODE_INDEX}" ]; then - log "Running synchronize as the main node" - NODE_TYPE="main" - fi - - - # Main - dispatch user request to appropriate function - log $NODE_TYPE - case $NODE_TYPE in - main) - wait_for_nodes "${@}" - ;; - - child) - report_to_master "${@}" - ;; - - *) log $NODE_TYPE - usage "Could not determine node type. Expected (main/child)" - ;; - esac -else - echo "ERROR: unknown execution type '${EXEC_TYPE}'" - exit 1 # terminate and indicate error -fi diff --git a/examples/architrave/run_batch.sh b/examples/architrave/run_batch.sh deleted file mode 100755 index 7d22f90e..00000000 --- a/examples/architrave/run_batch.sh +++ /dev/null @@ -1 +0,0 @@ -bash $INPUT_FILE_PATH diff --git a/examples/architrave/run_helper.sh b/examples/architrave/run_helper.sh deleted file mode 100755 index b2442dce..00000000 --- a/examples/architrave/run_helper.sh +++ /dev/null @@ -1,11 +0,0 @@ -# Uncomment AWS_BATCH_JOB_NUM_NODES,AWS_BATCH_JOB_NODE_INDEX, and AWS_BATCH_JOB_MAIN_NODE_INDEX when running batch on single node -# export AWS_BATCH_JOB_NUM_NODES=1 -# export AWS_BATCH_JOB_NODE_INDEX=0 -# export AWS_BATCH_JOB_MAIN_NODE_INDEX=0 - -export PRIVATE_PASSWD='' -export APP_PARAMS1='' -export APP_PARAMS2='' -export APP_BIN='' -export APP_IN_FILE='' -bash /opt/run.sh From 0f2cd94fd27aa8e2b402ea48ceff37151a73a7c4 Mon Sep 17 00:00:00 2001 From: Sebas Risco Date: Wed, 24 Mar 2021 11:40:28 +0100 Subject: [PATCH 83/84] Update docs --- docs/source/batch.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/source/batch.rst b/docs/source/batch.rst index bd12ac95..1cda3132 100644 --- a/docs/source/batch.rst +++ b/docs/source/batch.rst @@ -134,8 +134,6 @@ You can execute multinode parallel jobs in batch by enabling this mode either in You can also set the number of nodes and the index of the main node. Please take into account that the index of the main node starts from 0 up to the number of nodes -1. -We included two examples of MPI jobs that can be executed as multinode parallel jobs. -The first one, architrave, allows the execution of a commercial MPI application on AWS. -The second example, mpi, is just a hello world from each CPU/node available for execution. +We included an `example `_ of MPI job that can be executed as multinode parallel job, showing a hello world from each CPU/node available for execution. Both work in Amazon Lambda and Batch single node, you can use the included configuration files as a starting point. -For more details, please check the README.md that comes with each example. +For more details, please check the README.md that comes with the example. From 9e6a4cfa89ef073fdbcc605a79b7b10c989444a7 Mon Sep 17 00:00:00 2001 From: Sebas Risco Date: Wed, 24 Mar 2021 11:41:16 +0100 Subject: [PATCH 84/84] Bump --- scar/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scar/version.py b/scar/version.py index 45f1fdec..924fde74 100644 --- a/scar/version.py +++ b/scar/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = '4.1.0' +__version__ = '4.2.0'