Skip to content

Commit

Permalink
[AIRFLOW-4116] Dockerfile now supports CI image build on DockerHub (#…
Browse files Browse the repository at this point in the history
…4937)

(cherry picked from commit 78c592a)
  • Loading branch information
potiuk committed Jul 20, 2019
1 parent 7bef28a commit edaca51
Show file tree
Hide file tree
Showing 3 changed files with 698 additions and 33 deletions.
226 changes: 193 additions & 33 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
ARG APT_DEPS_IMAGE="airflow-apt-deps"
ARG PYTHON_BASE_IMAGE="python:3.6-slim"
############################################################################################################
# This is base image with APT dependencies needed by Airflow. It is based on a python slim image
# This is the base image with APT dependencies needed by Airflow. It is based on a python slim image
# Parameters:
# PYTHON_BASE_IMAGE - base python image (python:x.y-slim)
############################################################################################################
Expand All @@ -40,7 +40,7 @@ ENV AIRFLOW_VERSION=$AIRFLOW_VERSION
RUN echo "Base image: ${PYTHON_BASE_IMAGE}"
RUN echo "Airflow version: ${AIRFLOW_VERSION}"

# Make sure noninteractie debian install is used and language variab1les set
# Make sure noninteractie debian install is used and language variables set
ENV DEBIAN_FRONTEND=noninteractive LANGUAGE=C.UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
LC_CTYPE=C.UTF-8 LC_MESSAGES=C.UTF-8

Expand All @@ -49,10 +49,11 @@ ARG DEPENDENCIES_EPOCH_NUMBER="1"
# Increase the value below to force renstalling of all dependencies
ENV DEPENDENCIES_EPOCH_NUMBER=${DEPENDENCIES_EPOCH_NUMBER}

# Install curl and gnupg2 - needed to download nodejs in next step
# Install curl and gnupg2 - needed to download nodejs in the next step
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
curl gnupg2 \
curl \
gnupg2 \
&& apt-get autoremove -yqq --purge \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
Expand All @@ -62,22 +63,163 @@ RUN apt-get update \
RUN curl -sL https://deb.nodesource.com/setup_10.x | bash - \
&& apt-get update \
&& apt-get install -y --no-install-recommends \
# Packages to install \
libsasl2-dev freetds-bin build-essential sasl2-bin \
libsasl2-2 libsasl2-dev libsasl2-modules \
default-libmysqlclient-dev apt-utils curl rsync netcat locales \
freetds-dev libkrb5-dev libssl-dev libffi-dev libpq-dev git \
nodejs gosu sudo \
apt-utils \
build-essential \
curl \
dirmngr \
freetds-bin \
freetds-dev \
git \
gosu \
libffi-dev \
libkrb5-dev \
libpq-dev \
libsasl2-2 \
libsasl2-dev \
libsasl2-modules \
libssl-dev \
locales \
netcat \
nodejs \
rsync \
sasl2-bin \
sudo \
&& apt-get autoremove -yqq --purge \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Install MySQL client from Oracle repositories (Debian installs mariadb)
RUN KEY="A4A9406876FCBD3C456770C88C718D3B5072E1F5" \
&& GNUPGHOME="$(mktemp -d)" \
&& export GNUPGHOME \
&& for KEYSERVER in $(shuf -e \
ha.pool.sks-keyservers.net \
hkp://p80.pool.sks-keyservers.net:80 \
keyserver.ubuntu.com \
hkp://keyserver.ubuntu.com:80 \
pgp.mit.edu) ; do \
gpg --keyserver "${KEYSERVER}" --recv-keys "${KEY}" && break || true ; \
done \
&& gpg --export "${KEY}" | apt-key add - \
&& gpgconf --kill all \
rm -rf "${GNUPGHOME}"; \
apt-key list > /dev/null \
&& echo "deb http://repo.mysql.com/apt/ubuntu/ trusty mysql-5.6" | tee -a /etc/apt/sources.list.d/mysql.list \
&& apt-get update \
&& apt-get install --no-install-recommends -y \
libmysqlclient-dev \
mysql-client \
&& apt-get autoremove -yqq --purge \
&& apt-get clean && rm -rf /var/lib/apt/lists/*

RUN adduser airflow \
&& echo "airflow ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/airflow \
&& chmod 0440 /etc/sudoers.d/airflow

############################################################################################################
# This is the target image - it installs PIP and NPN dependencies including efficient caching
# This is an image with all APT dependencies needed by CI. It is built on top of the airlfow APT image
# Parameters:
# airflow-apt-deps - this is the base image for CI deps image.
############################################################################################################
FROM airflow-apt-deps as airflow-ci-apt-deps

SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"]

ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/

ARG APT_DEPS_IMAGE
ENV APT_DEPS_IMAGE=${APT_DEPS_IMAGE}

RUN echo "${APT_DEPS_IMAGE}"

# Note the ifs below might be removed if Buildkit will become usable. It should skip building this
# image automatically if it is not used. For now we still go through all layers below but they are empty
RUN if [[ "${APT_DEPS_IMAGE}" == "airflow-ci-apt-deps" ]]; then \
# Note missing man directories on debian-stretch
# https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=863199
mkdir -pv /usr/share/man/man1 \
&& mkdir -pv /usr/share/man/man7 \
&& apt-get update \
&& apt-get install --no-install-recommends -y \
gnupg \
krb5-user \
ldap-utils \
less \
lsb-release \
net-tools \
openjdk-8-jdk \
openssh-client \
openssh-server \
postgresql-client \
python-selinux \
sqlite3 \
tmux \
unzip \
vim \
&& apt-get autoremove -yqq --purge \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* \
;\
fi

ENV HADOOP_DISTRO=cdh \
HADOOP_MAJOR=5 \
HADOOP_DISTRO_VERSION=5.11.0 \
HADOOP_VERSION=2.6.0 \
HIVE_VERSION=1.1.0
ENV HADOOP_URL=https://archive.cloudera.com/${HADOOP_DISTRO}${HADOOP_MAJOR}/${HADOOP_DISTRO}/${HADOOP_MAJOR}/
ENV HADOOP_HOME=/tmp/hadoop-cdh HIVE_HOME=/tmp/hive

RUN \
if [[ "${APT_DEPS_IMAGE}" == "airflow-ci-apt-deps" ]]; then \
mkdir -pv ${HADOOP_HOME} \
&& mkdir -pv ${HIVE_HOME} \
&& mkdir /tmp/minicluster \
&& mkdir -pv /user/hive/warehouse \
&& chmod -R 777 ${HIVE_HOME} \
&& chmod -R 777 /user/ \
;\
fi
# Install Hadoop
# --absolute-names is a work around to avoid this issue https://github.com/docker/hub-feedback/issues/727
RUN \
if [[ "${APT_DEPS_IMAGE}" == "airflow-ci-apt-deps" ]]; then \
HADOOP_URL=${HADOOP_URL}hadoop-${HADOOP_VERSION}-${HADOOP_DISTRO}${HADOOP_DISTRO_VERSION}.tar.gz \
&& HADOOP_TMP_FILE=/tmp/hadoop.tar.gz \
&& curl -sL ${HADOOP_URL} > ${HADOOP_TMP_FILE} \
&& tar xzf ${HADOOP_TMP_FILE} --absolute-names --strip-components 1 -C ${HADOOP_HOME} \
&& rm ${HADOOP_TMP_FILE} \
;\
fi

# Install Hive
RUN \
if [[ "${APT_DEPS_IMAGE}" == "airflow-ci-apt-deps" ]]; then \
HIVE_URL=${HADOOP_URL}hive-${HIVE_VERSION}-${HADOOP_DISTRO}${HADOOP_DISTRO_VERSION}.tar.gz \
&& HIVE_TMP_FILE=/tmp/hive.tar.gz \
&& curl -sL ${HIVE_URL} > ${HIVE_TMP_FILE} \
&& tar xzf ${HIVE_TMP_FILE} --strip-components 1 -C ${HIVE_HOME} \
&& rm ${HIVE_TMP_FILE} \
;\
fi

ENV MINICLUSTER_URL=https://github.com/bolkedebruin/minicluster/releases/download/
ENV MINICLUSTER_VER=1.1
# Install MiniCluster TODO: install it differently. Installing to /tmp is probably a bad idea
RUN \
if [[ "${APT_DEPS_IMAGE}" == "airflow-ci-apt-deps" ]]; then \
MINICLUSTER_URL=${MINICLUSTER_URL}${MINICLUSTER_VER}/minicluster-${MINICLUSTER_VER}-SNAPSHOT-bin.zip \
&& MINICLUSTER_TMP_FILE=/tmp/minicluster.zip \
&& curl -sL ${MINICLUSTER_URL} > ${MINICLUSTER_TMP_FILE} \
&& unzip ${MINICLUSTER_TMP_FILE} -d /tmp \
&& rm ${MINICLUSTER_TMP_FILE} \
;\
fi

ENV PATH "${PATH}:/tmp/hive/bin"

############################################################################################################
# This is the target image - it installs PIP and NPM dependencies including efficient caching
# mechanisms - it might be used to build the bare airflow build or CI build
# Parameters:
# APT_DEPS_IMAGE - image with APT dependencies. It might either be base deps image with airflow
Expand All @@ -94,11 +236,22 @@ RUN echo "Airflow version: ${AIRFLOW_VERSION}"
ARG APT_DEPS_IMAGE
ENV APT_DEPS_IMAGE=${APT_DEPS_IMAGE}

ARG AIRFLOW_HOME=/opt/airflow
ARG AIRFLOW_USER=airflow
ENV AIRFLOW_USER=${AIRFLOW_USER}

ARG HOME=/home/airflow
ENV HOME=${HOME}

ARG AIRFLOW_HOME=${HOME}/airflow
ENV AIRFLOW_HOME=${AIRFLOW_HOME}

ARG AIRFLOW_SOURCES=/opt/airflow
ENV AIRFLOW_SOURCES=${AIRFLOW_SOURCES}

RUN mkdir -pv ${AIRFLOW_HOME} \
&& chown -R airflow.airflow ${AIRFLOW_HOME}
mkdir -pv ${AIRFLOW_HOME}/dags \
mkdir -pv ${AIRFLOW_HOME}/logs \
&& chown -R ${AIRFLOW_USER}.${AIRFLOW_USER} ${AIRFLOW_HOME}

# Increase the value here to force reinstalling Apache Airflow pip dependencies
ARG PIP_DEPENDENCIES_EPOCH_NUMBER="1"
Expand All @@ -113,7 +266,7 @@ ARG CASS_DRIVER_BUILD_CONCURRENCY="8"
ENV CASS_DRIVER_BUILD_CONCURRENCY=${CASS_DRIVER_BUILD_CONCURRENCY}
ENV CASS_DRIVER_NO_CYTHON=${CASS_DRIVER_NO_CYTHON}

# By default PIP install is run without cache to make image smaller
# By default PIP install run without cache to make image smaller
ARG PIP_NO_CACHE_DIR="true"
ENV PIP_NO_CACHE_DIR=${PIP_NO_CACHE_DIR}
RUN echo "Pip no cache dir: ${PIP_NO_CACHE_DIR}"
Expand All @@ -125,16 +278,19 @@ RUN echo "Pip version: ${PIP_VERSION}"

RUN pip install --upgrade pip==${PIP_VERSION}

# Airflow sources change frequently but dependency onfiguration won't change that often
# We are copying everything with airflow:airflow user:group even if we use root to run the scripts
# This is fine as root user will be able to use those dirs anyway.

# Airflow sources change frequently but dependency configuration won't change that often
# We copy setup.py and other files needed to perform setup of dependencies
# This way cache here will only be invalidated if any of the
# version/setup configuration change but not when airflow sources change
COPY --chown=airflow:airflow setup.py /opt/airflow/setup.py
COPY --chown=airflow:airflow setup.cfg /opt/airflow/setup.cfg
COPY --chown=airflow:airflow setup.py ${AIRFLOW_SOURCES}/setup.py
COPY --chown=airflow:airflow setup.cfg ${AIRFLOW_SOURCES}/setup.cfg

COPY --chown=airflow:airflow airflow/version.py /opt/airflow/airflow/version.py
COPY --chown=airflow:airflow airflow/__init__.py /opt/airflow/airflow/__init__.py
COPY --chown=airflow:airflow airflow/bin/airflow /opt/airflow/airflow/bin/airflow
COPY --chown=airflow:airflow airflow/version.py ${AIRFLOW_SOURCES}/airflow/version.py
COPY --chown=airflow:airflow airflow/__init__.py ${AIRFLOW_SOURCES}/airflow/__init__.py
COPY --chown=airflow:airflow airflow/bin/airflow ${AIRFLOW_SOURCES}/airflow/bin/airflow

# Airflow Extras installed
ARG AIRFLOW_EXTRAS="all"
Expand All @@ -146,20 +302,18 @@ RUN echo "Installing with extras: ${AIRFLOW_EXTRAS}."
# And this Docker layer will be reused between builds.
RUN pip install --no-use-pep517 -e ".[${AIRFLOW_EXTRAS}]"

COPY --chown=airflow:airflow airflow/www/package.json /opt/airflow/airflow/www/package.json
COPY --chown=airflow:airflow airflow/www/package-lock.json /opt/airflow/airflow/www/package-lock.json
COPY --chown=airflow:airflow airflow/www/package.json ${AIRFLOW_SOURCES}/airflow/www/package.json
COPY --chown=airflow:airflow airflow/www/package-lock.json ${AIRFLOW_SOURCES}/airflow/www/package-lock.json

WORKDIR /opt/airflow/airflow/www
WORKDIR ${AIRFLOW_SOURCES}/airflow/www

# Install necessary NPM dependencies (triggered by changes in package-lock.json)
RUN gosu airflow npm ci
RUN gosu ${AIRFLOW_USER} npm ci

COPY --chown=airflow:airflow airflow/www/ /opt/airflow/airflow/www/
COPY --chown=airflow:airflow airflow/www/ ${AIRFLOW_SOURCES}/airflow/www/

# Package NPM for production
RUN gosu airflow npm run prod

WORKDIR /opt/airflow
RUN gosu ${AIRFLOW_USER} npm run prod

# Always apt-get update/upgrade here to get latest dependencies before
# we redo pip install
Expand All @@ -170,7 +324,9 @@ RUN apt-get update \

# Cache for this line will be automatically invalidated if any
# of airflow sources change
COPY --chown=airflow:airflow . /opt/airflow/
COPY --chown=airflow:airflow . ${AIRFLOW_SOURCES}/

WORKDIR ${AIRFLOW_SOURCES}

# Always add-get update/upgrade here to get latest dependencies before
# we redo pip install
Expand All @@ -182,16 +338,20 @@ RUN apt-get update \
# Additional python deps to install
ARG ADDITIONAL_PYTHON_DEPS=""

RUN if [ -n "${ADDITIONAL_PYTHON_DEPS}" ]; then \
RUN if [[ -n "${ADDITIONAL_PYTHON_DEPS}" ]]; then \
pip install ${ADDITIONAL_PYTHON_DEPS}; \
fi

USER airflow
COPY --chown=airflow:airflow ./scripts/docker/entrypoint.sh /entrypoint.sh

WORKDIR ${AIRFLOW_HOME}
USER ${AIRFLOW_USER}

COPY --chown=airflow:airflow ./scripts/docker/entrypoint.sh /entrypoint.sh
WORKDIR ${AIRFLOW_SOURCES}

ENV PATH="${HOME}:${PATH}"

EXPOSE 8080

ENTRYPOINT ["/usr/local/bin/dumb-init", "--", "/entrypoint.sh"]

CMD ["--help"]
25 changes: 25 additions & 0 deletions Dockerfile-context
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# We use this small Dockerfile to find out the list of files that are part of the Docker context
# i.e. ignored by .dockerignore
# We need it to fix permissions of files checked out by git to help with cache invalidation on different
# system that have different UMASK. See hooks/build for some detailed explanation.

FROM alpine:3.9

COPY . /context
Loading

0 comments on commit edaca51

Please sign in to comment.