-
Notifications
You must be signed in to change notification settings - Fork 60
/
Dockerfile
136 lines (118 loc) · 5.16 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
ARG BASE_VERSION=v5
ARG SPARK_VERSION
ARG HADOOP_VERSION
ARG SCALA_VERSION
ARG JAVA_VERSION
ARG PYTHON_VERSION
FROM dsaidgovsg/spark-k8s-addons:${BASE_VERSION}_${SPARK_VERSION}_hadoop-${HADOOP_VERSION}_scala-${SCALA_VERSION}_java-${JAVA_VERSION}_python-${PYTHON_VERSION} AS base
# Airflow will run as root instead of the spark 185 user meant for k8s
USER root
# Set up gosu
RUN set -euo pipefail && \
apt-get update; \
apt-get install -y --no-install-recommends gosu; \
rm -rf /var/lib/apt/lists/*; \
# Verify that the binary works
gosu nobody true; \
:
# Set up Hadoop
ARG HADOOP_VERSION
## Other Spark / Airflow related defaults
ARG HADOOP_HOME="/opt/hadoop"
ENV HADOOP_HOME="${HADOOP_HOME}"
ARG HADOOP_CONF_DIR="/opt/hadoop/etc/hadoop"
ENV HADOOP_CONF_DIR="${HADOOP_CONF_DIR}"
RUN set -euo pipefail && \
mkdir -p "$(dirname "${HADOOP_HOME}")"; \
curl -LO "https://archive.apache.org/dist/hadoop/core/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz"; \
tar xf "hadoop-${HADOOP_VERSION}.tar.gz"; \
mv "hadoop-${HADOOP_VERSION}" "${HADOOP_HOME}"; \
rm "hadoop-${HADOOP_VERSION}.tar.gz"; \
# Install JARs to Hadoop external
## AWS S3 JARs
## Get the aws-java-sdk version dynamic based on Hadoop version
## Do not use head -n1 because it will trigger 141 exit code due to early return on pipe
AWS_JAVA_SDK_VERSION="$(curl https://raw.githubusercontent.com/apache/hadoop/branch-${HADOOP_VERSION}/hadoop-project/pom.xml | grep -A1 aws-java-sdk | grep -oE "[[:digit:]]+\.[[:digit:]]+\.[[:digit:]]+" | tr "\r\n" " " | cut -d " " -f 1)"; \
cd "${HADOOP_HOME}/share/hadoop/hdfs/"; \
curl -LO "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar"; \
curl -LO "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_JAVA_SDK_VERSION}/aws-java-sdk-bundle-${AWS_JAVA_SDK_VERSION}.jar"; \
cd -; \
printf "\
<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n\
<configuration>\n\
<property>\n\
<name>fs.s3a.impl</name>\n\
<value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>\n\
</property>\n\
</configuration>\n" > ${HADOOP_CONF_DIR}/core-site.xml; \
## Google Storage JAR
cd "${HADOOP_HOME}/share/hadoop/hdfs/"; \
curl -LO https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop2-latest.jar; \
cd -; \
cd "${HADOOP_HOME}/share/hadoop/tools/lib"; \
## MariaDB JAR
curl -LO https://downloads.mariadb.com/Connectors/java/connector-java-2.4.0/mariadb-java-client-2.4.0.jar; \
## Postgres JDBC JAR
curl -LO https://jdbc.postgresql.org/download/postgresql-42.6.0.jar; \
cd -; \
:
ENV PATH="${PATH}:${HADOOP_HOME}/bin"
# Set up Airflow via poetry
ARG AIRFLOW_VERSION
ENV AIRFLOW_VERSION="${AIRFLOW_VERSION}"
ARG SQLALCHEMY_VERSION
ENV SQLALCHEMY_VERSION="${SQLALCHEMY_VERSION}"
RUN set -euo pipefail && \
# Airflow and SQLAlchemy
# Postgres dev prereqs to install Airflow
apt-get update; \
apt-get install -y --no-install-recommends build-essential libpq5 libpq-dev; \
## These two version numbers can take MAJ.MIN[.PAT]
if [ -z "${AIRFLOW_VERSION}" ]; then >&2 echo "Please specify AIRFLOW_VERSION" && exit 1; fi; \
if [ -v "${SQLALCHEMY_VERSION}" ]; then >&2 echo "Please specify SQLALCHEMY_VERSION" && exit 1; fi; \
AIRFLOW_NORM_VERSION="$(printf "%s.%s" "${AIRFLOW_VERSION}" "*" | cut -d '.' -f1,2,3)"; \
SQLALCHEMY_NORM_VERSION="$(printf "%s.%s" "${SQLALCHEMY_VERSION}" "*" | cut -d '.' -f1,2,3)"; \
pushd "${POETRY_SYSTEM_PROJECT_DIR}"; \
if [[ "${AIRFLOW_NORM_VERSION}" == "2.1.*" ]]; then \
poetry add \
"apache-airflow==${AIRFLOW_NORM_VERSION}" \
"sqlalchemy==${SQLALCHEMY_NORM_VERSION}" \
"boto3" \
"psycopg2" \
# airflow 2.1 does not use markupsafe>=2, nothing to fix
# https://github.com/apache/airflow/blob/v2-1-stable/setup.cfg#L122
; \
popd; \
else \
# Airflow >= 2.2
poetry add \
"apache-airflow==${AIRFLOW_NORM_VERSION}" \
"sqlalchemy==${SQLALCHEMY_NORM_VERSION}" \
"boto3" \
"psycopg2" \
# Fixes ImportError: cannot import name 'soft_unicode' from 'markupsafe'
# https://github.com/dbt-labs/dbt-core/issues/4745#issuecomment-1044354226
"markupsafe==2.0.1" \
; \
popd; \
fi; \
## Clean up dev files and only retain the runtime of Postgres lib
apt-get remove -y build-essential libpq-dev; \
rm -rf /var/lib/apt/lists/*; \
:
ARG AIRFLOW_HOME=/airflow
ENV AIRFLOW_HOME="${AIRFLOW_HOME}"
# Create the Airflow home
WORKDIR ${AIRFLOW_HOME}
# Copy the entrypoint as root first but allow user to run
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x "/entrypoint.sh"
ENTRYPOINT ["/entrypoint.sh"]
# Less verbose logging
COPY log4j.properties "${SPARK_HOME}/conf/log4j.properties"
# Setup airflow dags path
ENV AIRFLOW_DAG="${AIRFLOW_HOME}/dags"
RUN mkdir -p "${AIRFLOW_DAG}"
COPY setup_auth.py test_db_conn.py ${AIRFLOW_HOME}/
# All the other env vars that don't affect the build here
ENV PYSPARK_SUBMIT_ARGS="--py-files ${SPARK_HOME}/python/lib/pyspark.zip pyspark-shell"