Skip to content

Commit

Permalink
2.9.2 hadoop
Browse files Browse the repository at this point in the history
  • Loading branch information
vnijs committed Feb 17, 2024
1 parent 10e2b6d commit f1beceb
Show file tree
Hide file tree
Showing 9 changed files with 703 additions and 604 deletions.
2 changes: 1 addition & 1 deletion files/scalable_analytics/core-site.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
<value>hdfs://localhost:9100</value>
</property>
</configuration>
7 changes: 0 additions & 7 deletions files/scalable_analytics/hadoop.sh

This file was deleted.

7 changes: 5 additions & 2 deletions files/scalable_analytics/init-dfs.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#!/bin/bash

$HADOOP_HOME/bin/hdfs namenode -format
echo `${HADOOP_HOME}/bin/hdfs getconf -confKey dfs.datanode.data.dir` | cut -c8- | xargs rm -r
mkdir -p /tmp/hadoop-root/dfs/name
mkdir -p /tmp/hadoop-jovyan/dfs/data
sed -i '$a\# Add the line for suppressing the NativeCodeLoader warning \nlog4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR,console' /$HADOOP_HOME/etc/hadoop/log4j.properties
$HADOOP_HOME/bin/hdfs namenode -format -force
echo `${HADOOP_HOME}/bin/hdfs getconf -confKey dfs.datanode.data.dir` | cut -c8- | xargs rm -r
2 changes: 1 addition & 1 deletion files/setup-hadoop.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ curl -sL --retry 3 \
| tar -x --strip-components=1 -C $HADOOP_HOME \
&& rm -rf $HADOOP_HOME/share/doc \
&& chown -R ${NB_USER} $HADOOP_HOME \
&& mkdir "${HADOOP_HOME}/logs"
&& mkdir -p "${HADOOP_HOME}/logs"
302 changes: 3 additions & 299 deletions rsm-msba-arm/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
# FROM quay.io/jupyter/pyspark-notebook@sha256:95ef0825db8d4ce411bafe93be230d8ff59d0c2a45038ca1fd4ac4c3c5474ddb
# FROM quay.io/jupyter/pyspark-notebook:aarch64-2023-12-18
FROM quay.io/jupyter/pyspark-notebook@sha256:9863dd81c0b2d047232c60f9b96aabb9a3c561161960eea48761618a891daa35
FROM vnijs/rsm-msba-arm:2.9.1

LABEL Vincent Nijs "[email protected]"

Expand All @@ -13,301 +11,8 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"]

USER root

# fixes the issue where sudo requires terminal for password when starting postgres
RUN echo "${NB_USER} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers

RUN apt-get update -qq && apt-get -y --no-install-recommends install \
supervisor \
openssh-server \
libcurl4-openssl-dev \
zsh \
vim \
vifm \
wget \
rsync \
lsb-release \
git \
netcat \
htop \
openjdk-17-jdk-headless \
ant \
ca-certificates-java \
&& apt-get clean \
&& update-ca-certificates -f;

ENV CMDSTAN_VERSION="2.33.1"
ENV PANDAS_VERSION="2.1.4"
# ENV PANDAS_VERSION="2.0.3" # pyspark image still using 2.0.3
ENV PYARROW_VERSION="14.0.1"
RUN mamba install --quiet --yes -c conda-forge \
pandas=${PANDAS_VERSION} \
cmdstan=${CMDSTAN_VERSION} \
cmdstanpy \
sqlalchemy \
psycopg2 \
ipython-sql \
beautifulsoup4 \
scikit-learn \
mlxtend \
xgboost \
lightgbm \
graphviz \
lime \
shap \
spacy \
pydotplus \
networkx \
seaborn \
plotnine \
selenium \
sqlalchemy \
pyLDAvis \
python-dotenv \
statsmodels \
linearmodels \
jupyterlab_widgets \
jupytext \
black \
isort \
nltk \
jupyter-server-proxy \
jupyter-rsession-proxy \
streamlit \
xlrd \
openpyxl \
pyarrow=${PYARROW_VERSION} \
python-duckdb \
duckdb-engine \
bash_kernel \
sympy \
simpy \
awscli \
bokeh \
dask-kubernetes \
dask-ml \
findspark \
pyspark \
plotly \
&& python -m bash_kernel.install

# causing issues with 1/12/2023 update
# snowflake-connector-python

COPY files/setup-ml-frameworks.sh setup.sh
RUN chmod 755 setup.sh \
&& ./setup.sh \
&& rm setup.sh

# make system (conda) R the first choice
ENV R_VERSION=4.3.2
ENV TERM=xterm
ENV R_HOME=/opt/conda/lib/R
ENV LD_LIBRARY_PATH="/opt/conda/lib:/usr/local/lib:${LD_LIBRARY_PATH}"
ENV PATH="/usr/local/bin:$PATH"

RUN mamba install --quiet --yes -c conda-forge \
c-compiler \
"r-base>=${R_VERSION}" \
r-curl \
binutils \
libgit2 \
freetype \
libpng \
libtiff \
libjpeg-turbo \
libxml2 \
unixodbc \
rpy2 \
jupyterlab-variableinspector \
jupyterlab_code_formatter \
openssh \
git \
&& ln -s /opt/conda/bin/R /usr/local/bin/R \
&& ln -s /opt/conda/bin/Rscript /usr/local/bin/Rscript

# not available through conda-forge for both arm and amd
# or the conda version is causing issues
RUN pip install \
jupyterlab-skip-traceback \
radian \
polars \
connectorx \
xlsx2csv \
jupysql \
shiny \
shinywidgets \
pyrsm

# catboost # not available for arm64

# connectorx is default for sql stuff in polars but is not built for aarch64
# had to do that manually with a docker file
# see https://github.com/sfu-db/connector-x/issues/386
ENV wheel_name=connectorx-0.3.2-cp311-cp311-manylinux_2_34_aarch64.whl
COPY files/connectorx/${wheel_name} ${wheel_name}
RUN pip install ${wheel_name}

RUN echo "R_LIBS_USER='~/.rsm-msba/R/${R_VERSION}'" >> ${R_HOME}/etc/Renviron.site
RUN echo '.libPaths(unique(c(Sys.getenv("R_LIBS_USER"), .libPaths())))' >> ${R_HOME}/etc/Rprofile.site

COPY files/setup-tidyverse.sh setup.sh
RUN chmod +x setup.sh \
&& ./setup.sh \
&& rm setup.sh

# packages need for radiant a reproducible analysis
COPY files/setup-radiant.sh setup.sh
RUN chmod +x setup.sh \
&& ./setup.sh \
&& rm setup.sh

# tooling for Bayesian Machine Learning class
# COPY files/setup-bml.sh setup.sh
# RUN chmod +x setup.sh \
# && ./setup.sh \
# && rm setup.sh

# adding postgres
# mostly from https://docs.docker.com/engine/examples/postgresql_service/
ENV POSTGRES_VERSION=14

# upgrade to postgres 14
RUN apt -y update && \
apt -y upgrade && \
apt -y install gpgv dirmngr wget vim && \
sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list' && \
wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - && \
apt -y update && \
apt-get install -y \
postgresql-${POSTGRES_VERSION} \
postgresql-client-${POSTGRES_VERSION} \
postgresql-contrib-${POSTGRES_VERSION}

# Run the rest of the commands as the postgres user
RUN addgroup ${NB_USER} postgres \
&& addgroup postgres users \
&& chown -R postgres:postgres /etc/postgresql/${POSTGRES_VERSION}/ \
&& chown -R postgres:postgres /var/lib/postgresql/${POSTGRES_VERSION}/ \
&& chmod -R u=rwX,go= /var/lib/postgresql/${POSTGRES_VERSION}/

USER postgres

ARG PGPASSWORD=${PGPASSWORD:-postgres}
ENV PGPASSWORD=${PGPASSWORD}

# create a postgres role for ${NB_USER} with "postgres" as the password
# create a database "rsm-docker" owned by the ${NB_USER} role.
RUN /etc/init.d/postgresql start \
&& psql --command "CREATE USER ${NB_USER} WITH SUPERUSER PASSWORD '${PGPASSWORD}';" \
&& createdb -O ${NB_USER} rsm-docker

COPY files/postgresql.conf /etc/postgresql/${POSTGRES_VERSION}/main/postgresql.conf
COPY files/pg_hba.conf /etc/postgresql/${POSTGRES_VERSION}/main/pg_hba.conf

USER root

# populate version number in conf file
RUN sed -i 's/__version__/'"$POSTGRES_VERSION"'/g' /etc/postgresql/${POSTGRES_VERSION}/main/postgresql.conf

RUN addgroup ${NB_USER} postgres \
&& chown -R postgres:postgres /etc/postgresql/${POSTGRES_VERSION}/main/ \
&& fix-permissions /etc/postgresql/${POSTGRES_VERSION}/main/

# from https://github.com/ucsd-ets/rsm-msba-datahub/blob/master/Dockerfile
# RUN chmod -R 777 /etc/postgresql/${POSTGRES_VERSION}
# RUN chmod -R 777 /var/lib/postgresql/

# oh-my-zsh (need to install wget and curl again ...)
RUN apt-get update -qq && apt-get -y --no-install-recommends install wget curl \
&& sh -c "$(curl -fsSL https://raw.github.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" \
&& git clone https://github.com/zsh-users/zsh-completions ${ZSH_CUSTOM:=~/.oh-my-zsh/custom}/plugins/zsh-completions \
&& git clone https://github.com/zsh-users/zsh-autosuggestions ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-autosuggestions \
&& git clone https://github.com/zsh-users/zsh-syntax-highlighting.git ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-syntax-highlighting \
&& git clone https://github.com/supercrabtree/k ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/k \
&& git clone --depth=1 https://github.com/romkatv/powerlevel10k.git ${ZSH_CUSTOM:-$HOME/.oh-my-zsh/custom}/themes/powerlevel10k \
&& cp -R /home/jovyan/.oh-my-zsh /etc/skel/.oh-my-zsh

COPY files/zshrc /etc/skel/.zshrc
COPY files/p10k.zsh /etc/skel/.p10k.zsh
COPY files/usethis /usr/local/bin/usethis
COPY files/clean.sh /usr/local/bin/clean

# settings for local install of python packages
ARG PYBASE=/home/${NB_USER}/.rsm-msba
ENV PYBASE=${PYBASE}
ENV PYTHONUSERBASE=${PYBASE} \
JUPYTER_PATH=${PYBASE}/share/jupyter \
JUPYTER_DATA_DIR=${PYBASE}/share/jupyter \
JUPYTER_CONFIG_DIR=${PYBASE}/jupyter \
JUPYTER_RUNTIME_DIR=/tmp/jupyter/runtime \
RSTUDIO_WHICH_R=/usr/local/bin/R \
SHELL=/bin/zsh \
ZDOTDIR=/home/${NB_USER}/.rsm-msba/zsh \
CMDSTAN="/opt/cmdstan/cmdstan-${CMDSTAN_VERSION}"

COPY files/install-rstudio.sh setup.sh
RUN chmod 755 setup.sh \
&& ./setup.sh \
&& rm setup.sh

# setup quarto - can be used with Rstudio
# and when connecting to running container
# from VSCode
COPY files/setup-quarto.sh setup.sh
RUN chmod +x setup.sh \
&& ./setup.sh \
&& rm setup.sh

# updating the supervisord.conf file for Jupyter and the notebook_config file
COPY files/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
COPY files/condarc /opt/conda/.condarc
RUN mkdir -p /var/log/supervisor \
&& fix-permissions /var/log/supervisor \
&& fix-permissions /etc/supervisor/conf.d/ \
&& fix-permissions "${CONDA_DIR}"

# copy base conda environment management script
COPY files/ccenv.sh /usr/local/bin/ccenv
COPY files/cl.sh /usr/local/bin/cl
COPY files/cr.sh /usr/local/bin/cr
COPY files/ci.sh /usr/local/bin/ci
COPY files/ce.sh /usr/local/bin/ce

# Copy the launch script into the image
COPY launch-${DOCKERHUB_NAME}.sh /opt/launch.sh
COPY files/setup.sh /usr/local/bin/setup
RUN fix-permissions /etc/skel \
&& fix-permissions /usr/local/bin \
&& chmod 755 /usr/local/bin/*

# get pgweb
RUN wget -O pgweb.zip https://github.com/sosedoff/pgweb/releases/download/v0.11.11/pgweb_linux_arm64_v7.zip \
&& unzip pgweb.zip -d pgweb_dir \
&& rm pgweb.zip \
&& mv pgweb_dir/* /usr/local/bin/pgweb \
&& rm -rf pgweb_dir

# setting up jupyter-server-proxy extensions pgweb, gitgadget, and radiant
RUN pip install git+https://github.com/vnijs/jupyter-pgweb-proxy.git \
&& pip install git+https://github.com/vnijs/jupyter-gitgadget-proxy.git \
&& pip install git+https://github.com/vnijs/jupyter-radiant-proxy.git

# packages need for radiant a reproducible analysis
COPY files/setup-extra.sh setup.sh
RUN chmod +x setup.sh \
&& ./setup.sh \
&& rm setup.sh

RUN mamba update --yes pandoc \
&& mamba clean --all -f -y \
&& fix-permissions "${CONDA_DIR}" \
&& fix-permissions "/home/${NB_USER}"

# packages need for arrow
COPY files/setup-arrow.sh setup.sh
RUN chmod +x setup.sh \
&& ./setup.sh \
&& rm setup.sh
RUN apt-get update -qq && apt-get -y --no-install-recommends install lsof
RUN pip install pyrsm --upgrade

# setup hadoop
ENV JAVA_HOME "/usr/lib/jvm/java-17-openjdk-arm64/"
Expand All @@ -325,7 +30,6 @@ ADD files/scalable_analytics/hdfs-site.xml $HADOOP_HOME/etc/hadoop/
ADD files/scalable_analytics/init-dfs.sh /opt/hadoop/
ADD files/scalable_analytics/start-dfs.sh /opt/hadoop/
ADD files/scalable_analytics/stop-dfs.sh /opt/hadoop/
ADD files/scalable_analytics/hadoop.sh /usr/bin/hadoop
RUN chown -R ${NB_USER} ${HADOOP_HOME} \
&& chmod 755 ${HADOOP_HOME}/*.sh \
&& chmod 755 /usr/bin/hadoop
Expand Down
Loading

0 comments on commit f1beceb

Please sign in to comment.