diff --git a/examples/nfs_venv.sh b/examples/nfs_venv.sh new file mode 100644 index 0000000..2dc1b67 --- /dev/null +++ b/examples/nfs_venv.sh @@ -0,0 +1,44 @@ +#!/bin/bash -x + +# +# Install python virtualenv in NFS mount. If it fails +# fall back to local +# +# Installing python libraries in an NFS mount has the following advantages over +# installing them locally on each node: +# 1. It allows for faster cluster startup and upscaling since the libraries only +# need to be installed once. This is especially pertinent with libraries that have +# compiled components, like numpy, scipy, etc. +# 2. One can install new libraries or upgrading existing ones at runtime, and the +# changes would be immediately available to all the cluster's nodes +# + +source /usr/lib/hustler/bin/qubole-bash-lib.sh +source /usr/lib/bootstrap-functions/misc/mount_nfs.sh +source /usr/lib/bootstrap-functions/misc/python_venv.sh + +mount_nfs_volume "fs-7abdefa3.efs.us-east-1.amazonaws.com:/" /mnt/efs + +if [[ $? == 0 ]]; then + is_master=$(nodeinfo is_master) + cluster_id=$(nodeinfo cluster_id) + # Use the cluster id so we can install different virtualenvs for + # different clusters + install_location="/mnt/efs/${cluster_id}/py36" + + # symlink to same path as local install so we can + # use in zeppelin + symlink=/usr/lib/virtualenv/py36 + + if [[ "$is_master" != "1" ]]; then + ln -s "$install_location" "$symlink" + hadoop_use_venv "$install_location" + # Install only from master. On worker nodes we just + # need the change to use the new virtualenv + exit 0 + fi + install_python_venv "36" "$install_location" + ln -s "$install_location" "$symlink" +else + install_python_venv +fi diff --git a/misc/mount_nfs.sh b/misc/mount_nfs.sh new file mode 100644 index 0000000..4ad0168 --- /dev/null +++ b/misc/mount_nfs.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +source /usr/lib/hustler/bin/qubole-bash-lib.sh + +# +# Instructions for AWS EFS mount: +# 1. After creating the EFS file system, create a security group +# 2. Create an inbound traffic rule for this security group that allows traffic on +# port 2049 (NFS) from this security group as described here: +# https://docs.aws.amazon.com/efs/latest/ug/accessing-fs-create-security-groups.html +# 3. Add this security group as a persistent security group for the cluster from which +# you want to mount the EFS store, as described here: +# http://docs.qubole.com/en/latest/admin-guide/how-to-topics/persistent-security-group.html +# +# TODO: add instructions for Azure file share +# + +function mount_nfs_volume() { + nfs_export=$1 + mountpoint=$2 + + is_master=$(nodeinfo is_master) + if [[ $is_master == "1" ]]; then + mount -v -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 "$nfs_export" "$mountpoint" + else + mount -v -t nfs4 -o nfsvers=4.1,ro,rsize=1048576,hard,timeo=600,retrans=2 "$nfs_export" "$mountpoint" + fi +} diff --git a/misc/python_venv.sh b/misc/python_venv.sh new file mode 100644 index 0000000..f584bca --- /dev/null +++ b/misc/python_venv.sh @@ -0,0 +1,30 @@ +#!/bin/bash -x + +# +# This function activates the new virtualenv, so install +# any libraries you want after calling this with "pip install" +# +# Alternatively you can also use a requirements file. For example +# to use a requirements file stored in S3 or Azure Blob Store, run +# +# /usr/lib/hadoop2/bin/hadoop dfs -get {s3|wasb}://path/to/requirements/file /tmp/requirements.txt +# pip install -r /tmp/requirements.txt +# + +function install_python_venv() { + version=${$1:-36} + location=${$2:-/usr/lib/virtualenv/py36} + + yum install -y "python${version}" + mkdir -p $location + + virtualenv -p "/usr/bin/python${version}" $location + hadoop_use_venv "$location" + + source ${location}/bin/activate +} + +function hadoop_use_venv() { + location="$1" + echo "VIRTUAL_ENV_DISABLE_PROMPT=1 source ${location}/bin/activate ${location}" >> /usr/lib/hadoop2/etc/hadoop/hadoop-env.sh +}