diff --git a/README.template b/README.template new file mode 100644 index 0000000..39dfe9a --- /dev/null +++ b/README.template @@ -0,0 +1,30 @@ +# bootstrap-functions +This repository holds common functions that can be used in Qubole node bootstraps + +## How to use + +Source the required script in your bootstrap script. For example, to mount an EFS volume with the bootstrap, you may do the following: + +``` +source /usr/lib/qubole/bootstrap-functions/misc/mount_nfs.sh + +mount_nfs fs-7abd2444.efs.us-east-1.amazonaws.com:/ /mnt/efs +``` + +## Available functions +The following set of functions are available at present: + +## Contributing +Please raise a pull request for any modifications or additions you would like to make. There may be a delay between when you want to start using a method and when it might be available via Qubole's AMI. To work around this, it is recommended to put a placeholder `source` line in your bootstrap script. For example + +``` +function mysparkfunction() { + # ... do some stuff +} + +source /usr/lib/qubole/bootstrap-functions/spark/mysparkfunction.sh + +mysparkfunction arg1 arg2 ... +``` + +This way, when the function makes it to the AMI, you will automatically use the copy in the bootstrap-functions library. diff --git a/common/utils.sh b/common/utils.sh index 8a6fe00..9755d5e 100755 --- a/common/utils.sh +++ b/common/utils.sh @@ -1,41 +1,92 @@ #!/usr/bin/env bash +# +# @file common/utils.sh +# @brief Provides common utility functions -#-------------------------------------------------------------------------------- -# Utility methods -#-------------------------------------------------------------------------------- - +# @description Function to populate nodeinfo +# # Please call this method at start of node bootstrap +# +# @example +# populate_nodeinfo +# +# @noargs populate_nodeinfo() { source /usr/lib/hustler/bin/qubole-bash-lib.sh } -# Returns 0 when run on a Hadoop2 cluster node. -# Returns 1 otherwise +# @description Function to check if the node belongs to a Hadoop2 cluster +# +# @example +# if is_hadoop2_cluster; then +# # do something here +# fi +# +# @noargs +# +# @exitcode 0 If the cluster runs hadoop2 +# @exitcode 1 Otherwise is_hadoop2_cluster() { [[ `nodeinfo use_hadoop2` = "1" ]] } -# Returns 0 when HiveServer2 is configured to run on the cluster master. -# Returns 1 otherwise +# @description Function to check if a HiveServer2 is configured to run on a master node +# +# @example +# if is_hs2_enabled; then +# # do something here +# fi +# +# @noargs +# +# @exitcode 0 When HiveServer2 is configured on a master node +# @exitcode 1 Otherwise is_hs2_enabled() { is_hadoop2_cluster && [[ `nodeinfo hive_use_hs2` = "1" ]] } -# Returns 0 when run on a HiveServer2 cluster node. -# Returns 1 otherwise +# @description Function to check if a node belongs to a HiveServer2 cluster +# +# @example +# if is_hs2_cluster; then +# # do something here +# fi +# +# @noargs +# +# @exitcode 0 When node belongs to a HiveServer2 cluster +# @exitcode 1 Otherwise is_hs2_cluster() { is_hadoop2_cluster && [[ `nodeinfo is_hs2_cluster` = "1" ]] } -# Returns 0 when run on a cluster master node. -# Returns 1 otherwise +# @description Function to check if a node is a cluster master node +# +# @example +# if is_master_node; then +# # do something here +# fi +# +# @noargs +# +# @exitcode 0 When node is a cluster master node +# @exitcode 1 Otherwise is_master_node() { [[ `nodeinfo is_master` = "1" ]] } -# Returns 0 when run on a cluster worker node. -# Returns 1 otherwise +# @description Function to check if a node is a cluster worker node +# +# @example +# if is_worker_node; then +# # do something here +# fi +# +# @noargs +# +# @exitcode 0 When node is a cluster worker node +# @exitcode 1 Otherwise is_worker_node() { ! is_master_node } diff --git a/generate_docs.sh b/generate_docs.sh new file mode 100755 index 0000000..0f1795f --- /dev/null +++ b/generate_docs.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +cd "$(dirname "$0")" + +# Cleanup older documentation +mkdir -p docs +rm -f docs/*.md + +# Generate new documentation +directories=$(ls -d */ | grep -v "docs\|tests\|examples") +for dx in ${directories}; do + find ${dx} -type f -name "*.sh" -exec shdoc {} \; > docs/$(dirname ${dx}.).md +done + +# Overwrite README.md +cp -f README.md README.bak +cp -f README.template README.md +for dx in ${directories}; do + d=$(dirname ${dx}.) + sed -i "/The following set of functions are available at present:/a * [${d}](docs/${d}.md)" README.md +done diff --git a/hadoop/util.sh b/hadoop/util.sh index 105404b..4532b4f 100644 --- a/hadoop/util.sh +++ b/hadoop/util.sh @@ -1,4 +1,7 @@ #!/bin/bash +# +# @file hadoop/util.sh +# @brief Provides Hadoop2 utility functions source /usr/lib/hustler/bin/qubole-bash-lib.sh export PROFILE_FILE=${PROFILE_FILE:-/etc/profile} @@ -64,12 +67,15 @@ function _restart_worker_services_ctl() { # after the bootstrap is finished } -## -# Restart hadoop services on the cluster master +# @description Function to restart hadoop services on the cluster master # # This may be used if you're using a different version # of Java, for example # +# @example +# restart_master_services +# +# @noargs function restart_master_services() { if [[ ${al2} == "true" || ${dont_use_monit} == "true" ]]; then _restart_master_services_ctl @@ -79,12 +85,15 @@ function restart_master_services() { } -## -# Restart hadoop services on cluster workers +# @description Function to restart hadoop services on the cluster workers # # This only restarts the datanode service since the # nodemanager is started after the bootstrap is run # +# @example +# restart_worker_services +# +# @noargs function restart_worker_services() { if [[ ${al2} == "true" || ${dont_use_monit} == "true" ]]; then _restart_worker_services_ctl @@ -93,9 +102,12 @@ function restart_worker_services() { fi } -## -# Generic fucntion to restart hadoop services +# @description Generic function to restart hadoop services +# +# @example +# restart_hadoop_services # +# @noargs function restart_hadoop_services() { local is_master=$(nodeinfo is_master) if [[ ${is_master} == "1" ]]; then @@ -105,15 +117,18 @@ function restart_hadoop_services() { fi } -## -# Use Java 8 for hadoop daemons and jobs +# @description Use Java 8 for hadoop daemons and jobs # # By default, the hadoop daemons and jobs on Qubole # clusters run on Java 7. Use this function if you would like # to use Java 8. This is only required if your cluster: -# is in AWS, and -# is running Hive or Spark < 2.2 +# 1. is in AWS, and +# 2. is running Hive or Spark < 2.2 # +# @example +# use_java8 +# +# @noargs function use_java8() { export JAVA_HOME=/usr/lib/jvm/java-1.8.0 export PATH=$JAVA_HOME/bin:$PATH @@ -130,11 +145,13 @@ function use_java8() { fi } -## -# Wait until namenode is out of safe mode. -# Takes 2 optional params -# first : Number of attempts function will make to get namenode out of safemode. Default is 50 -# second : Number of seconds each attempt will sleep for waiting for namenode to come out of sleep mode. Default is 5sec +# @description Wait until namenode is out of safe mode +# +# @example +# wait_until_namenode_running 25 5 +# +# @arg $1 int Number of attempts function will make to get namenode out of safemode. Defaults to 50 +# @arg $2 int Number of seconds each attempt will sleep for, waiting for namenode to come out of sleep mode. Defaults to 5 function wait_until_namenode_running() { n=0 attempts=${1:-50} diff --git a/hive/glue-sync.sh b/hive/glue-sync.sh index 3d125ec..349cff8 100755 --- a/hive/glue-sync.sh +++ b/hive/glue-sync.sh @@ -1,13 +1,20 @@ #!/bin/bash +# +# @file hive/glue-sync.sh +# @brief Provides function to install Hive Glue Catalog Sync Agent source /usr/lib/hustler/bin/qubole-bash-lib.sh source /usr/lib/qubole/bootstrap-functions/hive/hiveserver2.sh -## -# Installs Hive Glue Catalog Sync Agent -# param1 - Region for AWS Athena. Defaults to us-east-1 +# @description Installs Hive Glue Catalog Sync Agent +# # Requires Hive 2.x +# Currently supported only on AWS +# +# @example +# install_glue_sync us-east-1 # +# @arg $1 string Region for AWS Athena. Defaults to `us-east-1` function install_glue_sync() { aws_region=${1:-us-east-1} diff --git a/hive/hiveserver2.sh b/hive/hiveserver2.sh index c50daa6..d3f747c 100755 --- a/hive/hiveserver2.sh +++ b/hive/hiveserver2.sh @@ -1,32 +1,61 @@ #!/usr/bin/env bash +# +# @file hive/hiveserver2.sh +# @brief Provides functions to start/stop/restart HiveServer2 source /usr/lib/qubole/bootstrap-functions/common/utils.sh -#-------------------------------------------------------------------------------- -# Methods to stop/start/restart HiveServer2 -#-------------------------------------------------------------------------------- - +# @description Function to check if HiveServer2 is configured +# +# @example +# if [[ is_hs2_configured ]]; then +# # do something here +# fi +# +# @noargs +# +# @exitcode 0 If HiveServer2 is configured +# @exitcode 1 Otherwise function is_hs2_configured() { (is_master_node && is_hs2_enabled) || (is_worker_node && is_hs2_cluster) } -# Stop HiveServer2 JVM - works on both Hadoop2 and HiveServer2 cluster +# @description Function to stop HiveServer2 JVM +# +# Works on both Hadoop2 and HiveServer2 clusters +# +# @example +# stop_hs2 +# +# @noargs function stop_hs2() { if [[ is_hs2_configured ]]; then monit stop hs2 fi } -# Start HiveServer2 JVM - works on both Hadoop2 and HiveServer2 cluster +# @description Function to start HiveServer2 JVM +# +# Works on both Hadoop2 and HiveServer2 clusters +# +# @example +# start_hs2 +# +# @noargs function start_hs2() { if [[ is_hs2_configured ]]; then monit start hs2 fi } -## -# Restart HiveServer2 JVM - works on both Hadoop2 and HiveServer2 cluster +# @description Function to restart HiveServer2 JVM +# +# Works on both Hadoop2 and HiveServer2 clusters +# +# @example +# restart_hs2 # +# @noargs function restart_hs2() { stop_hs2 sleep 5 diff --git a/hive/ranger-client.sh b/hive/ranger-client.sh index 24389c9..4f89015 100755 --- a/hive/ranger-client.sh +++ b/hive/ranger-client.sh @@ -1,15 +1,22 @@ #!/bin/bash +# +# @file hive/ranger-client.sh +# @brief Provides function to install Apache Ranger client for Hive source /usr/lib/qubole/bootstrap-functions/common/utils.sh source /usr/lib/qubole/bootstrap-functions/hive/hiveserver2.sh -## -# Install Apache Ranger client for Hive -# Parameters: -# -h: Ranger admin host. Defaults to `localhost` -# -p: Ranger admin port. Defaults to `6080` -# -r: Ranger repository name. Defaults to `hivedev` +# @description Install Apache Ranger client for Hive +# +# Currently supported only on AWS +# Requires HiveServer2 +# +# @example +# install_ranger -h example.host -p 6080 -r examplerepo # +# @arg -h string Hostname of Ranger admin. Defaults to `localhost` +# @arg -p int Port where Ranger admin is running. Defaults to `6080` +# @arg -r string Name of Ranger repository. Defaults to `hivedev` function install_ranger() { populate_nodeinfo if [[ is_hs2_configured ]]; then diff --git a/hive/thrift-metastore.sh b/hive/thrift-metastore.sh index feee3f1..04fef9c 100755 --- a/hive/thrift-metastore.sh +++ b/hive/thrift-metastore.sh @@ -1,4 +1,7 @@ #!/bin/bash +# +# @file hive/thrift-metastore.sh +# @brief Provides functions to start/stop/restart thrift metastore server source /usr/lib/qubole/bootstrap-functions/common/utils.sh @@ -8,9 +11,12 @@ else metastore_service=metastore fi -## -# Start thrift metastore server +# @description Function to start thrift metastore server +# +# @example +# start_thrift_metastore # +# @noargs function start_thrift_metastore() { if is_master_node; then monit start ${metastore_service} @@ -19,9 +25,12 @@ function start_thrift_metastore() { fi } -## -# Stop thrift metastore server +# @description Function to stop thrift metastore server # +# @example +# stop_thrift_metastore +# +# @noargs function stop_thrift_metastore() { if is_master_node; then monit stop ${metastore_service} @@ -30,9 +39,12 @@ function stop_thrift_metastore() { fi } -## -# Restart thrift metastore server +# @description Function to restart thrift metastore server +# +# @example +# restart_thrift_metastore # +# @noargs function restart_thrift_metastore() { stop_thrift_metastore && sleep 2 && start_thrift_metastore } diff --git a/misc/awscli.sh b/misc/awscli.sh index b6e05db..7db8cae 100755 --- a/misc/awscli.sh +++ b/misc/awscli.sh @@ -1,13 +1,22 @@ #!/bin/bash +# +# @file misc/awscli.sh +# @brief Provides function to configure AWS CLI -## -# Configure AWS CLI -# -p: Name of the profile. Defaults to `default` -# -r: AWS region. Defaults to `us-east-1` -# -c: Credentials file -# The credentials file must contain the AWS Access Key and -# the AWS Secret Key separated by a space, comma, tab or newline +# @description Configure AWS CLI +# +# A credentials file containing the AWS Access Key and the AWS Secret Key +# separated by a space, comma, tab or newline must be provided +# +# @example +# configure_awscli -p exampleprofile -r us-east-1 -c /path/to/credentials/file +# +# @arg -p string Name of the profile. Defaults to `default` +# @arg -r string AWS region. Defaults to `us-east-1` +# @arg -c string Path to credentials file # +# @exitcode 0 AWS CLI is configured +# @exitcode 1 AWS CLI or credentials file not found function configure_awscli() { PROFILE=default REGION=us-east-1 diff --git a/misc/mount_nfs.sh b/misc/mount_nfs.sh index 63640c3..dfc7f94 100644 --- a/misc/mount_nfs.sh +++ b/misc/mount_nfs.sh @@ -1,11 +1,11 @@ #!/bin/bash +# +# @file misc/mount_nfs.sh +# @brief Provides function to mount a NFS volume source /usr/lib/hustler/bin/qubole-bash-lib.sh -## -# Mounts an NFS volume on master and worker nodes -# param1 - path to NFS share -# param2 - mountpoint to use +# @description Mounts an NFS volume on master and worker nodes # # Instructions for AWS EFS mount: # 1. After creating the EFS file system, create a security group @@ -17,7 +17,12 @@ source /usr/lib/hustler/bin/qubole-bash-lib.sh # http://docs.qubole.com/en/latest/admin-guide/how-to-topics/persistent-security-group.html # # TODO: add instructions for Azure file share -# +# +# @example +# mount_nfs_volume "example.nfs.share:/" /mnt/efs +# +# @arg $1 string Path to NFS share +# @arg $2 string Mount point to use function mount_nfs_volume() { nfs_export=$1 mountpoint=$2 diff --git a/misc/python_venv.sh b/misc/python_venv.sh index cc5b3ba..56b30e3 100644 --- a/misc/python_venv.sh +++ b/misc/python_venv.sh @@ -1,9 +1,9 @@ #!/bin/bash -x +# +# @file misc/python_venv.sh +# @brief Provides function to install Python virtualenv -## -# Install and activate a Python virtualenv -# param1 - version of python to use, default 3.6 -# param2 - location to create virtualenv in, default /usr/lib/virtualenv/py36 +# @description Install and activate a Python virtualenv # # This function activates the new virtualenv, so install # any libraries you want after calling this with "pip install" @@ -14,6 +14,11 @@ # /usr/lib/hadoop2/bin/hadoop dfs -get {s3|wasb}://path/to/requirements/file /tmp/requirements.txt # pip install -r /tmp/requirements.txt # +# @example +# install_python_env 3.6 /path/to/virtualenv/py36 +# +# @arg $1 float Version of Python to use. Defaults to 3.6 +# @arg $2 string Location to create virtualenv in. Defaults to /usr/lib/virtualenv/py36 function install_python_venv() { version=${$1:-36} location=${$2:-/usr/lib/virtualenv/py36} diff --git a/misc/util.sh b/misc/util.sh index 17c5652..7b8afbf 100755 --- a/misc/util.sh +++ b/misc/util.sh @@ -1,16 +1,20 @@ #!/bin/bash +# +# @file misc/util.sh +# @brief Provides miscellaneous utility functions -## -# Miscellaneous utility functions - -## -# Set the timezone -# param1 - Timezone to set. Mandatory parameter -# Eg: "US/Mountain", "America/Los_Angeles" etc. +# @description Set the timezone # # This function sets the timezone on the cluster node. +# The timezone to set is a mandatory parameter and must be present in /usr/share/zoneinfo +# Eg: "US/Mountain", "America/Los_Angeles" etc. +# # After setting the timezone, it is advised to restart engine daemons on the master and worker nodes # +# @example +# set_timezone "America/Los_Angeles" +# +# @arg $1 string Timezone to set function set_timezone() { timezone=$1 @@ -28,13 +32,13 @@ function set_timezone() { fi } -## -# Add public key to authorized_keys -# param1 - Public key to add to authorized_keys file. This parameter is mandatory. -# Eg: "ssh-rsa xyzxyzxyzxyz...xyzxyz user@example.com" -# param2 - User for which the public key is added. Defaults to "ec2-user". -# Eg: "ec2-user", "root" etc. +# @description Add a public key to authorized_keys +# +# @example +# add_to_authorized_keys "ssh-rsa xyzxyzxyzxyz...xyzxyz user@example.com" ec2-user # +# @arg $1 string Public key to add to authorized_keys file +# @arg $2 string User for which the public key is added. Defaults to `ec2-user` function add_to_authorized_keys() { public_key=$1 username=${2:-ec2-user} diff --git a/spark/util.sh b/spark/util.sh index 0d1ffb7..5921c4a 100755 --- a/spark/util.sh +++ b/spark/util.sh @@ -1,10 +1,19 @@ #!/bin/bash +# +# @file spark/util.sh +# @brief Provides functions to start/stop/restart Spark History Server source /usr/lib/hustler/bin/qubole-bash-lib.sh -## -# Start Spark History Server +# @description Function to start Spark History Server +# +# @example +# start_history_server +# +# @noargs # +# @exitcode 0 When Spark History Server is started +# @exitcode 1 Otherwise function start_history_server() { is_master=$(nodeinfo is_master) if [[ "$is_master" == "1" ]]; then @@ -14,9 +23,15 @@ function start_history_server() { fi } -## -# Stop Spark History Server +# @description Function to stop Spark History Server # +# @example +# stop_history_server +# +# @noargs +# +# @exitcode 0 When Spark History Server is stopped +# @exitcode 1 Otherwise function stop_history_server() { is_master=$(nodeinfo is_master) if [[ "$is_master" == "1" ]]; then @@ -26,9 +41,12 @@ function stop_history_server() { fi } -## -# Restart Spark History Server +# @description Function to restart Spark History Server +# +# @example +# restart_history_server # +# @noargs function restart_history_server() { stop_history_server && sleep 2 && start_history_server }