diff --git a/Makefile.am b/Makefile.am index a8a5fbc..0358d38 100644 --- a/Makefile.am +++ b/Makefile.am @@ -14,7 +14,7 @@ nobase_dist_conf_DATA = scripts/lbnl_cmd.nhc scripts/common.nhc \ scripts/lbnl_fs.nhc scripts/lbnl_hw.nhc \ scripts/lbnl_job.nhc scripts/lbnl_moab.nhc \ scripts/lbnl_net.nhc scripts/lbnl_nv.nhc \ - scripts/lbnl_ps.nhc + scripts/lbnl_ps.nhc scripts/csc_nvidia_smi.nhc MAINTAINERCLEANFILES = Makefile.in aclocal.m4 configure install-sh missing DISTCLEANFILES = diff --git a/README.md b/README.md index ad9c570..8780895 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,8 @@ The NHC Yum repository is currently unavailable, but we hope to provide one in t The [source tarball for the latest release](https://github.com/mej/nhc/releases/download/1.4.2/lbnl-nhc-1.4.2.tar.xz) is also available via the [NHC Project on GitHub](https://github.com/mej/nhc/). If you prefer to install from source, or aren't using one of the distributions shown above, use the commands shown here: ``` +# yum install automake +# ./autogen.sh # ./configure --prefix=/usr --sysconfdir=/etc --libexecdir=/usr/libexec # make test # make install @@ -670,6 +672,17 @@ _**Example** (make sure `/tmp` has at least 1000 inodes)_: `check_fs_inodes /tm
+##### check_all_fs_inodes +`check_all_fs_inodes fstype [min] [max]` + +Ensures that all filesystems of type _fstype_ has at least _min_ but no more than _max_ total inodes. Either may be blank. Calls check_fs_inodes for each filesystem. + +_**Example** (make sure xfs filesystems have at least 1000 inodes)_: `check_all_fs_inodes xfs 1k` + + +
+ + ##### check_fs_ifree `check_fs_ifree mountpoint min` @@ -683,6 +696,17 @@ _**Example** (make sure `/local` has at least 100 inodes free)_: `check_fs_ifre
+##### check_all_fs_ifree +`check_all_fs_ifree fstype min` + +Ensures that all filesystems of type _fstype_ have at least _min_ free inodes. Calls check_fs_ifree for each filesystem. + +_**Example** (make sure xfs filesystems have at least 100 inodes free)_: `check_all_fs_ifree xfs 100` + + +
+ + ##### check_fs_iused `check_fs_iused mountpoint max` @@ -696,6 +720,17 @@ _**Example** (make sure `/tmp` has no more than 1 million used inodes)_: `check
+##### check_all_fs_iused +`check_all_fs_iused fstype max` + +Ensures that all filesystems of type _fstype_ have no more than _max_ used inodes. Calls check_fs_iused for each filesystem. + +_**Example** (make sure xfs filesystems have no more than 1 million used inodes)_: `check_all_fs_iused xfs 1M` + + +
+ + ##### check_fs_mount `check_fs_mount [-0] [-r] [-t fstype] [-s source] [-o options] [-O remount_options] [-e missing_action] [-E found_action] {-f|-F} mountpoint [...]` @@ -778,6 +813,17 @@ _**Example**_: `check_fs_used / 98%`
+##### check_all_fs_used +`check_all_fs_used fstype maxused` + +Checks that all filesystems of type _fstype_ have less than _maxused_ space consumed. Calls check_fs_used for each filesystem. + +_**Example**_: `check_all_fs_used xfs 98%` + + +
+ + ##### check_hw_cpuinfo `check_hw_cpuinfo [sockets] [cores] [threads]` diff --git a/scripts/csc_nvidia_smi.nhc b/scripts/csc_nvidia_smi.nhc new file mode 100644 index 0000000..df1f79c --- /dev/null +++ b/scripts/csc_nvidia_smi.nhc @@ -0,0 +1,67 @@ +# NHC - nVidia GPU Checks +# +# Johan Guldmyr +# 17 Dec 2015 +# + +NVIDIA_SMI_HEALTHMON="${NVIDIA_SMI_HEALTHMON:-nvidia-smi}" +NVIDIA_SMI_HEALTHMON_ARGS="${NVIDIA_SMI_HEALTHMON_ARGS}" + +NVSMI_HEALTHMON_LINES=( ) +NVSMI_HEALTHMON_OUTPUT="" +NVSMI_HEALTHMON_RC="" + +export NVSMI_HEALTHMON_LINES NVSMI_HEALTHMON_OUTPUT NVSMI_HEALTHMON_RC + +function nhc_nvsmi_gather_data() { + local IFS + + NVSMI_HEALTHMON_OUTPUT=$($NVIDIA_SMI_HEALTHMON $NVIDIA_SMI_HEALTHMON_ARGS 2>/dev/null) + NVSMI_HEALTHMON_RC=$? + IFS=$'\n' + NVSMI_HEALTHMON_LINES=( $NVSMI_HEALTHMON_OUTPUT ) +} + +# Run the nvidia-smi utility and verify that all GPUs +# are functioning properly. +function check_nvsmi_healthmon() { + if [[ -z "$NVSMI_HEALTHMON_RC" ]]; then + nhc_nvsmi_gather_data + fi + + if [[ $NVSMI_HEALTHMON_RC -eq 0 ]]; then + dbg "$FUNCNAME: $NVIDIA_SMI_HEALTHMON completed successfully" + return 0 + elif [[ $NVSMI_HEALTHMON_RC -eq 4 ]]; then + die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Permission denied" + return 1 + elif [[ $NVSMI_HEALTHMON_RC -eq 8 ]]; then + die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Power cables not attached" + return 1 + elif [[ $NVSMI_HEALTHMON_RC -eq 2 ]]; then + die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Invalid argument or flag" + return 1 + elif [[ $NVSMI_HEALTHMON_RC -eq 9 ]]; then + die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: NVIDIA driver not loaded" + return 1 + elif [[ $NVSMI_HEALTHMON_RC -eq 10 ]]; then + die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Interrupt issue with a GPU" + return 1 + elif [[ $NVSMI_HEALTHMON_RC -eq 12 ]]; then + die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: NVML shared library could not be found" + return 1 + elif [[ $NVSMI_HEALTHMON_RC -eq 14 ]]; then + die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: InfoROM is corrupted" + return 1 + elif [[ $NVSMI_HEALTHMON_RC -eq 15 ]]; then + die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: The GPU has fallen off the bus or has otherwise become inaccessible" + return 1 + elif [[ $NVSMI_HEALTHMON_RC -gt 127 ]]; then + die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Caught fatal signal $((NVSMI_HEALTHMON_RC&0x7f))" + return 1 + else + log "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: \"$NVSMI_HEALTHMON_OUTPUT\"" + die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Returned failure code $NVSMI_HEALTHMON_RC" + return 1 + fi +} diff --git a/scripts/lbnl_fs.nhc b/scripts/lbnl_fs.nhc index d7c9741..377bee4 100644 --- a/scripts/lbnl_fs.nhc +++ b/scripts/lbnl_fs.nhc @@ -443,6 +443,27 @@ function check_fs_used() { return 0 } +# Check all filesystems of type ($1) has at most a specified amount ($2) of +# used space. This may be either a percentage or a number of kB. +# The function check_fs_used is called for each filesystem. +function check_all_fs_used() { + local TYPE=$1 + local MAX_USED=$2 + local i + + if [[ ${#DF_DEV[*]} -eq 0 ]]; then + nhc_fs_df_gather_data + fi + + for ((i=0; i < ${#DF_DEV[*]}; i++)); do + if [[ "${DF_TYPE[$i]}" != "$TYPE" ]]; then + continue + fi + check_fs_used "${DF_MNTPT[$i]}" "$MAX_USED" + done + return 0 +} + # Check to make sure a filesystem ($1) has between a minimum ($2) and # a maximum ($3) amount of inodes. Either may be blank. To check for # a specific inode count, pass the same value for both parameters. @@ -484,6 +505,28 @@ function check_fs_inodes() { return 0 } +# Check to make sure all filesystems of type ($1) has between a minimum ($2) and +# a maximum ($3) amount of inodes. Either may be blank. To check for +# a specific inode count, pass the same value for both parameters. +function check_all_fs_inodes() { + local TYPE=$1 + local MIN_INODES=$2 + local MAX_INODES=$3 + local i + + if [[ ${#DF_DEV[*]} -eq 0 ]]; then + nhc_fs_df_gather_data + fi + + for ((i=0; i < ${#DFI_DEV[*]}; i++)); do + if [[ "${DFI_TYPE[$i]}" != "$TYPE" ]]; then + continue + fi + check_fs_inodes "${DF_MNTPT[$i]}" "$MIN_INODES" "$MAX_INODES" + done + return 0 +} + # Check that filesystem ($1) has at least a specified amount ($2) of # free inodes. This may be either a percentage or an exact number. function check_fs_ifree() { @@ -525,6 +568,26 @@ function check_fs_ifree() { return 0 } +# Check that all filesystems of type ($1) has at least a specified amount ($2) of +# free inodes. This may be either a percentage or an exact number. +function check_all_fs_ifree() { + local TYPE=$1 + local MIN_IFREE=$2 + local i + + if [[ ${#DF_DEV[*]} -eq 0 ]]; then + nhc_fs_df_gather_data + fi + + for ((i=0; i < ${#DFI_DEV[*]}; i++)); do + if [[ "${DFI_TYPE[$i]}" != "$TYPE" ]]; then + continue + fi + check_fs_ifree "${DF_MNTPT[$i]}" "$MIN_IFREE" + done + return 0 +} + # Check that filesystem ($1) has at most a specified amount ($2) of # used inodes. This may be either a percentage or an exact number. function check_fs_iused() { @@ -564,3 +627,23 @@ function check_fs_iused() { done return 0 } + +# Check that all filesystems of type ($1) has at most a specified amount ($2) of +# used inodes. This may be either a percentage or an exact number. +function check_all_fs_iused() { + local TYPE=$1 + local MAX_IUSED=$2 + local i + + if [[ ${#DF_DEV[*]} -eq 0 ]]; then + nhc_fs_df_gather_data + fi + + for ((i=0; i < ${#DFI_DEV[*]}; i++)); do + if [[ "${DFI_TYPE[$i]}" != "$TYPE" ]]; then + continue + fi + check_fs_iused "${DFI_MNTPT[$i]}" "${MAX_IUSED}" + done + return 0 +} diff --git a/scripts/lbnl_hw.nhc b/scripts/lbnl_hw.nhc index e73d197..9dea7d6 100644 --- a/scripts/lbnl_hw.nhc +++ b/scripts/lbnl_hw.nhc @@ -117,6 +117,9 @@ function nhc_hw_gather_data() { # Check if user-leved mad driver loaded and IB diag tools will succeed to run if [[ -f /sys/class/infiniband_mad/abi_version ]]; then read HW_IB_UMAD_ABI_VER < /sys/class/infiniband_mad/abi_version + elif [[ ! -d /sys/class/infiniband_mad ]]; then + # No Infiniband device was found + HW_IB_UMAD_ABI_VER=-1 else HW_IB_UMAD_ABI_VER=0 fi @@ -338,6 +341,9 @@ function check_hw_ib() { if [[ $HW_IB_UMAD_ABI_VER -eq 0 ]]; then die 1 "$FUNCNAME: Version mismatch between kernel OFED drivers and userspace OFED libraries." return 1 + elif [[ $HW_IB_UMAD_ABI_VER -eq -1 ]]; then + die 1 "$FUNCNAME: No Infiniband device was found." + return 1 fi for ((i=0; i < ${#HW_IB_STATE[*]}; i++)); do