diff --git a/Makefile.am b/Makefile.am
index a8a5fbc..0358d38 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -14,7 +14,7 @@ nobase_dist_conf_DATA = scripts/lbnl_cmd.nhc scripts/common.nhc \
scripts/lbnl_fs.nhc scripts/lbnl_hw.nhc \
scripts/lbnl_job.nhc scripts/lbnl_moab.nhc \
scripts/lbnl_net.nhc scripts/lbnl_nv.nhc \
- scripts/lbnl_ps.nhc
+ scripts/lbnl_ps.nhc scripts/csc_nvidia_smi.nhc
MAINTAINERCLEANFILES = Makefile.in aclocal.m4 configure install-sh missing
DISTCLEANFILES =
diff --git a/README.md b/README.md
index ad9c570..8780895 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,8 @@ The NHC Yum repository is currently unavailable, but we hope to provide one in t
The [source tarball for the latest release](https://github.com/mej/nhc/releases/download/1.4.2/lbnl-nhc-1.4.2.tar.xz) is also available via the [NHC Project on GitHub](https://github.com/mej/nhc/). If you prefer to install from source, or aren't using one of the distributions shown above, use the commands shown here:
```
+# yum install automake
+# ./autogen.sh
# ./configure --prefix=/usr --sysconfdir=/etc --libexecdir=/usr/libexec
# make test
# make install
@@ -670,6 +672,17 @@ _**Example** (make sure `/tmp` has at least 1000 inodes)_: `check_fs_inodes /tm
+##### check_all_fs_inodes
+`check_all_fs_inodes fstype [min] [max]`
+
+Ensures that all filesystems of type _fstype_ has at least _min_ but no more than _max_ total inodes. Either may be blank. Calls check_fs_inodes for each filesystem.
+
+_**Example** (make sure xfs filesystems have at least 1000 inodes)_: `check_all_fs_inodes xfs 1k`
+
+
+
+
+
##### check_fs_ifree
`check_fs_ifree mountpoint min`
@@ -683,6 +696,17 @@ _**Example** (make sure `/local` has at least 100 inodes free)_: `check_fs_ifre
+##### check_all_fs_ifree
+`check_all_fs_ifree fstype min`
+
+Ensures that all filesystems of type _fstype_ have at least _min_ free inodes. Calls check_fs_ifree for each filesystem.
+
+_**Example** (make sure xfs filesystems have at least 100 inodes free)_: `check_all_fs_ifree xfs 100`
+
+
+
+
+
##### check_fs_iused
`check_fs_iused mountpoint max`
@@ -696,6 +720,17 @@ _**Example** (make sure `/tmp` has no more than 1 million used inodes)_: `check
+##### check_all_fs_iused
+`check_all_fs_iused fstype max`
+
+Ensures that all filesystems of type _fstype_ have no more than _max_ used inodes. Calls check_fs_iused for each filesystem.
+
+_**Example** (make sure xfs filesystems have no more than 1 million used inodes)_: `check_all_fs_iused xfs 1M`
+
+
+
+
+
##### check_fs_mount
`check_fs_mount [-0] [-r] [-t fstype] [-s source] [-o options] [-O remount_options] [-e missing_action] [-E found_action] {-f|-F} mountpoint [...]`
@@ -778,6 +813,17 @@ _**Example**_: `check_fs_used / 98%`
+##### check_all_fs_used
+`check_all_fs_used fstype maxused`
+
+Checks that all filesystems of type _fstype_ have less than _maxused_ space consumed. Calls check_fs_used for each filesystem.
+
+_**Example**_: `check_all_fs_used xfs 98%`
+
+
+
+
+
##### check_hw_cpuinfo
`check_hw_cpuinfo [sockets] [cores] [threads]`
diff --git a/scripts/csc_nvidia_smi.nhc b/scripts/csc_nvidia_smi.nhc
new file mode 100644
index 0000000..df1f79c
--- /dev/null
+++ b/scripts/csc_nvidia_smi.nhc
@@ -0,0 +1,67 @@
+# NHC - nVidia GPU Checks
+#
+# Johan Guldmyr
+# 17 Dec 2015
+#
+
+NVIDIA_SMI_HEALTHMON="${NVIDIA_SMI_HEALTHMON:-nvidia-smi}"
+NVIDIA_SMI_HEALTHMON_ARGS="${NVIDIA_SMI_HEALTHMON_ARGS}"
+
+NVSMI_HEALTHMON_LINES=( )
+NVSMI_HEALTHMON_OUTPUT=""
+NVSMI_HEALTHMON_RC=""
+
+export NVSMI_HEALTHMON_LINES NVSMI_HEALTHMON_OUTPUT NVSMI_HEALTHMON_RC
+
+function nhc_nvsmi_gather_data() {
+ local IFS
+
+ NVSMI_HEALTHMON_OUTPUT=$($NVIDIA_SMI_HEALTHMON $NVIDIA_SMI_HEALTHMON_ARGS 2>/dev/null)
+ NVSMI_HEALTHMON_RC=$?
+ IFS=$'\n'
+ NVSMI_HEALTHMON_LINES=( $NVSMI_HEALTHMON_OUTPUT )
+}
+
+# Run the nvidia-smi utility and verify that all GPUs
+# are functioning properly.
+function check_nvsmi_healthmon() {
+ if [[ -z "$NVSMI_HEALTHMON_RC" ]]; then
+ nhc_nvsmi_gather_data
+ fi
+
+ if [[ $NVSMI_HEALTHMON_RC -eq 0 ]]; then
+ dbg "$FUNCNAME: $NVIDIA_SMI_HEALTHMON completed successfully"
+ return 0
+ elif [[ $NVSMI_HEALTHMON_RC -eq 4 ]]; then
+ die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Permission denied"
+ return 1
+ elif [[ $NVSMI_HEALTHMON_RC -eq 8 ]]; then
+ die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Power cables not attached"
+ return 1
+ elif [[ $NVSMI_HEALTHMON_RC -eq 2 ]]; then
+ die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Invalid argument or flag"
+ return 1
+ elif [[ $NVSMI_HEALTHMON_RC -eq 9 ]]; then
+ die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: NVIDIA driver not loaded"
+ return 1
+ elif [[ $NVSMI_HEALTHMON_RC -eq 10 ]]; then
+ die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Interrupt issue with a GPU"
+ return 1
+ elif [[ $NVSMI_HEALTHMON_RC -eq 12 ]]; then
+ die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: NVML shared library could not be found"
+ return 1
+ elif [[ $NVSMI_HEALTHMON_RC -eq 14 ]]; then
+ die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: InfoROM is corrupted"
+ return 1
+ elif [[ $NVSMI_HEALTHMON_RC -eq 15 ]]; then
+ die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: The GPU has fallen off the bus or has otherwise become inaccessible"
+ return 1
+ elif [[ $NVSMI_HEALTHMON_RC -gt 127 ]]; then
+ die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Caught fatal signal $((NVSMI_HEALTHMON_RC&0x7f))"
+ return 1
+ else
+ log "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: \"$NVSMI_HEALTHMON_OUTPUT\""
+ die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Returned failure code $NVSMI_HEALTHMON_RC"
+ return 1
+ fi
+}
diff --git a/scripts/lbnl_fs.nhc b/scripts/lbnl_fs.nhc
index d7c9741..377bee4 100644
--- a/scripts/lbnl_fs.nhc
+++ b/scripts/lbnl_fs.nhc
@@ -443,6 +443,27 @@ function check_fs_used() {
return 0
}
+# Check all filesystems of type ($1) has at most a specified amount ($2) of
+# used space. This may be either a percentage or a number of kB.
+# The function check_fs_used is called for each filesystem.
+function check_all_fs_used() {
+ local TYPE=$1
+ local MAX_USED=$2
+ local i
+
+ if [[ ${#DF_DEV[*]} -eq 0 ]]; then
+ nhc_fs_df_gather_data
+ fi
+
+ for ((i=0; i < ${#DF_DEV[*]}; i++)); do
+ if [[ "${DF_TYPE[$i]}" != "$TYPE" ]]; then
+ continue
+ fi
+ check_fs_used "${DF_MNTPT[$i]}" "$MAX_USED"
+ done
+ return 0
+}
+
# Check to make sure a filesystem ($1) has between a minimum ($2) and
# a maximum ($3) amount of inodes. Either may be blank. To check for
# a specific inode count, pass the same value for both parameters.
@@ -484,6 +505,28 @@ function check_fs_inodes() {
return 0
}
+# Check to make sure all filesystems of type ($1) has between a minimum ($2) and
+# a maximum ($3) amount of inodes. Either may be blank. To check for
+# a specific inode count, pass the same value for both parameters.
+function check_all_fs_inodes() {
+ local TYPE=$1
+ local MIN_INODES=$2
+ local MAX_INODES=$3
+ local i
+
+ if [[ ${#DF_DEV[*]} -eq 0 ]]; then
+ nhc_fs_df_gather_data
+ fi
+
+ for ((i=0; i < ${#DFI_DEV[*]}; i++)); do
+ if [[ "${DFI_TYPE[$i]}" != "$TYPE" ]]; then
+ continue
+ fi
+ check_fs_inodes "${DF_MNTPT[$i]}" "$MIN_INODES" "$MAX_INODES"
+ done
+ return 0
+}
+
# Check that filesystem ($1) has at least a specified amount ($2) of
# free inodes. This may be either a percentage or an exact number.
function check_fs_ifree() {
@@ -525,6 +568,26 @@ function check_fs_ifree() {
return 0
}
+# Check that all filesystems of type ($1) has at least a specified amount ($2) of
+# free inodes. This may be either a percentage or an exact number.
+function check_all_fs_ifree() {
+ local TYPE=$1
+ local MIN_IFREE=$2
+ local i
+
+ if [[ ${#DF_DEV[*]} -eq 0 ]]; then
+ nhc_fs_df_gather_data
+ fi
+
+ for ((i=0; i < ${#DFI_DEV[*]}; i++)); do
+ if [[ "${DFI_TYPE[$i]}" != "$TYPE" ]]; then
+ continue
+ fi
+ check_fs_ifree "${DF_MNTPT[$i]}" "$MIN_IFREE"
+ done
+ return 0
+}
+
# Check that filesystem ($1) has at most a specified amount ($2) of
# used inodes. This may be either a percentage or an exact number.
function check_fs_iused() {
@@ -564,3 +627,23 @@ function check_fs_iused() {
done
return 0
}
+
+# Check that all filesystems of type ($1) has at most a specified amount ($2) of
+# used inodes. This may be either a percentage or an exact number.
+function check_all_fs_iused() {
+ local TYPE=$1
+ local MAX_IUSED=$2
+ local i
+
+ if [[ ${#DF_DEV[*]} -eq 0 ]]; then
+ nhc_fs_df_gather_data
+ fi
+
+ for ((i=0; i < ${#DFI_DEV[*]}; i++)); do
+ if [[ "${DFI_TYPE[$i]}" != "$TYPE" ]]; then
+ continue
+ fi
+ check_fs_iused "${DFI_MNTPT[$i]}" "${MAX_IUSED}"
+ done
+ return 0
+}
diff --git a/scripts/lbnl_hw.nhc b/scripts/lbnl_hw.nhc
index e73d197..9dea7d6 100644
--- a/scripts/lbnl_hw.nhc
+++ b/scripts/lbnl_hw.nhc
@@ -117,6 +117,9 @@ function nhc_hw_gather_data() {
# Check if user-leved mad driver loaded and IB diag tools will succeed to run
if [[ -f /sys/class/infiniband_mad/abi_version ]]; then
read HW_IB_UMAD_ABI_VER < /sys/class/infiniband_mad/abi_version
+ elif [[ ! -d /sys/class/infiniband_mad ]]; then
+ # No Infiniband device was found
+ HW_IB_UMAD_ABI_VER=-1
else
HW_IB_UMAD_ABI_VER=0
fi
@@ -338,6 +341,9 @@ function check_hw_ib() {
if [[ $HW_IB_UMAD_ABI_VER -eq 0 ]]; then
die 1 "$FUNCNAME: Version mismatch between kernel OFED drivers and userspace OFED libraries."
return 1
+ elif [[ $HW_IB_UMAD_ABI_VER -eq -1 ]]; then
+ die 1 "$FUNCNAME: No Infiniband device was found."
+ return 1
fi
for ((i=0; i < ${#HW_IB_STATE[*]}; i++)); do