-
Notifications
You must be signed in to change notification settings - Fork 96
/
showpower_nvidia
executable file
·88 lines (76 loc) · 2.15 KB
/
showpower_nvidia
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env bash
# Print the node power values from nodes with an Nvidia GPU.
# The ClusterShell (clush) tool is required.
# An SSH command executes the nvidia-smi tool on the nodes and extracts the power values.
# Command usage:
function usage()
{
cat <<EOF
Usage: $0 < -w node-list | -p partition(s) | -a | -h > -s
where:
-w node-list: Print this node-list
-p partition(s): Print this partition
-a: All nodes in the cluster
-s: Summarize total power only
-h: Print help information
Default node-list: All nodes with a GRES value of "gpu"
EOF
}
export nodelist=""
export summarize=0
export TMPFILE=`mktemp`
while getopts "p:w:ash" options; do
case $options in
p ) export nodelist="`sinfo -h -p $OPTARG -O NodeList:`"
if [[ -z $nodelist ]]
then
echo "ERROR: Partition $OPTARG does not exist on this cluster"
usage
exit 1
fi
;;
a ) export nodelist="`sinfo -h --all -O NodeList:`"
;;
w ) export nodelist=$OPTARG ;;
s ) export summarize=1 ;;
h|? ) usage
exit 0 ;;
esac
done
# Test for extraneous command line arguments
if test $# -gt $(($OPTIND-1))
then
echo ERROR: Too many command line arguments: $*
usage
exit 1
fi
# Default nodelist: nodes with a GRES value of "gpu"
if [[ -z "$nodelist" ]]
then
export nodelist=`sinfo -O Nodelist,GRES | grep -i gpu | awk '{print $1}'`
fi
# Sanity check of nodelist
if [[ -z "`sinfo -h -N -n $nodelist -O NodeList:`" ]]
then
echo "ERROR: Node-list $nodelist does not exist on this cluster"
usage
exit 1
fi
if [[ ! `which clush` ]]
then
echo "The ClusterShell tool clush is missing"
exit 1
fi
clush -w $nodelist 'which nvidia-smi 2>/dev/null > /dev/null && nvidia-smi | grep "W " | sed /W/s/// | awk "{power+=\$5}END{print power}"' | sort > $TMPFILE
if [[ ! -s $TMPFILE ]]
then
echo "ERROR: The nvidia-smi output file from nodes $nodelist is empty"
exit 1
fi
if [[ $summarize == 0 ]]
then
cat $TMPFILE | awk 'BEGIN{print "Node GPU power (Watt)"}{print $0; total+=$2}END{print "TOTAL: " total}'
else
cat $TMPFILE | awk '{total+=$2; nodes++}END{printf("TOTAL node GPU power for %d nodes= %d Watt, average= %d Watt\n", nodes, total, total/nodes)}'
fi
rm -f $TMPFILE