-
Notifications
You must be signed in to change notification settings - Fork 96
/
showpower
executable file
·159 lines (150 loc) · 4.32 KB
/
showpower
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env bash
# Print the nodes power values from scontrol show node (CurrentWatts etc.)
# Usage: showpower < -w node-list | -p partition(s) | -a | -h > [ -S sorting-variable ]
#
# Note: The sinfo command cannot print power values, so we have to use scontrol.
# To enable Slurm power monitoring:
# Several AcctGatherEnergyType types are defined in the slurm.conf manual page.
# RAPL data gathering can be enabled in Slurm by:
# AcctGatherEnergyType=acct_gather_energy/rapl
# AcctGatherNodeFreq=30
# and do an "scontrol reconfig".
# Command usage:
function usage()
{
cat <<EOF
Usage: $0 < -w node-list | -p partition(s) | -a | -h > [ -S sorting-variable ] -s
where:
-w node-list: Print this node-list
-p partition(s): Print this partition
-a: All nodes in the cluster
-s: Summarize total power only
-h: Print help information
-S: Sort output by this column (e.g. CurrentWatts)
EOF
}
if [[ $# = 0 ]]
then
echo "ERROR: No arguments given"
usage
exit 1
fi
# Default output sorting variable
export sortvar="NodeName"
export summarize=0
while getopts "p:w:S:ash" options; do
case $options in
p ) export nodelist="`sinfo -h -p $OPTARG -O NodeList:`"
if [[ -z $nodelist ]]
then
echo "ERROR: Partition $OPTARG does not exist on this cluster"
usage
exit 1
fi
;;
a ) export nodelist="`sinfo -h --all -O NodeList:`"
;;
w ) export nodelist=$OPTARG ;;
S ) export sortvar="$OPTARG" ;;
s ) export summarize=1 ;;
h|? ) usage
exit 0 ;;
esac
done
# Test for extraneous command line arguments
if test $# -gt $(($OPTIND-1))
then
echo ERROR: Too many command line arguments: $*
usage
exit 1
fi
# Sanity check of nodelist
if [[ -z "`sinfo -h -N -n $nodelist -O NodeList:`" ]]
then
echo "ERROR: Node-list $nodelist does not exist on this cluster"
usage
exit 1
fi
scontrol -o show node $nodelist | awk '
BEGIN {
# Read environment variables
sortvar=ENVIRON["sortvar"]
summarize=ENVIRON["summarize"]
if (summarize > 0)
printnodes=0
else
printnodes=1
}
{
# Get the NodeName n from $1
if (split($1,array,"=") != 2) {
print "Failed to read values from field: " $1 " in line:"
print $0
exit -1
}
if (array[1] != "NodeName") {
print "Failed to read NodeName from field: " $1 " in line:"
print $0
exit -1
}
# The current NodeName is assigned to n
n = array[2]
for (i=1; i<=NF; i++) {
# Get variable=value pairs (omit fields with spaces)
if (split($i,array,"=") == 2)
value[array[1]][n] = array[2]
}
}
function printheading(text, fmtheading)
{
fmtheading = "%-8s %6s %8s %8s %8s %10.10s %10.10s %10.10s %10.10s\n"
printf(fmtheading, text, "#CPUs", "CPU-", "Current", "Average", "Cap", "ExtSensors", "ExtSensors", "ExtSensors")
printf(fmtheading, "", "", "load", "Watts", "Watts", "Watts", "Watts", "Joules", "Temp")
}
END {
# Check that the sortvar index exists in the value array
if (sortvar in value) {
if (printnodes > 0) printheading("Nodename")
# Print selected variables for all nodes (ascending values)
PROCINFO["sorted_in"] = "@val_num_asc"
format = "%-8s %6d %8.1f %8d %8d %10s %10s %10s %10s\n"
for (n in value[sortvar]) {
nodecount++
if (printnodes > 0) printf(format,
value["NodeName"][n],
value["CPUTot"][n],
value["CPULoad"][n],
value["CurrentWatts"][n],
value["AveWatts"][n],
value["CapWatts"][n],
value["ExtSensorsWatts"][n],
value["ExtSensorsJoules"][n],
value["ExtSensorsTemp"][n])
CPUTot += value["CPUTot"][n]
CPULoad += value["CPULoad"][n]
CurrentWatts += value["CurrentWatts"][n]
AveWatts += value["AveWatts"][n]
CapWatts += value["CapWatts"][n]
ExtSensorsWatts += value["ExtSensorsWatts"][n]
ExtSensorsJoules += value["ExtSensorsJoules"][n]
ExtSensorsTemp += value["ExtSensorsTemp"][n]
}
if (nodecount > 1 || printnodes == 0) {
n = nodecount
print ""
printheading("Summary")
printf(format, "TOTAL", CPUTot, CPULoad, CurrentWatts, AveWatts, CapWatts,
ExtSensorsWatts, ExtSensorsJoules, ExtSensorsTemp)
printf(format, "Average", CPUTot/n, CPULoad/n, CurrentWatts/n, AveWatts/n, CapWatts/n,
ExtSensorsWatts/n, ExtSensorsJoules/n, ExtSensorsTemp/n)
}
} else {
print "ERROR: Unknown sorting variable=" sortvar
PROCINFO["sorted_in"] = "@ind_str_asc"
printf("Available sorting variables:\n")
for (i in value)
printf("%s ", i)
print ""
exit -1
}
}'