Skip to content

Commit

Permalink
Improve fix-lnet.sh (#110)
Browse files Browse the repository at this point in the history
* Improve lnet fix

* disable scale testing >32 pods
  • Loading branch information
vinli-cn authored Mar 9, 2023
1 parent 99c4acd commit ca1f68d
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 12 deletions.
10 changes: 8 additions & 2 deletions pkg/azurelustreplugin/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,16 @@ if [[ "${installClientPackages}" == "yes" ]]; then
sed -i "s/{default_interface}/${default_interface}/g;" ./fix-lnet.sh
cp ./fix-lnet.sh /etc/lustre

# legacy rules 73 & 74
test -e /etc/udev/rules.d/73-netadd.rules && rm -f /etc/udev/rules.d/73-netadd.rules
test -e /etc/udev/rules.d/74-netremove.rules && rm -f /etc/udev/rules.d/74-netremove.rules
echo 'SUBSYSTEM=="net", ACTION=="add", RUN+="/etc/lustre/fix-lnet.sh"' | tee /etc/udev/rules.d/73-netadd.rules
echo 'SUBSYSTEM=="net", ACTION=="remove", RUN+="/etc/lustre/fix-lnet.sh"' | tee /etc/udev/rules.d/74-netremove.rules

# current rules 98 & 99
test -e /etc/udev/rules.d/98-netadd.rules && rm -f /etc/udev/rules.d/98-netadd.rules
test -e /etc/udev/rules.d/99-netremove.rules && rm -f /etc/udev/rules.d/99-netremove.rules

echo 'SUBSYSTEM=="net", ACTION=="add", RUN+="/etc/lustre/fix-lnet.sh"' | tee /etc/udev/rules.d/98-netadd.rules
echo 'SUBSYSTEM=="net", ACTION=="remove", RUN+="/etc/lustre/fix-lnet.sh"' | tee /etc/udev/rules.d/99-netremove.rules

echo "$(date -u) Reloading udevadm"
udevadm control --reload
Expand Down
35 changes: 26 additions & 9 deletions pkg/azurelustreplugin/fix-lnet.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,29 @@
#!/bin/bash

for run_id in 0 2 4; do
sleep $run_id
(
flock -w 60 -e ${FD}
if sudo lnetctl net show --net tcp | grep "status: down"; then
/usr/sbin/lnetctl net del --net tcp
/usr/sbin/lnetctl net add --net tcp --if {default_interface}
fi
) {FD}< /etc/lustre/.lock
/usr/bin/logger "PID $$: Start fix-lnet"
count=1;

# try fix lnet 5 times maximum
for sleep_in_secs in 0 0.5 0.5 0.5 0.5; do
sleep $sleep_in_secs

break_flag=$(
(
break_flag_inner=false
flock -w 60 -e ${FD}
if sudo lnetctl net show --net tcp | grep "status: down"; then
/usr/sbin/lnetctl net del --net tcp
/usr/sbin/lnetctl net add --net tcp --if {default_interface}
break_flag_inner=true
fi
echo $break_flag_inner
) {FD}< /etc/lustre/.lock
)

if [[ $break_flag == true ]]; then
break
else
/usr/bin/logger "PID $$: Skipped fix-lnet, count=$count"
count=$((count+1))
fi
done
3 changes: 2 additions & 1 deletion test/long-haul/perf-scale-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
# limitations under the License.

python $(pwd)/../scale/run_test.py --provisioning-type static \
--scales 4 8 16 32 64 128 256 \
# disable scales 32 64 128 256 for Lnet fix \
--scales 4 8 16 \
--csi-name azurelustre.csi.azure.com \
--mgs-ip-address ${LustreFSIP:-"172.18.32.5"} \
--fs-name lustrefs

0 comments on commit ca1f68d

Please sign in to comment.