Skip to content

Commit

Permalink
[bug] fix lnet (#99)
Browse files Browse the repository at this point in the history
* fix lnet

* update image

* add missing files

* fix typo

* fix test commands
  • Loading branch information
chyin6 authored Feb 24, 2023
1 parent 4fff861 commit 42a79ad
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 61 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
/_output*/
/_output
azurelustreplugin
!pkg/azurelustreplugin

# Emacs save files
*~
Expand Down
53 changes: 29 additions & 24 deletions deploy/csi-azurelustre-node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -127,27 +127,28 @@ spec:
volumeMounts:
- mountPath: /csi
name: socket-dir
- mountPath: /var/lib/kubelet/
mountPropagation: Bidirectional
name: mountpoint-dir
- mountPath: /etc/kubernetes/
name: azure-cred
- mountPath: /mnt
name: azurelustre-cache
- mountPath: /dev
name: host-dev
- mountPath: /host/var/log
name: host-var-log
# - mountPath: /usr
# name: host-usr
# - mountPath: /etc
# name: host-etc
# - mountPath: /lib
# name: host-lib
- mountPath: /var
mountPropagation: Bidirectional
name: host-var
- mountPath: /usr
name: host-usr
- mountPath: /etc
name: host-etc
# udevadm needs this
- mountPath: /run/udev
name: host-run-udev
- mountPath: /lib
name: host-lib
- mountPath: /lib64
name: host-lib64
- mountPath: /sbin
name: host-sbin
- mountPath: /etc/host-os-release
name: host-os-release
- mountPath: /bin
name: host-bin
resources:
limits:
cpu: 1
Expand All @@ -160,10 +161,6 @@ spec:
path: /var/lib/kubelet/plugins/azurelustre.csi.azure.com
type: DirectoryOrCreate
name: socket-dir
- hostPath:
path: /var/lib/kubelet/
type: DirectoryOrCreate
name: mountpoint-dir
- hostPath:
path: /var/lib/kubelet/plugins_registry/
type: DirectoryOrCreate
Expand All @@ -181,9 +178,9 @@ spec:
type: Directory
name: host-dev
- hostPath:
path: /var/log
path: /var
type: Directory
name: host-var-log
name: host-var
- hostPath:
path: /usr
type: Directory
Expand All @@ -192,15 +189,23 @@ spec:
path: /etc
type: Directory
name: host-etc
- hostPath:
path: /run/udev
type: Directory
name: host-run-udev
- hostPath:
path: /lib
type: Directory
name: host-lib
- hostPath:
path: /lib64
type: Directory
name: host-lib64
- hostPath:
path: /sbin
type: Directory
name: host-sbin
- hostPath:
path: /etc/os-release
type: File
name: host-os-release
path: /bin
type: Directory
name: host-bin
21 changes: 11 additions & 10 deletions pkg/azurelustreplugin/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,22 @@ FROM ubuntu:18.04

COPY "./_output/azurelustreplugin" "/app/azurelustreplugin"
COPY "./pkg/azurelustreplugin/entrypoint.sh" "/app/entrypoint.sh"
COPY "./pkg/azurelustreplugin/fix-lnet.sh" "/app/fix-lnet.sh"

RUN chmod +x "/app/entrypoint.sh"
RUN chmod +x "/app/fix-lnet.sh"

RUN apt-get update

# RUN apt-get update

# Install all Lustre packages dependencies
RUN apt-get install -y --no-install-recommends ca-certificates curl distro-info-data gpg gpgconf kmod libasn1-8-heimdal \
libassuan0 libcurl4 libexpat1 libgssapi-krb5-2 libgssapi3-heimdal libhcrypto4-heimdal libheimbase1-heimdal \
libheimntlm0-heimdal libhx509-5-heimdal libk5crypto3 libkeyutils1 libkmod2 libkrb5-26-heimdal libkrb5-3 libkrb5support0 \
libldap-2.4-2 libldap-common libmpdec2 libnghttp2-14 libpsl5 libpython3-stdlib libpython3.6-minimal libpython3.6-stdlib \
libreadline7 libroken18-heimdal librtmp1 libsasl2-2 libsasl2-modules-db libsqlite3-0 libssl1.1 libwind0-heimdal \
libyaml-0-2 linux-base linux-base-sgx mime-support openssl python3 python3-minimal python3.6 python3.6-minimal readline-common

RUN apt-get clean all
# RUN apt-get install -y --no-install-recommends ca-certificates curl distro-info-data gpg gpgconf kmod libasn1-8-heimdal \
# libassuan0 libcurl4 libexpat1 libgssapi-krb5-2 libgssapi3-heimdal libhcrypto4-heimdal libheimbase1-heimdal \
# libheimntlm0-heimdal libhx509-5-heimdal libk5crypto3 libkeyutils1 libkmod2 libkrb5-26-heimdal libkrb5-3 libkrb5support0 \
# libldap-2.4-2 libldap-common libmpdec2 libnghttp2-14 libpsl5 libpython3-stdlib libpython3.6-minimal libpython3.6-stdlib \
# libreadline7 libroken18-heimdal librtmp1 libsasl2-2 libsasl2-modules-db libsqlite3-0 libssl1.1 libwind0-heimdal \
# libyaml-0-2 linux-base linux-base-sgx mime-support openssl python3 python3-minimal python3.6 python3.6-minimal readline-common

# RUN apt-get clean all

WORKDIR "/app"

Expand Down
93 changes: 66 additions & 27 deletions pkg/azurelustreplugin/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,22 +36,22 @@ echo "pkgVersion: ${pkgVersion}"
pkgName="amlfs-lustre-client-${pkgVersion}"
echo "pkgName: ${pkgName}"

if [[ ! -z $(grep -R 'bionic' /etc/host-os-release) ]]; then
if [[ ! -z $(grep -R 'bionic' /etc/os-release) ]]; then
osReleaseCodeName="bionic"
elif [[ ! -z $(grep -R 'jammy' /etc/host-os-release) ]]; then
cat << EOF | tee /etc/apt/sources.list.d/jammy.list
deb http://azure.archive.ubuntu.com/ubuntu/ jammy main restricted
deb http://azure.archive.ubuntu.com/ubuntu/ jammy-updates main restricted
deb http://azure.archive.ubuntu.com/ubuntu/ jammy universe
deb http://azure.archive.ubuntu.com/ubuntu/ jammy-updates universe
deb http://azure.archive.ubuntu.com/ubuntu/ jammy multiverse
deb http://azure.archive.ubuntu.com/ubuntu/ jammy-updates multiverse
deb http://azure.archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse
deb http://azure.archive.ubuntu.com/ubuntu/ jammy-security main restricted
deb http://azure.archive.ubuntu.com/ubuntu/ jammy-security universe
deb http://azure.archive.ubuntu.com/ubuntu/ jammy-security multiverse
EOF

elif [[ ! -z $(grep -R 'jammy' /etc/os-release) ]]; then
# cat << EOF | tee /etc/apt/sources.list.d/jammy.list
# deb http://azure.archive.ubuntu.com/ubuntu/ jammy main restricted
# deb http://azure.archive.ubuntu.com/ubuntu/ jammy-updates main restricted
# deb http://azure.archive.ubuntu.com/ubuntu/ jammy universe
# deb http://azure.archive.ubuntu.com/ubuntu/ jammy-updates universe
# deb http://azure.archive.ubuntu.com/ubuntu/ jammy multiverse
# deb http://azure.archive.ubuntu.com/ubuntu/ jammy-updates multiverse
# deb http://azure.archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse
# deb http://azure.archive.ubuntu.com/ubuntu/ jammy-security main restricted
# deb http://azure.archive.ubuntu.com/ubuntu/ jammy-security universe
# deb http://azure.archive.ubuntu.com/ubuntu/ jammy-security multiverse
# EOF
#
osReleaseCodeName="jammy"
else
echo "Unsupported Linux distro"
Expand All @@ -65,10 +65,12 @@ if [[ "${installClientPackages}" == "yes" ]]; then

echo "$(date -u) Installing Lustre client packages for OS=${osReleaseCodeName}, kernel=${kernelVersion} "

curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | tee /etc/apt/trusted.gpg.d/microsoft.gpg > /dev/null
echo "deb [arch=amd64] https://packages.microsoft.com/repos/amlfs-${osReleaseCodeName}/ ${osReleaseCodeName} main" | tee /etc/apt/sources.list.d/amlfs.list
apt-get update

if [ ! -f /etc/apt/sources.list.d/amlfs.list ]; then
curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | tee /etc/apt/trusted.gpg.d/microsoft.gpg > /dev/null
echo "deb [arch=amd64] https://packages.microsoft.com/repos/amlfs-${osReleaseCodeName}/ ${osReleaseCodeName} main" | tee /etc/apt/sources.list.d/amlfs.list
apt-get update
fi

echo "$(date -u) Installing Lustre client modules: ${pkgName}=${kernelVersion}"

# grub issue
Expand All @@ -78,18 +80,55 @@ if [[ "${installClientPackages}" == "yes" ]]; then

echo "$(date -u) Installed Lustre client packages."

echo "$(date -u) Enabling Lustre client kernel modules."
init_lnet="true"

if lsmod | grep "^lnet"; then
if lnetctl net show --net tcp | grep interfaces; then
echo "$(date -u) LNet is loaded skip the load."
init_lnet="false"
fi
fi

if [[ "${init_lnet}" == "true" ]]; then
echo "$(date -u) Loading the LNet."
modprobe -v lnet
lnetctl lnet configure

echo "$(date -u) Determining the default network interface."
# perl will be installed as dependency by luster client
echo "$(date -u) Route table is:"
ip route list
default_interface=$(ip route list | perl -n -e'/default via [0-9.]+ dev ([0-9a-zA-Z]+) / && print $1')
echo "$(date -u) Default network interface is ${default_interface}"

if [[ "${default_interface}" == "" ]]; then
echo "$(date -u) Cannot determine the default network interface"
exit 1
fi

lnetctl net add --net tcp --if "${default_interface}"

echo "$(date -u) Adding the udev script."
test -e /etc/lustre || mkdir /etc/lustre
touch /etc/lustre/.lock
test -e /etc/lustre/fix-lnet.sh && rm -f /etc/lustre/fix-lnet.sh
sed -i "s/{default_interface}/${default_interface}/g;" ./fix-lnet.sh
cp ./fix-lnet.sh /etc/lustre

test -e /etc/udev/rules.d/73-netadd.rules && rm -f /etc/udev/rules.d/73-netadd.rules
test -e /etc/udev/rules.d/74-netremove.rules && rm -f /etc/udev/rules.d/74-netremove.rules
echo 'SUBSYSTEM=="net", ACTION=="add", RUN+="/etc/lustre/fix-lnet.sh"' | tee /etc/udev/rules.d/73-netadd.rules
echo 'SUBSYSTEM=="net", ACTION=="remove", RUN+="/etc/lustre/fix-lnet.sh"' | tee /etc/udev/rules.d/74-netremove.rules

echo "$(date -u) Reloading udevadm"
udevadm control --reload
echo "$(date -u) Done"
fi

modprobe -v ksocklnd
modprobe -v lnet
echo "$(date -u) Enabling Lustre client kernel modules."
modprobe -v mgc
modprobe -v lustre

# For some reason, this is a false positive before we restart the container
# The volume mount succeeds later even this returns a failure
# We need to revisit this after moving the script to run on AKS node
lctl network up || true

echo "$(date -u) Enabled Lustre client kernel modules."

fi
Expand Down
9 changes: 9 additions & 0 deletions pkg/azurelustreplugin/fix-lnet.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

(
flock -w 60 -e ${FD}
if sudo lnetctl net show --net tcp | grep "status: down"; then
/usr/sbin/lnetctl net del --net tcp
/usr/sbin/lnetctl net add --net tcp --if {default_interface}
fi
) {FD}< /etc/lustre/.lock

0 comments on commit 42a79ad

Please sign in to comment.