From 42a79adaa9a3787d90db14dbdada194a17c42697 Mon Sep 17 00:00:00 2001 From: Chuanjun Yin Date: Fri, 24 Feb 2023 21:44:14 +0800 Subject: [PATCH] [bug] fix lnet (#99) * fix lnet * update image * add missing files * fix typo * fix test commands --- .gitignore | 1 + deploy/csi-azurelustre-node.yaml | 53 ++++++++-------- pkg/azurelustreplugin/Dockerfile | 21 +++---- pkg/azurelustreplugin/entrypoint.sh | 93 ++++++++++++++++++++--------- pkg/azurelustreplugin/fix-lnet.sh | 9 +++ 5 files changed, 116 insertions(+), 61 deletions(-) create mode 100644 pkg/azurelustreplugin/fix-lnet.sh diff --git a/.gitignore b/.gitignore index 0441872b7..dd60ba770 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ /_output*/ /_output azurelustreplugin +!pkg/azurelustreplugin # Emacs save files *~ diff --git a/deploy/csi-azurelustre-node.yaml b/deploy/csi-azurelustre-node.yaml index f3d779a8e..6e48e68d2 100644 --- a/deploy/csi-azurelustre-node.yaml +++ b/deploy/csi-azurelustre-node.yaml @@ -127,27 +127,28 @@ spec: volumeMounts: - mountPath: /csi name: socket-dir - - mountPath: /var/lib/kubelet/ - mountPropagation: Bidirectional - name: mountpoint-dir - - mountPath: /etc/kubernetes/ - name: azure-cred - mountPath: /mnt name: azurelustre-cache - mountPath: /dev name: host-dev - - mountPath: /host/var/log - name: host-var-log - # - mountPath: /usr - # name: host-usr - # - mountPath: /etc - # name: host-etc - # - mountPath: /lib - # name: host-lib + - mountPath: /var + mountPropagation: Bidirectional + name: host-var + - mountPath: /usr + name: host-usr + - mountPath: /etc + name: host-etc + # udevadm needs this + - mountPath: /run/udev + name: host-run-udev + - mountPath: /lib + name: host-lib + - mountPath: /lib64 + name: host-lib64 - mountPath: /sbin name: host-sbin - - mountPath: /etc/host-os-release - name: host-os-release + - mountPath: /bin + name: host-bin resources: limits: cpu: 1 @@ -160,10 +161,6 @@ spec: path: /var/lib/kubelet/plugins/azurelustre.csi.azure.com type: DirectoryOrCreate name: socket-dir - - hostPath: - path: /var/lib/kubelet/ - type: DirectoryOrCreate - name: mountpoint-dir - hostPath: path: /var/lib/kubelet/plugins_registry/ type: DirectoryOrCreate @@ -181,9 +178,9 @@ spec: type: Directory name: host-dev - hostPath: - path: /var/log + path: /var type: Directory - name: host-var-log + name: host-var - hostPath: path: /usr type: Directory @@ -192,15 +189,23 @@ spec: path: /etc type: Directory name: host-etc + - hostPath: + path: /run/udev + type: Directory + name: host-run-udev - hostPath: path: /lib type: Directory name: host-lib + - hostPath: + path: /lib64 + type: Directory + name: host-lib64 - hostPath: path: /sbin type: Directory name: host-sbin - hostPath: - path: /etc/os-release - type: File - name: host-os-release + path: /bin + type: Directory + name: host-bin diff --git a/pkg/azurelustreplugin/Dockerfile b/pkg/azurelustreplugin/Dockerfile index 60c4ec55d..83eb8d9d0 100644 --- a/pkg/azurelustreplugin/Dockerfile +++ b/pkg/azurelustreplugin/Dockerfile @@ -16,21 +16,22 @@ FROM ubuntu:18.04 COPY "./_output/azurelustreplugin" "/app/azurelustreplugin" COPY "./pkg/azurelustreplugin/entrypoint.sh" "/app/entrypoint.sh" +COPY "./pkg/azurelustreplugin/fix-lnet.sh" "/app/fix-lnet.sh" RUN chmod +x "/app/entrypoint.sh" +RUN chmod +x "/app/fix-lnet.sh" -RUN apt-get update - +# RUN apt-get update # Install all Lustre packages dependencies -RUN apt-get install -y --no-install-recommends ca-certificates curl distro-info-data gpg gpgconf kmod libasn1-8-heimdal \ - libassuan0 libcurl4 libexpat1 libgssapi-krb5-2 libgssapi3-heimdal libhcrypto4-heimdal libheimbase1-heimdal \ - libheimntlm0-heimdal libhx509-5-heimdal libk5crypto3 libkeyutils1 libkmod2 libkrb5-26-heimdal libkrb5-3 libkrb5support0 \ - libldap-2.4-2 libldap-common libmpdec2 libnghttp2-14 libpsl5 libpython3-stdlib libpython3.6-minimal libpython3.6-stdlib \ - libreadline7 libroken18-heimdal librtmp1 libsasl2-2 libsasl2-modules-db libsqlite3-0 libssl1.1 libwind0-heimdal \ - libyaml-0-2 linux-base linux-base-sgx mime-support openssl python3 python3-minimal python3.6 python3.6-minimal readline-common - -RUN apt-get clean all +# RUN apt-get install -y --no-install-recommends ca-certificates curl distro-info-data gpg gpgconf kmod libasn1-8-heimdal \ +# libassuan0 libcurl4 libexpat1 libgssapi-krb5-2 libgssapi3-heimdal libhcrypto4-heimdal libheimbase1-heimdal \ +# libheimntlm0-heimdal libhx509-5-heimdal libk5crypto3 libkeyutils1 libkmod2 libkrb5-26-heimdal libkrb5-3 libkrb5support0 \ +# libldap-2.4-2 libldap-common libmpdec2 libnghttp2-14 libpsl5 libpython3-stdlib libpython3.6-minimal libpython3.6-stdlib \ +# libreadline7 libroken18-heimdal librtmp1 libsasl2-2 libsasl2-modules-db libsqlite3-0 libssl1.1 libwind0-heimdal \ +# libyaml-0-2 linux-base linux-base-sgx mime-support openssl python3 python3-minimal python3.6 python3.6-minimal readline-common + +# RUN apt-get clean all WORKDIR "/app" diff --git a/pkg/azurelustreplugin/entrypoint.sh b/pkg/azurelustreplugin/entrypoint.sh index dc00434bb..4d4205b48 100755 --- a/pkg/azurelustreplugin/entrypoint.sh +++ b/pkg/azurelustreplugin/entrypoint.sh @@ -36,22 +36,22 @@ echo "pkgVersion: ${pkgVersion}" pkgName="amlfs-lustre-client-${pkgVersion}" echo "pkgName: ${pkgName}" -if [[ ! -z $(grep -R 'bionic' /etc/host-os-release) ]]; then +if [[ ! -z $(grep -R 'bionic' /etc/os-release) ]]; then osReleaseCodeName="bionic" -elif [[ ! -z $(grep -R 'jammy' /etc/host-os-release) ]]; then - cat << EOF | tee /etc/apt/sources.list.d/jammy.list -deb http://azure.archive.ubuntu.com/ubuntu/ jammy main restricted -deb http://azure.archive.ubuntu.com/ubuntu/ jammy-updates main restricted -deb http://azure.archive.ubuntu.com/ubuntu/ jammy universe -deb http://azure.archive.ubuntu.com/ubuntu/ jammy-updates universe -deb http://azure.archive.ubuntu.com/ubuntu/ jammy multiverse -deb http://azure.archive.ubuntu.com/ubuntu/ jammy-updates multiverse -deb http://azure.archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse -deb http://azure.archive.ubuntu.com/ubuntu/ jammy-security main restricted -deb http://azure.archive.ubuntu.com/ubuntu/ jammy-security universe -deb http://azure.archive.ubuntu.com/ubuntu/ jammy-security multiverse -EOF - +elif [[ ! -z $(grep -R 'jammy' /etc/os-release) ]]; then +# cat << EOF | tee /etc/apt/sources.list.d/jammy.list +# deb http://azure.archive.ubuntu.com/ubuntu/ jammy main restricted +# deb http://azure.archive.ubuntu.com/ubuntu/ jammy-updates main restricted +# deb http://azure.archive.ubuntu.com/ubuntu/ jammy universe +# deb http://azure.archive.ubuntu.com/ubuntu/ jammy-updates universe +# deb http://azure.archive.ubuntu.com/ubuntu/ jammy multiverse +# deb http://azure.archive.ubuntu.com/ubuntu/ jammy-updates multiverse +# deb http://azure.archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse +# deb http://azure.archive.ubuntu.com/ubuntu/ jammy-security main restricted +# deb http://azure.archive.ubuntu.com/ubuntu/ jammy-security universe +# deb http://azure.archive.ubuntu.com/ubuntu/ jammy-security multiverse +# EOF +# osReleaseCodeName="jammy" else echo "Unsupported Linux distro" @@ -65,10 +65,12 @@ if [[ "${installClientPackages}" == "yes" ]]; then echo "$(date -u) Installing Lustre client packages for OS=${osReleaseCodeName}, kernel=${kernelVersion} " - curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | tee /etc/apt/trusted.gpg.d/microsoft.gpg > /dev/null - echo "deb [arch=amd64] https://packages.microsoft.com/repos/amlfs-${osReleaseCodeName}/ ${osReleaseCodeName} main" | tee /etc/apt/sources.list.d/amlfs.list - apt-get update - + if [ ! -f /etc/apt/sources.list.d/amlfs.list ]; then + curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | tee /etc/apt/trusted.gpg.d/microsoft.gpg > /dev/null + echo "deb [arch=amd64] https://packages.microsoft.com/repos/amlfs-${osReleaseCodeName}/ ${osReleaseCodeName} main" | tee /etc/apt/sources.list.d/amlfs.list + apt-get update + fi + echo "$(date -u) Installing Lustre client modules: ${pkgName}=${kernelVersion}" # grub issue @@ -78,18 +80,55 @@ if [[ "${installClientPackages}" == "yes" ]]; then echo "$(date -u) Installed Lustre client packages." - echo "$(date -u) Enabling Lustre client kernel modules." + init_lnet="true" + + if lsmod | grep "^lnet"; then + if lnetctl net show --net tcp | grep interfaces; then + echo "$(date -u) LNet is loaded skip the load." + init_lnet="false" + fi + fi + + if [[ "${init_lnet}" == "true" ]]; then + echo "$(date -u) Loading the LNet." + modprobe -v lnet + lnetctl lnet configure + + echo "$(date -u) Determining the default network interface." + # perl will be installed as dependency by luster client + echo "$(date -u) Route table is:" + ip route list + default_interface=$(ip route list | perl -n -e'/default via [0-9.]+ dev ([0-9a-zA-Z]+) / && print $1') + echo "$(date -u) Default network interface is ${default_interface}" + + if [[ "${default_interface}" == "" ]]; then + echo "$(date -u) Cannot determine the default network interface" + exit 1 + fi + + lnetctl net add --net tcp --if "${default_interface}" + + echo "$(date -u) Adding the udev script." + test -e /etc/lustre || mkdir /etc/lustre + touch /etc/lustre/.lock + test -e /etc/lustre/fix-lnet.sh && rm -f /etc/lustre/fix-lnet.sh + sed -i "s/{default_interface}/${default_interface}/g;" ./fix-lnet.sh + cp ./fix-lnet.sh /etc/lustre + + test -e /etc/udev/rules.d/73-netadd.rules && rm -f /etc/udev/rules.d/73-netadd.rules + test -e /etc/udev/rules.d/74-netremove.rules && rm -f /etc/udev/rules.d/74-netremove.rules + echo 'SUBSYSTEM=="net", ACTION=="add", RUN+="/etc/lustre/fix-lnet.sh"' | tee /etc/udev/rules.d/73-netadd.rules + echo 'SUBSYSTEM=="net", ACTION=="remove", RUN+="/etc/lustre/fix-lnet.sh"' | tee /etc/udev/rules.d/74-netremove.rules + + echo "$(date -u) Reloading udevadm" + udevadm control --reload + echo "$(date -u) Done" + fi - modprobe -v ksocklnd - modprobe -v lnet + echo "$(date -u) Enabling Lustre client kernel modules." modprobe -v mgc modprobe -v lustre - # For some reason, this is a false positive before we restart the container - # The volume mount succeeds later even this returns a failure - # We need to revisit this after moving the script to run on AKS node - lctl network up || true - echo "$(date -u) Enabled Lustre client kernel modules." fi diff --git a/pkg/azurelustreplugin/fix-lnet.sh b/pkg/azurelustreplugin/fix-lnet.sh new file mode 100644 index 000000000..c5a7be320 --- /dev/null +++ b/pkg/azurelustreplugin/fix-lnet.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +( + flock -w 60 -e ${FD} + if sudo lnetctl net show --net tcp | grep "status: down"; then + /usr/sbin/lnetctl net del --net tcp + /usr/sbin/lnetctl net add --net tcp --if {default_interface} + fi +) {FD}< /etc/lustre/.lock