From 432d4ccff2df2f33d4ca7da89a5b2c335b89cf18 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 20 Dec 2023 11:20:09 +0000 Subject: [PATCH] update moneo exporter to have node exporter --- .github/workflows/build-image.yml | 5 +- dockerfile/moneo-exporter-nvidia.dockerfile | 47 +++++++++++++------ .../moneo-exporter-nvidia_entrypoint.sh | 14 +++--- docs/Moneo-exporter.md | 28 ++++++++++- 4 files changed, 71 insertions(+), 23 deletions(-) diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index 926c6de..7339f01 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -7,6 +7,9 @@ on: pull_request: branches: - main + release: + types: + - published workflow_dispatch: jobs: @@ -21,7 +24,7 @@ jobs: include: - name: moneo-exporter dockerfile: moneo-exporter-nvidia - tags: azmoneo/moneo-exporter:nvidia + tags: azmoneo/moneo-exporter:${{ github.ref_name }},azmoneo/moneo-exporter:latest steps: - name: Checkout uses: actions/checkout@v2 diff --git a/dockerfile/moneo-exporter-nvidia.dockerfile b/dockerfile/moneo-exporter-nvidia.dockerfile index 516fba2..b975a20 100644 --- a/dockerfile/moneo-exporter-nvidia.dockerfile +++ b/dockerfile/moneo-exporter-nvidia.dockerfile @@ -1,29 +1,45 @@ -FROM nvidia/cuda:11.1.1-runtime-ubuntu18.04 +FROM nvcr.io/nvidia/cuda:12.2.2-runtime-ubuntu22.04 LABEL maintainer="Moneo" ARG BRANCH_OR_TAG=main +ARG DCGM_VERSION=3.1.1 ENV PROFILING false +ENV GPU_SAMPLE_RATE 2 + +ENV DEBIAN_FRONTEND=noninteractive # Install dependencies -RUN apt-get update -y \ - && apt-get install -y \ - --no-install-recommends \ - git \ - curl \ - sudo \ - wget \ - libgomp1 \ - python3.8 \ +RUN apt-get update -y \ + && apt-get install -y \ + --no-install-recommends \ + numactl \ + git \ + curl \ + sudo \ + systemd \ + wget \ + libgomp1 \ + libcap2-bin \ + datacenter-gpu-manager \ + python3.10 \ python3-pip -# Link python3 to python3.8 -RUN cd /usr/bin/ \ - && rm python3 \ - && ln -s /usr/bin/python3.8 python3 +# Link python3 to python3.10 +RUN cd /usr/bin/ \ + && rm python3 \ + && ln -s /usr/bin/python3.10 python3 RUN python3 -m pip install --upgrade pip +# Install OFED +ENV OFED_VERSION=23.07-0.5.1.2 +RUN cd /tmp && \ + wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \ + tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \ + MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \ + rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* + # Clone Moneo repository RUN git config --global advice.detachedHead false RUN git clone --branch ${BRANCH_OR_TAG} https://github.com/Azure/Moneo.git @@ -35,4 +51,5 @@ RUN sudo bash install/nvidia.sh # Set EntryPoint COPY dockerfile/moneo-exporter-nvidia_entrypoint.sh . RUN chmod +x moneo-exporter-nvidia_entrypoint.sh -CMD /bin/bash moneo-exporter-nvidia_entrypoint.sh ${PROFILING} +CMD /bin/bash moneo-exporter-nvidia_entrypoint.sh ${PROFILING} ${GPU_SAMPLE_RATE} + diff --git a/dockerfile/moneo-exporter-nvidia_entrypoint.sh b/dockerfile/moneo-exporter-nvidia_entrypoint.sh index 9bd0dfa..9e63fe9 100755 --- a/dockerfile/moneo-exporter-nvidia_entrypoint.sh +++ b/dockerfile/moneo-exporter-nvidia_entrypoint.sh @@ -2,20 +2,22 @@ set -e enable_profiling=$1 +gpu_sample_rate=$2 # Start NVIDIA DCGM Daemon -echo "Starting NVIDIA DCGM Daemon" -nv-hostengine +# echo "Starting NVIDIA DCGM Daemon" +# nv-hostengine -# Start NVIDIA and Net Exporter -echo "Starting NVIDIA and Net Exporter" +# Start NVIDIA, Net and Node Exporter +echo "Starting NVIDIA, Net and Node Exporter" if [ $enable_profiling = true ]; then - python3 exporters/nvidia_exporter.py -m & + python3 exporters/nvidia_exporter.py -m -s $gpu_sample_rate & else - python3 exporters/nvidia_exporter.py & + python3 exporters/nvidia_exporter.py -s $gpu_sample_rate & fi python3 exporters/net_exporter.py --inifiband_sysfs=/hostsys/class/infiniband & +python3 exporters/node_exporter.py & wait -n exit $? diff --git a/docs/Moneo-exporter.md b/docs/Moneo-exporter.md index 0afc253..090f9ab 100644 --- a/docs/Moneo-exporter.md +++ b/docs/Moneo-exporter.md @@ -56,20 +56,23 @@ docker run --rm --runtime=nvidia --net=host -e PROFILING= + -e GPU_SAMPLE_RATE= --cap-add SYS_ADMIN -v /sys:/hostsys -itd moneo-exporter-nvidia:latest ``` -2. Check the port 8000 and 8001 is up, which is the moneo-exporter listening to: +2. Check the port 8000, 8001, 8002 is up, which is the moneo-exporter listening to: ```bash root@azureuser:~$ sudo netstat -tulpn | grep LISTEN | grep python3 tcp 0 0 0.0.0.0:8000 0.0.0.0:* LISTEN 94787/python3 tcp 0 0 0.0.0.0:8001 0.0.0.0:* LISTEN 94788/python3 +tcp 0 0 0.0.0.0:8002 0.0.0.0:* LISTEN 94789/python3 ``` 3. Get the prometheus metrics from Moneo-exporter. ```bash curl localhost:8000 curl localhost:8001 +curl localhost:8002 ``` You can see the following prometheus metrics just as below, which means moneo-exporter can work normally. ```bash @@ -99,4 +102,27 @@ ib_port_xmit_data{ib_port="mlx5_ib1:1",ib_sys_guid="********",job_id="None"} 0.0 ib_port_xmit_data{ib_port="mlx5_ib6:1",ib_sys_guid="********",job_id="None"} 0.0 ib_port_xmit_data{ib_port="mlx5_ib4:1",ib_sys_guid="********",job_id="None"} 0.0 ... +root@azureuser:~$ curl localhost:8001 +... +# HELP python_gc_objects_collected_total Objects collected during gc +# TYPE python_gc_objects_collected_total counter +python_gc_objects_collected_total{generation="0"} 104.0 +python_gc_objects_collected_total{generation="1"} 304.0 +python_gc_objects_collected_total{generation="2"} 0.0 +# HELP python_gc_objects_uncollectable_total Uncollectable objects found during GC +# TYPE python_gc_objects_uncollectable_total counter +python_gc_objects_uncollectable_total{generation="0"} 0.0 +python_gc_objects_uncollectable_total{generation="1"} 0.0 +python_gc_objects_uncollectable_total{generation="2"} 0.0 +# HELP node_mem_available node_mem_available +# TYPE node_mem_available gauge +node_mem_available{job_id="None"} 1.841545956e+09 +# HELP node_mem_util node_mem_util +# TYPE node_mem_util gauge +node_mem_util{job_id="None"} 0.9 +# HELP node_xid_error node_xid_error +# TYPE node_xid_error gauge +# HELP node_link_flap node_link_flap +# TYPE node_link_flap gauge +... ``` \ No newline at end of file