Skip to content

Commit

Permalink
update moneo exporter to have node exporter
Browse files Browse the repository at this point in the history
  • Loading branch information
Ubuntu authored and Ubuntu committed Dec 20, 2023
1 parent 89abfbe commit 432d4cc
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 23 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/build-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ on:
pull_request:
branches:
- main
release:
types:
- published
workflow_dispatch:

jobs:
Expand All @@ -21,7 +24,7 @@ jobs:
include:
- name: moneo-exporter
dockerfile: moneo-exporter-nvidia
tags: azmoneo/moneo-exporter:nvidia
tags: azmoneo/moneo-exporter:${{ github.ref_name }},azmoneo/moneo-exporter:latest
steps:
- name: Checkout
uses: actions/checkout@v2
Expand Down
47 changes: 32 additions & 15 deletions dockerfile/moneo-exporter-nvidia.dockerfile
Original file line number Diff line number Diff line change
@@ -1,29 +1,45 @@
FROM nvidia/cuda:11.1.1-runtime-ubuntu18.04
FROM nvcr.io/nvidia/cuda:12.2.2-runtime-ubuntu22.04

LABEL maintainer="Moneo"

ARG BRANCH_OR_TAG=main
ARG DCGM_VERSION=3.1.1
ENV PROFILING false
ENV GPU_SAMPLE_RATE 2

ENV DEBIAN_FRONTEND=noninteractive

# Install dependencies
RUN apt-get update -y \
&& apt-get install -y \
--no-install-recommends \
git \
curl \
sudo \
wget \
libgomp1 \
python3.8 \
RUN apt-get update -y \
&& apt-get install -y \
--no-install-recommends \
numactl \
git \
curl \
sudo \
systemd \
wget \
libgomp1 \
libcap2-bin \
datacenter-gpu-manager \
python3.10 \
python3-pip

# Link python3 to python3.8
RUN cd /usr/bin/ \
&& rm python3 \
&& ln -s /usr/bin/python3.8 python3
# Link python3 to python3.10
RUN cd /usr/bin/ \
&& rm python3 \
&& ln -s /usr/bin/python3.10 python3

RUN python3 -m pip install --upgrade pip

# Install OFED
ENV OFED_VERSION=23.07-0.5.1.2
RUN cd /tmp && \
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*

# Clone Moneo repository
RUN git config --global advice.detachedHead false
RUN git clone --branch ${BRANCH_OR_TAG} https://github.com/Azure/Moneo.git
Expand All @@ -35,4 +51,5 @@ RUN sudo bash install/nvidia.sh
# Set EntryPoint
COPY dockerfile/moneo-exporter-nvidia_entrypoint.sh .
RUN chmod +x moneo-exporter-nvidia_entrypoint.sh
CMD /bin/bash moneo-exporter-nvidia_entrypoint.sh ${PROFILING}
CMD /bin/bash moneo-exporter-nvidia_entrypoint.sh ${PROFILING} ${GPU_SAMPLE_RATE}

14 changes: 8 additions & 6 deletions dockerfile/moneo-exporter-nvidia_entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,22 @@
set -e

enable_profiling=$1
gpu_sample_rate=$2
# Start NVIDIA DCGM Daemon
echo "Starting NVIDIA DCGM Daemon"
nv-hostengine
# echo "Starting NVIDIA DCGM Daemon"
# nv-hostengine

# Start NVIDIA and Net Exporter
echo "Starting NVIDIA and Net Exporter"
# Start NVIDIA, Net and Node Exporter
echo "Starting NVIDIA, Net and Node Exporter"

if [ $enable_profiling = true ]; then
python3 exporters/nvidia_exporter.py -m &
python3 exporters/nvidia_exporter.py -m -s $gpu_sample_rate &
else
python3 exporters/nvidia_exporter.py &
python3 exporters/nvidia_exporter.py -s $gpu_sample_rate &
fi

python3 exporters/net_exporter.py --inifiband_sysfs=/hostsys/class/infiniband &
python3 exporters/node_exporter.py &

wait -n
exit $?
28 changes: 27 additions & 1 deletion docs/Moneo-exporter.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,20 +56,23 @@ docker run
--rm --runtime=nvidia
--net=host
-e PROFILING=<true/false>
-e GPU_SAMPLE_RATE=<gpu_sample_rate:(1,2,10)>
--cap-add SYS_ADMIN
-v /sys:/hostsys
-itd moneo-exporter-nvidia:latest
```
2. Check the port 8000 and 8001 is up, which is the moneo-exporter listening to:
2. Check the port 8000, 8001, 8002 is up, which is the moneo-exporter listening to:
```bash
root@azureuser:~$ sudo netstat -tulpn | grep LISTEN | grep python3
tcp 0 0 0.0.0.0:8000 0.0.0.0:* LISTEN 94787/python3
tcp 0 0 0.0.0.0:8001 0.0.0.0:* LISTEN 94788/python3
tcp 0 0 0.0.0.0:8002 0.0.0.0:* LISTEN 94789/python3
```
3. Get the prometheus metrics from Moneo-exporter.
```bash
curl localhost:8000
curl localhost:8001
curl localhost:8002
```
You can see the following prometheus metrics just as below, which means moneo-exporter can work normally.
```bash
Expand Down Expand Up @@ -99,4 +102,27 @@ ib_port_xmit_data{ib_port="mlx5_ib1:1",ib_sys_guid="********",job_id="None"} 0.0
ib_port_xmit_data{ib_port="mlx5_ib6:1",ib_sys_guid="********",job_id="None"} 0.0
ib_port_xmit_data{ib_port="mlx5_ib4:1",ib_sys_guid="********",job_id="None"} 0.0
...
root@azureuser:~$ curl localhost:8001
...
# HELP python_gc_objects_collected_total Objects collected during gc
# TYPE python_gc_objects_collected_total counter
python_gc_objects_collected_total{generation="0"} 104.0
python_gc_objects_collected_total{generation="1"} 304.0
python_gc_objects_collected_total{generation="2"} 0.0
# HELP python_gc_objects_uncollectable_total Uncollectable objects found during GC
# TYPE python_gc_objects_uncollectable_total counter
python_gc_objects_uncollectable_total{generation="0"} 0.0
python_gc_objects_uncollectable_total{generation="1"} 0.0
python_gc_objects_uncollectable_total{generation="2"} 0.0
# HELP node_mem_available node_mem_available
# TYPE node_mem_available gauge
node_mem_available{job_id="None"} 1.841545956e+09
# HELP node_mem_util node_mem_util
# TYPE node_mem_util gauge
node_mem_util{job_id="None"} 0.9
# HELP node_xid_error node_xid_error
# TYPE node_xid_error gauge
# HELP node_link_flap node_link_flap
# TYPE node_link_flap gauge
...
```

0 comments on commit 432d4cc

Please sign in to comment.