Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Moneo Exporter for MI300 #81

Merged
merged 2 commits into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions dockerfile/moneo-exporter-amd.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Use ROCm development image
FROM rocm/dev-ubuntu-22.04:6.1.1

# Metadata
LABEL maintainer="Moneo"

# Environment variables
ENV ROCM_VERSION=6.1.1 \
DEBIAN_FRONTEND=noninteractive

# Work directory setup
WORKDIR /root/Moneo
COPY . .

# Installing packages and setting up Python
RUN apt-get update -y && apt-get install -y --no-install-recommends \
numactl git curl cmake ibverbs-utils sudo systemd wget libgomp1 libcap2-bin python3.10 python3-pip && \
cd /usr/bin && rm python3 && ln -s python3.10 python3 && \
python3 -m pip install --upgrade pip && \
python3 -m pip install prometheus_client psutil

# # RDC installation
WORKDIR /root/Moneo/src/worker
RUN sudo bash install/amd.sh

# Set EntryPoint
COPY dockerfile/moneo-exporter-amd_entrypoint.sh .
RUN chmod +x moneo-exporter-amd_entrypoint.sh

# Final CMD
CMD ["/bin/bash", "moneo-exporter-amd_entrypoint.sh"]
27 changes: 27 additions & 0 deletions dockerfile/moneo-exporter-amd_entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash
set -e

# Stops previous instances of AMD RDC Daemon and Exporter
bash shutdown.sh

# Launches AMD RDC Daemon
nohup /opt/rocm/bin/rdcd -u </dev/null >/dev/null 2>&1 &

# Initiates AMD and Network Exporters
echo "Starting AMD and Network Exporters"

# Starts AMD Exporter
python3 exporters/amd_exporter.py &
echo "AMD Exporter Started!"

# Starts Network Exporter with specified InfiniBand sysfs path
python3 exporters/net_exporter.py --inifiband_sysfs=/hostsys/class/infiniband &
echo "Network Exporter Started!"

# Starts Node Exporter
python3 exporters/node_exporter.py &
echo "Node Exporter Started!"

# Waits for any process to exit and returns the exit status
wait -n
exit $?
19 changes: 11 additions & 8 deletions src/worker/exporters/amd_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,16 @@
sys.path.extend([
'/opt/rocm/libexec/rocm_smi', # ROCm >=5.2
'/opt/rocm/rocm_smi/bindings', # ROCm <5.2
'/opt/rocm/rdc/python_binding',
'/opt/rdc/python_binding',
])

from rsmiBindings import rocmsmi, rsmi_status_t
from rsmiBindings import *
from RdcReader import RdcReader
from rdc_bootstrap import * # noqa: F403

PRINT_JSON = True
rocmsmi = initRsmiBindings(silent=PRINT_JSON)

RDC_FIELDS = [
# PID
# rdc_field_t.RDC_FI_DEV_COMPUTE_PIDS,
Expand Down Expand Up @@ -51,8 +54,8 @@
# rdc_field_t.RDC_FI_PROF_NVLINK_TX_BYTES,
# rdc_field_t.RDC_FI_PROF_NVLINK_RX_BYTES,
# PCIe
rdc_field_t.RDC_FI_PCIE_TX,
rdc_field_t.RDC_FI_PCIE_RX,
# rdc_field_t.RDC_FI_PCIE_TX,
# rdc_field_t.RDC_FI_PCIE_RX,
]


Expand Down Expand Up @@ -82,18 +85,18 @@ def init_connection(self):
logging.info('Publishing fields: {}'.format(','.join(field_name_list)))

def init_gauges(self):
self.guages = {}
self.gauges = {}
for field_id in self.field_ids:
field_name = self.rdc_util.field_id_string(field_id).lower()
self.guages[field_id] = prometheus_client.Gauge(
self.gauges[field_id] = prometheus_client.Gauge(
'rdc_{}'.format(field_name),
'rdc_{}'.format(field_name),
['gpu_id', 'gpu_uuid'],
)

def handle_field(self, gpu_id, value):
if value.field_id.value in self.guages:
self.guages[value.field_id.value].labels(
if value.field_id.value in self.gauges:
self.gauges[value.field_id.value].labels(
gpu_id,
rdc_config['device_uuid'][gpu_id],
).set(value.value.l_int)
Expand Down
65 changes: 41 additions & 24 deletions src/worker/install/amd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,46 @@ source ./$(dirname "${BASH_SOURCE[0]}")/common.sh
apt-get install -y automake make g++ unzip build-essential autoconf libtool pkg-config libgflags-dev libgtest-dev libc++-dev curl libcap-dev

# install grpc
export GRPC_LIB_DIR=/usr/local/lib
git clone -b v1.28.1 https://github.com/grpc/grpc /opt/grpc ||:
cd /opt/grpc
git submodule update --init
mkdir -p cmake/build
cd cmake/build
cmake -DgRPC_INSTALL=ON -DBUILD_SHARED_LIBS=ON ../..
make -j
make install
echo ${GRPC_LIB_DIR} | tee /etc/ld.so.conf.d/grpc.conf
export GRPC_ROOT=/opt/grpc

# install RDC
export RDC_LIB_DIR=/opt/rocm/rdc/lib
git clone https://github.com/RadeonOpenCompute/rdc /opt/rdc ||:
mkdir -p /opt/rdc/build
cd /opt/rdc/build
cmake -DROCM_DIR=/opt/rocm -DGRPC_ROOT="/usr/local" ..
make -j
make install
cat > /etc/ld.so.conf.d/x86_64-librdc_client.conf <<EOF
${GRPC_LIB_DIR}
${GRPC_LIB_DIR}64
${RDC_LIB_DIR}
${RDC_LIB_DIR}64
EOF
# Check if the directory exists and is not empty
if [ -d "$GRPC_ROOT" ] && [ "$(ls -A $GRPC_ROOT)" ]; then
cd "$GRPC_ROOT"
git pull
else
git clone -b v1.61.0 https://github.com/grpc/grpc --depth=1 --shallow-submodules --recurse-submodules "$GRPC_ROOT"
cd "$GRPC_ROOT"
fi
cmake -B build \
-DgRPC_INSTALL=ON \
-DgRPC_BUILD_TESTS=OFF \
-DBUILD_SHARED_LIBS=ON \
-DCMAKE_INSTALL_PREFIX="$GRPC_ROOT" \
-DCMAKE_INSTALL_LIBDIR=lib \
-DCMAKE_BUILD_TYPE=Release
make -C build -j $(nproc)
make -C build install
echo "$GRPC_ROOT" | sudo tee /etc/ld.so.conf.d/grpc.conf

# install rdc
export RDC_ROOT=/opt/rdc
# Check if the directory exists and is not empty
if [ -d "$RDC_ROOT" ] && [ "$(ls -A $RDC_ROOT)" ]; then
cd "$RDC_ROOT"
git pull
else
git clone --depth 1 --branch rocm-6.1.1 https://github.com/RadeonOpenCompute/rdc "$RDC_ROOT"
cd "$RDC_ROOT"
fi

# default installation location is /opt/rocm, specify with -DROCM_DIR or -DCMAKE_INSTALL_PREFIX
cmake -B build -DGRPC_ROOT="$GRPC_ROOT" -DROCM_DIR="/opt/rocm" -DCMAKE_INSTALL_PREFIX="/opt/rocm"
make -C build -j $(nproc)
make -C build install

# Update ldconfig
export RDC_LIB_DIR=/opt/rocm/lib/rdc
export GRPC_LIB_DIR=/opt/grpc/lib
echo -e "${GRPC_LIB_DIR}\n${GRPC_LIB_DIR}64" | sudo tee /etc/ld.so.conf.d/x86_64-librdc_client.conf
echo -e "${RDC_LIB_DIR}\n${RDC_LIB_DIR}64" | sudo tee -a /etc/ld.so.conf.d/x86_64-librdc_client.conf
ldconfig
Loading