From 9bf5ce94fcb85338a431c3caa42848a9256d9189 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Fri, 18 Oct 2024 01:25:35 -0400 Subject: [PATCH 01/57] Add metric.py, prometheus configs, and modify pyproject.toml --- docker/prometheus/docker-compose.yml | 26 ++ docker/prometheus/prometheus.yml | 14 + pyproject.toml | 3 +- zeus/metric.py | 429 +++++++++++++++++++++++++++ 4 files changed, 471 insertions(+), 1 deletion(-) create mode 100644 docker/prometheus/docker-compose.yml create mode 100644 docker/prometheus/prometheus.yml create mode 100644 zeus/metric.py diff --git a/docker/prometheus/docker-compose.yml b/docker/prometheus/docker-compose.yml new file mode 100644 index 00000000..9a23ad57 --- /dev/null +++ b/docker/prometheus/docker-compose.yml @@ -0,0 +1,26 @@ +version: '3.7' +services: + prometheus: + image: prom/prometheus + volumes: + - ".prometheus.yml:/etc/prometheus/prometheus.yml" + networks: + - localprom + ports: + - 9090:9090 + node-exporter: + image: prom/node-exporter + networks: + - localprom + ports: + - 9100:9100 + pushgateway: + image: prom/pushgateway + networks: + - localprom + ports: + - 9091:9091 +networks: + localprom: + driver: bridge + diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml new file mode 100644 index 00000000..8aa5a937 --- /dev/null +++ b/docker/prometheus/prometheus.yml @@ -0,0 +1,14 @@ +global: + scrape_interval: 15s + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + - job_name: 'pushgateway' + static_configs: + - targets: ['zeus-pushgateway-1:9091'] + - job_name: 'node' + static_configs: + - targets: ['localhost:9100'] + diff --git a/pyproject.toml b/pyproject.toml index 7a050e89..41c317b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,8 +48,9 @@ migration = ["alembic", "sqlalchemy", "pydantic<2", "python-dotenv"] lint = ["ruff", "black==22.6.0", "pyright", "pandas-stubs", "transformers"] test = ["fastapi[all]", "sqlalchemy", "pydantic<2", "pytest==7.3.2", "pytest-mock==3.10.0", "pytest-xdist==3.3.1", "anyio==3.7.1", "aiosqlite==0.20.0", "numpy<2"] docs = ["mkdocs-material[imaging]==9.5.19", "mkdocstrings[python]==0.25.0", "mkdocs-gen-files==0.5.0", "mkdocs-literate-nav==0.6.1", "mkdocs-section-index==0.3.9", "mkdocs-redirects==1.2.1", "urllib3<2", "black"] +prometheus = ["prometheus-client"] # greenlet is for supporting apple mac silicon for sqlalchemy(https://docs.sqlalchemy.org/en/20/faq/installation.html) -dev = ["zeus-ml[pfo-server,bso,bso-server,migration,lint,test]", "greenlet"] +dev = ["zeus-ml[pfo-server,bso,bso-server,migration,prometheus,lint,test]", "greenlet"] [tool.setuptools.packages.find] where = ["."] diff --git a/zeus/metric.py b/zeus/metric.py new file mode 100644 index 00000000..8474b076 --- /dev/null +++ b/zeus/metric.py @@ -0,0 +1,429 @@ +from __future__ import annotations +import abc +import time +import warnings +import multiprocessing as mp +from zeus.monitor.power import PowerMonitor +from zeus.monitor.energy import ZeusMonitor +from zeus.device.cpu.common import CPU +from prometheus_client import CollectorRegistry, Histogram, Counter, Gauge, push_to_gateway + +class Metric(abc.ABC): + @abc.abstractmethod + def begin_window(self, name: str): + pass + + @abc.abstractmethod + def end_window(self, name: str): + pass + +class EnergyHistogram(Metric): + """EnergyHistogram class to monitor and record energy consumption metrics. + + This class tracks GPU, CPU, and DRAM energy usage, and records the data as Prometheus Histogram metrics. + The energy data is pushed to a Prometheus Push Gateway for monitoring and analysis. + + Attributes: + energy_monitor: The ZeusMonitor instance that collects energy consumption data. + prometheus_url: The URL of the Prometheus Push Gateway where the metrics will be pushed. + job: The name of the job to associate with the Prometheus metrics. + registry: The Prometheus CollectorRegistry that manages all the Histogram metrics for this class. + bucket_ranges: Optional custom bucket ranges for the Histogram metrics (GPU, CPU, and DRAM). + gpu_histograms: A dictionary mapping GPU indices to their respective Histogram metrics. + cpu_histograms: A dictionary mapping CPU indices to their respective Histogram metrics. + dram_histograms: A dictionary mapping DRAM indices to their respective Histogram metrics. + """ + + def __init__( + self, + energy_monitor: ZeusMonitor, + prometheus_url: str, + job: str, + bucket_ranges=None + ) -> None: + """ + Initialize the EnergyHistogram class. + + Sets up the Prometheus Histogram metrics to track energy consumption for GPUs, CPUs, and DRAMs. + The data will be collected and pushed to the Prometheus Push Gateway at regular intervals. + + Args: + energy_monitor: The ZeusMonitor instance used to retrieve energy data for the system. + prometheus_url: The URL for the Prometheus Push Gateway where the metrics will be sent. + job: The name of the Prometheus job associated with the energy metrics. + bucket_ranges: Optional custom bucket ranges for the Histogram metrics (GPU, CPU, and DRAM). + If not provided, default bucket ranges will be used for each component. + """ + self.energy_monitor = energy_monitor + self.prometheus_url = prometheus_url + self.job = job + + self.registry = CollectorRegistry() + + default_gpu_buckets = [50.0, 100.0, 200.0, 500.0, 1000.0] + default_cpu_buckets = [10.0, 20.0, 50.0, 100.0, 200.0] + default_dram_buckets = [5.0, 10.0, 20.0, 50.0, 150.0] + + self.bucket_ranges = { + 'gpu': default_gpu_buckets, + 'cpu': default_cpu_buckets, + 'dram': default_dram_buckets, + } + + self.bucket_ranges['gpu'] = ( + bucket_ranges.get('gpu') if bucket_ranges and 'gpu' in bucket_ranges + else default_gpu_buckets + ) + + self.bucket_ranges['cpu'] = ( + bucket_ranges.get('cpu') if bucket_ranges and 'cpu' in bucket_ranges + else default_cpu_buckets + ) + + self.bucket_ranges['dram'] = ( + bucket_ranges.get('dram') if bucket_ranges and 'dram' in bucket_ranges + else default_dram_buckets + ) + # If GPU availble, for each gpu_indices, create a Histogram metric with the label window, and index. + if energy_monitor.gpu_indices: + self.gpu_histograms = {} + for gpu_index in self.energy_monitor.gpu_indices: + self.gpu_histograms[gpu_index] = Histogram( + f'energy_monitor_gpu_{gpu_index}_energy_joules', + f'GPU {gpu_index} energy consumption', + ['window', 'index'], + buckets=self.bucket_ranges.get('gpu', []), + registry=self.registry + ) + else: + self.gpu_histogram = None + # If CPU available, for each cpu_indices, create a Histogram metric with the label window, and index. + if energy_monitor.cpu_indices: + self.cpu_histograms = {} + for cpu_index in self.energy_monitor.cpu_indices: + self.cpu_histograms[cpu_index] = Histogram( + f'energy_monitor_cpu_{cpu_index}_energy_joules', + f'CPU {cpu_index} energy consumption', + ['window', 'index'], + buckets=self.bucket_ranges.get('cpu', []), + registry=self.registry + ) + # Only when CPUs are available, we check if DRAM is available using supportsGetDramEnergyConsumption in CPU class + # If DRAM available, we create histogram for each DRAM indices for each CPU indices + if CPU.supportsGetDramEnergyConsumption: + self.dram_histograms = {} + for dram_index in self.energy_monitor.cpu_indices: + self.dram_histograms[dram_index] = Histogram( + f'energy_monitor_dram_{dram_index}_energy_joules', + f'DRAM {dram_index} energy consumption', + ['window', 'index'], + buckets=self.bucket_ranges.get('dram', []), + registry=self.registry + ) + else: + self.dram_histogram = None + else: + self.cpu_histogram = None + + self.max_gpu_bucket = max(self.bucket_ranges.get('gpu')) + self.max_cpu_bucket = max(self.bucket_ranges.get('cpu')) + self.max_dram_bucket = max(self.bucket_ranges.get('dram')) + + def begin_window(self, name: str) -> None: + """Begin a new energy monitoring window.""" + self.energy_monitor.begin_window(f"__EnergyHistogram_{name}") + + def end_window(self, name: str) -> None: + """ + End the current energy monitoring window and record the energy data. + + Retrieves the energy consumption data (for GPUs, CPUs, and DRAMs) for the monitoring window + and updates the corresponding Histogram metrics. The data is then pushed to the Prometheus Push Gateway. + + Args: + name: The name of the monitoring window (used as a label for the Prometheus Histogram metrics). + + Pushes: + - GPU energy data to the Prometheus Push Gateway via the associated Histogram metric. + - CPU energy data to the Prometheus Push Gateway via the associated Histogram metric. + - DRAM energy data to the Prometheus Push Gateway via the associated Histogram metric. + """ + measurement = self.energy_monitor.end_window(f"__EnergyHistogram_{name}") + + if measurement.gpu_energy: + for gpu_index, gpu_energy in measurement.gpu_energy.items(): + if gpu_index in self.gpu_histograms: + self.gpu_histograms[gpu_index].labels(window=f"__EnergyHistogram_{name}", index=gpu_index).observe(gpu_energy) + if gpu_energy > self.max_gpu_bucket: + warnings.warn(f"GPU {gpu_index} energy {gpu_energy} exceeds the maximum bucket value of {self.max_gpu_bucket}") + + if measurement.cpu_energy: + for cpu_index, cpu_energy in measurement.cpu_energy.items(): + if cpu_index in self.cpu_histograms: + self.cpu_histograms[cpu_index].labels(window=f"__EnergyHistogram_{name}", index=cpu_index).observe(cpu_energy) + if cpu_energy > self.max_cpu_bucket: + warnings.warn(f"CPU {cpu_index} energy {cpu_energy} exceeds the maximum bucket value of {self.max_cpu_bucket}") + + if measurement.dram_energy: + for dram_index, dram_energy in measurement.dram_energy.items(): + if dram_index in self.dram_histograms: + self.dram_histograms[dram_index].labels(window=f"__EnergyHistogram_{name}", index=dram_index).observe(dram_energy) + if dram_energy > self.max_dram_bucket: + warnings.warn(f"DRAM {dram_index} energy {dram_energy} exceeds the maximum bucket value of {self.max_dram_bucket}") + + push_to_gateway(self.prometheus_url, job=self.job, registry=self.registry) + +class EnergyCumulativeCounter(Metric): + """ + EnergyCumulativeCounter class to monitor and record cumulative energy consumption. + + This class tracks GPU, CPU, and DRAM energy usage over time, and records the data as Prometheus Counter metrics. + The energy consumption metrics are periodically updated and pushed to a Prometheus Push Gateway for monitoring and analysis. + + The cumulative nature of the Counter ensures that energy values are always incremented over time, never reset, + which is ideal for tracking continuously increasing values like energy usage. + + Attributes: + energy_monitor: The ZeusMonitor instance that collects energy consumption data for the system. + update_period: The interval (in seconds) between consecutive energy data updates. + prometheus_url: The URL of the Prometheus Push Gateway where the Counter metrics will be pushed. + job: The name of the job associated with the energy monitoring in Prometheus. + queue: A multiprocessing queue used to send signals to start/stop energy monitoring. + proc: A multiprocessing process that runs the energy monitoring loop. + """ + + def __init__( + self, + energy_monitor: ZeusMonitor, + update_period: int, + prometheus_url: str, + job: str + )-> None: + """ + Initialize the EnergyCumulativeCounter. + + Args: + energy_monitor: The ZeusMonitor instance used to monitor energy consumption. + update_period: The time interval (in seconds) at which energy measurements are updated. + prometheus_url: The URL for the Prometheus Push Gateway where the metrics will be pushed. + job: The name of the job to be associated with the Prometheus metrics. + """ + self.energy_monitor = energy_monitor + self.update_period = update_period + self.prometheus_url = prometheus_url + self.job = job + + def begin_window(self, name: str) -> None: + """ + Begin the energy monitoring window. + + Starts a new multiprocessing process that monitors energy usage periodically + and pushes the results to the Prometheus Push Gateway. + + Args: + name: A unique name for the monitoring window (used as a label in Prometheus metrics). + """ + self.queue = mp.Queue() + self.proc = mp.Process( + target=energy_monitoring_loop, + args=(name, self.queue, self.energy_monitor, self.update_period, self.prometheus_url, self.job) + ) + self.proc.start() + + def end_window(self, name: str)-> None: + """End the energy monitoring window.""" + self.queue.put("stop") + self.proc.join() + +def energy_monitoring_loop( + name: str, + pipe: mp.Queue, + energy_monitor: ZeusMonitor, + update_period: int, + prometheus_url: str, + job: str +) -> None: + """ + The polling function that runs in a separate process to monitor energy usage. + + It periodically collects energy consumption metrics from the energy monitor and + pushes the results to the Prometheus Push Gateway. + + Args: + name: The unique name of the monitoring window. + pipe: A multiprocessing queue used to receive signals (e.g., to stop the process). + energy_monitor: The ZeusMonitor instance used to retrieve energy data. + update_period: The interval (in seconds) between energy data polls. + prometheus_url: The URL of the Prometheus Push Gateway. + job: The job name used in Prometheus for Counter metrics. + """ + registry = CollectorRegistry() + + if energy_monitor.gpu_indices: + gpu_counters = {} + for gpu_index in energy_monitor.gpu_indices: + gpu_counters[gpu_index] = Counter( + f'energy_monitor_gpu_{gpu_index}_energy_joules', + f'GPU {gpu_index} energy consumption', + ['window', 'index'], + registry=registry + ) + + if energy_monitor.cpu_indices: + cpu_counters = {} + for cpu_index in energy_monitor.cpu_indices: + cpu_counters[cpu_index] = Counter( + f'energy_monitor_cpu_{cpu_index}_energy_joules', + f'CPU {cpu_index} energy consumption', + ['window', 'index'], + registry=registry + ) + + if CPU.supportsGetDramEnergyConsumption: + dram_counters = {} + for dram_index in energy_monitor.cpu_indices: + dram_counters[dram_index] = Counter( + f'energy_monitor_dram_{dram_index}_energy_joules', + f'DRAM {dram_index} energy consumption', + ['window', 'index'], + registry=registry + ) + + while True: + if not pipe.empty(): + signal = pipe.get() + if signal == "stop": + break + + energy_monitor.begin_window(f"__EnergyCumulativeCounter_{name}") + time.sleep(update_period) + measurement = energy_monitor.end_window(f"__EnergyCumulativeCounter_{name}") + + if measurement.gpu_energy: + for gpu_index, energy in measurement.gpu_energy.items(): + if gpu_index in gpu_counters: + gpu_counters[gpu_index].labels(window=f"__EnergyCumulativeCounter_{name}", index=gpu_index).inc(energy) + + if measurement.cpu_energy: + for cpu_index, energy in measurement.cpu_energy.items(): + if cpu_index in cpu_counters: + cpu_counters[cpu_index].labels(window=f"__EnergyCumulativeCounter_{name}", index=cpu_index).inc(energy) + + if measurement.dram_energy: + for dram_index, energy in measurement.dram_energy.items(): + if dram_index in dram_counters: + dram_counters[dram_index].labels(window=f"__EnergyCumulativeCounter_{name}", index=dram_index).inc(energy) + + push_to_gateway(prometheus_url, job=job, registry=registry) + +class PowerGauge(Metric): + """ + PowerGauge class to monitor and record power consumption. + + This class tracks GPU power usage in real time and records it as **Prometheus Gauge** metrics. + The Gauge metric type is suitable for tracking values that can go up and down over time, like power consumption. + + Power usage data is collected at regular intervals and pushed to a Prometheus Push Gateway for monitoring. + + Attributes: + power_monitor: The PowerMonitor instance that retrieves power consumption data for the GPUs. + update_period: The time interval (in seconds) between consecutive power measurements. + prometheus_url: The URL of the Prometheus Push Gateway where the Gauge metrics will be pushed. + job: The name of the job associated with the power metrics in Prometheus. + queue: A multiprocessing queue used to send signals to start/stop power monitoring. + proc: A multiprocessing process that runs the power monitoring loop. + """ + + def __init__( + self, + power_monitor: PowerMonitor, + update_period: int, + prometheus_url: str, + job: str + ) -> None: + """ + Initialize the PowerGauge metric. + + Args: + power_monitor: The PowerMonitor instance used to monitor power consumption. + update_period: The interval (in seconds) between power measurement updates. + prometheus_url: The URL for the Prometheus Push Gateway where the metrics will be pushed. + job: The name of the job to be associated with the Prometheus metrics. + """ + self.power_monitor = power_monitor + self.update_period = update_period + self.prometheus_url = prometheus_url + self.job = job + + def begin_window(self, name: str) -> None: + """ + Begin the power monitoring window. + + Starts a new multiprocessing process that runs the power monitoring loop. + The process collects real-time power consumption data and updates the corresponding + Gauge metrics in Prometheus. + + Args: + name: A unique name for the monitoring window, used as a label for the Prometheus Gauge metrics. + """ + self.queue = mp.Queue() + self.proc = mp.Process( + target=power_monitoring_loop, + args=(name, self.queue, self.power_monitor, self.update_period, self.prometheus_url, self.job) + ) + self.proc.start() + + def end_window(self, name: str) -> None: + """End the power monitoring window.""" + self.queue.put("stop") + self.proc.join() + +# For each GPU, it creates a Prometheus Gauge to record power consumption over time. +# Each gauge is associated with a specific GPU index, and Prometheus uses these to track power consumption. +def power_monitoring_loop( + name: str, + pipe: mp.Queue, + power_monitor: PowerMonitor, + update_period: int, + prometheus_url: str, + job: str +) -> None: + """ + The polling function for power monitoring that runs in a separate process. + + It periodically collects power consumption data for each GPU and pushes the results + to the Prometheus Push Gateway. + + Args: + name: The unique name for the monitoring window. + pipe: A multiprocessing queue to receive control signals (e.g., "stop"). + power_monitor: The PowerMonitor instance used to retrieve power usage data. + update_period: The interval (in seconds) between power data polls. + prometheus_url: The URL of the Prometheus Push Gateway where metrics are pushed. + job: The job name used in Prometheus for Gauge metrics. + """ + gpu_gauges = {} + registry = CollectorRegistry() + + for gpu_index in power_monitor.gpu_indices: + gpu_gauges[gpu_index] = Gauge( + f'power_monitor_gpu_{gpu_index}_power_watts', + f'Records power consumption for GPU {gpu_index} over time', + ['gpu_index'], # Label to indicate GPU index + registry=registry + ) + + while True: + if not pipe.empty(): + signal = pipe.get() + if signal == "stop": + break + + power_measurement = power_monitor.get_power() + if power_measurement is not None: + for gpu_index, power_value in power_measurement: + gpu_gauges[gpu_index].labels(gpu_index=f"{name}_gpu{gpu_index}").set(power_value) + + push_to_gateway(prometheus_url, job=job, registry=registry) + time.sleep(update_period) + From dd881e832af81dc9bc6277a24cf2a80999c5046f Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Fri, 18 Oct 2024 01:33:36 -0400 Subject: [PATCH 02/57] Reformat metric.py with black --- zeus/metric.py | 264 +++++++++++++++++++++++++++++-------------------- 1 file changed, 156 insertions(+), 108 deletions(-) diff --git a/zeus/metric.py b/zeus/metric.py index 8474b076..8c110eb6 100644 --- a/zeus/metric.py +++ b/zeus/metric.py @@ -6,9 +6,16 @@ from zeus.monitor.power import PowerMonitor from zeus.monitor.energy import ZeusMonitor from zeus.device.cpu.common import CPU -from prometheus_client import CollectorRegistry, Histogram, Counter, Gauge, push_to_gateway +from prometheus_client import ( + CollectorRegistry, + Histogram, + Counter, + Gauge, + push_to_gateway, +) -class Metric(abc.ABC): + +class Metric(abc.ABC): @abc.abstractmethod def begin_window(self, name: str): pass @@ -17,6 +24,7 @@ def begin_window(self, name: str): def end_window(self, name: str): pass + class EnergyHistogram(Metric): """EnergyHistogram class to monitor and record energy consumption metrics. @@ -35,11 +43,11 @@ class EnergyHistogram(Metric): """ def __init__( - self, - energy_monitor: ZeusMonitor, - prometheus_url: str, - job: str, - bucket_ranges=None + self, + energy_monitor: ZeusMonitor, + prometheus_url: str, + job: str, + bucket_ranges=None, ) -> None: """ Initialize the EnergyHistogram class. @@ -51,7 +59,7 @@ def __init__( energy_monitor: The ZeusMonitor instance used to retrieve energy data for the system. prometheus_url: The URL for the Prometheus Push Gateway where the metrics will be sent. job: The name of the Prometheus job associated with the energy metrics. - bucket_ranges: Optional custom bucket ranges for the Histogram metrics (GPU, CPU, and DRAM). + bucket_ranges: Optional custom bucket ranges for the Histogram metrics (GPU, CPU, and DRAM). If not provided, default bucket ranges will be used for each component. """ self.energy_monitor = energy_monitor @@ -65,23 +73,26 @@ def __init__( default_dram_buckets = [5.0, 10.0, 20.0, 50.0, 150.0] self.bucket_ranges = { - 'gpu': default_gpu_buckets, - 'cpu': default_cpu_buckets, - 'dram': default_dram_buckets, + "gpu": default_gpu_buckets, + "cpu": default_cpu_buckets, + "dram": default_dram_buckets, } - self.bucket_ranges['gpu'] = ( - bucket_ranges.get('gpu') if bucket_ranges and 'gpu' in bucket_ranges + self.bucket_ranges["gpu"] = ( + bucket_ranges.get("gpu") + if bucket_ranges and "gpu" in bucket_ranges else default_gpu_buckets ) - self.bucket_ranges['cpu'] = ( - bucket_ranges.get('cpu') if bucket_ranges and 'cpu' in bucket_ranges + self.bucket_ranges["cpu"] = ( + bucket_ranges.get("cpu") + if bucket_ranges and "cpu" in bucket_ranges else default_cpu_buckets ) - self.bucket_ranges['dram'] = ( - bucket_ranges.get('dram') if bucket_ranges and 'dram' in bucket_ranges + self.bucket_ranges["dram"] = ( + bucket_ranges.get("dram") + if bucket_ranges and "dram" in bucket_ranges else default_dram_buckets ) # If GPU availble, for each gpu_indices, create a Histogram metric with the label window, and index. @@ -89,11 +100,11 @@ def __init__( self.gpu_histograms = {} for gpu_index in self.energy_monitor.gpu_indices: self.gpu_histograms[gpu_index] = Histogram( - f'energy_monitor_gpu_{gpu_index}_energy_joules', - f'GPU {gpu_index} energy consumption', - ['window', 'index'], - buckets=self.bucket_ranges.get('gpu', []), - registry=self.registry + f"energy_monitor_gpu_{gpu_index}_energy_joules", + f"GPU {gpu_index} energy consumption", + ["window", "index"], + buckets=self.bucket_ranges.get("gpu", []), + registry=self.registry, ) else: self.gpu_histogram = None @@ -102,11 +113,11 @@ def __init__( self.cpu_histograms = {} for cpu_index in self.energy_monitor.cpu_indices: self.cpu_histograms[cpu_index] = Histogram( - f'energy_monitor_cpu_{cpu_index}_energy_joules', - f'CPU {cpu_index} energy consumption', - ['window', 'index'], - buckets=self.bucket_ranges.get('cpu', []), - registry=self.registry + f"energy_monitor_cpu_{cpu_index}_energy_joules", + f"CPU {cpu_index} energy consumption", + ["window", "index"], + buckets=self.bucket_ranges.get("cpu", []), + registry=self.registry, ) # Only when CPUs are available, we check if DRAM is available using supportsGetDramEnergyConsumption in CPU class # If DRAM available, we create histogram for each DRAM indices for each CPU indices @@ -114,25 +125,25 @@ def __init__( self.dram_histograms = {} for dram_index in self.energy_monitor.cpu_indices: self.dram_histograms[dram_index] = Histogram( - f'energy_monitor_dram_{dram_index}_energy_joules', - f'DRAM {dram_index} energy consumption', - ['window', 'index'], - buckets=self.bucket_ranges.get('dram', []), - registry=self.registry + f"energy_monitor_dram_{dram_index}_energy_joules", + f"DRAM {dram_index} energy consumption", + ["window", "index"], + buckets=self.bucket_ranges.get("dram", []), + registry=self.registry, ) else: self.dram_histogram = None else: self.cpu_histogram = None - self.max_gpu_bucket = max(self.bucket_ranges.get('gpu')) - self.max_cpu_bucket = max(self.bucket_ranges.get('cpu')) - self.max_dram_bucket = max(self.bucket_ranges.get('dram')) - + self.max_gpu_bucket = max(self.bucket_ranges.get("gpu")) + self.max_cpu_bucket = max(self.bucket_ranges.get("cpu")) + self.max_dram_bucket = max(self.bucket_ranges.get("dram")) + def begin_window(self, name: str) -> None: """Begin a new energy monitoring window.""" self.energy_monitor.begin_window(f"__EnergyHistogram_{name}") - + def end_window(self, name: str) -> None: """ End the current energy monitoring window and record the energy data. @@ -153,26 +164,39 @@ def end_window(self, name: str) -> None: if measurement.gpu_energy: for gpu_index, gpu_energy in measurement.gpu_energy.items(): if gpu_index in self.gpu_histograms: - self.gpu_histograms[gpu_index].labels(window=f"__EnergyHistogram_{name}", index=gpu_index).observe(gpu_energy) + self.gpu_histograms[gpu_index].labels( + window=f"__EnergyHistogram_{name}", index=gpu_index + ).observe(gpu_energy) if gpu_energy > self.max_gpu_bucket: - warnings.warn(f"GPU {gpu_index} energy {gpu_energy} exceeds the maximum bucket value of {self.max_gpu_bucket}") - + warnings.warn( + f"GPU {gpu_index} energy {gpu_energy} exceeds the maximum bucket value of {self.max_gpu_bucket}" + ) + if measurement.cpu_energy: for cpu_index, cpu_energy in measurement.cpu_energy.items(): if cpu_index in self.cpu_histograms: - self.cpu_histograms[cpu_index].labels(window=f"__EnergyHistogram_{name}", index=cpu_index).observe(cpu_energy) + self.cpu_histograms[cpu_index].labels( + window=f"__EnergyHistogram_{name}", index=cpu_index + ).observe(cpu_energy) if cpu_energy > self.max_cpu_bucket: - warnings.warn(f"CPU {cpu_index} energy {cpu_energy} exceeds the maximum bucket value of {self.max_cpu_bucket}") + warnings.warn( + f"CPU {cpu_index} energy {cpu_energy} exceeds the maximum bucket value of {self.max_cpu_bucket}" + ) if measurement.dram_energy: for dram_index, dram_energy in measurement.dram_energy.items(): if dram_index in self.dram_histograms: - self.dram_histograms[dram_index].labels(window=f"__EnergyHistogram_{name}", index=dram_index).observe(dram_energy) + self.dram_histograms[dram_index].labels( + window=f"__EnergyHistogram_{name}", index=dram_index + ).observe(dram_energy) if dram_energy > self.max_dram_bucket: - warnings.warn(f"DRAM {dram_index} energy {dram_energy} exceeds the maximum bucket value of {self.max_dram_bucket}") - + warnings.warn( + f"DRAM {dram_index} energy {dram_energy} exceeds the maximum bucket value of {self.max_dram_bucket}" + ) + push_to_gateway(self.prometheus_url, job=self.job, registry=self.registry) + class EnergyCumulativeCounter(Metric): """ EnergyCumulativeCounter class to monitor and record cumulative energy consumption. @@ -180,7 +204,7 @@ class EnergyCumulativeCounter(Metric): This class tracks GPU, CPU, and DRAM energy usage over time, and records the data as Prometheus Counter metrics. The energy consumption metrics are periodically updated and pushed to a Prometheus Push Gateway for monitoring and analysis. - The cumulative nature of the Counter ensures that energy values are always incremented over time, never reset, + The cumulative nature of the Counter ensures that energy values are always incremented over time, never reset, which is ideal for tracking continuously increasing values like energy usage. Attributes: @@ -193,12 +217,12 @@ class EnergyCumulativeCounter(Metric): """ def __init__( - self, - energy_monitor: ZeusMonitor, - update_period: int, - prometheus_url: str, - job: str - )-> None: + self, + energy_monitor: ZeusMonitor, + update_period: int, + prometheus_url: str, + job: str, + ) -> None: """ Initialize the EnergyCumulativeCounter. @@ -208,10 +232,10 @@ def __init__( prometheus_url: The URL for the Prometheus Push Gateway where the metrics will be pushed. job: The name of the job to be associated with the Prometheus metrics. """ - self.energy_monitor = energy_monitor - self.update_period = update_period - self.prometheus_url = prometheus_url - self.job = job + self.energy_monitor = energy_monitor + self.update_period = update_period + self.prometheus_url = prometheus_url + self.job = job def begin_window(self, name: str) -> None: """ @@ -226,22 +250,30 @@ def begin_window(self, name: str) -> None: self.queue = mp.Queue() self.proc = mp.Process( target=energy_monitoring_loop, - args=(name, self.queue, self.energy_monitor, self.update_period, self.prometheus_url, self.job) + args=( + name, + self.queue, + self.energy_monitor, + self.update_period, + self.prometheus_url, + self.job, + ), ) self.proc.start() - def end_window(self, name: str)-> None: + def end_window(self, name: str) -> None: """End the energy monitoring window.""" self.queue.put("stop") self.proc.join() + def energy_monitoring_loop( - name: str, - pipe: mp.Queue, - energy_monitor: ZeusMonitor, - update_period: int, - prometheus_url: str, - job: str + name: str, + pipe: mp.Queue, + energy_monitor: ZeusMonitor, + update_period: int, + prometheus_url: str, + job: str, ) -> None: """ The polling function that runs in a separate process to monitor energy usage. @@ -263,64 +295,71 @@ def energy_monitoring_loop( gpu_counters = {} for gpu_index in energy_monitor.gpu_indices: gpu_counters[gpu_index] = Counter( - f'energy_monitor_gpu_{gpu_index}_energy_joules', - f'GPU {gpu_index} energy consumption', - ['window', 'index'], - registry=registry + f"energy_monitor_gpu_{gpu_index}_energy_joules", + f"GPU {gpu_index} energy consumption", + ["window", "index"], + registry=registry, ) if energy_monitor.cpu_indices: cpu_counters = {} for cpu_index in energy_monitor.cpu_indices: cpu_counters[cpu_index] = Counter( - f'energy_monitor_cpu_{cpu_index}_energy_joules', - f'CPU {cpu_index} energy consumption', - ['window', 'index'], - registry=registry + f"energy_monitor_cpu_{cpu_index}_energy_joules", + f"CPU {cpu_index} energy consumption", + ["window", "index"], + registry=registry, ) if CPU.supportsGetDramEnergyConsumption: dram_counters = {} for dram_index in energy_monitor.cpu_indices: dram_counters[dram_index] = Counter( - f'energy_monitor_dram_{dram_index}_energy_joules', - f'DRAM {dram_index} energy consumption', - ['window', 'index'], - registry=registry + f"energy_monitor_dram_{dram_index}_energy_joules", + f"DRAM {dram_index} energy consumption", + ["window", "index"], + registry=registry, ) while True: if not pipe.empty(): signal = pipe.get() if signal == "stop": - break + break energy_monitor.begin_window(f"__EnergyCumulativeCounter_{name}") - time.sleep(update_period) + time.sleep(update_period) measurement = energy_monitor.end_window(f"__EnergyCumulativeCounter_{name}") if measurement.gpu_energy: for gpu_index, energy in measurement.gpu_energy.items(): if gpu_index in gpu_counters: - gpu_counters[gpu_index].labels(window=f"__EnergyCumulativeCounter_{name}", index=gpu_index).inc(energy) + gpu_counters[gpu_index].labels( + window=f"__EnergyCumulativeCounter_{name}", index=gpu_index + ).inc(energy) if measurement.cpu_energy: for cpu_index, energy in measurement.cpu_energy.items(): if cpu_index in cpu_counters: - cpu_counters[cpu_index].labels(window=f"__EnergyCumulativeCounter_{name}", index=cpu_index).inc(energy) + cpu_counters[cpu_index].labels( + window=f"__EnergyCumulativeCounter_{name}", index=cpu_index + ).inc(energy) if measurement.dram_energy: for dram_index, energy in measurement.dram_energy.items(): if dram_index in dram_counters: - dram_counters[dram_index].labels(window=f"__EnergyCumulativeCounter_{name}", index=dram_index).inc(energy) + dram_counters[dram_index].labels( + window=f"__EnergyCumulativeCounter_{name}", index=dram_index + ).inc(energy) push_to_gateway(prometheus_url, job=job, registry=registry) + class PowerGauge(Metric): """ PowerGauge class to monitor and record power consumption. - This class tracks GPU power usage in real time and records it as **Prometheus Gauge** metrics. + This class tracks GPU power usage in real time and records it as **Prometheus Gauge** metrics. The Gauge metric type is suitable for tracking values that can go up and down over time, like power consumption. Power usage data is collected at regular intervals and pushed to a Prometheus Push Gateway for monitoring. @@ -335,11 +374,11 @@ class PowerGauge(Metric): """ def __init__( - self, - power_monitor: PowerMonitor, - update_period: int, - prometheus_url: str, - job: str + self, + power_monitor: PowerMonitor, + update_period: int, + prometheus_url: str, + job: str, ) -> None: """ Initialize the PowerGauge metric. @@ -351,16 +390,16 @@ def __init__( job: The name of the job to be associated with the Prometheus metrics. """ self.power_monitor = power_monitor - self.update_period = update_period + self.update_period = update_period self.prometheus_url = prometheus_url - self.job = job - + self.job = job + def begin_window(self, name: str) -> None: """ Begin the power monitoring window. - Starts a new multiprocessing process that runs the power monitoring loop. - The process collects real-time power consumption data and updates the corresponding + Starts a new multiprocessing process that runs the power monitoring loop. + The process collects real-time power consumption data and updates the corresponding Gauge metrics in Prometheus. Args: @@ -369,24 +408,32 @@ def begin_window(self, name: str) -> None: self.queue = mp.Queue() self.proc = mp.Process( target=power_monitoring_loop, - args=(name, self.queue, self.power_monitor, self.update_period, self.prometheus_url, self.job) + args=( + name, + self.queue, + self.power_monitor, + self.update_period, + self.prometheus_url, + self.job, + ), ) - self.proc.start() + self.proc.start() def end_window(self, name: str) -> None: """End the power monitoring window.""" self.queue.put("stop") self.proc.join() -# For each GPU, it creates a Prometheus Gauge to record power consumption over time. + +# For each GPU, it creates a Prometheus Gauge to record power consumption over time. # Each gauge is associated with a specific GPU index, and Prometheus uses these to track power consumption. def power_monitoring_loop( - name: str, - pipe: mp.Queue, - power_monitor: PowerMonitor, - update_period: int, - prometheus_url: str, - job: str + name: str, + pipe: mp.Queue, + power_monitor: PowerMonitor, + update_period: int, + prometheus_url: str, + job: str, ) -> None: """ The polling function for power monitoring that runs in a separate process. @@ -407,23 +454,24 @@ def power_monitoring_loop( for gpu_index in power_monitor.gpu_indices: gpu_gauges[gpu_index] = Gauge( - f'power_monitor_gpu_{gpu_index}_power_watts', - f'Records power consumption for GPU {gpu_index} over time', - ['gpu_index'], # Label to indicate GPU index - registry=registry + f"power_monitor_gpu_{gpu_index}_power_watts", + f"Records power consumption for GPU {gpu_index} over time", + ["gpu_index"], # Label to indicate GPU index + registry=registry, ) while True: if not pipe.empty(): signal = pipe.get() if signal == "stop": - break + break power_measurement = power_monitor.get_power() if power_measurement is not None: for gpu_index, power_value in power_measurement: - gpu_gauges[gpu_index].labels(gpu_index=f"{name}_gpu{gpu_index}").set(power_value) + gpu_gauges[gpu_index].labels(gpu_index=f"{name}_gpu{gpu_index}").set( + power_value + ) push_to_gateway(prometheus_url, job=job, registry=registry) time.sleep(update_period) - From d21cfd1db61241edbf3e173b4872cb83c3e92fe3 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Sat, 9 Nov 2024 13:58:09 -0500 Subject: [PATCH 03/57] Add metric monitoring section to documentation --- docs/measure/index.md | 82 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/docs/measure/index.md b/docs/measure/index.md index 92512440..e7cd78fd 100644 --- a/docs/measure/index.md +++ b/docs/measure/index.md @@ -114,6 +114,88 @@ if __name__ == "__main__": avg_energy = sum(map(lambda m: m.total_energy, steps)) / len(steps) print(f"One step takes {avg_time} s and {avg_energy} J for the CPU.") ``` +## Metric Monitoring + +Zeus allows you to monitor energy and power consumption through different metrics, such as Histograms, Counters, and Gauges, which can be pushed to a Prometheus Push Gateway for further analysis. + +[`EnergyHistogram`][zeus.metric.EnergyHistogram] records energy consumption data for GPUs, CPUs, and DRAM in Prometheus Histograms. This is useful for observing how frequently energy usage reaches certain levels. + +You can customize the bucket ranges for each component (GPU, CPU, and DRAM), or let Zeus use default ranges. + +```python hl_lines="2 5-7" +from zeus.monitor import ZeusMonitor +from zeus.metric import EnergyHistogram + +if __name__ == "__main__": + # Initialize EnergyHistogram with custom bucket ranges + histogram_metric = EnergyHistogram( + energy_monitor=ZeusMonitor, + prometheus_url='http://localhost:9091', + job='energy_histogram_job', + bucket_ranges={ + "gpu": [10.0, 25.0, 50.0, 100.0], + "cpu": [5.0, 10.0, 25.0, 50.0], + "dram": [1.0, 2.5, 5.0, 10.0] + } + ) + + histogram_metric.begin_window("histogram_test") + # Perform tasks + histogram_metric.end_window("histogram_test") +``` +You can use the `begin_window` and `end_window` methods to define a measurement window, similar to other ZeusMonitor operations. Energy consumption data will be recorded for the entire duration of the window. + +!!! Tip + If no custom `bucket ranges` are provided, Zeus uses default ranges for GPU, CPU, and DRAM. + + If you later decide to specify custom bucket ranges only for the GPU while leaving CPU and DRAM to use defaults, you could write: + bucket_ranges={ + "gpu": [10.0, 25.0, 50.0, 100.0] + } + +[`EnergyCumulativeCounter`][zeus.metric.EnergyCumulativeCounter] monitors cumulative energy consumption. It tracks energy usage over time, without resetting the values, and is updated periodically. + +```python hl_lines="2 5-7" +from zeus.monitor import ZeusMonitor +from zeus.metric import EnergyCumulativeCounter + +if __name__ == "__main__": + + cumulative_counter_metric = EnergyCumulativeCounter( + energy_monitor=ZeusMonitor, + update_period=2, # Updates energy data every 2 seconds + prometheus_url='http://localhost:9091', + job='energy_counter_job' + ) + + cumulative_counter_metric.begin_window("counter_test_window") + # Let the counter run + time.sleep(10) # Keep measuring for 10 seconds + cumulative_counter_metric.end_window("counter_test_window") +``` +The `update_period` parameter defines how often the energy measurements are updated and pushed to Prometheus. + +[`PowerGauge`][zeus.metric.PowerGauge] tracks real-time power consumption using Prometheus Gauges which monitors fluctuating values such as power usage. + +```python hl_lines="2 5-7" +from zeus.monitor.power import PowerMonitor +from zeus.metric import PowerGauge + +if __name__ == "__main__": + + power_gauge_metric = PowerGauge( + power_monitor=PowerMonitor, + update_period=2, # Updates power consumption every 2 seconds + prometheus_url='http://localhost:9091', + job='power_gauge_job' + ) + + power_gauge_metric.begin_window("gauge_test_window") + # Monitor power consumption for 10 seconds + time.sleep(10) + power_gauge_metric.end_window("gauge_test_window") +``` +The `update_period` parameter defines how often the power datas are updated and pushed to Prometheus. ## CLI power and energy monitor From 4681796ed517e01678fa7261a51f8119343e5dc8 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Sat, 9 Nov 2024 14:04:04 -0500 Subject: [PATCH 04/57] Add unit tests for EnergyHistogram, EnergyCumulativeCounter, and PowerGauge --- tests/test_metric.py | 204 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 204 insertions(+) create mode 100644 tests/test_metric.py diff --git a/tests/test_metric.py b/tests/test_metric.py new file mode 100644 index 00000000..9d11cd46 --- /dev/null +++ b/tests/test_metric.py @@ -0,0 +1,204 @@ +from __future__ import annotations +from unittest.mock import patch + +import pytest +from zeus.metric import EnergyHistogram, EnergyCumulativeCounter, PowerGauge + +class MockMeasurement: + """ + Mock object representing energy measurements for testing. + Contains energy values for GPU, CPU, and DRAM. + """ + def __init__(self, gpu_energy=None, cpu_energy=None, dram_energy=None): + self.gpu_energy = gpu_energy or {} + self.cpu_energy = cpu_energy or {} + self.dram_energy = dram_energy or {} + +class MockZeusMonitor: + """ + Mock object to simulate an ZeusMonitor, which provides energy measurements + for GPU, CPU, and DRAM for use in unit testing. The measurement values are fixed + to aid in validating the functionality of histogram metrics. + """ + def __init__(self): + self.gpu_indices = [0, 1, 2, 3] # 4 GPUs in the mock + self.cpu_indices = [0, 1] # 2 CPUs in the mock + self.dram_indices = [0, 1] + + def begin_window(self, name): + """ + Simulates the start of a measurement window. + """ + + def end_window(self, name: str) -> MockMeasurement: + """ + Simulates the end of a measurement window, returning fixed energy measurements + for testing purposes. + """ + return MockMeasurement( + gpu_energy={index: 30.0 for index in self.gpu_indices}, # Fixed value for all GPUs + cpu_energy={index: 15.0 for index in self.cpu_indices}, # Fixed value for all CPUs + dram_energy={index: 7.5 for index in self.dram_indices} # Fixed value for all DRAMs + ) + +class MockPowerMonitor: + """ + Mock object to simulate a PowerMonitor, which provides power measurements for GPUs. + The power values are randomized for testing purposes. + """ + def __init__(self): + self.gpu_indices = [0, 1, 2, 3] # 4 GPUs + + def begin_window(self, name): + """ + Simulates the start of a power measurement window. + """ + def get_power(self): + """ + Returns simulated power measurements for each GPU. + """ + return [(index, 300.0) for index in self.gpu_indices] + + def end_window(self, name): + """ + Simulates the start of a power measurement window. + """ + print(f"MockPowerMonitor: end window {name}") + +@pytest.fixture +def mock_energy_monitor(): + """ + Returns a mocked energy monitor instance for testing. + """ + return MockZeusMonitor() + +@pytest.fixture +def mock_power_monitor(): + """ + Returns a mocked power monitor instance for testing. + """ + return MockPowerMonitor() + +# Test Cases + +def test_energy_histogram(mock_energy_monitor): + """ + Unit test for the EnergyHistogram class. This test validates that the `observe()` + method on the Prometheus Histogram is correctly called with the fixed GPU, CPU, and + DRAM energy values (30.0, 15.0, and 7.5, respectively). + """ + # Define custom bucket ranges for GPU, CPU, and DRAM energy histograms + custom_bucket_ranges = { + "gpu": [10.0, 25.0, 50.0], + "cpu": [5.0, 10.0, 25.0], + "dram": [1.0, 2.5, 10.0] + } + # Instantiate the EnergyHistogram class with the mock energy monitor and custom bucket ranges + histogram_metric = EnergyHistogram( + energy_monitor=mock_energy_monitor, + prometheus_url='http://localhost:9091', + job='test_energy_histogram', + bucket_ranges=custom_bucket_ranges + ) + # Test GPU energy observations + for gpu_index in histogram_metric.gpu_histograms.keys(): + with patch.object(histogram_metric.gpu_histograms[gpu_index], 'observe') as mock_observe_gpu: + + histogram_metric.begin_window("test_window") + histogram_metric.end_window("test_window") + + for call_args in mock_observe_gpu.call_args_list: + observed_value = call_args[0][0] + assert observed_value == 30.0 + # Test CPU energy observations + for cpu_index in histogram_metric.cpu_histograms.keys(): + with patch.object(histogram_metric.cpu_histograms[cpu_index], 'observe') as mock_observe_cpu: + # Check that `observe()` was called with the correct CPU energy value + histogram_metric.begin_window("test_window") + histogram_metric.end_window("test_window") + + for call_args in mock_observe_cpu.call_args_list: + observed_value = call_args[0][0] + assert observed_value == 15.0 + # Test DRAM energy observations + for dram_index in histogram_metric.dram_histograms.keys(): + with patch.object(histogram_metric.dram_histograms[dram_index], 'observe') as mock_observe_dram: + # Check that `observe()` was called with the correct DRAM energy value + histogram_metric.begin_window("test_window") + histogram_metric.end_window("test_window") + + for call_args in mock_observe_dram.call_args_list: + observed_value = call_args[0][0] + assert observed_value == 7.5 + +def test_energy_cumulative_counter(mock_energy_monitor): + """ + Unit test for the EnergyCumulativeCounter class. This test ensures that the + cumulative energy counter starts and stops correctly, and that the energy + monitoring process is alive during the window, and that the 'inc' method + of the Prometheus Counter is called with the expected incremental energy values for + GPU, CPU, and DRAM. + """ + cumulative_counter_metric = EnergyCumulativeCounter( + energy_monitor=mock_energy_monitor, + update_period=2, + prometheus_url='http://localhost:9091', + job='test_energy_counter' + ) + + # Start the window and check the process + cumulative_counter_metric.begin_window("counter_test") + assert cumulative_counter_metric.proc is not None + assert cumulative_counter_metric.proc.is_alive() # Check if the process is running + + for gpu_index in cumulative_counter_metric.gpu_counters.keys(): + with patch.object(cumulative_counter_metric.gpu_counters[gpu_index], 'inc') as mock_set: + for call_args in mock_set.return_value.labels.return_value.set.call_args_list: + observed_value = call_args[0][0] + assert observed_value == 30.0 + + for cpu_index in cumulative_counter_metric.cpu_counters.keys(): + with patch.object(cumulative_counter_metric.cpu_counters[cpu_index], 'inc') as mock_set: + for call_args in mock_set.return_value.labels.return_value.set.call_args_list: + observed_value = call_args[0][0] + assert observed_value == 15.0 + + for dram_index in cumulative_counter_metric.dram_counters.keys(): + with patch.object(cumulative_counter_metric.dram_counters[dram_index], 'inc') as mock_set: + for call_args in mock_set.return_value.labels.return_value.set.call_args_list: + observed_value = call_args[0][0] + assert observed_value == 7.5 + + # End the window and ensure the process has stopped + cumulative_counter_metric.end_window("counter_test") + cumulative_counter_metric.proc.join() # Ensure the process has finished + assert not cumulative_counter_metric.proc.is_alive() # Process should be done + +def test_power_gauge(mock_power_monitor): + """ + Unit test for the PowerGauge class. This test checks that the power gauge + measurement process starts and stops correctly, and that the mock power monitor + provides valid power measurements during the window, and that the 'set' method + of the Prometheus Gauge is called with the expected power values for GPU. + """ + + power_gauge_metric = PowerGauge( + power_monitor=mock_power_monitor, + update_period=2, + prometheus_url='http://localhost:9091', + job='test_power_gauge' + ) + + power_gauge_metric.begin_window("gauge_test") + assert power_gauge_metric.proc is not None + assert power_gauge_metric.proc.is_alive() + + for gpu_index in power_gauge_metric.gpu_gauges.keys(): + with patch.object(power_gauge_metric.gpu_gauges[gpu_index], 'set') as mock_set: + for call_args in mock_set.return_value.labels.return_value.set.call_args_list: + observed_value = call_args[0][0] + assert observed_value == 300.0 + + power_gauge_metric.end_window("gauge_test") + power_gauge_metric.proc.join() # Ensure the process has finished + assert not power_gauge_metric.proc.is_alive() # Process should be done From e8bfe7bee4485b15fa0c77e72e56cfe71c231a04 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Mon, 11 Nov 2024 13:44:27 -0500 Subject: [PATCH 05/57] Add train_single.py for testing energy monitoring metrics --- examples/prometheus/requirements.txt | 2 + examples/prometheus/train_single.py | 409 +++++++++++++++++++++++++++ 2 files changed, 411 insertions(+) create mode 100644 examples/prometheus/requirements.txt create mode 100644 examples/prometheus/train_single.py diff --git a/examples/prometheus/requirements.txt b/examples/prometheus/requirements.txt new file mode 100644 index 00000000..ac988bdf --- /dev/null +++ b/examples/prometheus/requirements.txt @@ -0,0 +1,2 @@ +torch +torchvision diff --git a/examples/prometheus/train_single.py b/examples/prometheus/train_single.py new file mode 100644 index 00000000..28098edb --- /dev/null +++ b/examples/prometheus/train_single.py @@ -0,0 +1,409 @@ +import argparse +import os +import random +import time +from enum import Enum + +import torch +import torch.nn as nn +import torch.nn.parallel +import torch.backends.cudnn as cudnn +import torch.distributed as dist +import torch.optim +from torch.optim.lr_scheduler import StepLR +import torch.utils.data +from torch.utils.data import DataLoader +import torch.utils.data.distributed +import torchvision.transforms as transforms +import torchvision.datasets as datasets +import torchvision.models as models +from zeus.metric import EnergyHistogram +from zeus.metric import EnergyCumulativeCounter +from zeus.metric import PowerGauge + +# ZEUS +from zeus.monitor import ZeusMonitor +from zeus.monitor import PowerMonitor +from zeus.optimizer.power_limit import MaxSlowdownConstraint, GlobalPowerLimitOptimizer +from zeus.utils.env import get_env + + +def parse_args() -> argparse.Namespace: + """Parse command line arguments.""" + # List choices of models + model_names = sorted( + name + for name in models.__dict__ + if name.islower() + and not name.startswith("__") + and callable(models.__dict__[name]) + ) + + parser = argparse.ArgumentParser(description="PyTorch ImageNet Training") + parser.add_argument("data", metavar="DIR", help="Path to the ImageNet directory") + parser.add_argument( + "-a", + "--arch", + metavar="ARCH", + default="resnet18", + choices=model_names, + help="model architecture: " + " | ".join(model_names) + " (default: resnet18)", + ) + parser.add_argument( + "-j", + "--workers", + default=4, + type=int, + metavar="N", + help="number of data loading workers (default: 4)", + ) + parser.add_argument( + "--epochs", + default=90, + type=int, + metavar="N", + help="number of total epochs to run", + ) + parser.add_argument( + "-b", + "--batch_size", + default=256, + type=int, + metavar="N", + help="mini-batch size (default: 256)", + ) + parser.add_argument( + "--lr", + "--learning_rate", + default=0.1, + type=float, + metavar="LR", + help="initial learning rate", + dest="lr", + ) + parser.add_argument( + "--momentum", default=0.9, type=float, metavar="M", help="momentum" + ) + parser.add_argument( + "--wd", + "--weight_decay", + default=1e-4, + type=float, + metavar="W", + help="weight decay (default: 1e-4)", + dest="weight_decay", + ) + parser.add_argument( + "-p", + "--print_freq", + default=10, + type=int, + metavar="N", + help="print frequency (default: 10)", + ) + parser.add_argument( + "--seed", default=None, type=int, help="seed for initializing training. " + ) + parser.add_argument( + "--gpu", default=0, type=int, metavar="N", help="GPU id to use (default: 0)" + ) + + return parser.parse_args() + + +def main(): + """Main function that prepares values and spawns/calls the worker function.""" + args = parse_args() + + if args.seed is not None: + random.seed(args.seed) + torch.manual_seed(args.seed) + cudnn.deterministic = True + + print("=> creating model '{}'".format(args.arch)) + model = models.__dict__[args.arch]() + + torch.cuda.set_device(args.gpu) + model.cuda(args.gpu) + + criterion = nn.CrossEntropyLoss().cuda(args.gpu) + optimizer = torch.optim.SGD( + model.parameters(), + args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay, + ) + scheduler = StepLR(optimizer, step_size=30, gamma=0.1) + + traindir = os.path.join(args.data, "train") + valdir = os.path.join(args.data, "val") + normalize = transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) + train_dataset = datasets.ImageFolder( + traindir, + transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ] + ), + ) + val_dataset = datasets.ImageFolder( + valdir, + transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ] + ), + ) + + train_loader = DataLoader( + train_dataset, + batch_size=args.batch_size, + shuffle=True, + num_workers=args.workers, + pin_memory=True, + ) + val_loader = DataLoader( + val_dataset, + batch_size=args.batch_size, + shuffle=False, + num_workers=args.workers, + pin_memory=True, + ) + + ################################## The important part ##################################### + # ZeusMonitor is used to profile the time and energy consumption of the GPU. + energy_monitor = ZeusMonitor(gpu_indices=[args.gpu]) + power_monitor = PowerMonitor(gpu_indices=[args.gpu]) + energy_histogram = EnergyHistogram(energy_monitor=energy_monitor, prometheus_url='http://localhost:9091', job='training_energy_histogram') + power_gauge = PowerGauge(power_monitor=power_monitor, update_period=2, prometheus_url='http://localhost:9091', job='training_power_gauge') + energy_counter = EnergyCumulativeCounter(energy_monitor=energy_monitor, update_period=2, prometheus_url='http://localhost:9091', job='training_energy_counter') + + for epoch in range(args.epochs): + # plo.on_epoch_begin() + + energy_histogram.begin_window(f"epoch_{epoch}") + # energy_counter.begin_window(f"epoch_{epoch}") + # power_gauge.begin_window(f"epoch_{epoch}") + + # train(train_loader, model, criterion, optimizer, epoch, args, plo) + energy_histogram.end_window(f"epoch_{epoch}") + # energy_counter.end_window(f"epoch_{epoch}") + # power_gauge.end_window(f"epoch_{epoch}") + + # plo.on_epoch_end() + + acc1 = validate(val_loader, model, criterion, args) + print(f"Top-1 accuracy: {acc1}") + + scheduler.step() + + energy_counter.begin_window("Counter_window") + power_gauge.begin_window("Gauge Window") + + energy_counter.end_window("Counter_window") + power_gauge.end_window("Gauge Window") + ################################## The important part ##################################### + + +def train( + train_loader, model, criterion, optimizer, epoch, args, power_limit_optimizer +): + batch_time = AverageMeter("Time", ":6.3f") + data_time = AverageMeter("Data", ":6.3f") + losses = AverageMeter("Loss", ":.4e") + top1 = AverageMeter("Acc@1", ":6.2f") + top5 = AverageMeter("Acc@5", ":6.2f") + + progress = ProgressMeter( + len(train_loader), + [batch_time, data_time, losses, top1, top5], + prefix="Epoch: [{}]".format(epoch), + ) + + # switch to train mode + model.train() + + end = time.time() + for i, (images, target) in enumerate(train_loader): + power_limit_optimizer.on_step_begin() # Mark the beginning of one training step. + + # Load data to GPU + images = images.cuda(args.gpu, non_blocking=True) + target = target.cuda(args.gpu, non_blocking=True) + + # measure data loading time + data_time.update(time.time() - end) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # compute gradient and do SGD step + optimizer.zero_grad() + loss.backward() + optimizer.step() + + power_limit_optimizer.on_step_end() # Mark the end of one training step. + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i + 1) + + +def validate(val_loader, model, criterion, args): + + batch_time = AverageMeter("Time", ":6.3f", Summary.NONE) + losses = AverageMeter("Loss", ":.4e", Summary.NONE) + top1 = AverageMeter("Acc@1", ":6.2f", Summary.AVERAGE) + top5 = AverageMeter("Acc@5", ":6.2f", Summary.AVERAGE) + progress = ProgressMeter( + len(val_loader), + [batch_time, losses, top1, top5], + prefix="Test: ", + ) + + # switch to evaluate mode + model.eval() + + with torch.no_grad(): + end = time.time() + for i, (images, target) in enumerate(val_loader): + # Load data to GPU + images = images.cuda(args.gpu, non_blocking=True) + target = target.cuda(args.gpu, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i + 1) + + progress.display_summary() + + return top1.avg + + +class Summary(Enum): + NONE = 0 + AVERAGE = 1 + SUM = 2 + COUNT = 3 + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self, name, fmt=":f", summary_type=Summary.AVERAGE): + self.name = name + self.fmt = fmt + self.summary_type = summary_type + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + # DATA PARALLEL + def all_reduce(self): + device = "cuda" if torch.cuda.is_available() else "cpu" + total = torch.tensor([self.sum, self.count], dtype=torch.float32, device=device) + dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False) + self.sum, self.count = total.tolist() + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" + return fmtstr.format(**self.__dict__) + + def summary(self): + fmtstr = "" + if self.summary_type is Summary.NONE: + fmtstr = "" + elif self.summary_type is Summary.AVERAGE: + fmtstr = "{name} {avg:.3f}" + elif self.summary_type is Summary.SUM: + fmtstr = "{name} {sum:.3f}" + elif self.summary_type is Summary.COUNT: + fmtstr = "{name} {count:.3f}" + else: + raise ValueError("invalid summary type %r" % self.summary_type) + + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print("\t".join(entries)) + + def display_summary(self): + entries = [" *"] + entries += [meter.summary() for meter in self.meters] + print(" ".join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = "{:" + str(num_digits) + "d}" + return "[" + fmt + "/" + fmt.format(num_batches) + "]" + + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == "__main__": + main() \ No newline at end of file From 1b9e5412ce96e2878a92d262dce6a9fe13327359 Mon Sep 17 00:00:00 2001 From: Sharon Seungyu Han <87476439+sharonsyh@users.noreply.github.com> Date: Sat, 16 Nov 2024 20:17:51 -0500 Subject: [PATCH 06/57] Update docs/measure/index.md Co-authored-by: Jae-Won Chung --- docs/measure/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/measure/index.md b/docs/measure/index.md index e7cd78fd..f902be0d 100644 --- a/docs/measure/index.md +++ b/docs/measure/index.md @@ -122,7 +122,7 @@ Zeus allows you to monitor energy and power consumption through different metric You can customize the bucket ranges for each component (GPU, CPU, and DRAM), or let Zeus use default ranges. -```python hl_lines="2 5-7" +```python hl_lines="2 5-15" from zeus.monitor import ZeusMonitor from zeus.metric import EnergyHistogram From 26e925ea2377c55f2876e18b2b15323d64299624 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Thu, 28 Nov 2024 21:38:06 -0500 Subject: [PATCH 07/57] Refactor metric initialization and multiprocessing logic in metric.py - Changed metric instantiation to accept CPU and GPU indices directly instead of class objects. - Improved multiprocessing logic to address and fix pickle-related errors. - Added consistent handling for sync_execution across begin_window and end_window calls for all metrics. - Centralized bucket range validation and default handling for EnergyHistogram. - Improved error handling and logging for multiprocessing processes. - Standardized Prometheus metric labels (e.g., window and index) across Histogram, Counter, and Gauge. - Updated docstrings for consistency and clarity across all Metric subclasses. --- zeus/metric.py | 375 +++++++++++++++++++++++++++---------------------- 1 file changed, 210 insertions(+), 165 deletions(-) diff --git a/zeus/metric.py b/zeus/metric.py index 8c110eb6..930858c4 100644 --- a/zeus/metric.py +++ b/zeus/metric.py @@ -1,53 +1,59 @@ from __future__ import annotations + import abc import time import warnings import multiprocessing as mp + +from prometheus_client import CollectorRegistry, Histogram, Counter, Gauge, push_to_gateway + from zeus.monitor.power import PowerMonitor from zeus.monitor.energy import ZeusMonitor -from zeus.device.cpu.common import CPU -from prometheus_client import ( - CollectorRegistry, - Histogram, - Counter, - Gauge, - push_to_gateway, -) +from zeus.device.cpu import get_cpus class Metric(abc.ABC): + """ + Abstract base class for all metric types in Zeus. + + Defines a common interface for metrics, ensuring consistent behavior + for `begin_window` and `end_window` operations. + """ @abc.abstractmethod def begin_window(self, name: str): + """Start a new measurement window.""" pass @abc.abstractmethod def end_window(self, name: str): + """End the current measurement window and report metrics.""" pass class EnergyHistogram(Metric): - """EnergyHistogram class to monitor and record energy consumption metrics. + """Measures the energy consumption a code range and exports a histogram metrics. - This class tracks GPU, CPU, and DRAM energy usage, and records the data as Prometheus Histogram metrics. - The energy data is pushed to a Prometheus Push Gateway for monitoring and analysis. + Tracks energy consumption for GPUs, CPUs, and DRAM as Prometheus Histogram metrics. Attributes: - energy_monitor: The ZeusMonitor instance that collects energy consumption data. - prometheus_url: The URL of the Prometheus Push Gateway where the metrics will be pushed. - job: The name of the job to associate with the Prometheus metrics. - registry: The Prometheus CollectorRegistry that manages all the Histogram metrics for this class. - bucket_ranges: Optional custom bucket ranges for the Histogram metrics (GPU, CPU, and DRAM). - gpu_histograms: A dictionary mapping GPU indices to their respective Histogram metrics. - cpu_histograms: A dictionary mapping CPU indices to their respective Histogram metrics. - dram_histograms: A dictionary mapping DRAM indices to their respective Histogram metrics. + cpu_indices: List of CPU indices to monitor. + gpu_indices: List of GPU indices to monitor. + prometheus_url: Prometheus Push Gateway URL. + job: Prometheus job name. + gpu_bucket_range: Histogram buckets for GPU energy. + cpu_bucket_range: Histogram buckets for CPU energy. + dram_bucket_range: Histogram buckets for DRAM energy. """ def __init__( self, - energy_monitor: ZeusMonitor, + cpu_indices: list, + gpu_indices: list, prometheus_url: str, job: str, - bucket_ranges=None, + gpu_bucket_range: list[float] = [50.0, 100.0, 200.0, 500.0, 1000.0], + cpu_bucket_range: list[float] = [10.0, 20.0, 50.0, 100.0, 200.0], + dram_bucket_range: list[float] = [5.0, 10.0, 20.0, 50.0, 150.0], ) -> None: """ Initialize the EnergyHistogram class. @@ -56,93 +62,85 @@ def __init__( The data will be collected and pushed to the Prometheus Push Gateway at regular intervals. Args: - energy_monitor: The ZeusMonitor instance used to retrieve energy data for the system. - prometheus_url: The URL for the Prometheus Push Gateway where the metrics will be sent. - job: The name of the Prometheus job associated with the energy metrics. - bucket_ranges: Optional custom bucket ranges for the Histogram metrics (GPU, CPU, and DRAM). - If not provided, default bucket ranges will be used for each component. + cpu_indices (list): List of CPU indices to monitor. + gpu_indices (list): List of GPU indices to monitor. + prometheus_url (str): URL of the Prometheus Push Gateway where metrics will be pushed. + job (str): Name of the Prometheus job to associate with the energy metrics. + gpu_bucket_range (list[float], optional): Bucket ranges for GPU energy histograms. + Defaults to [50.0, 100.0, 200.0, 500.0, 1000.0]. + cpu_bucket_range (list[float], optional): Bucket ranges for CPU energy histograms. + Defaults to [10.0, 20.0, 50.0, 100.0, 200.0]. + dram_bucket_range (list[float], optional): Bucket ranges for DRAM energy histograms. + Defaults to [5.0, 10.0, 20.0, 50.0, 150.0]. + Raises: + ValueError: If any of the bucket ranges (GPU, CPU, DRAM) is an empty list. """ - self.energy_monitor = energy_monitor + if not gpu_bucket_range: + raise ValueError("GPU bucket range cannot be empty. Please provide a valid range or omit the argument to use defaults.") + if not cpu_bucket_range: + raise ValueError("CPU bucket range cannot be empty. Please provide a valid range or omit the argument to use defaults.") + if not dram_bucket_range: + raise ValueError("DRAM bucket range cannot be empty. Please provide a valid range or omit the argument to use defaults.") + + self.gpu_bucket_range = gpu_bucket_range + self.cpu_bucket_range = cpu_bucket_range + self.dram_bucket_range = dram_bucket_range + self.cpu_indices = cpu_indices + self.gpu_indices = gpu_indices self.prometheus_url = prometheus_url self.job = job self.registry = CollectorRegistry() - default_gpu_buckets = [50.0, 100.0, 200.0, 500.0, 1000.0] - default_cpu_buckets = [10.0, 20.0, 50.0, 100.0, 200.0] - default_dram_buckets = [5.0, 10.0, 20.0, 50.0, 150.0] - - self.bucket_ranges = { - "gpu": default_gpu_buckets, - "cpu": default_cpu_buckets, - "dram": default_dram_buckets, - } - - self.bucket_ranges["gpu"] = ( - bucket_ranges.get("gpu") - if bucket_ranges and "gpu" in bucket_ranges - else default_gpu_buckets - ) - - self.bucket_ranges["cpu"] = ( - bucket_ranges.get("cpu") - if bucket_ranges and "cpu" in bucket_ranges - else default_cpu_buckets - ) - - self.bucket_ranges["dram"] = ( - bucket_ranges.get("dram") - if bucket_ranges and "dram" in bucket_ranges - else default_dram_buckets - ) - # If GPU availble, for each gpu_indices, create a Histogram metric with the label window, and index. - if energy_monitor.gpu_indices: - self.gpu_histograms = {} - for gpu_index in self.energy_monitor.gpu_indices: + # Initialize GPU histograms + self.gpu_histograms = {} + if self.gpu_indices: + for gpu_index in gpu_indices: self.gpu_histograms[gpu_index] = Histogram( f"energy_monitor_gpu_{gpu_index}_energy_joules", f"GPU {gpu_index} energy consumption", ["window", "index"], - buckets=self.bucket_ranges.get("gpu", []), + buckets=self.gpu_bucket_range, registry=self.registry, ) - else: - self.gpu_histogram = None - # If CPU available, for each cpu_indices, create a Histogram metric with the label window, and index. - if energy_monitor.cpu_indices: - self.cpu_histograms = {} - for cpu_index in self.energy_monitor.cpu_indices: + # Initialize CPU histograms + self.cpu_histograms = {} + self.dram_histograms = {} + if self.cpu_indices: + for cpu_index in self.cpu_indices: self.cpu_histograms[cpu_index] = Histogram( f"energy_monitor_cpu_{cpu_index}_energy_joules", f"CPU {cpu_index} energy consumption", ["window", "index"], - buckets=self.bucket_ranges.get("cpu", []), + buckets=self.cpu_bucket_range, registry=self.registry, ) - # Only when CPUs are available, we check if DRAM is available using supportsGetDramEnergyConsumption in CPU class - # If DRAM available, we create histogram for each DRAM indices for each CPU indices - if CPU.supportsGetDramEnergyConsumption: - self.dram_histograms = {} - for dram_index in self.energy_monitor.cpu_indices: - self.dram_histograms[dram_index] = Histogram( - f"energy_monitor_dram_{dram_index}_energy_joules", - f"DRAM {dram_index} energy consumption", + # Initialize CPU and DRAM histograms + # Only when CPUs are available, we check if DRAM is available. + for i, cpu in enumerate(get_cpus().cpus): + if cpu.supportsGetDramEnergyConsumption(): + self.dram_histograms[i] = Histogram( + f"energy_monitor_dram_{i}_energy_joules", + f"DRAM {i} energy consumption", ["window", "index"], - buckets=self.bucket_ranges.get("dram", []), + buckets=self.dram_bucket_range, registry=self.registry, ) - else: - self.dram_histogram = None - else: - self.cpu_histogram = None - self.max_gpu_bucket = max(self.bucket_ranges.get("gpu")) - self.max_cpu_bucket = max(self.bucket_ranges.get("cpu")) - self.max_dram_bucket = max(self.bucket_ranges.get("dram")) + self.max_gpu_bucket = max(self.gpu_bucket_range) + self.max_cpu_bucket = max(self.cpu_bucket_range) + self.max_dram_bucket = max(self.dram_bucket_range) + + self.energy_monitor = ZeusMonitor(cpu_indices=cpu_indices, gpu_indices=gpu_indices) def begin_window(self, name: str) -> None: - """Begin a new energy monitoring window.""" - self.energy_monitor.begin_window(f"__EnergyHistogram_{name}") + """ + Begin the energy monitoring window. + + Args: + name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. + """ + self.energy_monitor.begin_window(f"__EnergyHistogram_{name}", sync_execution = True) def end_window(self, name: str) -> None: """ @@ -152,20 +150,21 @@ def end_window(self, name: str) -> None: and updates the corresponding Histogram metrics. The data is then pushed to the Prometheus Push Gateway. Args: - name: The name of the monitoring window (used as a label for the Prometheus Histogram metrics). + name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. Pushes: - GPU energy data to the Prometheus Push Gateway via the associated Histogram metric. - CPU energy data to the Prometheus Push Gateway via the associated Histogram metric. - DRAM energy data to the Prometheus Push Gateway via the associated Histogram metric. """ - measurement = self.energy_monitor.end_window(f"__EnergyHistogram_{name}") + measurement = self.energy_monitor.end_window(f"__EnergyHistogram_{name}", sync_execution = True) if measurement.gpu_energy: for gpu_index, gpu_energy in measurement.gpu_energy.items(): if gpu_index in self.gpu_histograms: self.gpu_histograms[gpu_index].labels( - window=f"__EnergyHistogram_{name}", index=gpu_index + window=name, + index=gpu_index ).observe(gpu_energy) if gpu_energy > self.max_gpu_bucket: warnings.warn( @@ -176,7 +175,8 @@ def end_window(self, name: str) -> None: for cpu_index, cpu_energy in measurement.cpu_energy.items(): if cpu_index in self.cpu_histograms: self.cpu_histograms[cpu_index].labels( - window=f"__EnergyHistogram_{name}", index=cpu_index + window=name, + index=cpu_index ).observe(cpu_energy) if cpu_energy > self.max_cpu_bucket: warnings.warn( @@ -187,7 +187,8 @@ def end_window(self, name: str) -> None: for dram_index, dram_energy in measurement.dram_energy.items(): if dram_index in self.dram_histograms: self.dram_histograms[dram_index].labels( - window=f"__EnergyHistogram_{name}", index=dram_index + window=name, + index=dram_index ).observe(dram_energy) if dram_energy > self.max_dram_bucket: warnings.warn( @@ -196,7 +197,6 @@ def end_window(self, name: str) -> None: push_to_gateway(self.prometheus_url, job=self.job, registry=self.registry) - class EnergyCumulativeCounter(Metric): """ EnergyCumulativeCounter class to monitor and record cumulative energy consumption. @@ -212,13 +212,17 @@ class EnergyCumulativeCounter(Metric): update_period: The interval (in seconds) between consecutive energy data updates. prometheus_url: The URL of the Prometheus Push Gateway where the Counter metrics will be pushed. job: The name of the job associated with the energy monitoring in Prometheus. + gpu_counters: A dictionary storing the Prometheus Counter metrics for each GPU. + cpu_counters: A dictionary storing the Prometheus Counter metrics for each CPU. + dram_counters: A dictionary storing the Prometheus Counter metrics for DRAM. queue: A multiprocessing queue used to send signals to start/stop energy monitoring. proc: A multiprocessing process that runs the energy monitoring loop. """ def __init__( self, - energy_monitor: ZeusMonitor, + cpu_indices: list, + gpu_indices: list, update_period: int, prometheus_url: str, job: str, @@ -227,15 +231,22 @@ def __init__( Initialize the EnergyCumulativeCounter. Args: - energy_monitor: The ZeusMonitor instance used to monitor energy consumption. + cpu_indices (list): List of CPU indices to monitor. + gpu_indices (list): List of GPU indices to monitor. update_period: The time interval (in seconds) at which energy measurements are updated. prometheus_url: The URL for the Prometheus Push Gateway where the metrics will be pushed. job: The name of the job to be associated with the Prometheus metrics. """ - self.energy_monitor = energy_monitor + self.cpu_indices = cpu_indices + self.gpu_indices = gpu_indices self.update_period = update_period self.prometheus_url = prometheus_url self.job = job + self.gpu_counters = {} + self.cpu_counters = {} + self.dram_counters = {} + self.queue = None + self.proc = None def begin_window(self, name: str) -> None: """ @@ -245,51 +256,70 @@ def begin_window(self, name: str) -> None: and pushes the results to the Prometheus Push Gateway. Args: - name: A unique name for the monitoring window (used as a label in Prometheus metrics). + name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. """ - self.queue = mp.Queue() - self.proc = mp.Process( + context = mp.get_context("spawn") + self.queue = context.Queue() + self.proc = context.Process( target=energy_monitoring_loop, args=( name, self.queue, - self.energy_monitor, + self.cpu_indices, + self.gpu_indices, self.update_period, self.prometheus_url, self.job, ), ) self.proc.start() + if not self.proc.is_alive(): + raise RuntimeError(f"Failed to start monitoring process for {name}.") def end_window(self, name: str) -> None: - """End the energy monitoring window.""" - self.queue.put("stop") - self.proc.join() + """ + End the energy monitoring window. + + Args: + name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. + """ + if not hasattr(self, 'queue') or self.queue is None: + raise RuntimeError( + "EnergyCumulativeCounter's 'queue' is not initialized. " + "Make sure 'begin_window' is called before 'end_window'." + ) + self.queue.put("stop") + self.proc.join(timeout=20) + if self.proc.is_alive(): + warnings.warn(f"Forcefully terminating monitoring process for {name}.") + self.proc.terminate() def energy_monitoring_loop( name: str, pipe: mp.Queue, - energy_monitor: ZeusMonitor, + cpu_indices : list, + gpu_indices : list, update_period: int, prometheus_url: str, job: str, ) -> None: """ - The polling function that runs in a separate process to monitor energy usage. - - It periodically collects energy consumption metrics from the energy monitor and - pushes the results to the Prometheus Push Gateway. + This function runs in a separate process to collect and update energy consumption metrics + (for GPUs, CPUs, and DRAM) at regular intervals. It utilizes the Zeus energy monitoring + framework and pushes the collected data to the Prometheus Push Gateway for real-time tracking. Args: - name: The unique name of the monitoring window. - pipe: A multiprocessing queue used to receive signals (e.g., to stop the process). - energy_monitor: The ZeusMonitor instance used to retrieve energy data. - update_period: The interval (in seconds) between energy data polls. - prometheus_url: The URL of the Prometheus Push Gateway. - job: The job name used in Prometheus for Counter metrics. + name (str): The user-defined name of the monitoring window (used as a label for Prometheus metrics). + pipe (mp.Queue): A multiprocessing queue for inter-process communication, used to signal when to stop the process. + cpu_indices (list): List of CPU indices to monitor. + gpu_indices (list): List of GPU indices to monitor. + update_period (int): The interval (in seconds) between consecutive energy data updates. + prometheus_url (str): The URL of the Prometheus Push Gateway where the metrics will be pushed. + job (str): The name of the Prometheus job associated with these metrics. """ registry = CollectorRegistry() + energy_monitor = ZeusMonitor(cpu_indices=cpu_indices, gpu_indices=gpu_indices) if energy_monitor.gpu_indices: gpu_counters = {} @@ -310,46 +340,46 @@ def energy_monitoring_loop( ["window", "index"], registry=registry, ) - - if CPU.supportsGetDramEnergyConsumption: - dram_counters = {} - for dram_index in energy_monitor.cpu_indices: - dram_counters[dram_index] = Counter( - f"energy_monitor_dram_{dram_index}_energy_joules", - f"DRAM {dram_index} energy consumption", + dram_counters = {} + for i, cpu in enumerate(get_cpus().cpus): + if cpu.supportsGetDramEnergyConsumption(): + dram_counters[i] = Counter( + f"energy_monitor_dram_{i}_energy_joules", + f"DRAM {i} energy consumption", ["window", "index"], registry=registry, ) while True: if not pipe.empty(): - signal = pipe.get() - if signal == "stop": - break + break - energy_monitor.begin_window(f"__EnergyCumulativeCounter_{name}") + energy_monitor.begin_window(f"__EnergyCumulativeCounter_{name}", sync_execution = False) time.sleep(update_period) - measurement = energy_monitor.end_window(f"__EnergyCumulativeCounter_{name}") + measurement = energy_monitor.end_window(f"__EnergyCumulativeCounter_{name}", sync_execution = False) if measurement.gpu_energy: for gpu_index, energy in measurement.gpu_energy.items(): if gpu_index in gpu_counters: gpu_counters[gpu_index].labels( - window=f"__EnergyCumulativeCounter_{name}", index=gpu_index + window=name, + index=gpu_index ).inc(energy) if measurement.cpu_energy: for cpu_index, energy in measurement.cpu_energy.items(): if cpu_index in cpu_counters: cpu_counters[cpu_index].labels( - window=f"__EnergyCumulativeCounter_{name}", index=cpu_index + window=name, + index=cpu_index ).inc(energy) if measurement.dram_energy: for dram_index, energy in measurement.dram_energy.items(): if dram_index in dram_counters: dram_counters[dram_index].labels( - window=f"__EnergyCumulativeCounter_{name}", index=dram_index + window=name, + index=dram_index ).inc(energy) push_to_gateway(prometheus_url, job=job, registry=registry) @@ -365,17 +395,18 @@ class PowerGauge(Metric): Power usage data is collected at regular intervals and pushed to a Prometheus Push Gateway for monitoring. Attributes: - power_monitor: The PowerMonitor instance that retrieves power consumption data for the GPUs. - update_period: The time interval (in seconds) between consecutive power measurements. - prometheus_url: The URL of the Prometheus Push Gateway where the Gauge metrics will be pushed. - job: The name of the job associated with the power metrics in Prometheus. - queue: A multiprocessing queue used to send signals to start/stop power monitoring. - proc: A multiprocessing process that runs the power monitoring loop. + gpu_indices: List of GPU indices to monitor for power consumption. + update_period: Time interval (in seconds) between consecutive power measurements. + prometheus_url: URL of the Prometheus Push Gateway where Gauge metrics are pushed. + job: Name of the Prometheus job associated with the power metrics. + gpu_gauges (dict[int, Gauge]): Dictionary mapping GPU indices to Prometheus Gauge metrics for real-time power consumption tracking. + queue: Queue for controlling the monitoring process. + proc: Process running the power monitoring loop. """ def __init__( self, - power_monitor: PowerMonitor, + gpu_indices: list, update_period: int, prometheus_url: str, job: str, @@ -384,15 +415,16 @@ def __init__( Initialize the PowerGauge metric. Args: - power_monitor: The PowerMonitor instance used to monitor power consumption. - update_period: The interval (in seconds) between power measurement updates. - prometheus_url: The URL for the Prometheus Push Gateway where the metrics will be pushed. - job: The name of the job to be associated with the Prometheus metrics. + gpu_indices (list[int]): List of GPU indices to monitor for power consumption. + update_period (int): Interval (in seconds) between consecutive power measurements. + prometheus_url (str): URL of the Prometheus Push Gateway where Gauge metrics are pushed. + job (str): Name of the Prometheus job to associate with the power metrics. """ - self.power_monitor = power_monitor + self.gpu_indices = gpu_indices self.update_period = update_period self.prometheus_url = prometheus_url self.job = job + self.gpu_gauges = {} def begin_window(self, name: str) -> None: """ @@ -403,34 +435,42 @@ def begin_window(self, name: str) -> None: Gauge metrics in Prometheus. Args: - name: A unique name for the monitoring window, used as a label for the Prometheus Gauge metrics. + name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. """ - self.queue = mp.Queue() - self.proc = mp.Process( + context = mp.get_context("spawn") + self.queue = context.Queue() + self.proc = context.Process( target=power_monitoring_loop, args=( name, self.queue, - self.power_monitor, + self.gpu_indices, self.update_period, self.prometheus_url, self.job, ), ) self.proc.start() + time.sleep(5) def end_window(self, name: str) -> None: - """End the power monitoring window.""" - self.queue.put("stop") - self.proc.join() + """ + End the power monitoring window. + Args: + name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. + """ + + self.queue.put("stop") + self.proc.join(timeout=20) + if self.proc.is_alive(): + warnings.warn(f"Forcefully terminating monitoring process for {name}.") + self.proc.terminate() -# For each GPU, it creates a Prometheus Gauge to record power consumption over time. -# Each gauge is associated with a specific GPU index, and Prometheus uses these to track power consumption. def power_monitoring_loop( name: str, pipe: mp.Queue, - power_monitor: PowerMonitor, + gpu_indices: list[int], update_period: int, prometheus_url: str, job: str, @@ -442,36 +482,41 @@ def power_monitoring_loop( to the Prometheus Push Gateway. Args: - name: The unique name for the monitoring window. - pipe: A multiprocessing queue to receive control signals (e.g., "stop"). - power_monitor: The PowerMonitor instance used to retrieve power usage data. - update_period: The interval (in seconds) between power data polls. - prometheus_url: The URL of the Prometheus Push Gateway where metrics are pushed. - job: The job name used in Prometheus for Gauge metrics. + name (str): Unique name for the monitoring window (used as a label in Prometheus metrics). + pipe (multiprocessing.Queue): Queue to receive control signals (e.g., "stop"). + gpu_indices (list[int]): List of GPU indices to monitor for power consumption. + update_period (int): Interval (in seconds) between consecutive power data polls. + prometheus_url (str): URL of the Prometheus Push Gateway where metrics are pushed. + job (str): Name of the Prometheus job to associate with the metrics. """ gpu_gauges = {} + power_monitor = PowerMonitor(gpu_indices=gpu_indices) registry = CollectorRegistry() - for gpu_index in power_monitor.gpu_indices: + for gpu_index in gpu_indices: gpu_gauges[gpu_index] = Gauge( f"power_monitor_gpu_{gpu_index}_power_watts", f"Records power consumption for GPU {gpu_index} over time", - ["gpu_index"], # Label to indicate GPU index + ["gpu_index"], registry=registry, ) while True: if not pipe.empty(): - signal = pipe.get() - if signal == "stop": - break + break power_measurement = power_monitor.get_power() - if power_measurement is not None: - for gpu_index, power_value in power_measurement: - gpu_gauges[gpu_index].labels(gpu_index=f"{name}_gpu{gpu_index}").set( - power_value - ) - push_to_gateway(prometheus_url, job=job, registry=registry) + try: + for gpu_index, power_value in power_measurement.items(): + gpu_gauges[gpu_index].labels(gpu_index=f"{name}_gpu{gpu_index}").set(power_value) + except Exception as e: + print(f"Error during processing power measurement: {e}") + + try: + push_to_gateway(prometheus_url, job=job, registry=registry) + except Exception as e: + print(f"Error pushing metrics: {e}") + time.sleep(update_period) + From 3569b687cdb046d4683c8f3544d0c19b33df7f09 Mon Sep 17 00:00:00 2001 From: Sharon Seungyu Han <87476439+sharonsyh@users.noreply.github.com> Date: Thu, 28 Nov 2024 21:49:55 -0500 Subject: [PATCH 08/57] Update prometheus.yml Adjust target names to standardize pushgateway references, ensuring consistency with the Docker Compose configuration. --- docker/prometheus/prometheus.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml index 8aa5a937..26ff0f5a 100644 --- a/docker/prometheus/prometheus.yml +++ b/docker/prometheus/prometheus.yml @@ -7,8 +7,8 @@ scrape_configs: - targets: ['localhost:9090'] - job_name: 'pushgateway' static_configs: - - targets: ['zeus-pushgateway-1:9091'] + - targets: ['pushgateway:9091'] - job_name: 'node' static_configs: - - targets: ['localhost:9100'] + - targets: ['node-exporter:9100'] From 29e615bdf86d9348c664618843abdb367de4d557 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Thu, 28 Nov 2024 22:51:02 -0500 Subject: [PATCH 09/57] Improve example training script to include Zeus metrics --- docker/prometheus/docker-compose.yml | 3 +- examples/prometheus/train_single.py | 81 ++++++---- tests/test_metric.py | 223 +++++++++++++++++++-------- 3 files changed, 212 insertions(+), 95 deletions(-) diff --git a/docker/prometheus/docker-compose.yml b/docker/prometheus/docker-compose.yml index 9a23ad57..b1c0448c 100644 --- a/docker/prometheus/docker-compose.yml +++ b/docker/prometheus/docker-compose.yml @@ -3,7 +3,7 @@ services: prometheus: image: prom/prometheus volumes: - - ".prometheus.yml:/etc/prometheus/prometheus.yml" + - "./prometheus.yml:/etc/prometheus/prometheus.yml" networks: - localprom ports: @@ -23,4 +23,3 @@ services: networks: localprom: driver: bridge - diff --git a/examples/prometheus/train_single.py b/examples/prometheus/train_single.py index 28098edb..85a674a8 100644 --- a/examples/prometheus/train_single.py +++ b/examples/prometheus/train_single.py @@ -14,18 +14,18 @@ import torch.utils.data from torch.utils.data import DataLoader import torch.utils.data.distributed +from torch.utils.data import Subset import torchvision.transforms as transforms import torchvision.datasets as datasets import torchvision.models as models -from zeus.metric import EnergyHistogram -from zeus.metric import EnergyCumulativeCounter -from zeus.metric import PowerGauge +from multiprocessing import set_start_method # ZEUS from zeus.monitor import ZeusMonitor from zeus.monitor import PowerMonitor from zeus.optimizer.power_limit import MaxSlowdownConstraint, GlobalPowerLimitOptimizer from zeus.utils.env import get_env +from zeus.metric import EnergyHistogram, EnergyCumulativeCounter, PowerGauge def parse_args() -> argparse.Namespace: @@ -144,8 +144,7 @@ def main(): traindir, transforms.Compose( [ - transforms.RandomResizedCrop(224), - transforms.RandomHorizontalFlip(), + transforms.Resize((224, 224)), transforms.ToTensor(), normalize, ] @@ -155,8 +154,7 @@ def main(): valdir, transforms.Compose( [ - transforms.Resize(256), - transforms.CenterCrop(224), + transforms.Resize((224, 224)), transforms.ToTensor(), normalize, ] @@ -178,38 +176,63 @@ def main(): pin_memory=True, ) + train_dataset = Subset(train_dataset, range(5)) + val_dataset = Subset(val_dataset, range(2)) + ################################## The important part ##################################### # ZeusMonitor is used to profile the time and energy consumption of the GPU. - energy_monitor = ZeusMonitor(gpu_indices=[args.gpu]) - power_monitor = PowerMonitor(gpu_indices=[args.gpu]) - energy_histogram = EnergyHistogram(energy_monitor=energy_monitor, prometheus_url='http://localhost:9091', job='training_energy_histogram') - power_gauge = PowerGauge(power_monitor=power_monitor, update_period=2, prometheus_url='http://localhost:9091', job='training_power_gauge') - energy_counter = EnergyCumulativeCounter(energy_monitor=energy_monitor, update_period=2, prometheus_url='http://localhost:9091', job='training_energy_counter') - for epoch in range(args.epochs): - # plo.on_epoch_begin() + # EnergyHistogram: Records the energy consumption during specific phases of the program execution + # and pushes it to the Prometheus Push Gateway as histogram metrics. + energy_histogram = EnergyHistogram( + cpu_indices=[0], + gpu_indices=[0], + prometheus_url='http://localhost:9091', + job='training_energy_histogram' + ) + + # PowerGauge: Monitors real-time power usage of the GPUs and pushes the data to the Prometheus + # Push Gateway as gauge metrics, updated at regular intervals. + power_gauge = PowerGauge( + gpu_indices=[0], + update_period=2, + prometheus_url='http://localhost:9091', + job='training_power_gauge' + ) - energy_histogram.begin_window(f"epoch_{epoch}") - # energy_counter.begin_window(f"epoch_{epoch}") - # power_gauge.begin_window(f"epoch_{epoch}") + # EnergyCumulativeCounter: Tracks cumulative energy consumption over time for CPUs and GPUs + # and pushes the results to the Prometheus Push Gateway as counter metrics. + energy_counter = EnergyCumulativeCounter( + cpu_indices=[0], + gpu_indices=[0], + update_period=2, + prometheus_url='http://localhost:9091', + job='training_energy_counter' + ) - # train(train_loader, model, criterion, optimizer, epoch, args, plo) - energy_histogram.end_window(f"epoch_{epoch}") - # energy_counter.end_window(f"epoch_{epoch}") - # power_gauge.end_window(f"epoch_{epoch}") + # Start monitoring real-time power usage. + power_gauge.begin_window("epoch_power") - # plo.on_epoch_end() + # Start tracking cumulative energy consumption. + energy_counter.begin_window("epoch_energy") + # Loop through training epochs while measuring energy and power metrics. + for epoch in range(args.epochs): + # Validate the model and compute accuracy. acc1 = validate(val_loader, model, criterion, args) + + # Begin and end energy monitoring for the current epoch. + energy_histogram.begin_window(f"epoch_{epoch}_energy") + energy_histogram.end_window(f"epoch_{epoch}_energy") + print(f"Top-1 accuracy: {acc1}") - scheduler.step() - - energy_counter.begin_window("Counter_window") - power_gauge.begin_window("Gauge Window") + # Allow metrics to capture remaining data before shutting down monitoring. + time.sleep(10) - energy_counter.end_window("Counter_window") - power_gauge.end_window("Gauge Window") + # End the cumulative energy and power monitoring windows. + energy_counter.end_window("epoch_energy") + power_gauge.end_window("epoch_power") ################################## The important part ##################################### @@ -406,4 +429,4 @@ def accuracy(output, target, topk=(1,)): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tests/test_metric.py b/tests/test_metric.py index 9d11cd46..33b76189 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -1,8 +1,12 @@ from __future__ import annotations -from unittest.mock import patch + +from unittest.mock import patch, MagicMock import pytest -from zeus.metric import EnergyHistogram, EnergyCumulativeCounter, PowerGauge + +from zeus.metric import EnergyHistogram, EnergyCumulativeCounter, PowerGauge, energy_monitoring_loop + +import multiprocessing as mp class MockMeasurement: """ @@ -14,6 +18,16 @@ def __init__(self, gpu_energy=None, cpu_energy=None, dram_energy=None): self.cpu_energy = cpu_energy or {} self.dram_energy = dram_energy or {} +@pytest.fixture +def mock_gpu_indices(): + """Mock GPU indices for testing.""" + return [0, 1, 2, 3] # 4 GPUs + +@pytest.fixture +def mock_cpu_indices(): + """Mock CPU indices for testing.""" + return [0, 1] # 2 CPUs + class MockZeusMonitor: """ Mock object to simulate an ZeusMonitor, which provides energy measurements @@ -21,9 +35,9 @@ class MockZeusMonitor: to aid in validating the functionality of histogram metrics. """ def __init__(self): - self.gpu_indices = [0, 1, 2, 3] # 4 GPUs in the mock - self.cpu_indices = [0, 1] # 2 CPUs in the mock - self.dram_indices = [0, 1] + self.gpu_indices = mock_gpu_indices + self.cpu_indices = mock_cpu_indices + self.dram_indices = mock_cpu_indices def begin_window(self, name): """ @@ -36,9 +50,9 @@ def end_window(self, name: str) -> MockMeasurement: for testing purposes. """ return MockMeasurement( - gpu_energy={index: 30.0 for index in self.gpu_indices}, # Fixed value for all GPUs - cpu_energy={index: 15.0 for index in self.cpu_indices}, # Fixed value for all CPUs - dram_energy={index: 7.5 for index in self.dram_indices} # Fixed value for all DRAMs + gpu_energy={index: 30.0 for index in self.gpu_indices}, + cpu_energy={index: 15.0 for index in self.cpu_indices}, + dram_energy={index: 7.5 for index in self.dram_indices} ) class MockPowerMonitor: @@ -47,7 +61,7 @@ class MockPowerMonitor: The power values are randomized for testing purposes. """ def __init__(self): - self.gpu_indices = [0, 1, 2, 3] # 4 GPUs + self.gpu_indices = mock_gpu_indices def begin_window(self, name): """ @@ -65,23 +79,23 @@ def end_window(self, name): """ print(f"MockPowerMonitor: end window {name}") -@pytest.fixture -def mock_energy_monitor(): - """ - Returns a mocked energy monitor instance for testing. - """ - return MockZeusMonitor() +# @pytest.fixture +# def mock_energy_monitor(): +# """ +# Returns a mocked energy monitor instance for testing. +# """ +# return MockZeusMonitor() -@pytest.fixture -def mock_power_monitor(): - """ - Returns a mocked power monitor instance for testing. - """ - return MockPowerMonitor() +# @pytest.fixture +# def mock_power_monitor(): +# """ +# Returns a mocked power monitor instance for testing. +# """ +# return MockPowerMonitor() # Test Cases - -def test_energy_histogram(mock_energy_monitor): +@patch("zeus.metric.ZeusMonitor", autospec=True) +def test_energy_histogram(MockZeusMonitor, mock_gpu_indices, mock_cpu_indices): """ Unit test for the EnergyHistogram class. This test validates that the `observe()` method on the Prometheus Histogram is correctly called with the fixed GPU, CPU, and @@ -93,20 +107,33 @@ def test_energy_histogram(mock_energy_monitor): "cpu": [5.0, 10.0, 25.0], "dram": [1.0, 2.5, 10.0] } + # Instantiate the EnergyHistogram class with the mock energy monitor and custom bucket ranges + # with patch("zeus.metric.ZeusMonitor", autospec=True) as MockZeusMonitor: + mock_monitor_instance = MockZeusMonitor.return_value + mock_monitor_instance.gpu_indices = mock_gpu_indices + mock_monitor_instance.cpu_indices = mock_cpu_indices + mock_monitor_instance.end_window.return_value = MockMeasurement( + gpu_energy={index: 30.0 for index in mock_gpu_indices}, + cpu_energy={index: 15.0 for index in mock_cpu_indices}, + dram_energy={index: 7.5 for index in mock_cpu_indices}, + ) + histogram_metric = EnergyHistogram( - energy_monitor=mock_energy_monitor, - prometheus_url='http://localhost:9091', - job='test_energy_histogram', + cpu_indices=mock_cpu_indices, + gpu_indices=mock_gpu_indices, + prometheus_url="http://localhost:9091", + job="test_energy_histogram", bucket_ranges=custom_bucket_ranges ) + # Test GPU energy observations for gpu_index in histogram_metric.gpu_histograms.keys(): with patch.object(histogram_metric.gpu_histograms[gpu_index], 'observe') as mock_observe_gpu: histogram_metric.begin_window("test_window") histogram_metric.end_window("test_window") - + for call_args in mock_observe_gpu.call_args_list: observed_value = call_args[0][0] assert observed_value == 30.0 @@ -130,51 +157,118 @@ def test_energy_histogram(mock_energy_monitor): for call_args in mock_observe_dram.call_args_list: observed_value = call_args[0][0] assert observed_value == 7.5 +from unittest.mock import patch, MagicMock +import multiprocessing as mp +from zeus.metric import EnergyCumulativeCounter, energy_monitoring_loop -def test_energy_cumulative_counter(mock_energy_monitor): - """ - Unit test for the EnergyCumulativeCounter class. This test ensures that the - cumulative energy counter starts and stops correctly, and that the energy - monitoring process is alive during the window, and that the 'inc' method - of the Prometheus Counter is called with the expected incremental energy values for - GPU, CPU, and DRAM. - """ + +@patch("prometheus_client.Counter") # Mock Prometheus Counter +def test_energy_cumulative_counter_with_multiprocessing(MockCounter): + # Mock indices + mock_gpu_indices = [0, 1, 2, 3] + mock_cpu_indices = [0, 1] + + # Mock Prometheus Counters + mock_gpu_counters = {gpu: MagicMock() for gpu in mock_gpu_indices} + mock_cpu_counters = {cpu: MagicMock() for cpu in mock_cpu_indices} + mock_dram_counters = {dram: MagicMock() for dram in mock_cpu_indices} + + # Mock Counter side effect + def mock_counter_side_effect(name, description, labels, registry): + index = int(name.split("_")[-2]) # Extract index from name + if "gpu" in name: + return mock_gpu_counters[index] + elif "cpu" in name: + return mock_cpu_counters[index] + elif "dram" in name: + return mock_dram_counters[index] + + MockCounter.side_effect = mock_counter_side_effect + + # Mock ZeusMonitor behavior + def start_energy_monitoring_loop(queue, cpu_indices, gpu_indices): + with patch("zeus.metric.ZeusMonitor") as MockZeusMonitor: + mock_monitor_instance = MockZeusMonitor.return_value + mock_monitor_instance.begin_window = MagicMock() + mock_monitor_instance.end_window.return_value = MagicMock( + gpu_energy={index: 30.0 for index in gpu_indices}, + cpu_energy={index: 15.0 for index in cpu_indices}, + dram_energy={index: 7.5 for index in cpu_indices}, + ) + + # Call the energy monitoring loop + energy_monitoring_loop( + name="counter_test", + pipe=queue, + cpu_indices=cpu_indices, + gpu_indices=gpu_indices, + update_period=1, + prometheus_url="http://localhost:9091", + job="test_energy_counter", + ) + + # Initialize EnergyCumulativeCounter cumulative_counter_metric = EnergyCumulativeCounter( - energy_monitor=mock_energy_monitor, - update_period=2, - prometheus_url='http://localhost:9091', - job='test_energy_counter' + cpu_indices=mock_cpu_indices, + gpu_indices=mock_gpu_indices, + update_period=1, # Shorter period for faster test + prometheus_url="http://localhost:9091", + job="test_energy_counter", ) - # Start the window and check the process - cumulative_counter_metric.begin_window("counter_test") - assert cumulative_counter_metric.proc is not None - assert cumulative_counter_metric.proc.is_alive() # Check if the process is running + # Use a real multiprocessing Queue + queue = mp.Queue() - for gpu_index in cumulative_counter_metric.gpu_counters.keys(): - with patch.object(cumulative_counter_metric.gpu_counters[gpu_index], 'inc') as mock_set: - for call_args in mock_set.return_value.labels.return_value.set.call_args_list: - observed_value = call_args[0][0] - assert observed_value == 30.0 + # Start the subprocess + process = mp.Process( + target=start_energy_monitoring_loop, args=(queue, mock_cpu_indices, mock_gpu_indices) + ) + process.start() - for cpu_index in cumulative_counter_metric.cpu_counters.keys(): - with patch.object(cumulative_counter_metric.cpu_counters[cpu_index], 'inc') as mock_set: - for call_args in mock_set.return_value.labels.return_value.set.call_args_list: - observed_value = call_args[0][0] - assert observed_value == 15.0 + # Allow the loop to run for a few iterations + import time + time.sleep(3) # Wait for some time to let the loop run + + # Send the stop signal to end the loop + queue.put("stop") + process.join() # Wait for the process to finish + + # Validate GPU counters + for gpu_index, mock_counter in mock_gpu_counters.items(): + try: + mock_counter.labels.assert_called_with( + window="__EnergyCumulativeCounter_counter_test", index=gpu_index + ) + mock_counter.labels.return_value.inc.assert_called_with(30.0) + except AssertionError: + print(f"Assertion failed for GPU counter {gpu_index}") + raise + + # Validate CPU counters + for cpu_index, mock_counter in mock_cpu_counters.items(): + try: + mock_counter.labels.assert_called_with( + window="__EnergyCumulativeCounter_counter_test", index=cpu_index + ) + mock_counter.labels.return_value.inc.assert_called_with(15.0) + except AssertionError: + print(f"Assertion failed for CPU counter {cpu_index}") + raise + + # Validate DRAM counters + for dram_index, mock_counter in mock_dram_counters.items(): + try: + mock_counter.labels.assert_called_with( + window="__EnergyCumulativeCounter_counter_test", index=dram_index + ) + mock_counter.labels.return_value.inc.assert_called_with(7.5) + except AssertionError: + print(f"Assertion failed for DRAM counter {dram_index}") + raise - for dram_index in cumulative_counter_metric.dram_counters.keys(): - with patch.object(cumulative_counter_metric.dram_counters[dram_index], 'inc') as mock_set: - for call_args in mock_set.return_value.labels.return_value.set.call_args_list: - observed_value = call_args[0][0] - assert observed_value == 7.5 - # End the window and ensure the process has stopped - cumulative_counter_metric.end_window("counter_test") - cumulative_counter_metric.proc.join() # Ensure the process has finished - assert not cumulative_counter_metric.proc.is_alive() # Process should be done -def test_power_gauge(mock_power_monitor): +def test_power_gauge(): """ Unit test for the PowerGauge class. This test checks that the power gauge measurement process starts and stops correctly, and that the mock power monitor @@ -183,7 +277,8 @@ def test_power_gauge(mock_power_monitor): """ power_gauge_metric = PowerGauge( - power_monitor=mock_power_monitor, + # power_monitor=mock_power_monitor, + gpu_indices=mock_gpu_indices, update_period=2, prometheus_url='http://localhost:9091', job='test_power_gauge' From 2ae388fc0c6fa7adf72f83258370df0f5845a5e0 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Fri, 29 Nov 2024 09:49:48 -0500 Subject: [PATCH 10/57] Remove unintended file tests/test_metric.py from repository --- tests/test_metric.py | 299 ------------------------------------------- 1 file changed, 299 deletions(-) delete mode 100644 tests/test_metric.py diff --git a/tests/test_metric.py b/tests/test_metric.py deleted file mode 100644 index 33b76189..00000000 --- a/tests/test_metric.py +++ /dev/null @@ -1,299 +0,0 @@ -from __future__ import annotations - -from unittest.mock import patch, MagicMock - -import pytest - -from zeus.metric import EnergyHistogram, EnergyCumulativeCounter, PowerGauge, energy_monitoring_loop - -import multiprocessing as mp - -class MockMeasurement: - """ - Mock object representing energy measurements for testing. - Contains energy values for GPU, CPU, and DRAM. - """ - def __init__(self, gpu_energy=None, cpu_energy=None, dram_energy=None): - self.gpu_energy = gpu_energy or {} - self.cpu_energy = cpu_energy or {} - self.dram_energy = dram_energy or {} - -@pytest.fixture -def mock_gpu_indices(): - """Mock GPU indices for testing.""" - return [0, 1, 2, 3] # 4 GPUs - -@pytest.fixture -def mock_cpu_indices(): - """Mock CPU indices for testing.""" - return [0, 1] # 2 CPUs - -class MockZeusMonitor: - """ - Mock object to simulate an ZeusMonitor, which provides energy measurements - for GPU, CPU, and DRAM for use in unit testing. The measurement values are fixed - to aid in validating the functionality of histogram metrics. - """ - def __init__(self): - self.gpu_indices = mock_gpu_indices - self.cpu_indices = mock_cpu_indices - self.dram_indices = mock_cpu_indices - - def begin_window(self, name): - """ - Simulates the start of a measurement window. - """ - - def end_window(self, name: str) -> MockMeasurement: - """ - Simulates the end of a measurement window, returning fixed energy measurements - for testing purposes. - """ - return MockMeasurement( - gpu_energy={index: 30.0 for index in self.gpu_indices}, - cpu_energy={index: 15.0 for index in self.cpu_indices}, - dram_energy={index: 7.5 for index in self.dram_indices} - ) - -class MockPowerMonitor: - """ - Mock object to simulate a PowerMonitor, which provides power measurements for GPUs. - The power values are randomized for testing purposes. - """ - def __init__(self): - self.gpu_indices = mock_gpu_indices - - def begin_window(self, name): - """ - Simulates the start of a power measurement window. - """ - def get_power(self): - """ - Returns simulated power measurements for each GPU. - """ - return [(index, 300.0) for index in self.gpu_indices] - - def end_window(self, name): - """ - Simulates the start of a power measurement window. - """ - print(f"MockPowerMonitor: end window {name}") - -# @pytest.fixture -# def mock_energy_monitor(): -# """ -# Returns a mocked energy monitor instance for testing. -# """ -# return MockZeusMonitor() - -# @pytest.fixture -# def mock_power_monitor(): -# """ -# Returns a mocked power monitor instance for testing. -# """ -# return MockPowerMonitor() - -# Test Cases -@patch("zeus.metric.ZeusMonitor", autospec=True) -def test_energy_histogram(MockZeusMonitor, mock_gpu_indices, mock_cpu_indices): - """ - Unit test for the EnergyHistogram class. This test validates that the `observe()` - method on the Prometheus Histogram is correctly called with the fixed GPU, CPU, and - DRAM energy values (30.0, 15.0, and 7.5, respectively). - """ - # Define custom bucket ranges for GPU, CPU, and DRAM energy histograms - custom_bucket_ranges = { - "gpu": [10.0, 25.0, 50.0], - "cpu": [5.0, 10.0, 25.0], - "dram": [1.0, 2.5, 10.0] - } - - # Instantiate the EnergyHistogram class with the mock energy monitor and custom bucket ranges - # with patch("zeus.metric.ZeusMonitor", autospec=True) as MockZeusMonitor: - mock_monitor_instance = MockZeusMonitor.return_value - mock_monitor_instance.gpu_indices = mock_gpu_indices - mock_monitor_instance.cpu_indices = mock_cpu_indices - mock_monitor_instance.end_window.return_value = MockMeasurement( - gpu_energy={index: 30.0 for index in mock_gpu_indices}, - cpu_energy={index: 15.0 for index in mock_cpu_indices}, - dram_energy={index: 7.5 for index in mock_cpu_indices}, - ) - - histogram_metric = EnergyHistogram( - cpu_indices=mock_cpu_indices, - gpu_indices=mock_gpu_indices, - prometheus_url="http://localhost:9091", - job="test_energy_histogram", - bucket_ranges=custom_bucket_ranges - ) - - # Test GPU energy observations - for gpu_index in histogram_metric.gpu_histograms.keys(): - with patch.object(histogram_metric.gpu_histograms[gpu_index], 'observe') as mock_observe_gpu: - - histogram_metric.begin_window("test_window") - histogram_metric.end_window("test_window") - - for call_args in mock_observe_gpu.call_args_list: - observed_value = call_args[0][0] - assert observed_value == 30.0 - # Test CPU energy observations - for cpu_index in histogram_metric.cpu_histograms.keys(): - with patch.object(histogram_metric.cpu_histograms[cpu_index], 'observe') as mock_observe_cpu: - # Check that `observe()` was called with the correct CPU energy value - histogram_metric.begin_window("test_window") - histogram_metric.end_window("test_window") - - for call_args in mock_observe_cpu.call_args_list: - observed_value = call_args[0][0] - assert observed_value == 15.0 - # Test DRAM energy observations - for dram_index in histogram_metric.dram_histograms.keys(): - with patch.object(histogram_metric.dram_histograms[dram_index], 'observe') as mock_observe_dram: - # Check that `observe()` was called with the correct DRAM energy value - histogram_metric.begin_window("test_window") - histogram_metric.end_window("test_window") - - for call_args in mock_observe_dram.call_args_list: - observed_value = call_args[0][0] - assert observed_value == 7.5 -from unittest.mock import patch, MagicMock -import multiprocessing as mp -from zeus.metric import EnergyCumulativeCounter, energy_monitoring_loop - - -@patch("prometheus_client.Counter") # Mock Prometheus Counter -def test_energy_cumulative_counter_with_multiprocessing(MockCounter): - # Mock indices - mock_gpu_indices = [0, 1, 2, 3] - mock_cpu_indices = [0, 1] - - # Mock Prometheus Counters - mock_gpu_counters = {gpu: MagicMock() for gpu in mock_gpu_indices} - mock_cpu_counters = {cpu: MagicMock() for cpu in mock_cpu_indices} - mock_dram_counters = {dram: MagicMock() for dram in mock_cpu_indices} - - # Mock Counter side effect - def mock_counter_side_effect(name, description, labels, registry): - index = int(name.split("_")[-2]) # Extract index from name - if "gpu" in name: - return mock_gpu_counters[index] - elif "cpu" in name: - return mock_cpu_counters[index] - elif "dram" in name: - return mock_dram_counters[index] - - MockCounter.side_effect = mock_counter_side_effect - - # Mock ZeusMonitor behavior - def start_energy_monitoring_loop(queue, cpu_indices, gpu_indices): - with patch("zeus.metric.ZeusMonitor") as MockZeusMonitor: - mock_monitor_instance = MockZeusMonitor.return_value - mock_monitor_instance.begin_window = MagicMock() - mock_monitor_instance.end_window.return_value = MagicMock( - gpu_energy={index: 30.0 for index in gpu_indices}, - cpu_energy={index: 15.0 for index in cpu_indices}, - dram_energy={index: 7.5 for index in cpu_indices}, - ) - - # Call the energy monitoring loop - energy_monitoring_loop( - name="counter_test", - pipe=queue, - cpu_indices=cpu_indices, - gpu_indices=gpu_indices, - update_period=1, - prometheus_url="http://localhost:9091", - job="test_energy_counter", - ) - - # Initialize EnergyCumulativeCounter - cumulative_counter_metric = EnergyCumulativeCounter( - cpu_indices=mock_cpu_indices, - gpu_indices=mock_gpu_indices, - update_period=1, # Shorter period for faster test - prometheus_url="http://localhost:9091", - job="test_energy_counter", - ) - - # Use a real multiprocessing Queue - queue = mp.Queue() - - # Start the subprocess - process = mp.Process( - target=start_energy_monitoring_loop, args=(queue, mock_cpu_indices, mock_gpu_indices) - ) - process.start() - - # Allow the loop to run for a few iterations - import time - time.sleep(3) # Wait for some time to let the loop run - - # Send the stop signal to end the loop - queue.put("stop") - process.join() # Wait for the process to finish - - # Validate GPU counters - for gpu_index, mock_counter in mock_gpu_counters.items(): - try: - mock_counter.labels.assert_called_with( - window="__EnergyCumulativeCounter_counter_test", index=gpu_index - ) - mock_counter.labels.return_value.inc.assert_called_with(30.0) - except AssertionError: - print(f"Assertion failed for GPU counter {gpu_index}") - raise - - # Validate CPU counters - for cpu_index, mock_counter in mock_cpu_counters.items(): - try: - mock_counter.labels.assert_called_with( - window="__EnergyCumulativeCounter_counter_test", index=cpu_index - ) - mock_counter.labels.return_value.inc.assert_called_with(15.0) - except AssertionError: - print(f"Assertion failed for CPU counter {cpu_index}") - raise - - # Validate DRAM counters - for dram_index, mock_counter in mock_dram_counters.items(): - try: - mock_counter.labels.assert_called_with( - window="__EnergyCumulativeCounter_counter_test", index=dram_index - ) - mock_counter.labels.return_value.inc.assert_called_with(7.5) - except AssertionError: - print(f"Assertion failed for DRAM counter {dram_index}") - raise - - - -def test_power_gauge(): - """ - Unit test for the PowerGauge class. This test checks that the power gauge - measurement process starts and stops correctly, and that the mock power monitor - provides valid power measurements during the window, and that the 'set' method - of the Prometheus Gauge is called with the expected power values for GPU. - """ - - power_gauge_metric = PowerGauge( - # power_monitor=mock_power_monitor, - gpu_indices=mock_gpu_indices, - update_period=2, - prometheus_url='http://localhost:9091', - job='test_power_gauge' - ) - - power_gauge_metric.begin_window("gauge_test") - assert power_gauge_metric.proc is not None - assert power_gauge_metric.proc.is_alive() - - for gpu_index in power_gauge_metric.gpu_gauges.keys(): - with patch.object(power_gauge_metric.gpu_gauges[gpu_index], 'set') as mock_set: - for call_args in mock_set.return_value.labels.return_value.set.call_args_list: - observed_value = call_args[0][0] - assert observed_value == 300.0 - - power_gauge_metric.end_window("gauge_test") - power_gauge_metric.proc.join() # Ensure the process has finished - assert not power_gauge_metric.proc.is_alive() # Process should be done From 6a9daa5cb48c343873c732da32ce9ef33316415e Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Fri, 29 Nov 2024 12:43:18 -0500 Subject: [PATCH 11/57] Update the doc on Metrics Monitoring and Assumptions --- docs/measure/index.md | 204 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 166 insertions(+), 38 deletions(-) diff --git a/docs/measure/index.md b/docs/measure/index.md index f902be0d..0c6796e5 100644 --- a/docs/measure/index.md +++ b/docs/measure/index.md @@ -88,7 +88,7 @@ To only measure the energy consumption of the CPU used by the current Python pro You can pass in `cpu_indices=[]` or `gpu_indices=[]` to [`ZeusMonitor`][zeus.monitor.ZeusMonitor] to disable either CPU or GPU measurements. -```python hl_lines="2 5-7" +```python hl_lines="2 5-15" from zeus.monitor import ZeusMonitor from zeus.device.cpu import get_current_cpu_index @@ -114,86 +114,214 @@ if __name__ == "__main__": avg_energy = sum(map(lambda m: m.total_energy, steps)) / len(steps) print(f"One step takes {avg_time} s and {avg_energy} J for the CPU.") ``` +# Prometheus Assumptions + +To monitor energy and power consumption effectively using Zeus, Prometheus and the Prometheus Push Gateway must be properly set up. This section outlines the assumptions and provides a guide to configure Prometheus and the Push Gateway. + +--- + +## Assumptions + +1. **Prometheus Push Gateway Deployment** + A Prometheus Push Gateway must be deployed and accessible at the URL provided in your Zeus configuration. This ensures that metrics collected by Zeus can be pushed to Prometheus. + +2. **Prometheus Configuration** + Prometheus is configured to scrape data from the Push Gateway. This involves adding the Push Gateway URL to the Prometheus `prometheus.yml` configuration file. + +3. **Network Accessibility** + If the Push Gateway and Prometheus are hosted on a remote server, ensure that firewall settings allow traffic on the required ports: + - **Push Gateway**: Port `9091` + - **Prometheus**: Port `9090` + +4. **Optional Visualization Tools** + Tools like Grafana can be integrated with Prometheus to create detailed visualizations of the metrics collected. + +--- + +## Setup Guide + +### Step 1: Install and Start the Prometheus Push Gateway +Choose either Option 1 (Binary) or Option 2 (Docker). + +#### Option 1: Download Binary +1. Visit the [Prometheus Push Gateway Download Page](https://prometheus.io/download/#pushgateway). +2. Download the appropriate binary for your operating system. +3. Extract the binary: +```sh + tar -xvzf prometheus-pushgateway*.tar.gz + cd prometheus-pushgateway-* +``` +4. Start the Push Gateway: +```sh +./prometheus-pushgateway --web.listen-address=:9091 +``` +5. Verify the Push Gateway is running by visiting http://localhost:9091 in your browser. + +### Option 2: Using Docker +1. Pull the official Prometheus Push Gateway Docker image: +```sh +docker pull prom/pushgateway +``` +2. Run the Push Gateway in a container: +```sh +docker run -d -p 9091:9091 prom/pushgateway +``` +3. Verify it is running by visiting http://localhost:9091 in your browser. + +### Step 2: Install and Configure Prometheus + +1. Visit the Prometheus [Prometheus Download Page](https://prometheus.io/download/#prometheus). +2. Download the appropriate binary for your operating system. +3. Extract the binary: +```sh +tar -xvzf prometheus*.tar.gz +cd prometheus-* +``` +4. Update the Prometheus configuration file `prometheus.yml` to scrape metrics from the Push Gateway: +```sh +scrape_configs: + - job_name: 'pushgateway' + honor_labels: true + static_configs: + - targets: ['localhost:9091'] # Replace with your Push Gateway URL +``` +5. Start Prometheus: +```sh +./prometheus --config.file=prometheus.yml +``` +6. Visit http://localhost:9090 in your browser, or use curl http://localhost:9090/api/v1/targets +7. Verify Prometheus is running by visiting http://localhost:9090 in your browser. + ## Metric Monitoring Zeus allows you to monitor energy and power consumption through different metrics, such as Histograms, Counters, and Gauges, which can be pushed to a Prometheus Push Gateway for further analysis. -[`EnergyHistogram`][zeus.metric.EnergyHistogram] records energy consumption data for GPUs, CPUs, and DRAM in Prometheus Histograms. This is useful for observing how frequently energy usage reaches certain levels. +[`EnergyHistogram`][zeus.metric.EnergyHistogram] records energy consumption data for GPUs, CPUs, and DRAM in Prometheus Histograms. This is ideal for observing how often energy usage falls within specific ranges. You can customize the bucket ranges for each component (GPU, CPU, and DRAM), or let Zeus use default ranges. ```python hl_lines="2 5-15" -from zeus.monitor import ZeusMonitor + from zeus.metric import EnergyHistogram if __name__ == "__main__": - # Initialize EnergyHistogram with custom bucket ranges - histogram_metric = EnergyHistogram( - energy_monitor=ZeusMonitor, - prometheus_url='http://localhost:9091', - job='energy_histogram_job', - bucket_ranges={ - "gpu": [10.0, 25.0, 50.0, 100.0], - "cpu": [5.0, 10.0, 25.0, 50.0], - "dram": [1.0, 2.5, 5.0, 10.0] - } + # Initialize EnergyHistogram + energy_histogram = EnergyHistogram( + cpu_indices=[0], + gpu_indices=[0], + prometheus_url='http://localhost:9091', + job='training_energy_histogram' ) - histogram_metric.begin_window("histogram_test") - # Perform tasks - histogram_metric.end_window("histogram_test") + for epoch in range(100): + # Start monitoring energy for the entire epoch + energy_histogram.begin_window(f"epoch_{epoch}_energy") + + # Step-level monitoring + for step_idx, (x, y) in enumerate(train_loader): + energy_histogram.begin_window(f"epoch_{epoch}_step_{step_idx}_energy") + train_one_step(x, y) + energy_histogram.end_window(f"epoch_{epoch}_step_{step_idx}_energy") + + # Perform epoch-level operations (e.g., aggregation) + train_one_epoch(train_loader, model, optimizer, criterion, epoch, args) + acc1 = validate(val_loader, model, criterion, args) + + # End monitoring energy for the epoch + energy_histogram.end_window(f"epoch_{epoch}_energy") + + print(f"Epoch {epoch} completed. Validation Accuracy: {acc1}%") + ``` You can use the `begin_window` and `end_window` methods to define a measurement window, similar to other ZeusMonitor operations. Energy consumption data will be recorded for the entire duration of the window. !!! Tip - If no custom `bucket ranges` are provided, Zeus uses default ranges for GPU, CPU, and DRAM. + You can customize the bucket ranges for GPUs, CPUs, and DRAM during initialization to tailor the granularity of energy monitoring. For example: + +```python hl_lines="2 5-15" +energy_histogram = EnergyHistogram( + cpu_indices=[0], + gpu_indices=[0], + prometheus_url='http://localhost:9091', + job='training_energy_histogram', + gpu_bucket_range = [10.0, 25.0, 50.0, 100.0], + cpu_bucket_range = [5.0, 15.0, 30.0, 50.0], + dram_bucket_range = [2.0, 8.0, 20.0, 40.0], +) +``` + +If no custom `bucket ranges` are specified, Zeus uses these default ranges: +- **GPU**: `[50.0, 100.0, 200.0, 500.0, 1000.0]` +- **CPU**: `[10.0, 20.0, 50.0, 100.0, 200.0]` +- **DRAM**: `[5.0, 10.0, 20.0, 50.0, 150.0]` + +!!! Warning + Empty bucket ranges (e.g., []) are not allowed and will raise an error. Ensure you provide a valid range for each device or use the defaults. - If you later decide to specify custom bucket ranges only for the GPU while leaving CPU and DRAM to use defaults, you could write: - bucket_ranges={ - "gpu": [10.0, 25.0, 50.0, 100.0] - } [`EnergyCumulativeCounter`][zeus.metric.EnergyCumulativeCounter] monitors cumulative energy consumption. It tracks energy usage over time, without resetting the values, and is updated periodically. -```python hl_lines="2 5-7" -from zeus.monitor import ZeusMonitor +```python hl_lines="2 5-15" + from zeus.metric import EnergyCumulativeCounter if __name__ == "__main__": cumulative_counter_metric = EnergyCumulativeCounter( - energy_monitor=ZeusMonitor, - update_period=2, # Updates energy data every 2 seconds + cpu_indices=[0], + gpu_indices=[0], + update_period=2, prometheus_url='http://localhost:9091', job='energy_counter_job' ) + train_loader = range(10) + val_loader = range(5) + + cumulative_counter_metric.begin_window("training_energy_monitoring") - cumulative_counter_metric.begin_window("counter_test_window") - # Let the counter run - time.sleep(10) # Keep measuring for 10 seconds - cumulative_counter_metric.end_window("counter_test_window") + for epoch in range(100): + print(f"\n--- Epoch {epoch} ---") + train_one_epoch(train_loader, model, optimizer, criterion, epoch, args) + acc1 = validate(val_loader, model, criterion, args) + print(f"Epoch {epoch} completed. Validation Accuracy: {acc1:.2f}%.") + + # Simulate additional operations outside of training + print("\nSimulating additional operations...") + time.sleep(10) + + cumulative_counter_metric.end_window("training_energy_monitoring") ``` -The `update_period` parameter defines how often the energy measurements are updated and pushed to Prometheus. +In this example, `cumulative_counter_metric` monitors energy usage throughout the entire training process rather than on a per-epoch basis. The `update_period` parameter defines how often the energy measurements are updated and pushed to Prometheus. [`PowerGauge`][zeus.metric.PowerGauge] tracks real-time power consumption using Prometheus Gauges which monitors fluctuating values such as power usage. -```python hl_lines="2 5-7" -from zeus.monitor.power import PowerMonitor +```python hl_lines="2 5-15" from zeus.metric import PowerGauge if __name__ == "__main__": power_gauge_metric = PowerGauge( - power_monitor=PowerMonitor, - update_period=2, # Updates power consumption every 2 seconds + gpu_indices=[0], + update_period=2, prometheus_url='http://localhost:9091', job='power_gauge_job' ) + train_loader = range(10) + val_loader = range(5) + + power_gauge_metric.begin_window("training_power_monitoring") + + for epoch in range(100): + print(f"\n--- Epoch {epoch} ---") + train_one_epoch(train_loader, model, optimizer, criterion, epoch, args) + acc1 = validate(val_loader, model, criterion, args) + print(f"Epoch {epoch} completed. Validation Accuracy: {acc1:.2f}%.") + + # Simulate additional operations outside of training + print("\nSimulating additional operations...") + time.sleep(10) - power_gauge_metric.begin_window("gauge_test_window") - # Monitor power consumption for 10 seconds - time.sleep(10) - power_gauge_metric.end_window("gauge_test_window") + power_gauge_metric.end_window("training_power_monitoring") ``` The `update_period` parameter defines how often the power datas are updated and pushed to Prometheus. From 69c42da55439b50935a217a64e7df6095d4334f5 Mon Sep 17 00:00:00 2001 From: Sharon Seungyu Han <87476439+sharonsyh@users.noreply.github.com> Date: Fri, 29 Nov 2024 12:48:44 -0500 Subject: [PATCH 12/57] Update index.md --- docs/measure/index.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/docs/measure/index.md b/docs/measure/index.md index 0c6796e5..fbf443a3 100644 --- a/docs/measure/index.md +++ b/docs/measure/index.md @@ -114,7 +114,7 @@ if __name__ == "__main__": avg_energy = sum(map(lambda m: m.total_energy, steps)) / len(steps) print(f"One step takes {avg_time} s and {avg_energy} J for the CPU.") ``` -# Prometheus Assumptions +## Metric Monitoring To monitor energy and power consumption effectively using Zeus, Prometheus and the Prometheus Push Gateway must be properly set up. This section outlines the assumptions and provides a guide to configure Prometheus and the Push Gateway. @@ -169,7 +169,6 @@ docker run -d -p 9091:9091 prom/pushgateway 3. Verify it is running by visiting http://localhost:9091 in your browser. ### Step 2: Install and Configure Prometheus - 1. Visit the Prometheus [Prometheus Download Page](https://prometheus.io/download/#prometheus). 2. Download the appropriate binary for your operating system. 3. Extract the binary: @@ -192,8 +191,6 @@ scrape_configs: 6. Visit http://localhost:9090 in your browser, or use curl http://localhost:9090/api/v1/targets 7. Verify Prometheus is running by visiting http://localhost:9090 in your browser. -## Metric Monitoring - Zeus allows you to monitor energy and power consumption through different metrics, such as Histograms, Counters, and Gauges, which can be pushed to a Prometheus Push Gateway for further analysis. [`EnergyHistogram`][zeus.metric.EnergyHistogram] records energy consumption data for GPUs, CPUs, and DRAM in Prometheus Histograms. This is ideal for observing how often energy usage falls within specific ranges. From 4704a67da8fefc4850529045f3e0e08523fdcea0 Mon Sep 17 00:00:00 2001 From: Sharon Seungyu Han <87476439+sharonsyh@users.noreply.github.com> Date: Fri, 29 Nov 2024 12:50:30 -0500 Subject: [PATCH 13/57] Update index.md --- docs/measure/index.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/measure/index.md b/docs/measure/index.md index fbf443a3..515cb2cb 100644 --- a/docs/measure/index.md +++ b/docs/measure/index.md @@ -193,6 +193,7 @@ scrape_configs: Zeus allows you to monitor energy and power consumption through different metrics, such as Histograms, Counters, and Gauges, which can be pushed to a Prometheus Push Gateway for further analysis. +## Monitoring with Histogram, Counter, Gauge Metric [`EnergyHistogram`][zeus.metric.EnergyHistogram] records energy consumption data for GPUs, CPUs, and DRAM in Prometheus Histograms. This is ideal for observing how often energy usage falls within specific ranges. You can customize the bucket ranges for each component (GPU, CPU, and DRAM), or let Zeus use default ranges. From 5666ba5bc704c8652e9cc2fbe9267058c0d7b7f1 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Fri, 29 Nov 2024 13:13:32 -0500 Subject: [PATCH 14/57] Add README for example training file with Zeus energy metrics integration --- examples/prometheus/README.md | 51 +++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 examples/prometheus/README.md diff --git a/examples/prometheus/README.md b/examples/prometheus/README.md new file mode 100644 index 00000000..37e8a65a --- /dev/null +++ b/examples/prometheus/README.md @@ -0,0 +1,51 @@ +# Integrating the power limit optimizer with ImageNet training + +This example will demonstrate how to integrate Zeus with `torchvision` and the ImageNet dataset. + +[`train_single.py`](train_single.py) and [`train_dp.py`](train_dp.py) were adapted and simplified from [PyTorch's example training code for ImageNet](https://github.com/pytorch/examples/blob/main/imagenet/main.py). +The former script is for simple single GPU training, whereas the latter is for data parallel training with PyTorch DDP and [`torchrun`](https://pytorch.org/docs/stable/elastic/run.html). + +## Dependencies + +All packages (including torchvision) are pre-installed if you're using our [Docker image](https://ml.energy/zeus/getting_started/environment/). +You just need to download and extract the ImageNet data and mount it to the Docker container with the `-v` option (first step below). + +1. Download the ILSVRC2012 dataset from [the ImageNet homepage](http://www.image-net.org/). + Then, extract archives using [this script](https://github.com/pytorch/examples/blob/main/imagenet/extract_ILSVRC.sh) provided by PyTorch. +1. Install `zeus` and build the power monitor, following [Installing and Building](https://ml.energy/zeus/getting_started/installing_and_building/). +1. Install `torchvision`: + ```sh + pip install torchvision==0.15.2 + ``` + +## EnergyHistogram, PowerGauge, and EnergyCumulativeCounter +- [`EnergyHistogram`][zeus.metric.EnergyHistogram]: Records energy consumption data for GPUs, CPUs, and DRAM and pushes the data to Prometheus as histogram metrics. This is useful for tracking energy usage distribution over time. +- [`PowerGauge`][zeus.metric.PowerGauge]: Monitors real-time GPU power usage and pushes the data to Prometheus as gauge metrics, which are updated at regular intervals. +- [`EnergyCumulativeCounter`][zeus.metric.EnergyCumulativeCounter]: Tracks cumulative energy consumption over time for CPUs and GPUs and pushes the results to Prometheus as counter metrics. + +## `ZeusMonitor` and `GlobalPowerLimitOptimizer` + +- [`ZeusMonitor`](http://ml.energy/zeus/reference/monitor/#zeus.monitor.ZeusMonitor): Measures the GPU time and energy consumption of arbitrary code blocks. +- [`GlobalPowerLimitOptimizer`](https://ml.energy/zeus/reference/optimizer/power_limit/#zeus.optimizer.power_limit.GlobalPowerLimitOptimizer): Online-profiles each power limit with `ZeusMonitor` and finds the cost-optimal power limit. + +## Example command + +You can specify the maximum training time slowdown factor (1.0 means no slowdown) by setting `ZEUS_MAX_SLOWDOWN`. The default is set to 1.1 in this example script, meaning the lowest power limit that keeps training time inflation within 10% will be automatically found. +`GlobalPowerLimitOptimizer` supports other optimal power limit selection strategies. See [here](https://ml.energy/zeus/reference/optimizer/power_limit). + +```bash +# Single-GPU +python train_single.py \ + [DATA_DIR] \ + --gpu 0 `# Specify the GPU id to use` + +# Multi-GPU Data Parallel +torchrun \ + --nnodes 1 \ + --nproc_per_node gpu `# Number of processes per node, should be equal to the number of GPUs.` \ + `# When set to 'gpu', it means use all the GPUs available.` \ + train_dp.py \ + [DATA_DIR] +``` + + From 863f2575788e94d9ac6d312004a237c22a0f7e0c Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Fri, 29 Nov 2024 15:20:54 -0500 Subject: [PATCH 15/57] Add Metric Name Construction section on index.md --- docs/measure/index.md | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/docs/measure/index.md b/docs/measure/index.md index 515cb2cb..1236a94f 100644 --- a/docs/measure/index.md +++ b/docs/measure/index.md @@ -116,8 +116,29 @@ if __name__ == "__main__": ``` ## Metric Monitoring -To monitor energy and power consumption effectively using Zeus, Prometheus and the Prometheus Push Gateway must be properly set up. This section outlines the assumptions and provides a guide to configure Prometheus and the Push Gateway. +To monitor energy and power consumption effectively using Zeus, Prometheus and the Prometheus Push Gateway must be properly set up. This section outlines the assumptions and provides a guide to configure Prometheus and the Push Gateway. +### **Metric Name Construction** +Zeus organizes metrics using **static metric names** and **dynamic labels** to ensure flexibility and ease of querying in Prometheus. Below, we document how metric names are constructed, how the `job` and `window` parameters affect the metrics, and how users can query them effectively. + +Currently, metric names (e.g., `energy_monitor_gpu_{gpu_index}_energy_joules`) are static and cannot be overridden. However, users can customize the name of the window to define the context of the metrics. + +- **Metric Name**: `energy_monitor_gpu_{gpu_index}_energy_joules` +- **Labels**: + - `window`: The user-defined window name provided during `begin_window()` and `end_window()` (e.g. energy_histogram.begin_window(f"epoch_{epoch}_energy")). + - `index`: The GPU index (e.g., `0` for GPU 0). + + +## How to Query Metrics in Prometheus + +### 1. Query Metrics for a Specific Window +Retrieve energy metrics for a GPU during a specific window: +```promql +energy_monitor_gpu_0_energy_joules{window="epoch_1_step_0"} +``` +```promql +sum(energy_monitor_gpu_0_energy_joules) by (window) +``` --- ## Assumptions @@ -193,7 +214,8 @@ scrape_configs: Zeus allows you to monitor energy and power consumption through different metrics, such as Histograms, Counters, and Gauges, which can be pushed to a Prometheus Push Gateway for further analysis. -## Monitoring with Histogram, Counter, Gauge Metric +--- + [`EnergyHistogram`][zeus.metric.EnergyHistogram] records energy consumption data for GPUs, CPUs, and DRAM in Prometheus Histograms. This is ideal for observing how often energy usage falls within specific ranges. You can customize the bucket ranges for each component (GPU, CPU, and DRAM), or let Zeus use default ranges. From 35ab26747f2aa7ec71fbf65a2891dda4a80a956b Mon Sep 17 00:00:00 2001 From: Sharon Seungyu Han <87476439+sharonsyh@users.noreply.github.com> Date: Fri, 29 Nov 2024 15:23:26 -0500 Subject: [PATCH 16/57] Update index.md --- docs/measure/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/measure/index.md b/docs/measure/index.md index 1236a94f..f3fef874 100644 --- a/docs/measure/index.md +++ b/docs/measure/index.md @@ -119,7 +119,7 @@ if __name__ == "__main__": To monitor energy and power consumption effectively using Zeus, Prometheus and the Prometheus Push Gateway must be properly set up. This section outlines the assumptions and provides a guide to configure Prometheus and the Push Gateway. ### **Metric Name Construction** -Zeus organizes metrics using **static metric names** and **dynamic labels** to ensure flexibility and ease of querying in Prometheus. Below, we document how metric names are constructed, how the `job` and `window` parameters affect the metrics, and how users can query them effectively. +Zeus organizes metrics using **static metric names** and **dynamic labels** to ensure flexibility and ease of querying in Prometheus. Below, we document how metric names are constructed and how users can query them effectively. Currently, metric names (e.g., `energy_monitor_gpu_{gpu_index}_energy_joules`) are static and cannot be overridden. However, users can customize the name of the window to define the context of the metrics. From 4aa0f393906aeadd72082b23a6b0e93bad71d344 Mon Sep 17 00:00:00 2001 From: Sharon Seungyu Han <87476439+sharonsyh@users.noreply.github.com> Date: Fri, 29 Nov 2024 15:31:10 -0500 Subject: [PATCH 17/57] Update README.md to include the dependency on prometheus_client --- examples/prometheus/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/prometheus/README.md b/examples/prometheus/README.md index 37e8a65a..c09e39e5 100644 --- a/examples/prometheus/README.md +++ b/examples/prometheus/README.md @@ -7,7 +7,7 @@ The former script is for simple single GPU training, whereas the latter is for d ## Dependencies -All packages (including torchvision) are pre-installed if you're using our [Docker image](https://ml.energy/zeus/getting_started/environment/). +All packages (including torchvision and prometheus_client) are pre-installed if you're using our [Docker image](https://ml.energy/zeus/getting_started/environment/). You just need to download and extract the ImageNet data and mount it to the Docker container with the `-v` option (first step below). 1. Download the ILSVRC2012 dataset from [the ImageNet homepage](http://www.image-net.org/). @@ -17,6 +17,10 @@ You just need to download and extract the ImageNet data and mount it to the Dock ```sh pip install torchvision==0.15.2 ``` +1. Install `prometheus_client`: + ```sh + pip install prometheus-client + ``` ## EnergyHistogram, PowerGauge, and EnergyCumulativeCounter - [`EnergyHistogram`][zeus.metric.EnergyHistogram]: Records energy consumption data for GPUs, CPUs, and DRAM and pushes the data to Prometheus as histogram metrics. This is useful for tracking energy usage distribution over time. From 1e996a4fbf90fc9612d031772ef2ecf821b5b828 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Sat, 30 Nov 2024 00:21:06 -0500 Subject: [PATCH 18/57] Update unit tests for the modified metric.py --- tests/test_metric.py | 214 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 214 insertions(+) create mode 100644 tests/test_metric.py diff --git a/tests/test_metric.py b/tests/test_metric.py new file mode 100644 index 00000000..02e230b0 --- /dev/null +++ b/tests/test_metric.py @@ -0,0 +1,214 @@ +from __future__ import annotations + +from unittest.mock import patch, MagicMock + +import pytest + +from zeus.metric import EnergyHistogram, EnergyCumulativeCounter, PowerGauge + +@pytest.fixture +def mock_get_cpus(): + """Fixture to mock `get_cpus()` to avoid RAPL-related errors.""" + + with patch("zeus.metric.get_cpus", autospec=True) as mock_get_cpus: + mock_cpu = MagicMock() + mock_cpu.cpus = [] + mock_get_cpus.return_value = mock_cpu + yield mock_get_cpus + + +@pytest.fixture +def mock_zeus_monitor(): + """Fixture to mock ZeusMonitor behavior.""" + + with patch("zeus.metric.ZeusMonitor", autospec=True) as MockZeusMonitor: + mock_instance = MockZeusMonitor.return_value + mock_instance.end_window.return_value = MagicMock( + gpu_energy={0: 30.0, 1: 35.0, 2: 40.0}, + cpu_energy={0: 20.0, 1: 25.0}, + dram_energy={}, + ) + mock_instance.gpu_indices = [0, 1, 2] + mock_instance.cpu_indices = [0, 1] + yield mock_instance + +@pytest.fixture +def mock_histogram(): + """Fixture to mock Prometheus Histogram creation. + + Mocks the Histogram functionality to avoid real Prometheus interactions + and to validate histogram-related method calls. + """ + + with patch("zeus.metric.Histogram", autospec=True) as MockHistogram: + yield MockHistogram + +def test_energy_histogram(mock_get_cpus, mock_zeus_monitor, mock_histogram): + """Test EnergyHistogram class. + + Validates that GPU, CPU, and DRAM histograms are properly initialized, + and that the correct energy values are recorded. + + Args: + mock_get_cpus (MagicMock): Mocked `get_cpus` fixture. + mock_zeus_monitor (MagicMock): Mocked ZeusMonitor fixture. + mock_histogram (MagicMock): Mocked Prometheus Histogram fixture. + """ + + cpu_indices = [0, 1] + gpu_indices = [0, 1, 2] + prometheus_url = "http://localhost:9091" + + histogram_metric = EnergyHistogram( + cpu_indices=cpu_indices, + gpu_indices=gpu_indices, + prometheus_url=prometheus_url, + job="test_energy_histogram", + ) + + for gpu_index, gpu_histogram in histogram_metric.gpu_histograms.items(): + gpu_histogram.labels = MagicMock(return_value=gpu_histogram) + gpu_histogram.observe = MagicMock() + + for cpu_index, cpu_histogram in histogram_metric.cpu_histograms.items(): + cpu_histogram.labels = MagicMock(return_value=cpu_histogram) + cpu_histogram.observe = MagicMock() + + for dram_index, dram_histogram in histogram_metric.dram_histograms.items(): + dram_histogram.labels = MagicMock(return_value=dram_histogram) + dram_histogram.observe = MagicMock() + + histogram_metric.begin_window("test_window") + histogram_metric.end_window("test_window") + + # Assert GPU histograms were observed + for gpu_index, energy in mock_zeus_monitor.return_value.end_window.return_value.gpu_energy.items(): + calls = [call[0][0] for call in histogram_metric.gpu_histograms[gpu_index].observe.call_args_list] + print(f"Observed calls for GPU {gpu_index}: {calls}") + assert energy in calls, f"Expected {energy} in {calls}" + + # Assert CPU histograms were observed + for cpu_index, energy in mock_zeus_monitor.return_value.end_window.return_value.cpu_energy.items(): + calls = [call[0][0] for call in histogram_metric.cpu_histograms[cpu_index].observe.call_args_list] + print(f"Observed CPU calls for CPU {cpu_index}: {calls}") + assert energy in calls, f"Expected CPU energy {energy} in {calls}" + + # Assert DRAM histograms were observed + for dram_index, energy in mock_zeus_monitor.return_value.end_window.return_value.dram_energy.items(): + calls = [call[0][0] for call in histogram_metric.dram_histograms[dram_index].observe.call_args_list] + print(f"Observed DRAM calls for CPU {dram_index}: {calls}") + assert energy in calls, f"Expected DRAM energy {energy} in {calls}" + +def test_energy_cumulative_counter(mock_get_cpus, mock_zeus_monitor): + """Test EnergyCumulativeCounter with mocked ZeusMonitor. + + Args: + mock_get_cpus (MagicMock): Mocked `get_cpus` fixture. + mock_zeus_monitor (MagicMock): Mocked ZeusMonitor fixture. + """ + + cpu_indices = [0, 1] + gpu_indices = [0, 1, 2] + prometheus_url = "http://localhost:9091" + + cumulative_counter = EnergyCumulativeCounter( + cpu_indices=cpu_indices, + gpu_indices=gpu_indices, + update_period=2, + prometheus_url=prometheus_url, + job="test_energy_counter", + ) + + for counters in [ + cumulative_counter.gpu_counters, + cumulative_counter.cpu_counters, + ]: + for counter in counters.values(): + counter.labels = MagicMock(return_value=counter) + counter.inc = MagicMock() + + cumulative_counter.begin_window("test_counter") + cumulative_counter.end_window("test_counter") + + # Assert GPU counters + for gpu_index, energy in mock_zeus_monitor.return_value.end_window.return_value.gpu_energy.items(): + assert gpu_index in cumulative_counter.gpu_counters, f"GPU counter for index {gpu_index} not initialized" + cumulative_counter.gpu_counters[gpu_index].inc.assert_called_with(energy) + + # Assert CPU counters + for cpu_index, energy in mock_zeus_monitor.return_value.end_window.return_value.cpu_energy.items(): + assert cpu_index in cumulative_counter.cpu_counters, f"CPU counter for index {cpu_index} not initialized" + cumulative_counter.cpu_counters[cpu_index].inc.assert_called_with(energy) + +@pytest.fixture +def mock_power_monitor(): + """Fixture to mock PowerMonitor.""" + + with patch("zeus.metric.PowerMonitor", autospec=True) as MockPowerMonitor: + mock_instance = MockPowerMonitor.return_value + mock_instance.get_power.return_value = { + 0: 300.0, + 1: 310.0, + 2: 320.0, + } + yield mock_instance + +@pytest.fixture +def mock_gauge(): + """Fixture to mock Prometheus Gauge creation.""" + + with patch("zeus.metric.Gauge", autospec=True) as MockGauge: + MockGauge.side_effect = lambda *args, **kwargs: MagicMock() + yield MockGauge + +@patch("prometheus_client.push_to_gateway") +@patch("zeus.device.gpu.get_gpus") +def test_power_gauge( + mock_get_gpus: MagicMock, + mock_power_monitor: MagicMock, + mock_gauge: MagicMock, + mock_push_to_gateway +) -> None: + """Test PowerGauge with mocked PowerMonitor and Prometheus Gauges. + + Args: + mock_get_gpus (MagicMock): Mocked `get_gpus` function to simulate available GPUs. + mock_power_monitor (MagicMock): Mocked PowerMonitor to simulate GPU power data. + mock_gauge (MagicMock): Mocked Prometheus Gauge creation. + """ + + gpu_indices = [0, 1, 2] + prometheus_url = "http://localhost:9091" + mock_push_to_gateway.return_value = None + + # Mock `get_gpus` to simulate available GPUs + mock_get_gpus.return_value = MagicMock() + mock_get_gpus.return_value.gpus = gpu_indices + + mock_gauge.side_effect = lambda *args, **kwargs: MagicMock() + + power_gauge = PowerGauge( + gpu_indices=gpu_indices, + update_period=2, + prometheus_url=prometheus_url, + job="test_power_gauge", + ) + for gpu_index, gauge in power_gauge.gpu_gauges.items(): + gauge.labels = MagicMock(return_value=gauge) + gauge.set = MagicMock() + + power_gauge.begin_window("test_power_window") + power_gauge.end_window("test_power_window") + + # Assert that the gauges were set with the correct power values + for gpu_index, power_value in mock_power_monitor.return_value.get_power.return_value.items(): + try: + # Check if `labels` was called with the correct arguments + power_gauge.gpu_gauges[gpu_index].labels.assert_called_once_with( + gpu_index=gpu_index, window="test_power_window" + ) + power_gauge.gpu_gauges[gpu_index].set.assert_called_once_with(power_value) + except AssertionError as e: + print(f"AssertionError for GPU {gpu_index}:") + raise e + From 8e1d35b5c88c79d5c9604c668bf7340414415bab Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Sat, 30 Nov 2024 01:00:48 -0500 Subject: [PATCH 19/57] Fix formatting issues detected by black --- tests/test_metric.py | 103 +++++++++++++++++++++++++++++------------- zeus/metric.py | 104 ++++++++++++++++++++++++++----------------- 2 files changed, 136 insertions(+), 71 deletions(-) diff --git a/tests/test_metric.py b/tests/test_metric.py index 02e230b0..ace4ab65 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -6,13 +6,14 @@ from zeus.metric import EnergyHistogram, EnergyCumulativeCounter, PowerGauge + @pytest.fixture def mock_get_cpus(): """Fixture to mock `get_cpus()` to avoid RAPL-related errors.""" with patch("zeus.metric.get_cpus", autospec=True) as mock_get_cpus: mock_cpu = MagicMock() - mock_cpu.cpus = [] + mock_cpu.cpus = [] mock_get_cpus.return_value = mock_cpu yield mock_get_cpus @@ -24,14 +25,15 @@ def mock_zeus_monitor(): with patch("zeus.metric.ZeusMonitor", autospec=True) as MockZeusMonitor: mock_instance = MockZeusMonitor.return_value mock_instance.end_window.return_value = MagicMock( - gpu_energy={0: 30.0, 1: 35.0, 2: 40.0}, - cpu_energy={0: 20.0, 1: 25.0}, - dram_energy={}, + gpu_energy={0: 30.0, 1: 35.0, 2: 40.0}, + cpu_energy={0: 20.0, 1: 25.0}, + dram_energy={}, ) - mock_instance.gpu_indices = [0, 1, 2] - mock_instance.cpu_indices = [0, 1] + mock_instance.gpu_indices = [0, 1, 2] + mock_instance.cpu_indices = [0, 1] yield mock_instance + @pytest.fixture def mock_histogram(): """Fixture to mock Prometheus Histogram creation. @@ -42,7 +44,8 @@ def mock_histogram(): with patch("zeus.metric.Histogram", autospec=True) as MockHistogram: yield MockHistogram - + + def test_energy_histogram(mock_get_cpus, mock_zeus_monitor, mock_histogram): """Test EnergyHistogram class. @@ -54,9 +57,9 @@ def test_energy_histogram(mock_get_cpus, mock_zeus_monitor, mock_histogram): mock_zeus_monitor (MagicMock): Mocked ZeusMonitor fixture. mock_histogram (MagicMock): Mocked Prometheus Histogram fixture. """ - - cpu_indices = [0, 1] - gpu_indices = [0, 1, 2] + + cpu_indices = [0, 1] + gpu_indices = [0, 1, 2] prometheus_url = "http://localhost:9091" histogram_metric = EnergyHistogram( @@ -69,7 +72,7 @@ def test_energy_histogram(mock_get_cpus, mock_zeus_monitor, mock_histogram): for gpu_index, gpu_histogram in histogram_metric.gpu_histograms.items(): gpu_histogram.labels = MagicMock(return_value=gpu_histogram) gpu_histogram.observe = MagicMock() - + for cpu_index, cpu_histogram in histogram_metric.cpu_histograms.items(): cpu_histogram.labels = MagicMock(return_value=cpu_histogram) cpu_histogram.observe = MagicMock() @@ -82,29 +85,54 @@ def test_energy_histogram(mock_get_cpus, mock_zeus_monitor, mock_histogram): histogram_metric.end_window("test_window") # Assert GPU histograms were observed - for gpu_index, energy in mock_zeus_monitor.return_value.end_window.return_value.gpu_energy.items(): - calls = [call[0][0] for call in histogram_metric.gpu_histograms[gpu_index].observe.call_args_list] + for ( + gpu_index, + energy, + ) in mock_zeus_monitor.return_value.end_window.return_value.gpu_energy.items(): + calls = [ + call[0][0] + for call in histogram_metric.gpu_histograms[ + gpu_index + ].observe.call_args_list + ] print(f"Observed calls for GPU {gpu_index}: {calls}") assert energy in calls, f"Expected {energy} in {calls}" - + # Assert CPU histograms were observed - for cpu_index, energy in mock_zeus_monitor.return_value.end_window.return_value.cpu_energy.items(): - calls = [call[0][0] for call in histogram_metric.cpu_histograms[cpu_index].observe.call_args_list] + for ( + cpu_index, + energy, + ) in mock_zeus_monitor.return_value.end_window.return_value.cpu_energy.items(): + calls = [ + call[0][0] + for call in histogram_metric.cpu_histograms[ + cpu_index + ].observe.call_args_list + ] print(f"Observed CPU calls for CPU {cpu_index}: {calls}") assert energy in calls, f"Expected CPU energy {energy} in {calls}" - + # Assert DRAM histograms were observed - for dram_index, energy in mock_zeus_monitor.return_value.end_window.return_value.dram_energy.items(): - calls = [call[0][0] for call in histogram_metric.dram_histograms[dram_index].observe.call_args_list] + for ( + dram_index, + energy, + ) in mock_zeus_monitor.return_value.end_window.return_value.dram_energy.items(): + calls = [ + call[0][0] + for call in histogram_metric.dram_histograms[ + dram_index + ].observe.call_args_list + ] print(f"Observed DRAM calls for CPU {dram_index}: {calls}") assert energy in calls, f"Expected DRAM energy {energy} in {calls}" + def test_energy_cumulative_counter(mock_get_cpus, mock_zeus_monitor): """Test EnergyCumulativeCounter with mocked ZeusMonitor. Args: mock_get_cpus (MagicMock): Mocked `get_cpus` fixture. - mock_zeus_monitor (MagicMock): Mocked ZeusMonitor fixture. + mock_zeus_monitor (MagicMock): Mocked ZeusMonitor fixture. """ cpu_indices = [0, 1] @@ -131,15 +159,26 @@ def test_energy_cumulative_counter(mock_get_cpus, mock_zeus_monitor): cumulative_counter.end_window("test_counter") # Assert GPU counters - for gpu_index, energy in mock_zeus_monitor.return_value.end_window.return_value.gpu_energy.items(): - assert gpu_index in cumulative_counter.gpu_counters, f"GPU counter for index {gpu_index} not initialized" + for ( + gpu_index, + energy, + ) in mock_zeus_monitor.return_value.end_window.return_value.gpu_energy.items(): + assert ( + gpu_index in cumulative_counter.gpu_counters + ), f"GPU counter for index {gpu_index} not initialized" cumulative_counter.gpu_counters[gpu_index].inc.assert_called_with(energy) # Assert CPU counters - for cpu_index, energy in mock_zeus_monitor.return_value.end_window.return_value.cpu_energy.items(): - assert cpu_index in cumulative_counter.cpu_counters, f"CPU counter for index {cpu_index} not initialized" + for ( + cpu_index, + energy, + ) in mock_zeus_monitor.return_value.end_window.return_value.cpu_energy.items(): + assert ( + cpu_index in cumulative_counter.cpu_counters + ), f"CPU counter for index {cpu_index} not initialized" cumulative_counter.cpu_counters[cpu_index].inc.assert_called_with(energy) + @pytest.fixture def mock_power_monitor(): """Fixture to mock PowerMonitor.""" @@ -153,21 +192,23 @@ def mock_power_monitor(): } yield mock_instance + @pytest.fixture def mock_gauge(): """Fixture to mock Prometheus Gauge creation.""" - + with patch("zeus.metric.Gauge", autospec=True) as MockGauge: MockGauge.side_effect = lambda *args, **kwargs: MagicMock() yield MockGauge + @patch("prometheus_client.push_to_gateway") @patch("zeus.device.gpu.get_gpus") def test_power_gauge( - mock_get_gpus: MagicMock, - mock_power_monitor: MagicMock, + mock_get_gpus: MagicMock, + mock_power_monitor: MagicMock, mock_gauge: MagicMock, - mock_push_to_gateway + mock_push_to_gateway, ) -> None: """Test PowerGauge with mocked PowerMonitor and Prometheus Gauges. @@ -201,7 +242,10 @@ def test_power_gauge( power_gauge.end_window("test_power_window") # Assert that the gauges were set with the correct power values - for gpu_index, power_value in mock_power_monitor.return_value.get_power.return_value.items(): + for ( + gpu_index, + power_value, + ) in mock_power_monitor.return_value.get_power.return_value.items(): try: # Check if `labels` was called with the correct arguments power_gauge.gpu_gauges[gpu_index].labels.assert_called_once_with( @@ -211,4 +255,3 @@ def test_power_gauge( except AssertionError as e: print(f"AssertionError for GPU {gpu_index}:") raise e - diff --git a/zeus/metric.py b/zeus/metric.py index 930858c4..586ca688 100644 --- a/zeus/metric.py +++ b/zeus/metric.py @@ -5,13 +5,20 @@ import warnings import multiprocessing as mp -from prometheus_client import CollectorRegistry, Histogram, Counter, Gauge, push_to_gateway +from prometheus_client import ( + CollectorRegistry, + Histogram, + Counter, + Gauge, + push_to_gateway, +) from zeus.monitor.power import PowerMonitor from zeus.monitor.energy import ZeusMonitor from zeus.device.cpu import get_cpus + class Metric(abc.ABC): """ Abstract base class for all metric types in Zeus. @@ -19,6 +26,7 @@ class Metric(abc.ABC): Defines a common interface for metrics, ensuring consistent behavior for `begin_window` and `end_window` operations. """ + @abc.abstractmethod def begin_window(self, name: str): """Start a new measurement window.""" @@ -66,22 +74,28 @@ def __init__( gpu_indices (list): List of GPU indices to monitor. prometheus_url (str): URL of the Prometheus Push Gateway where metrics will be pushed. job (str): Name of the Prometheus job to associate with the energy metrics. - gpu_bucket_range (list[float], optional): Bucket ranges for GPU energy histograms. + gpu_bucket_range (list[float], optional): Bucket ranges for GPU energy histograms. Defaults to [50.0, 100.0, 200.0, 500.0, 1000.0]. - cpu_bucket_range (list[float], optional): Bucket ranges for CPU energy histograms. + cpu_bucket_range (list[float], optional): Bucket ranges for CPU energy histograms. Defaults to [10.0, 20.0, 50.0, 100.0, 200.0]. - dram_bucket_range (list[float], optional): Bucket ranges for DRAM energy histograms. + dram_bucket_range (list[float], optional): Bucket ranges for DRAM energy histograms. Defaults to [5.0, 10.0, 20.0, 50.0, 150.0]. Raises: ValueError: If any of the bucket ranges (GPU, CPU, DRAM) is an empty list. """ if not gpu_bucket_range: - raise ValueError("GPU bucket range cannot be empty. Please provide a valid range or omit the argument to use defaults.") + raise ValueError( + "GPU bucket range cannot be empty. Please provide a valid range or omit the argument to use defaults." + ) if not cpu_bucket_range: - raise ValueError("CPU bucket range cannot be empty. Please provide a valid range or omit the argument to use defaults.") + raise ValueError( + "CPU bucket range cannot be empty. Please provide a valid range or omit the argument to use defaults." + ) if not dram_bucket_range: - raise ValueError("DRAM bucket range cannot be empty. Please provide a valid range or omit the argument to use defaults.") - + raise ValueError( + "DRAM bucket range cannot be empty. Please provide a valid range or omit the argument to use defaults." + ) + self.gpu_bucket_range = gpu_bucket_range self.cpu_bucket_range = cpu_bucket_range self.dram_bucket_range = dram_bucket_range @@ -131,16 +145,20 @@ def __init__( self.max_cpu_bucket = max(self.cpu_bucket_range) self.max_dram_bucket = max(self.dram_bucket_range) - self.energy_monitor = ZeusMonitor(cpu_indices=cpu_indices, gpu_indices=gpu_indices) + self.energy_monitor = ZeusMonitor( + cpu_indices=cpu_indices, gpu_indices=gpu_indices + ) def begin_window(self, name: str) -> None: """ Begin the energy monitoring window. - + Args: name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. """ - self.energy_monitor.begin_window(f"__EnergyHistogram_{name}", sync_execution = True) + self.energy_monitor.begin_window( + f"__EnergyHistogram_{name}", sync_execution=True + ) def end_window(self, name: str) -> None: """ @@ -157,14 +175,15 @@ def end_window(self, name: str) -> None: - CPU energy data to the Prometheus Push Gateway via the associated Histogram metric. - DRAM energy data to the Prometheus Push Gateway via the associated Histogram metric. """ - measurement = self.energy_monitor.end_window(f"__EnergyHistogram_{name}", sync_execution = True) + measurement = self.energy_monitor.end_window( + f"__EnergyHistogram_{name}", sync_execution=True + ) if measurement.gpu_energy: for gpu_index, gpu_energy in measurement.gpu_energy.items(): if gpu_index in self.gpu_histograms: self.gpu_histograms[gpu_index].labels( - window=name, - index=gpu_index + window=name, index=gpu_index ).observe(gpu_energy) if gpu_energy > self.max_gpu_bucket: warnings.warn( @@ -175,8 +194,7 @@ def end_window(self, name: str) -> None: for cpu_index, cpu_energy in measurement.cpu_energy.items(): if cpu_index in self.cpu_histograms: self.cpu_histograms[cpu_index].labels( - window=name, - index=cpu_index + window=name, index=cpu_index ).observe(cpu_energy) if cpu_energy > self.max_cpu_bucket: warnings.warn( @@ -187,8 +205,7 @@ def end_window(self, name: str) -> None: for dram_index, dram_energy in measurement.dram_energy.items(): if dram_index in self.dram_histograms: self.dram_histograms[dram_index].labels( - window=name, - index=dram_index + window=name, index=dram_index ).observe(dram_energy) if dram_energy > self.max_dram_bucket: warnings.warn( @@ -197,6 +214,7 @@ def end_window(self, name: str) -> None: push_to_gateway(self.prometheus_url, job=self.job, registry=self.registry) + class EnergyCumulativeCounter(Metric): """ EnergyCumulativeCounter class to monitor and record cumulative energy consumption. @@ -279,12 +297,12 @@ def begin_window(self, name: str) -> None: def end_window(self, name: str) -> None: """ End the energy monitoring window. - + Args: name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. """ - if not hasattr(self, 'queue') or self.queue is None: + if not hasattr(self, "queue") or self.queue is None: raise RuntimeError( "EnergyCumulativeCounter's 'queue' is not initialized. " "Make sure 'begin_window' is called before 'end_window'." @@ -295,11 +313,12 @@ def end_window(self, name: str) -> None: warnings.warn(f"Forcefully terminating monitoring process for {name}.") self.proc.terminate() + def energy_monitoring_loop( name: str, pipe: mp.Queue, - cpu_indices : list, - gpu_indices : list, + cpu_indices: list, + gpu_indices: list, update_period: int, prometheus_url: str, job: str, @@ -342,8 +361,8 @@ def energy_monitoring_loop( ) dram_counters = {} for i, cpu in enumerate(get_cpus().cpus): - if cpu.supportsGetDramEnergyConsumption(): - dram_counters[i] = Counter( + if cpu.supportsGetDramEnergyConsumption(): + dram_counters[i] = Counter( f"energy_monitor_dram_{i}_energy_joules", f"DRAM {i} energy consumption", ["window", "index"], @@ -354,33 +373,34 @@ def energy_monitoring_loop( if not pipe.empty(): break - energy_monitor.begin_window(f"__EnergyCumulativeCounter_{name}", sync_execution = False) + energy_monitor.begin_window( + f"__EnergyCumulativeCounter_{name}", sync_execution=False + ) time.sleep(update_period) - measurement = energy_monitor.end_window(f"__EnergyCumulativeCounter_{name}", sync_execution = False) + measurement = energy_monitor.end_window( + f"__EnergyCumulativeCounter_{name}", sync_execution=False + ) if measurement.gpu_energy: for gpu_index, energy in measurement.gpu_energy.items(): if gpu_index in gpu_counters: - gpu_counters[gpu_index].labels( - window=name, - index=gpu_index - ).inc(energy) + gpu_counters[gpu_index].labels(window=name, index=gpu_index).inc( + energy + ) if measurement.cpu_energy: for cpu_index, energy in measurement.cpu_energy.items(): if cpu_index in cpu_counters: - cpu_counters[cpu_index].labels( - window=name, - index=cpu_index - ).inc(energy) + cpu_counters[cpu_index].labels(window=name, index=cpu_index).inc( + energy + ) if measurement.dram_energy: for dram_index, energy in measurement.dram_energy.items(): if dram_index in dram_counters: - dram_counters[dram_index].labels( - window=name, - index=dram_index - ).inc(energy) + dram_counters[dram_index].labels(window=name, index=dram_index).inc( + energy + ) push_to_gateway(prometheus_url, job=job, registry=registry) @@ -467,6 +487,7 @@ def end_window(self, name: str) -> None: warnings.warn(f"Forcefully terminating monitoring process for {name}.") self.proc.terminate() + def power_monitoring_loop( name: str, pipe: mp.Queue, @@ -497,7 +518,7 @@ def power_monitoring_loop( gpu_gauges[gpu_index] = Gauge( f"power_monitor_gpu_{gpu_index}_power_watts", f"Records power consumption for GPU {gpu_index} over time", - ["gpu_index"], + ["gpu_index"], registry=registry, ) @@ -509,7 +530,9 @@ def power_monitoring_loop( try: for gpu_index, power_value in power_measurement.items(): - gpu_gauges[gpu_index].labels(gpu_index=f"{name}_gpu{gpu_index}").set(power_value) + gpu_gauges[gpu_index].labels(gpu_index=f"{name}_gpu{gpu_index}").set( + power_value + ) except Exception as e: print(f"Error during processing power measurement: {e}") @@ -519,4 +542,3 @@ def power_monitoring_loop( print(f"Error pushing metrics: {e}") time.sleep(update_period) - From 7e47fbb6f2a8d288e76e130b29d983bff8618671 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Sat, 30 Nov 2024 01:33:30 -0500 Subject: [PATCH 20/57] Fix formatting issues detected by black --- tests/test_metric.py | 69 ++++++++++++++++++---------------------- zeus/metric.py | 75 +++++++++++++++++--------------------------- 2 files changed, 59 insertions(+), 85 deletions(-) diff --git a/tests/test_metric.py b/tests/test_metric.py index ace4ab65..a9413f0d 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -1,3 +1,5 @@ +"""Test metric.py.""" + from __future__ import annotations from unittest.mock import patch, MagicMock @@ -10,7 +12,6 @@ @pytest.fixture def mock_get_cpus(): """Fixture to mock `get_cpus()` to avoid RAPL-related errors.""" - with patch("zeus.metric.get_cpus", autospec=True) as mock_get_cpus: mock_cpu = MagicMock() mock_cpu.cpus = [] @@ -21,9 +22,8 @@ def mock_get_cpus(): @pytest.fixture def mock_zeus_monitor(): """Fixture to mock ZeusMonitor behavior.""" - - with patch("zeus.metric.ZeusMonitor", autospec=True) as MockZeusMonitor: - mock_instance = MockZeusMonitor.return_value + with patch("zeus.metric.ZeusMonitor", autospec=True) as zeus_monitor: + mock_instance = zeus_monitor.return_value mock_instance.end_window.return_value = MagicMock( gpu_energy={0: 30.0, 1: 35.0, 2: 40.0}, cpu_energy={0: 20.0, 1: 25.0}, @@ -34,6 +34,19 @@ def mock_zeus_monitor(): yield mock_instance +@pytest.fixture +def mock_power_monitor(): + """Fixture to mock PowerMonitor.""" + with patch("zeus.metric.PowerMonitor", autospec=True) as power_monitor: + mock_instance = power_monitor.return_value + mock_instance.get_power.return_value = { + 0: 300.0, + 1: 310.0, + 2: 320.0, + } + yield mock_instance + + @pytest.fixture def mock_histogram(): """Fixture to mock Prometheus Histogram creation. @@ -41,9 +54,16 @@ def mock_histogram(): Mocks the Histogram functionality to avoid real Prometheus interactions and to validate histogram-related method calls. """ + with patch("zeus.metric.Histogram", autospec=True) as histogram: + yield histogram + - with patch("zeus.metric.Histogram", autospec=True) as MockHistogram: - yield MockHistogram +@pytest.fixture +def mock_gauge(): + """Fixture to mock Prometheus Gauge creation.""" + with patch("zeus.metric.Gauge", autospec=True) as gauge: + gauge.side_effect = lambda *args, **kwargs: MagicMock() + yield gauge def test_energy_histogram(mock_get_cpus, mock_zeus_monitor, mock_histogram): @@ -57,7 +77,6 @@ def test_energy_histogram(mock_get_cpus, mock_zeus_monitor, mock_histogram): mock_zeus_monitor (MagicMock): Mocked ZeusMonitor fixture. mock_histogram (MagicMock): Mocked Prometheus Histogram fixture. """ - cpu_indices = [0, 1] gpu_indices = [0, 1, 2] prometheus_url = "http://localhost:9091" @@ -69,15 +88,15 @@ def test_energy_histogram(mock_get_cpus, mock_zeus_monitor, mock_histogram): job="test_energy_histogram", ) - for gpu_index, gpu_histogram in histogram_metric.gpu_histograms.items(): + for _gpu_index, gpu_histogram in histogram_metric.gpu_histograms.items(): gpu_histogram.labels = MagicMock(return_value=gpu_histogram) gpu_histogram.observe = MagicMock() - for cpu_index, cpu_histogram in histogram_metric.cpu_histograms.items(): + for _cpu_index, cpu_histogram in histogram_metric.cpu_histograms.items(): cpu_histogram.labels = MagicMock(return_value=cpu_histogram) cpu_histogram.observe = MagicMock() - for dram_index, dram_histogram in histogram_metric.dram_histograms.items(): + for _dram_index, dram_histogram in histogram_metric.dram_histograms.items(): dram_histogram.labels = MagicMock(return_value=dram_histogram) dram_histogram.observe = MagicMock() @@ -134,7 +153,6 @@ def test_energy_cumulative_counter(mock_get_cpus, mock_zeus_monitor): mock_get_cpus (MagicMock): Mocked `get_cpus` fixture. mock_zeus_monitor (MagicMock): Mocked ZeusMonitor fixture. """ - cpu_indices = [0, 1] gpu_indices = [0, 1, 2] prometheus_url = "http://localhost:9091" @@ -179,36 +197,11 @@ def test_energy_cumulative_counter(mock_get_cpus, mock_zeus_monitor): cumulative_counter.cpu_counters[cpu_index].inc.assert_called_with(energy) -@pytest.fixture -def mock_power_monitor(): - """Fixture to mock PowerMonitor.""" - - with patch("zeus.metric.PowerMonitor", autospec=True) as MockPowerMonitor: - mock_instance = MockPowerMonitor.return_value - mock_instance.get_power.return_value = { - 0: 300.0, - 1: 310.0, - 2: 320.0, - } - yield mock_instance - - -@pytest.fixture -def mock_gauge(): - """Fixture to mock Prometheus Gauge creation.""" - - with patch("zeus.metric.Gauge", autospec=True) as MockGauge: - MockGauge.side_effect = lambda *args, **kwargs: MagicMock() - yield MockGauge - - -@patch("prometheus_client.push_to_gateway") @patch("zeus.device.gpu.get_gpus") def test_power_gauge( mock_get_gpus: MagicMock, mock_power_monitor: MagicMock, mock_gauge: MagicMock, - mock_push_to_gateway, ) -> None: """Test PowerGauge with mocked PowerMonitor and Prometheus Gauges. @@ -217,10 +210,8 @@ def test_power_gauge( mock_power_monitor (MagicMock): Mocked PowerMonitor to simulate GPU power data. mock_gauge (MagicMock): Mocked Prometheus Gauge creation. """ - gpu_indices = [0, 1, 2] prometheus_url = "http://localhost:9091" - mock_push_to_gateway.return_value = None # Mock `get_gpus` to simulate available GPUs mock_get_gpus.return_value = MagicMock() @@ -234,7 +225,7 @@ def test_power_gauge( prometheus_url=prometheus_url, job="test_power_gauge", ) - for gpu_index, gauge in power_gauge.gpu_gauges.items(): + for _gpu_index, gauge in power_gauge.gpu_gauges.items(): gauge.labels = MagicMock(return_value=gauge) gauge.set = MagicMock() diff --git a/zeus/metric.py b/zeus/metric.py index 586ca688..f500815e 100644 --- a/zeus/metric.py +++ b/zeus/metric.py @@ -1,3 +1,5 @@ +"""Track and export energy and power metrics via Prometheus.""" + from __future__ import annotations import abc @@ -20,8 +22,7 @@ class Metric(abc.ABC): - """ - Abstract base class for all metric types in Zeus. + """Abstract base class for all metric types in Zeus. Defines a common interface for metrics, ensuring consistent behavior for `begin_window` and `end_window` operations. @@ -59,12 +60,11 @@ def __init__( gpu_indices: list, prometheus_url: str, job: str, - gpu_bucket_range: list[float] = [50.0, 100.0, 200.0, 500.0, 1000.0], - cpu_bucket_range: list[float] = [10.0, 20.0, 50.0, 100.0, 200.0], - dram_bucket_range: list[float] = [5.0, 10.0, 20.0, 50.0, 150.0], + gpu_bucket_range: list[float] = None, + cpu_bucket_range: list[float] = None, + dram_bucket_range: list[float] = None, ) -> None: - """ - Initialize the EnergyHistogram class. + """Initialize the EnergyHistogram class. Sets up the Prometheus Histogram metrics to track energy consumption for GPUs, CPUs, and DRAMs. The data will be collected and pushed to the Prometheus Push Gateway at regular intervals. @@ -80,6 +80,7 @@ def __init__( Defaults to [10.0, 20.0, 50.0, 100.0, 200.0]. dram_bucket_range (list[float], optional): Bucket ranges for DRAM energy histograms. Defaults to [5.0, 10.0, 20.0, 50.0, 150.0]. + Raises: ValueError: If any of the bucket ranges (GPU, CPU, DRAM) is an empty list. """ @@ -96,9 +97,9 @@ def __init__( "DRAM bucket range cannot be empty. Please provide a valid range or omit the argument to use defaults." ) - self.gpu_bucket_range = gpu_bucket_range - self.cpu_bucket_range = cpu_bucket_range - self.dram_bucket_range = dram_bucket_range + self.gpu_bucket_range = gpu_bucket_range or [50.0, 100.0, 200.0, 500.0, 1000.0] + self.cpu_bucket_range = cpu_bucket_range or [10.0, 20.0, 50.0, 100.0, 200.0] + self.dram_bucket_range = dram_bucket_range or [5.0, 10.0, 20.0, 50.0, 150.0] self.cpu_indices = cpu_indices self.gpu_indices = gpu_indices self.prometheus_url = prometheus_url @@ -150,8 +151,7 @@ def __init__( ) def begin_window(self, name: str) -> None: - """ - Begin the energy monitoring window. + """Begin the energy monitoring window. Args: name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. @@ -161,8 +161,7 @@ def begin_window(self, name: str) -> None: ) def end_window(self, name: str) -> None: - """ - End the current energy monitoring window and record the energy data. + """End the current energy monitoring window and record the energy data. Retrieves the energy consumption data (for GPUs, CPUs, and DRAMs) for the monitoring window and updates the corresponding Histogram metrics. The data is then pushed to the Prometheus Push Gateway. @@ -187,7 +186,8 @@ def end_window(self, name: str) -> None: ).observe(gpu_energy) if gpu_energy > self.max_gpu_bucket: warnings.warn( - f"GPU {gpu_index} energy {gpu_energy} exceeds the maximum bucket value of {self.max_gpu_bucket}" + f"GPU {gpu_index} energy {gpu_energy} exceeds the maximum bucket value of {self.max_gpu_bucket}", + stacklevel=1, ) if measurement.cpu_energy: @@ -198,7 +198,8 @@ def end_window(self, name: str) -> None: ).observe(cpu_energy) if cpu_energy > self.max_cpu_bucket: warnings.warn( - f"CPU {cpu_index} energy {cpu_energy} exceeds the maximum bucket value of {self.max_cpu_bucket}" + f"CPU {cpu_index} energy {cpu_energy} exceeds the maximum bucket value of {self.max_cpu_bucket}", + stacklevel=1, ) if measurement.dram_energy: @@ -209,15 +210,15 @@ def end_window(self, name: str) -> None: ).observe(dram_energy) if dram_energy > self.max_dram_bucket: warnings.warn( - f"DRAM {dram_index} energy {dram_energy} exceeds the maximum bucket value of {self.max_dram_bucket}" + f"DRAM {dram_index} energy {dram_energy} exceeds the maximum bucket value of {self.max_dram_bucket}", + stacklevel=1, ) push_to_gateway(self.prometheus_url, job=self.job, registry=self.registry) class EnergyCumulativeCounter(Metric): - """ - EnergyCumulativeCounter class to monitor and record cumulative energy consumption. + """EnergyCumulativeCounter class to monitor and record cumulative energy consumption. This class tracks GPU, CPU, and DRAM energy usage over time, and records the data as Prometheus Counter metrics. The energy consumption metrics are periodically updated and pushed to a Prometheus Push Gateway for monitoring and analysis. @@ -245,8 +246,7 @@ def __init__( prometheus_url: str, job: str, ) -> None: - """ - Initialize the EnergyCumulativeCounter. + """Initialize the EnergyCumulativeCounter. Args: cpu_indices (list): List of CPU indices to monitor. @@ -267,8 +267,7 @@ def __init__( self.proc = None def begin_window(self, name: str) -> None: - """ - Begin the energy monitoring window. + """Begin the energy monitoring window. Starts a new multiprocessing process that monitors energy usage periodically and pushes the results to the Prometheus Push Gateway. @@ -295,13 +294,11 @@ def begin_window(self, name: str) -> None: raise RuntimeError(f"Failed to start monitoring process for {name}.") def end_window(self, name: str) -> None: - """ - End the energy monitoring window. + """End the energy monitoring window. Args: name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. """ - if not hasattr(self, "queue") or self.queue is None: raise RuntimeError( "EnergyCumulativeCounter's 'queue' is not initialized. " @@ -310,7 +307,6 @@ def end_window(self, name: str) -> None: self.queue.put("stop") self.proc.join(timeout=20) if self.proc.is_alive(): - warnings.warn(f"Forcefully terminating monitoring process for {name}.") self.proc.terminate() @@ -323,10 +319,7 @@ def energy_monitoring_loop( prometheus_url: str, job: str, ) -> None: - """ - This function runs in a separate process to collect and update energy consumption metrics - (for GPUs, CPUs, and DRAM) at regular intervals. It utilizes the Zeus energy monitoring - framework and pushes the collected data to the Prometheus Push Gateway for real-time tracking. + """Runs in a separate process to collect and update energy consumption metrics (for GPUs, CPUs, and DRAM). Args: name (str): The user-defined name of the monitoring window (used as a label for Prometheus metrics). @@ -406,8 +399,7 @@ def energy_monitoring_loop( class PowerGauge(Metric): - """ - PowerGauge class to monitor and record power consumption. + """PowerGauge class to monitor and record power consumption. This class tracks GPU power usage in real time and records it as **Prometheus Gauge** metrics. The Gauge metric type is suitable for tracking values that can go up and down over time, like power consumption. @@ -431,8 +423,7 @@ def __init__( prometheus_url: str, job: str, ) -> None: - """ - Initialize the PowerGauge metric. + """Initialize the PowerGauge metric. Args: gpu_indices (list[int]): List of GPU indices to monitor for power consumption. @@ -447,8 +438,7 @@ def __init__( self.gpu_gauges = {} def begin_window(self, name: str) -> None: - """ - Begin the power monitoring window. + """Begin the power monitoring window. Starts a new multiprocessing process that runs the power monitoring loop. The process collects real-time power consumption data and updates the corresponding @@ -474,17 +464,14 @@ def begin_window(self, name: str) -> None: time.sleep(5) def end_window(self, name: str) -> None: - """ - End the power monitoring window. + """End the power monitoring window. Args: name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. """ - self.queue.put("stop") self.proc.join(timeout=20) if self.proc.is_alive(): - warnings.warn(f"Forcefully terminating monitoring process for {name}.") self.proc.terminate() @@ -496,11 +483,7 @@ def power_monitoring_loop( prometheus_url: str, job: str, ) -> None: - """ - The polling function for power monitoring that runs in a separate process. - - It periodically collects power consumption data for each GPU and pushes the results - to the Prometheus Push Gateway. + """Runs in a separate process and periodically collects power consumption data for each GPU and pushes the results to the Prometheus Push Gateway. Args: name (str): Unique name for the monitoring window (used as a label in Prometheus metrics). From 52f2fa70cfdabeb4bd23fcbc00e8db1b302d9ee6 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Sat, 30 Nov 2024 01:50:59 -0500 Subject: [PATCH 21/57] Fix formatting issues detected by black --- tests/test_metric.py | 162 ++++++++++++++++++++++--------------------- zeus/metric.py | 30 ++++---- 2 files changed, 101 insertions(+), 91 deletions(-) diff --git a/tests/test_metric.py b/tests/test_metric.py index a9413f0d..d368629d 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -87,63 +87,65 @@ def test_energy_histogram(mock_get_cpus, mock_zeus_monitor, mock_histogram): prometheus_url=prometheus_url, job="test_energy_histogram", ) + if histogram_metric.gpu_histograms: + for _gpu_index, gpu_histogram in histogram_metric.gpu_histograms.items(): + gpu_histogram.labels = MagicMock(return_value=gpu_histogram) + gpu_histogram.observe = MagicMock() - for _gpu_index, gpu_histogram in histogram_metric.gpu_histograms.items(): - gpu_histogram.labels = MagicMock(return_value=gpu_histogram) - gpu_histogram.observe = MagicMock() + if histogram_metric.cpu_histograms: + for _cpu_index, cpu_histogram in histogram_metric.cpu_histograms.items(): + cpu_histogram.labels = MagicMock(return_value=cpu_histogram) + cpu_histogram.observe = MagicMock() - for _cpu_index, cpu_histogram in histogram_metric.cpu_histograms.items(): - cpu_histogram.labels = MagicMock(return_value=cpu_histogram) - cpu_histogram.observe = MagicMock() - - for _dram_index, dram_histogram in histogram_metric.dram_histograms.items(): - dram_histogram.labels = MagicMock(return_value=dram_histogram) - dram_histogram.observe = MagicMock() + if histogram_metric.dram_histogram: + for _dram_index, dram_histogram in histogram_metric.dram_histograms.items(): + dram_histogram.labels = MagicMock(return_value=dram_histogram) + dram_histogram.observe = MagicMock() histogram_metric.begin_window("test_window") histogram_metric.end_window("test_window") # Assert GPU histograms were observed - for ( - gpu_index, - energy, - ) in mock_zeus_monitor.return_value.end_window.return_value.gpu_energy.items(): - calls = [ - call[0][0] - for call in histogram_metric.gpu_histograms[ - gpu_index - ].observe.call_args_list - ] - print(f"Observed calls for GPU {gpu_index}: {calls}") - assert energy in calls, f"Expected {energy} in {calls}" + if mock_zeus_monitor.return_value.end_window.return_value.gpu_energy: + for ( + gpu_index, + energy, + ) in mock_zeus_monitor.return_value.end_window.return_value.gpu_energy.items(): + calls = [ + call[0][0] + for call in histogram_metric.gpu_histograms[ + gpu_index + ].observe.call_args_list + ] + assert energy in calls, f"Expected {energy} in {calls}" # Assert CPU histograms were observed - for ( - cpu_index, - energy, - ) in mock_zeus_monitor.return_value.end_window.return_value.cpu_energy.items(): - calls = [ - call[0][0] - for call in histogram_metric.cpu_histograms[ - cpu_index - ].observe.call_args_list - ] - print(f"Observed CPU calls for CPU {cpu_index}: {calls}") - assert energy in calls, f"Expected CPU energy {energy} in {calls}" + if mock_zeus_monitor.return_value.end_window.return_value.cpu_energy: + for ( + cpu_index, + energy, + ) in mock_zeus_monitor.return_value.end_window.return_value.cpu_energy.items(): + calls = [ + call[0][0] + for call in histogram_metric.cpu_histograms[ + cpu_index + ].observe.call_args_list + ] + assert energy in calls, f"Expected CPU energy {energy} in {calls}" # Assert DRAM histograms were observed - for ( - dram_index, - energy, - ) in mock_zeus_monitor.return_value.end_window.return_value.dram_energy.items(): - calls = [ - call[0][0] - for call in histogram_metric.dram_histograms[ - dram_index - ].observe.call_args_list - ] - print(f"Observed DRAM calls for CPU {dram_index}: {calls}") - assert energy in calls, f"Expected DRAM energy {energy} in {calls}" + if mock_zeus_monitor.return_value.end_window.return_value.dram_energy: + for ( + dram_index, + energy, + ) in mock_zeus_monitor.return_value.end_window.return_value.dram_energy.items(): + calls = [ + call[0][0] + for call in histogram_metric.dram_histograms[ + dram_index + ].observe.call_args_list + ] + assert energy in calls, f"Expected DRAM energy {energy} in {calls}" def test_energy_cumulative_counter(mock_get_cpus, mock_zeus_monitor): @@ -177,24 +179,26 @@ def test_energy_cumulative_counter(mock_get_cpus, mock_zeus_monitor): cumulative_counter.end_window("test_counter") # Assert GPU counters - for ( - gpu_index, - energy, - ) in mock_zeus_monitor.return_value.end_window.return_value.gpu_energy.items(): - assert ( - gpu_index in cumulative_counter.gpu_counters - ), f"GPU counter for index {gpu_index} not initialized" - cumulative_counter.gpu_counters[gpu_index].inc.assert_called_with(energy) + if mock_zeus_monitor.return_value.end_window.return_value.gpu_energy: + for ( + gpu_index, + energy, + ) in mock_zeus_monitor.return_value.end_window.return_value.gpu_energy.items(): + assert ( + gpu_index in cumulative_counter.gpu_counters + ), f"GPU counter for index {gpu_index} not initialized" + cumulative_counter.gpu_counters[gpu_index].inc.assert_called_with(energy) # Assert CPU counters - for ( - cpu_index, - energy, - ) in mock_zeus_monitor.return_value.end_window.return_value.cpu_energy.items(): - assert ( - cpu_index in cumulative_counter.cpu_counters - ), f"CPU counter for index {cpu_index} not initialized" - cumulative_counter.cpu_counters[cpu_index].inc.assert_called_with(energy) + if mock_zeus_monitor.return_value.end_window.return_value.cpu_energy: + for ( + cpu_index, + energy, + ) in mock_zeus_monitor.return_value.end_window.return_value.cpu_energy.items(): + assert ( + cpu_index in cumulative_counter.cpu_counters + ), f"CPU counter for index {cpu_index} not initialized" + cumulative_counter.cpu_counters[cpu_index].inc.assert_called_with(energy) @patch("zeus.device.gpu.get_gpus") @@ -225,24 +229,26 @@ def test_power_gauge( prometheus_url=prometheus_url, job="test_power_gauge", ) - for _gpu_index, gauge in power_gauge.gpu_gauges.items(): - gauge.labels = MagicMock(return_value=gauge) - gauge.set = MagicMock() + if power_gauge.gpu_gauges: + for _gpu_index, gauge in power_gauge.gpu_gauges.items(): + gauge.labels = MagicMock(return_value=gauge) + gauge.set = MagicMock() power_gauge.begin_window("test_power_window") power_gauge.end_window("test_power_window") # Assert that the gauges were set with the correct power values - for ( - gpu_index, - power_value, - ) in mock_power_monitor.return_value.get_power.return_value.items(): - try: - # Check if `labels` was called with the correct arguments - power_gauge.gpu_gauges[gpu_index].labels.assert_called_once_with( - gpu_index=gpu_index, window="test_power_window" - ) - power_gauge.gpu_gauges[gpu_index].set.assert_called_once_with(power_value) - except AssertionError as e: - print(f"AssertionError for GPU {gpu_index}:") - raise e + if mock_power_monitor.return_value.get_power.return_value: + for ( + gpu_index, + power_value, + ) in mock_power_monitor.return_value.get_power.return_value.items(): + try: + # Check if `labels` was called with the correct arguments + power_gauge.gpu_gauges[gpu_index].labels.assert_called_once_with( + gpu_index=gpu_index, window="test_power_window" + ) + power_gauge.gpu_gauges[gpu_index].set.assert_called_once_with(power_value) + except AssertionError as e: + print(f"AssertionError for GPU {gpu_index}:") + raise e diff --git a/zeus/metric.py b/zeus/metric.py index f500815e..311a3b89 100644 --- a/zeus/metric.py +++ b/zeus/metric.py @@ -60,9 +60,9 @@ def __init__( gpu_indices: list, prometheus_url: str, job: str, - gpu_bucket_range: list[float] = None, - cpu_bucket_range: list[float] = None, - dram_bucket_range: list[float] = None, + gpu_bucket_range: list[float] | None, + cpu_bucket_range: list[float] | None, + dram_bucket_range: list[float] | None, ) -> None: """Initialize the EnergyHistogram class. @@ -305,9 +305,10 @@ def end_window(self, name: str) -> None: "Make sure 'begin_window' is called before 'end_window'." ) self.queue.put("stop") - self.proc.join(timeout=20) - if self.proc.is_alive(): - self.proc.terminate() + if self.proc is not None: + self.proc.join(timeout=20) + if self.proc.is_alive(): + self.proc.terminate() def energy_monitoring_loop( @@ -470,9 +471,11 @@ def end_window(self, name: str) -> None: name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. """ self.queue.put("stop") - self.proc.join(timeout=20) - if self.proc.is_alive(): - self.proc.terminate() + if self.proc is not None: + self.proc.join(timeout=20) + if self.proc.is_alive(): + warnings.warn(f"Forcefully terminating monitoring process for {name}.", stacklevel=2) + self.proc.terminate() def power_monitoring_loop( @@ -512,10 +515,11 @@ def power_monitoring_loop( power_measurement = power_monitor.get_power() try: - for gpu_index, power_value in power_measurement.items(): - gpu_gauges[gpu_index].labels(gpu_index=f"{name}_gpu{gpu_index}").set( - power_value - ) + if power_measurement: + for gpu_index, power_value in power_measurement.items(): + gpu_gauges[gpu_index].labels(gpu_index=f"{name}_gpu{gpu_index}").set( + power_value + ) except Exception as e: print(f"Error during processing power measurement: {e}") From 30b807e5a88f1e047768225555a1ee6174bae5f2 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Sat, 30 Nov 2024 01:54:21 -0500 Subject: [PATCH 22/57] Fix formatting issues detected by black --- tests/test_metric.py | 4 +++- zeus/metric.py | 11 +++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/tests/test_metric.py b/tests/test_metric.py index d368629d..04f083ab 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -248,7 +248,9 @@ def test_power_gauge( power_gauge.gpu_gauges[gpu_index].labels.assert_called_once_with( gpu_index=gpu_index, window="test_power_window" ) - power_gauge.gpu_gauges[gpu_index].set.assert_called_once_with(power_value) + power_gauge.gpu_gauges[gpu_index].set.assert_called_once_with( + power_value + ) except AssertionError as e: print(f"AssertionError for GPU {gpu_index}:") raise e diff --git a/zeus/metric.py b/zeus/metric.py index 311a3b89..5ce52762 100644 --- a/zeus/metric.py +++ b/zeus/metric.py @@ -474,7 +474,10 @@ def end_window(self, name: str) -> None: if self.proc is not None: self.proc.join(timeout=20) if self.proc.is_alive(): - warnings.warn(f"Forcefully terminating monitoring process for {name}.", stacklevel=2) + warnings.warn( + f"Forcefully terminating monitoring process for {name}.", + stacklevel=2, + ) self.proc.terminate() @@ -517,9 +520,9 @@ def power_monitoring_loop( try: if power_measurement: for gpu_index, power_value in power_measurement.items(): - gpu_gauges[gpu_index].labels(gpu_index=f"{name}_gpu{gpu_index}").set( - power_value - ) + gpu_gauges[gpu_index].labels( + gpu_index=f"{name}_gpu{gpu_index}" + ).set(power_value) except Exception as e: print(f"Error during processing power measurement: {e}") From c53c72e0dc974b0d390ad1b0225d0abb309f733c Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Sat, 30 Nov 2024 02:03:31 -0500 Subject: [PATCH 23/57] Resolve unbound variable errors --- zeus/metric.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/zeus/metric.py b/zeus/metric.py index 5ce52762..fc6ec6b6 100644 --- a/zeus/metric.py +++ b/zeus/metric.py @@ -333,9 +333,12 @@ def energy_monitoring_loop( """ registry = CollectorRegistry() energy_monitor = ZeusMonitor(cpu_indices=cpu_indices, gpu_indices=gpu_indices) + + gpu_counters = {} + cpu_counters = {} + dram_counters = {} if energy_monitor.gpu_indices: - gpu_counters = {} for gpu_index in energy_monitor.gpu_indices: gpu_counters[gpu_index] = Counter( f"energy_monitor_gpu_{gpu_index}_energy_joules", @@ -345,7 +348,6 @@ def energy_monitoring_loop( ) if energy_monitor.cpu_indices: - cpu_counters = {} for cpu_index in energy_monitor.cpu_indices: cpu_counters[cpu_index] = Counter( f"energy_monitor_cpu_{cpu_index}_energy_joules", @@ -353,7 +355,6 @@ def energy_monitoring_loop( ["window", "index"], registry=registry, ) - dram_counters = {} for i, cpu in enumerate(get_cpus().cpus): if cpu.supportsGetDramEnergyConsumption(): dram_counters[i] = Counter( @@ -377,21 +378,21 @@ def energy_monitoring_loop( if measurement.gpu_energy: for gpu_index, energy in measurement.gpu_energy.items(): - if gpu_index in gpu_counters: + if gpu_counters and gpu_index in gpu_counters: gpu_counters[gpu_index].labels(window=name, index=gpu_index).inc( energy ) if measurement.cpu_energy: for cpu_index, energy in measurement.cpu_energy.items(): - if cpu_index in cpu_counters: + if cpu_counters and cpu_index in cpu_counters: cpu_counters[cpu_index].labels(window=name, index=cpu_index).inc( energy ) if measurement.dram_energy: for dram_index, energy in measurement.dram_energy.items(): - if dram_index in dram_counters: + if dram_counters and dram_index in dram_counters: dram_counters[dram_index].labels(window=name, index=dram_index).inc( energy ) @@ -474,10 +475,6 @@ def end_window(self, name: str) -> None: if self.proc is not None: self.proc.join(timeout=20) if self.proc.is_alive(): - warnings.warn( - f"Forcefully terminating monitoring process for {name}.", - stacklevel=2, - ) self.proc.terminate() From ffa46d152e56fd098ca2981367d49147973748de Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Sat, 30 Nov 2024 02:04:11 -0500 Subject: [PATCH 24/57] Resolve unbound variable errors --- zeus/metric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zeus/metric.py b/zeus/metric.py index fc6ec6b6..91edfc90 100644 --- a/zeus/metric.py +++ b/zeus/metric.py @@ -333,7 +333,7 @@ def energy_monitoring_loop( """ registry = CollectorRegistry() energy_monitor = ZeusMonitor(cpu_indices=cpu_indices, gpu_indices=gpu_indices) - + gpu_counters = {} cpu_counters = {} dram_counters = {} From 5f67d5c4e243355e87151a202e813161b9db4405 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Sat, 30 Nov 2024 02:23:03 -0500 Subject: [PATCH 25/57] Specify type for the args --- tests/test_metric.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/test_metric.py b/tests/test_metric.py index 04f083ab..b4b246e9 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -66,7 +66,11 @@ def mock_gauge(): yield gauge -def test_energy_histogram(mock_get_cpus, mock_zeus_monitor, mock_histogram): +def test_energy_histogram( + mock_get_cpus: MagicMock, + mock_zeus_monitor: MagicMock, + mock_histogram: MagicMock +) -> None: """Test EnergyHistogram class. Validates that GPU, CPU, and DRAM histograms are properly initialized, @@ -148,7 +152,10 @@ def test_energy_histogram(mock_get_cpus, mock_zeus_monitor, mock_histogram): assert energy in calls, f"Expected DRAM energy {energy} in {calls}" -def test_energy_cumulative_counter(mock_get_cpus, mock_zeus_monitor): +def test_energy_cumulative_counter( + mock_get_cpus: MagicMock, + mock_zeus_monitor: MagicMock +) -> None: """Test EnergyCumulativeCounter with mocked ZeusMonitor. Args: From b96b32eac8d81a2dd8a3b54b4cc2815c5ef369ef Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Sat, 30 Nov 2024 02:24:08 -0500 Subject: [PATCH 26/57] Fix formatting issues detected by black --- tests/test_metric.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/test_metric.py b/tests/test_metric.py index b4b246e9..afca7469 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -67,9 +67,7 @@ def mock_gauge(): def test_energy_histogram( - mock_get_cpus: MagicMock, - mock_zeus_monitor: MagicMock, - mock_histogram: MagicMock + mock_get_cpus: MagicMock, mock_zeus_monitor: MagicMock, mock_histogram: MagicMock ) -> None: """Test EnergyHistogram class. @@ -153,8 +151,7 @@ def test_energy_histogram( def test_energy_cumulative_counter( - mock_get_cpus: MagicMock, - mock_zeus_monitor: MagicMock + mock_get_cpus: MagicMock, mock_zeus_monitor: MagicMock ) -> None: """Test EnergyCumulativeCounter with mocked ZeusMonitor. From 9aa1888c390a9d4530aa34b0ed3a2e994798c28e Mon Sep 17 00:00:00 2001 From: Jae-Won Chung Date: Sat, 30 Nov 2024 17:21:13 -0500 Subject: [PATCH 27/57] Merge master --- docs/measure/index.md | 20 ++ .../profile_p2p.py | 2 +- examples/power_limit_optimizer/README.md | 2 +- pyproject.toml | 9 +- zeus/device/gpu/amd.py | 123 +++++++-- zeus/device/gpu/common.py | 5 + zeus/device/gpu/nvidia.py | 10 + zeus/monitor/energy.py | 1 + .../pipeline_frequency/server/router.py | 2 +- zeusd/Cargo.toml | 4 +- zeusd/scripts/lint.sh | 0 zeusd/src/devices/cpu/linux.rs | 248 ++++++++++++++++++ zeusd/src/devices/cpu/macos.rs | 59 +++++ zeusd/src/devices/cpu/mod.rs | 189 +++++++++++++ zeusd/src/devices/gpu/linux.rs | 16 +- zeusd/src/devices/gpu/mod.rs | 2 +- zeusd/src/devices/mod.rs | 1 + zeusd/src/error.rs | 12 + zeusd/src/main.rs | 24 +- zeusd/src/routes/cpu.rs | 53 ++++ zeusd/src/routes/gpu.rs | 8 +- zeusd/src/routes/mod.rs | 2 + zeusd/src/startup.rs | 32 ++- zeusd/tests/cpu.rs | 156 +++++++++++ zeusd/tests/helpers/mod.rs | 145 +++++++++- 25 files changed, 1059 insertions(+), 66 deletions(-) mode change 100644 => 100755 zeusd/scripts/lint.sh create mode 100644 zeusd/src/devices/cpu/linux.rs create mode 100644 zeusd/src/devices/cpu/macos.rs create mode 100644 zeusd/src/devices/cpu/mod.rs create mode 100644 zeusd/src/routes/cpu.rs create mode 100644 zeusd/tests/cpu.rs diff --git a/docs/measure/index.md b/docs/measure/index.md index f3fef874..59acc265 100644 --- a/docs/measure/index.md +++ b/docs/measure/index.md @@ -379,3 +379,23 @@ Total time (s): 4.421529293060303 Total energy (J): {'GPU0': 198.52566362297537, 'GPU1': 206.22215216255188, 'GPU2': 201.08565518283845, 'GPU3': 201.79834523367884} ``` + +## Hardware Support +We currently support both NVIDIA (via NVML) and AMD GPUs (via AMDSMI, with ROCm 6.1 or later). + +### `get_gpus` +The [`get_gpus`][zeus.device.get_gpus] function returns a [`GPUs`][zeus.device.gpu.GPUs] object, which can be either an [`NVIDIAGPUs`][zeus.device.gpu.NVIDIAGPUs] or [`AMDGPUs`][zeus.device.gpu.AMDGPUs] object depending on the availability of `nvml` or `amdsmi`. Each [`GPUs`][zeus.device.gpu.GPUs] object contains one or more [`GPU`][zeus.device.gpu.common.GPU] instances, which are specifically [`NVIDIAGPU`][zeus.device.gpu.nvidia.NVIDIAGPU] or [`AMDGPU`][zeus.device.gpu.amd.AMDGPU] objects. + +These [`GPU`][zeus.device.gpu.common.GPU] objects directly call respective `nvml` or `amdsmi` methods, providing a one-to-one mapping of methods for seamless GPU abstraction and support for multiple GPU types. For example: +- [`NVIDIAGPU.getName`][zeus.device.gpu.nvidia.NVIDIAGPU.getName] calls `pynvml.nvmlDeviceGetName`. +- [`AMDGPU.getName`][zeus.device.gpu.amd.AMDGPU.getName] calls `amdsmi.amdsmi_get_gpu_asic_info`. + +### Notes on AMD GPUs + +#### AMD GPUs Initialization +`amdsmi.amdsmi_get_energy_count` sometimes returns invalid values on certain GPUs or ROCm versions (e.g., MI100 on ROCm 6.2). See [ROCm issue #38](https://github.com/ROCm/amdsmi/issues/38) for more details. During the [`AMDGPUs`][zeus.device.gpu.AMDGPUs] object initialization, we call `amdsmi.amdsmi_get_energy_count` twice for each GPU, with a 0.5-second delay between calls. This difference is compared to power measurements to determine if `amdsmi.amdsmi_get_energy_count` is stable and reliable. Initialization takes 0.5 seconds regardless of the number of AMD GPUs. + +`amdsmi.amdsmi_get_power_info` provides "average_socket_power" and "current_socket_power" fields, but the "current_socket_power" field is sometimes not supported and returns "N/A." During the [`AMDGPUs`][zeus.device.gpu.AMDGPUs] object initialization, this method is checked, and if "N/A" is returned, the [`AMDGPU.getInstantPowerUsage`][zeus.device.gpu.amd.AMDGPU.getInstantPowerUsage] method is disabled. Instead, [`AMDGPU.getAveragePowerUsage`][zeus.device.gpu.amd.AMDGPU.getAveragePowerUsage] needs to be used. + +#### Supported AMD SMI Versions +Only ROCm >= 6.1 is supported, as the AMDSMI APIs for power and energy return wrong values. For more information, see [ROCm issue #22](https://github.com/ROCm/amdsmi/issues/22). Ensure your `amdsmi` and ROCm versions are up to date. diff --git a/examples/pipeline_frequency_optimizer/profile_p2p.py b/examples/pipeline_frequency_optimizer/profile_p2p.py index 2b688db4..26b68895 100644 --- a/examples/pipeline_frequency_optimizer/profile_p2p.py +++ b/examples/pipeline_frequency_optimizer/profile_p2p.py @@ -1,4 +1,4 @@ -"""Profile the power cosumtion of the GPU while waiting on P2P communication.""" +"""Profile the power consumption of the GPU while waiting on P2P communication.""" import os import time diff --git a/examples/power_limit_optimizer/README.md b/examples/power_limit_optimizer/README.md index 1c1af4a1..ea227ed0 100644 --- a/examples/power_limit_optimizer/README.md +++ b/examples/power_limit_optimizer/README.md @@ -7,7 +7,7 @@ The former script is for simple single GPU training, whereas the latter is for d ## Dependencies -All packages (including torchvision) are pre-installed if you're using our [Docker image](https://ml.energy/zeus/getting_started/environment/). +All packages (including torchvision) are pre-installed if you're using our [Docker image](https://ml.energy/zeus/getting_started/#using-docker). You just need to download and extract the ImageNet data and mount it to the Docker container with the `-v` option (first step below). 1. Download the ILSVRC2012 dataset from [the ImageNet homepage](http://www.image-net.org/). diff --git a/pyproject.toml b/pyproject.toml index 41c317b2..f7282097 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,8 @@ dependencies = [ "pydantic", # The `zeus.utils.pydantic_v1` compatibility layer allows us to unpin Pydantic in most cases. "rich", "tyro", - "httpx" + "httpx", + "amdsmi" ] dynamic = ["version"] @@ -41,12 +42,12 @@ Documentation = "https://ml.energy/zeus" [project.optional-dependencies] # One day FastAPI will drop support for Pydantic V1. Then fastapi has to be pinned as well. pfo = ["pydantic<2"] -pfo-server = ["fastapi[all]", "pydantic<2", "lowtime", "aiofiles", "torch"] +pfo-server = ["fastapi[standard]", "pydantic<2", "lowtime", "aiofiles", "torch"] bso = ["pydantic<2"] -bso-server = ["fastapi[all]", "sqlalchemy", "pydantic<2", "python-dotenv"] +bso-server = ["fastapi[standard]", "sqlalchemy", "pydantic<2", "python-dotenv"] migration = ["alembic", "sqlalchemy", "pydantic<2", "python-dotenv"] lint = ["ruff", "black==22.6.0", "pyright", "pandas-stubs", "transformers"] -test = ["fastapi[all]", "sqlalchemy", "pydantic<2", "pytest==7.3.2", "pytest-mock==3.10.0", "pytest-xdist==3.3.1", "anyio==3.7.1", "aiosqlite==0.20.0", "numpy<2"] +test = ["fastapi[standard]", "sqlalchemy", "pydantic<2", "pytest==7.3.2", "pytest-mock==3.10.0", "pytest-xdist==3.3.1", "anyio==3.7.1", "aiosqlite==0.20.0", "numpy<2"] docs = ["mkdocs-material[imaging]==9.5.19", "mkdocstrings[python]==0.25.0", "mkdocs-gen-files==0.5.0", "mkdocs-literate-nav==0.6.1", "mkdocs-section-index==0.3.9", "mkdocs-redirects==1.2.1", "urllib3<2", "black"] prometheus = ["prometheus-client"] # greenlet is for supporting apple mac silicon for sqlalchemy(https://docs.sqlalchemy.org/en/20/faq/installation.html) diff --git a/zeus/device/gpu/amd.py b/zeus/device/gpu/amd.py index 34299e7d..8024420c 100644 --- a/zeus/device/gpu/amd.py +++ b/zeus/device/gpu/amd.py @@ -4,12 +4,15 @@ import functools import os import contextlib +import time from typing import Sequence from functools import lru_cache try: import amdsmi # type: ignore -except ImportError: +# must catch all exceptions, since ImportError is not the only exception that can be raised (ex. OSError on version mismatch). +# Specific exceptions are handled when import and initialization are retested in `amdsmi_is_available` +except Exception: class MockAMDSMI: """Mock class for AMD SMI library.""" @@ -41,6 +44,18 @@ def amdsmi_is_available() -> bool: except ImportError: logger.info("amdsmi is not available.") return False + # usually thrown if amdsmi can't find libamd_smi.so + except OSError: + if os.getenv("ROCM_PATH") is None: + logger.warning("`ROCM_PATH` is not set. Do you have ROCm installed?") + return False + # usually thrown if versions of amdsmi and ROCm are incompatible. + except AttributeError: + logger.warning( + "Failed to import amdsmi. " + "Ensure amdsmi's version is at least as high as the current ROCm version." + ) + return False try: amdsmi.amdsmi_init() logger.info("amdsmi is available and initialized") @@ -71,10 +86,10 @@ def __init__(self, gpu_index: int) -> None: """Initialize the GPU object.""" super().__init__(gpu_index) self._get_handle() - # XXX(Jae-Won): Right now, the energy API's unit is broken (either the - # `power` field or the `counter_resolution` field). Before that, we're - # disabling the energy API. - self._supportsGetTotalEnergyConsumption = False + + # These values are updated in AMDGPUs constructor + self._supportsGetTotalEnergyConsumption = True + self._supportsInstantPowerUsage = True _exception_map = { 1: gpu_common.ZeusGPUInvalidArgError, # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL @@ -225,12 +240,28 @@ def resetGpuLockedClocks(self, _block: bool = True) -> None: clk_type=amdsmi.AmdSmiClkType.GFX, ) # expects MHz + @_handle_amdsmi_errors + def getAveragePowerUsage(self) -> int: + """Return the average power draw of the GPU. Units: mW.""" + # returns in W, convert to mW + return ( + int(amdsmi.amdsmi_get_power_info(self.handle)["average_socket_power"]) + * 1000 + ) + @_handle_amdsmi_errors def getInstantPowerUsage(self) -> int: """Return the current power draw of the GPU. Units: mW.""" + if not self._supportsInstantPowerUsage: + raise gpu_common.ZeusGPUNotSupportedError( + "Instant power usage is not supported on this AMD GPU. " + "This is because amdsmi.amdsmi_get_power_info does not return a valid 'current_socket_power'. " + "Please use `getAveragePowerUsage` instead." + ) # returns in W, convert to mW - return int( - amdsmi.amdsmi_get_power_info(self.handle)["average_socket_power"] * 1000 + return ( + int(amdsmi.amdsmi_get_power_info(self.handle)["current_socket_power"]) + * 1000 ) @_handle_amdsmi_errors @@ -242,28 +273,28 @@ def getAverageMemoryPowerUsage(self) -> int: @_handle_amdsmi_errors def supportsGetTotalEnergyConsumption(self) -> bool: - """Check if the GPU supports retrieving total energy consumption.""" - if self._supportsGetTotalEnergyConsumption is None: - try: - _ = amdsmi.amdsmi_get_energy_count(self.handle) - self._supportsGetTotalEnergyConsumption = True - except amdsmi.AmdSmiLibraryException as e: - if ( - e.get_error_code() == 2 - ): # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED - self._supportsGetTotalEnergyConsumption = False - else: - raise e - + """Check if the GPU supports retrieving total energy consumption. Returns a future object of the result.""" return self._supportsGetTotalEnergyConsumption @_handle_amdsmi_errors def getTotalEnergyConsumption(self) -> int: """Return the total energy consumption of the GPU since driver load. Units: mJ.""" - info = amdsmi.amdsmi_get_energy_count(self.handle) - return int( - info["power"] / 1e3 - ) # returns in micro Joules, convert to mili Joules + if not self._supportsGetTotalEnergyConsumption: + raise gpu_common.ZeusGPUNotSupportedError( + "Total energy consumption is not supported on this AMD GPU. " + "This is because the result of `amdsmi.amdsmi_get_energy_count` is not accurate. " + "Please use `getAveragePowerUsage` or `getInstantPowerUsage` to calculate energy usage." + ) + energy_dict = amdsmi.amdsmi_get_energy_count(self.handle) + if "energy_accumulator" in energy_dict: # Changed since amdsmi 6.2.1 + energy = ( + energy_dict["energy_accumulator"] * energy_dict["counter_resolution"] + ) + else: + # Old API: assume has key "power". If not, exception will be handled by _handle_amdsmi_errors. + energy = energy_dict["power"] * energy_dict["counter_resolution"] + + return int(energy / 1e3) # returns in micro Joules, convert to mili Joules class AMDGPUs(gpu_common.GPUs): @@ -292,11 +323,11 @@ def __init__(self, ensure_homogeneous: bool = False) -> None: self._init_gpus() if ensure_homogeneous: self._ensure_homogeneous() - except amdsmi.AmdSmiException as e: + except amdsmi.AmdSmiLibraryException as e: exception_class = AMDGPU._exception_map.get( - e.value, gpu_common.ZeusBaseGPUError + e.get_error_code(), gpu_common.ZeusBaseGPUError ) - raise exception_class(e.msg) from e + raise exception_class(e.get_error_info()) from e @property def gpus(self) -> Sequence[AMDGPU]: @@ -318,8 +349,46 @@ def _init_gpus(self) -> None: else: visible_indices = list(range(len(amdsmi.amdsmi_get_processor_handles()))) + # create the number of visible GPUs self._gpus = [AMDGPU(gpu_num) for gpu_num in visible_indices] + # set _supportsInstantPowerUsage for all GPUs + for gpu in self._gpus: + gpu._supportsInstantPowerUsage = isinstance( + amdsmi.amdsmi_get_power_info(gpu.handle)["current_socket_power"], + int, + ) # amdsmi.amdsmi_get_power_info["current_socket_power"] returns "N/A" if not supported + + # set _supportsGetTotalEnergyConsumption for all GPUs + wait_time = 0.5 # seconds + powers = [gpu.getAveragePowerUsage() for gpu in self._gpus] + initial_energies = [gpu.getTotalEnergyConsumption() for gpu in self._gpus] + time.sleep(wait_time) + final_energies = [gpu.getTotalEnergyConsumption() for gpu in self._gpus] + measured_energies = [ + final - initial for final, initial in zip(final_energies, initial_energies) + ] + expected_energies = [ + power * wait_time for power in powers + ] # energy = power * time + + for gpu, measured_energy, expected_energy in zip( + self._gpus, measured_energies, expected_energies + ): + # Loose bound to rule out very obvious counter problems + if 0.1 < measured_energy / expected_energy < 10: + gpu._supportsGetTotalEnergyConsumption = True + else: + gpu._supportsGetTotalEnergyConsumption = False + logger.info( + "Disabling `getTotalEnergyConsumption` for device %d. The result of `amdsmi.amdsmi_get_energy_count` is not accurate. Expected energy: %d mJ, Measured energy: %d mJ. " + "This is a known issue with some AMD GPUs, please see https://github.com/ROCm/amdsmi/issues/38 for more information. " + "You can still measure energy by polling either `getInstantPowerUsage` or `getAveragePowerUsage` and integrating over time.", + gpu.gpu_index, + expected_energy, + measured_energy, + ) + def __del__(self) -> None: """Shut down AMDSMI.""" with contextlib.suppress(amdsmi.AmdSmiException): diff --git a/zeus/device/gpu/common.py b/zeus/device/gpu/common.py index 87a5c28a..89db5871 100644 --- a/zeus/device/gpu/common.py +++ b/zeus/device/gpu/common.py @@ -96,6 +96,11 @@ def resetGpuLockedClocks(self, _block: bool = True) -> None: """Reset the locked GPU clocks to the default.""" pass + @abc.abstractmethod + def getAveragePowerUsage(self) -> int: + """Return the average power usage of the GPU. Units: mW.""" + pass + @abc.abstractmethod def getInstantPowerUsage(self) -> int: """Return the current power draw of the GPU. Units: mW.""" diff --git a/zeus/device/gpu/nvidia.py b/zeus/device/gpu/nvidia.py index 134696de..82500d11 100644 --- a/zeus/device/gpu/nvidia.py +++ b/zeus/device/gpu/nvidia.py @@ -189,6 +189,16 @@ def resetGpuLockedClocks(self, _block: bool = True) -> None: """Reset the locked GPU clocks to the default.""" pynvml.nvmlDeviceResetGpuLockedClocks(self.handle) + @_handle_nvml_errors + def getAveragePowerUsage(self) -> int: + """Return the average power draw of the GPU. Units: mW.""" + metric = pynvml.nvmlDeviceGetFieldValues( + self.handle, [pynvml.NVML_FI_DEV_POWER_AVERAGE] + )[0] + if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS: + raise pynvml.NVMLError(ret) + return metric.value.uiVal + @_handle_nvml_errors def getInstantPowerUsage(self) -> int: """Return the current power draw of the GPU. Units: mW.""" diff --git a/zeus/monitor/energy.py b/zeus/monitor/energy.py index d4037f73..d485021a 100644 --- a/zeus/monitor/energy.py +++ b/zeus/monitor/energy.py @@ -194,6 +194,7 @@ def __init__( "for more information or disable CPU measurement by passing cpu_indices=[] to " "ZeusMonitor" ) from err + self.cpus = EmptyCPUs() # Resolve GPU indices. If the user did not specify `gpu_indices`, use all available GPUs. self.gpu_indices = ( diff --git a/zeus/optimizer/pipeline_frequency/server/router.py b/zeus/optimizer/pipeline_frequency/server/router.py index 75d51ba7..36b438eb 100644 --- a/zeus/optimizer/pipeline_frequency/server/router.py +++ b/zeus/optimizer/pipeline_frequency/server/router.py @@ -44,7 +44,7 @@ async def custom_route_handler(request: Request) -> Response: request.method, request.url, await request.json() if await request.body() else "None", - response.body.decode(response.charset), + bytes(response.body).decode(response.charset), ) return response diff --git a/zeusd/Cargo.toml b/zeusd/Cargo.toml index c5d66cce..7cad81df 100644 --- a/zeusd/Cargo.toml +++ b/zeusd/Cargo.toml @@ -20,7 +20,7 @@ name = "zeusd" [dependencies] nvml-wrapper = "0.10" actix-web = "4" -tokio = { version = "1", features = ["macros", "rt-multi-thread"] } +tokio = { version = "1", features = ["macros", "rt-multi-thread", "fs"] } thiserror = "1" clap = { version = "4.5.4", features = ["derive"] } serde = { version = "1", features = ["derive"] } @@ -31,8 +31,8 @@ tracing-log = "0.2.0" tracing-actix-web = "0.7.10" nix = { version = "0.29", default-features = false, features = ["user"] } paste = "1" +once_cell = "1.7.2" [dev-dependencies] -once_cell = "1.7.2" reqwest = { version = "0.11", default-features = false, features = ["json"] } serde_json = "1" diff --git a/zeusd/scripts/lint.sh b/zeusd/scripts/lint.sh old mode 100644 new mode 100755 diff --git a/zeusd/src/devices/cpu/linux.rs b/zeusd/src/devices/cpu/linux.rs new file mode 100644 index 00000000..94f38e9e --- /dev/null +++ b/zeusd/src/devices/cpu/linux.rs @@ -0,0 +1,248 @@ +//! CPU power measurement with RAPL. Only supported on Linux. + +use once_cell::sync::OnceCell; +use std::fs; +use std::io::Read; +use std::path::{Path, PathBuf}; +use std::string::String; +use std::sync::{Arc, RwLock}; +use tokio::io::AsyncReadExt; +use tokio::task::JoinHandle; +use tokio::time::{sleep, Duration}; + +use crate::devices::cpu::{CpuManager, PackageInfo}; +use crate::error::ZeusdError; + +// NOTE: To support Zeusd deployment in a docker container, this should support +// sysfs mounts under places like `/zeus_sys`. +static RAPL_DIR: &str = "/sys/class/powercap/intel-rapl"; + +// Assuming a maximum power draw of 1000 Watts when we are polling every 0.1 seconds, the maximum +// amount the RAPL counter would increase (1000 * 1e6 * 0.1) +static RAPL_COUNTER_MAX_INCREASE: u64 = 1000 * 100000; + +pub struct RaplCpu { + cpu: Arc, + dram: Option>, + cpu_monitoring_task: OnceCell>>, + dram_monitoring_task: OnceCell>>, +} + +impl RaplCpu { + pub fn init(_index: usize) -> Result { + let fields = RaplCpu::get_available_fields(_index)?; + Ok(Self { + cpu: fields.0, + dram: fields.1, + cpu_monitoring_task: OnceCell::new(), + dram_monitoring_task: OnceCell::new(), + }) + } +} + +impl PackageInfo { + pub fn new(base_path: &Path, index: usize) -> anyhow::Result { + let cpu_name_path = base_path.join("name"); + let cpu_energy_path = base_path.join("energy_uj"); + let cpu_max_energy_path = base_path.join("max_energy_range_uj"); + + if !cpu_name_path.exists() || !cpu_max_energy_path.exists() || !cpu_energy_path.exists() { + return Err(ZeusdError::CpuInitializationError(index)); + } + + let cpu_name = fs::read_to_string(&cpu_name_path)?.trim_end().to_string(); + // Try reding from energy_uj file + read_u64(&cpu_energy_path)?; + let cpu_max_energy = read_u64(&cpu_max_energy_path)?; + let wraparound_counter = RwLock::new(0); + Ok(PackageInfo { + index, + name: cpu_name, + energy_uj_path: cpu_energy_path, + max_energy_uj: cpu_max_energy, + num_wraparounds: wraparound_counter, + }) + } +} + +impl CpuManager for RaplCpu { + fn device_count() -> Result { + let mut index_count = 0; + let base_path = PathBuf::from(RAPL_DIR); + + match fs::read_dir(&base_path) { + Ok(entries) => { + for entry in entries.flatten() { + let path = entry.path(); + if path.is_dir() { + if let Some(dir_name_str) = path.file_name() { + let dir_name = dir_name_str.to_string_lossy(); + if dir_name.contains("intel-rapl") { + index_count += 1; + } + } + } + } + } + Err(_) => { + tracing::error!("RAPL not available"); + } + }; + Ok(index_count) + } + + fn get_available_fields( + index: usize, + ) -> Result<(Arc, Option>), ZeusdError> { + let base_path = PathBuf::from(format!("{}/intel-rapl:{}", RAPL_DIR, index)); + let cpu_info = PackageInfo::new(&base_path, index)?; + + match fs::read_dir(&base_path) { + Ok(entries) => { + for entry in entries.flatten() { + let path = entry.path(); + if path.is_dir() { + if let Some(dir_name_str) = path.file_name() { + let dir_name = dir_name_str.to_string_lossy(); + if dir_name.contains("intel-rapl") { + let subpackage_path = base_path.join(&*dir_name); + let subpackage_info = PackageInfo::new(&subpackage_path, index)?; + if subpackage_info.name == "dram" { + return Ok(( + Arc::new(cpu_info), + Some(Arc::new(subpackage_info)), + )); + } + } + } + } + } + } + Err(_) => { + return Err(ZeusdError::CpuInitializationError(index)); + } + }; + + Ok((Arc::new(cpu_info), None)) + } + + fn get_cpu_energy(&mut self) -> Result { + // Assume that RAPL counter will not wrap around twice during a request to poll energy. The + // number of wrap arounds is polled twice to handle the case where the counter wraps around + // a request. If this happens, `measurement` has to be updated as to not return an + // unexpectedly large energy value. + + let handle = self + .cpu_monitoring_task + .get_or_init(|| tokio::spawn(monitor_rapl(Arc::clone(&self.cpu)))); + if handle.is_finished() { + return Err(ZeusdError::CpuManagementTaskTerminatedError(self.cpu.index)); + } + + let num_wraparounds_before = *self + .cpu + .num_wraparounds + .read() + .map_err(|_| ZeusdError::CpuManagementTaskTerminatedError(self.cpu.index))?; + let mut measurement = read_u64(&self.cpu.energy_uj_path)?; + let num_wraparounds = *self + .cpu + .num_wraparounds + .read() + .map_err(|_| ZeusdError::CpuManagementTaskTerminatedError(self.cpu.index))?; + if num_wraparounds != num_wraparounds_before { + // Wraparound has happened after measurement, take measurement again + measurement = read_u64(&self.cpu.energy_uj_path)?; + } + + Ok(measurement + num_wraparounds * self.cpu.max_energy_uj) + } + + fn get_dram_energy(&mut self) -> Result { + match &self.dram { + None => Err(ZeusdError::CpuManagementTaskTerminatedError(self.cpu.index)), + Some(dram) => { + let handle = self + .dram_monitoring_task + .get_or_init(|| tokio::spawn(monitor_rapl(Arc::clone(dram)))); + if handle.is_finished() { + return Err(ZeusdError::CpuManagementTaskTerminatedError(dram.index)); + } + + let num_wraparounds_before = *dram + .num_wraparounds + .read() + .map_err(|_| ZeusdError::CpuManagementTaskTerminatedError(dram.index))?; + let mut measurement = read_u64(&dram.energy_uj_path)?; + let num_wraparounds = *dram + .num_wraparounds + .read() + .map_err(|_| ZeusdError::CpuManagementTaskTerminatedError(dram.index))?; + if num_wraparounds != num_wraparounds_before { + // Wraparound has happened after measurement, take measurement again + measurement = read_u64(&dram.energy_uj_path)?; + } + + Ok(measurement + num_wraparounds * dram.max_energy_uj) + } + } + } + + fn stop_monitoring(&mut self) { + if let Some(handle) = self.cpu_monitoring_task.take() { + handle.abort(); + } + if let Some(handle) = self.dram_monitoring_task.take() { + handle.abort(); + } + } + + fn is_dram_available(&self) -> bool { + self.dram.is_some() + } +} + +fn read_u64(path: &PathBuf) -> anyhow::Result { + let mut file = std::fs::File::open(path)?; + let mut buf = String::new(); + file.read_to_string(&mut buf)?; + buf.trim() + .parse() + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e)) +} + +async fn read_u64_async(path: &PathBuf) -> Result { + let mut file = tokio::fs::File::open(path).await?; + let mut buf = String::new(); + file.read_to_string(&mut buf).await?; + buf.trim() + .parse() + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e)) +} + +async fn monitor_rapl(rapl_file: Arc) -> Result<(), ZeusdError> { + let mut last_energy_uj = read_u64_async(&rapl_file.energy_uj_path).await?; + tracing::info!( + "Monitoring started for {}", + rapl_file.energy_uj_path.display() + ); + loop { + let current_energy_uj = read_u64_async(&rapl_file.energy_uj_path).await?; + + if current_energy_uj < last_energy_uj { + let mut wraparound_guard = rapl_file + .num_wraparounds + .write() + .map_err(|_| ZeusdError::CpuManagementTaskTerminatedError(rapl_file.index))?; + *wraparound_guard += 1; + } + last_energy_uj = current_energy_uj; + let sleep_time = if rapl_file.max_energy_uj - current_energy_uj < RAPL_COUNTER_MAX_INCREASE + { + 100 + } else { + 1000 + }; + sleep(Duration::from_millis(sleep_time)).await; + } +} diff --git a/zeusd/src/devices/cpu/macos.rs b/zeusd/src/devices/cpu/macos.rs new file mode 100644 index 00000000..66edce5c --- /dev/null +++ b/zeusd/src/devices/cpu/macos.rs @@ -0,0 +1,59 @@ +//! Fake `RaplCpu` implementation to allow development and testing on MacOS. +use std::path::PathBuf; +use std::sync::{Arc, RwLock}; + +use crate::devices::cpu::{CpuManager, PackageInfo}; +use crate::error::ZeusdError; + +pub struct RaplCpu {} + +impl RaplCpu { + pub fn init(_index: usize) -> Result { + Ok(Self {}) + } +} + +impl CpuManager for RaplCpu { + fn device_count() -> Result { + Ok(1) + } + + fn get_available_fields( + _index: usize, + ) -> Result<(Arc, Option>), ZeusdError> { + Ok(( + Arc::new(PackageInfo { + index: _index, + name: "package-0".to_string(), + energy_uj_path: PathBuf::from( + "/sys/class/powercap/intel-rapl/intel-rapl:0/energy_uj", + ), + max_energy_uj: 1000000, + num_wraparounds: RwLock::new(0), + }), + Some(Arc::new(PackageInfo { + index: _index, + name: "dram".to_string(), + energy_uj_path: PathBuf::from( + "/sys/class/powercap/intel-rapl/intel-rapl:0/intel-rapl:0:0/energy_uj", + ), + max_energy_uj: 1000000, + num_wraparounds: RwLock::new(0), + })), + )) + } + + fn get_cpu_energy(&mut self) -> Result { + Ok(10001) + } + + fn get_dram_energy(&mut self) -> Result { + Ok(1001) + } + + fn stop_monitoring(&mut self) {} + + fn is_dram_available(&self) -> bool { + true + } +} diff --git a/zeusd/src/devices/cpu/mod.rs b/zeusd/src/devices/cpu/mod.rs new file mode 100644 index 00000000..f79ee1ee --- /dev/null +++ b/zeusd/src/devices/cpu/mod.rs @@ -0,0 +1,189 @@ +// RAPL CPU +// Real RAPL interface. +#[cfg(target_os = "linux")] +mod linux; +#[cfg(target_os = "linux")] +pub use linux::RaplCpu; + +// Fake Rapl interface for dev and testing on macOS. +#[cfg(target_os = "macos")] +mod macos; +#[cfg(target_os = "macos")] +pub use macos::RaplCpu; + +use serde::{Deserialize, Serialize}; +use std::path::PathBuf; +use std::sync::Arc; +use std::sync::RwLock; +use std::time::Instant; +use tokio::sync::mpsc::{Sender, UnboundedReceiver, UnboundedSender}; +use tracing::Span; + +use crate::error::ZeusdError; + +pub struct PackageInfo { + pub index: usize, + pub name: String, + pub energy_uj_path: PathBuf, + pub max_energy_uj: u64, + pub num_wraparounds: RwLock, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct RaplResponse { + pub cpu_energy_uj: Option, + pub dram_energy_uj: Option, +} + +pub trait CpuManager { + /// Get the number of CPUs available. + fn device_count() -> Result; + /// Get the CPU PackageInfo and the DRAM PackageInfo it is available. + fn get_available_fields( + index: usize, + ) -> Result<(Arc, Option>), ZeusdError>; + // Get the cumulative Rapl count value of the CPU after compensating for wraparounds. + fn get_cpu_energy(&mut self) -> Result; + // Get the cumulative Rapl count value of the DRAM after compensating for wraparounds if it is + // available. + fn get_dram_energy(&mut self) -> Result; + // Abort the monitoring tasks for CPU and DRAM if the tasks have been started. + fn stop_monitoring(&mut self); + // Check if DRAM is available. + fn is_dram_available(&self) -> bool; +} + +pub type CpuCommandRequest = ( + CpuCommand, + Option>>, + Instant, + Span, +); + +#[derive(Clone)] +pub struct CpuManagementTasks { + // Senders to the CPU management tasks. index is the CPU ID. + senders: Vec>, +} + +impl CpuManagementTasks { + pub fn start(cpus: Vec) -> Result + where + T: CpuManager + Send + 'static, + { + let mut senders = Vec::with_capacity(cpus.len()); + for (cpu_id, cpu) in cpus.into_iter().enumerate() { + // Channel to send commands to the CPU management task. + let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); + senders.push(tx); + // The CPU management task will automatically terminate + // when the server terminates and the last sender is dropped. + tokio::spawn(cpu_management_task(cpu, rx)); + tracing::info!("Background task for CPU {} successfully spawned", cpu_id); + } + Ok(Self { senders }) + } + + pub async fn send_command_blocking( + &self, + cpu_id: usize, + command: CpuCommand, + request_start_time: Instant, + ) -> Result { + if cpu_id >= self.senders.len() { + return Err(ZeusdError::CpuNotFoundError(cpu_id)); + } + let (tx, mut rx) = tokio::sync::mpsc::channel(1); + self.senders[cpu_id] + .send((command, Some(tx), request_start_time, Span::current())) + .unwrap(); + match rx.recv().await { + Some(result) => result, + None => Err(ZeusdError::CpuManagementTaskTerminatedError(cpu_id)), + } + } + + pub async fn stop_monitoring(&self) -> Result<(), ZeusdError> { + for (index, sender) in self.senders.iter().enumerate() { + let (tx, mut rx) = tokio::sync::mpsc::channel(1); + sender + .send(( + CpuCommand::StopMonitoring, + Some(tx), + Instant::now(), + Span::current(), + )) + .unwrap(); + match rx.recv().await { + Some(_) => {} + None => return Err(ZeusdError::CpuManagementTaskTerminatedError(index)), + } + } + Ok(()) + } +} + +/// A CPU command that can be executed on a CPU. +#[derive(Debug)] +pub enum CpuCommand { + /// Get the CPU and DRAM energy measurement for the CPU index + GetIndexEnergy { cpu: bool, dram: bool }, + /// Stop the monitoring task for CPU and DRAM if they have been started. + StopMonitoring, +} + +/// Tokio background task that handles requests to each CPU. +/// NOTE: Currently, this serializes the handling of request to a single CPU, which is +/// largely unnecessary as the requests are simply reading energy counters. +/// This is subject to refactoring if it is to become a bottleneck. +async fn cpu_management_task( + mut cpu: T, + mut rx: UnboundedReceiver, +) { + while let Some((command, response, start_time, span)) = rx.recv().await { + let _span_guard = span.enter(); + let result = command.execute(&mut cpu, start_time); + if let Some(response) = response { + if response.send(result).await.is_err() { + tracing::error!("Failed to send response to caller"); + } + } + } +} + +impl CpuCommand { + fn execute( + &self, + device: &mut T, + _request_arrival_time: Instant, + ) -> Result + where + T: CpuManager, + { + match *self { + Self::GetIndexEnergy { cpu, dram } => { + let cpu_energy_uj = if cpu { + Some(device.get_cpu_energy()?) + } else { + None + }; + let dram_energy_uj = if dram && device.is_dram_available() { + Some(device.get_dram_energy()?) + } else { + None + }; + Ok(RaplResponse { + cpu_energy_uj, + dram_energy_uj, + }) + } + Self::StopMonitoring {} => { + device.stop_monitoring(); + Ok(RaplResponse { + cpu_energy_uj: Some(0), + dram_energy_uj: Some(0), + }) + } + } + } +} diff --git a/zeusd/src/devices/gpu/linux.rs b/zeusd/src/devices/gpu/linux.rs index 2b372464..2f01c6c0 100644 --- a/zeusd/src/devices/gpu/linux.rs +++ b/zeusd/src/devices/gpu/linux.rs @@ -3,7 +3,7 @@ //! Note that NVML is only available on Linux. use nvml_wrapper::enums::device::GpuLockedClocksSetting; -use nvml_wrapper::{Device, Nvml}; +use nvml_wrapper::{error::NvmlError, Device, Nvml}; use crate::devices::gpu::GpuManager; use crate::error::ZeusdError; @@ -26,8 +26,18 @@ impl NvmlGpu<'static> { impl GpuManager for NvmlGpu<'static> { fn device_count() -> Result { - let nvml = Nvml::init()?; - Ok(nvml.device_count()?) + match Nvml::init() { + Ok(nvml) => match nvml.device_count() { + Ok(count) => Ok(count), + Err(e) => Err(ZeusdError::NvmlError(e)), + }, + // Specifically catch this error that is thrown when GPU is not available + Err(NvmlError::LibloadingError(e)) => { + tracing::error!("Error initializing NVML, {}", e); + Ok(0) + } + Err(e) => Err(ZeusdError::NvmlError(e)), + } } #[inline] diff --git a/zeusd/src/devices/gpu/mod.rs b/zeusd/src/devices/gpu/mod.rs index 281f54ba..58599736 100644 --- a/zeusd/src/devices/gpu/mod.rs +++ b/zeusd/src/devices/gpu/mod.rs @@ -81,7 +81,7 @@ pub struct GpuManagementTasks { impl GpuManagementTasks { /// Start GPU management tasks for the given GPUs. /// It's generic over the type of GPU manager to allow for testing. - pub fn start(gpus: Vec) -> anyhow::Result + pub fn start(gpus: Vec) -> Result where T: GpuManager + Send + 'static, { diff --git a/zeusd/src/devices/mod.rs b/zeusd/src/devices/mod.rs index eaeb673a..184db2b7 100644 --- a/zeusd/src/devices/mod.rs +++ b/zeusd/src/devices/mod.rs @@ -1,3 +1,4 @@ //! Interfaces for interacting with devices +pub mod cpu; pub mod gpu; diff --git a/zeusd/src/error.rs b/zeusd/src/error.rs index 0733dacd..c1f24af6 100644 --- a/zeusd/src/error.rs +++ b/zeusd/src/error.rs @@ -17,12 +17,20 @@ use crate::devices::gpu::GpuCommandRequest; pub enum ZeusdError { #[error("GPU index {0} does not exist.")] GpuNotFoundError(usize), + #[error("CPU index {0} does not exist.")] + CpuNotFoundError(usize), #[error("NVML error: {0}")] NvmlError(#[from] NvmlError), #[error("GPU command send error: {0}")] GpuCommandSendError(#[from] SendError), #[error("Management task for GPU {0} unexpectedly terminated while handling the request.")] GpuManagementTaskTerminatedError(usize), + #[error("Management task for CPU {0} unexpectedly terminated while handling the request.")] + CpuManagementTaskTerminatedError(usize), + #[error("Initialization for CPU {0} unexpectedly errored.")] + CpuInitializationError(usize), + #[error("IOError: {0}")] + IOError(#[from] std::io::Error), } /// This allows us to return a custom HTTP status code for each error variant. @@ -30,6 +38,7 @@ impl ResponseError for ZeusdError { fn status_code(&self) -> StatusCode { match self { ZeusdError::GpuNotFoundError(_) => StatusCode::BAD_REQUEST, + ZeusdError::CpuNotFoundError(_) => StatusCode::BAD_REQUEST, ZeusdError::NvmlError(e) => match e { NvmlError::NoPermission => StatusCode::FORBIDDEN, NvmlError::InvalidArg => StatusCode::BAD_REQUEST, @@ -37,6 +46,9 @@ impl ResponseError for ZeusdError { }, ZeusdError::GpuCommandSendError(_) => StatusCode::INTERNAL_SERVER_ERROR, ZeusdError::GpuManagementTaskTerminatedError(_) => StatusCode::INTERNAL_SERVER_ERROR, + ZeusdError::CpuManagementTaskTerminatedError(_) => StatusCode::INTERNAL_SERVER_ERROR, + ZeusdError::CpuInitializationError(_) => StatusCode::INTERNAL_SERVER_ERROR, + ZeusdError::IOError(_) => StatusCode::INTERNAL_SERVER_ERROR, } } } diff --git a/zeusd/src/main.rs b/zeusd/src/main.rs index 9030173a..47cd6bc6 100644 --- a/zeusd/src/main.rs +++ b/zeusd/src/main.rs @@ -4,8 +4,8 @@ use std::net::TcpListener; use zeusd::config::{get_config, ConnectionMode}; use zeusd::startup::{ - ensure_root, get_unix_listener, init_tracing, start_device_tasks, start_server_tcp, - start_server_uds, + ensure_root, get_unix_listener, init_tracing, start_cpu_device_tasks, start_gpu_device_tasks, + start_server_tcp, start_server_uds, }; #[tokio::main] @@ -19,7 +19,8 @@ async fn main() -> anyhow::Result<()> { ensure_root()?; } - let device_tasks = start_device_tasks()?; + let gpu_device_tasks = start_gpu_device_tasks()?; + let cpu_device_tasks = start_cpu_device_tasks()?; tracing::info!("Started all device tasks"); let num_workers = config.num_workers.unwrap_or_else(|| { @@ -37,16 +38,29 @@ async fn main() -> anyhow::Result<()> { )?; tracing::info!("Listening on {}", &config.socket_path); - start_server_uds(listener, device_tasks, num_workers)?.await?; + start_server_uds( + listener, + gpu_device_tasks, + cpu_device_tasks.clone(), + num_workers, + )? + .await?; } ConnectionMode::TCP => { let listener = TcpListener::bind(&config.tcp_bind_address)?; tracing::info!("Listening on {}", &listener.local_addr()?); - start_server_tcp(listener, device_tasks, num_workers)?.await?; + start_server_tcp( + listener, + gpu_device_tasks, + cpu_device_tasks.clone(), + num_workers, + )? + .await?; } } + let _ = cpu_device_tasks.stop_monitoring().await; Ok(()) } diff --git a/zeusd/src/routes/cpu.rs b/zeusd/src/routes/cpu.rs new file mode 100644 index 00000000..092c4c74 --- /dev/null +++ b/zeusd/src/routes/cpu.rs @@ -0,0 +1,53 @@ +//! Routes for interacting with CPUs + +use actix_web::{web, HttpResponse}; +use serde::{Deserialize, Serialize}; +use std::time::Instant; + +use crate::devices::cpu::{CpuCommand, CpuManagementTasks}; +use crate::error::ZeusdError; + +#[derive(Serialize, Deserialize, Debug)] +pub struct GetIndexEnergy { + pub cpu: bool, + pub dram: bool, +} + +impl From for CpuCommand { + fn from(_request: GetIndexEnergy) -> Self { + CpuCommand::GetIndexEnergy { + cpu: _request.cpu, + dram: _request.dram, + } + } +} + +#[actix_web::post("/{cpu_id}/get_index_energy")] +#[tracing::instrument( + skip(cpu_id, request, _device_tasks), + fields( + cpu_id = %cpu_id, + cpu = %request.cpu, + dram = %request.dram, + ) +)] +async fn get_index_energy_handler( + cpu_id: web::Path, + request: web::Json, + _device_tasks: web::Data, +) -> Result { + let now = Instant::now(); + tracing::info!("Received request"); + let cpu_id = cpu_id.into_inner(); + let request = request.into_inner(); + + let measurement = _device_tasks + .send_command_blocking(cpu_id, request.into(), now) + .await?; + + Ok(HttpResponse::Ok().json(measurement)) +} + +pub fn cpu_routes(cfg: &mut web::ServiceConfig) { + cfg.service(get_index_energy_handler); +} diff --git a/zeusd/src/routes/gpu.rs b/zeusd/src/routes/gpu.rs index 46f002a3..73b696d4 100644 --- a/zeusd/src/routes/gpu.rs +++ b/zeusd/src/routes/gpu.rs @@ -13,18 +13,18 @@ use crate::error::ZeusdError; /// This macro takes /// - the API name (set_power_limit, set_persistence_mode, etc.), /// - the method and path for the request handler, -/// - and a list of `field name ` pairs of the corresponding `GpuCommand` variant. +/// - and a list of `field name: type` pairs of the corresponding `GpuCommand` variant. /// /// Gien this, the macro generates /// - a request payload struct named API name (e.g., SetPowerLimit) and all the -/// fields specified plus `block: bool` to indicate whether the request should block, +/// fields specified plus `block: bool` to indicate whether the request should block, /// - an implementation of `From` for the payload struct to convert it to the /// - a handler function that takes the request payload, converts it to a `GpuCommand` variant, -/// and sends it to the `GpuManagementTasks` actor. +/// and sends it to the `GpuManagementTasks` actor. /// /// Assumptions: /// - The `GpuCommand` variant name is the same as the API name, but the former is camel case -/// and the latter is snake case (e.g., SetPowerLimit vs. set_power_limit). +/// and the latter is snake case (e.g., SetPowerLimit vs. set_power_limit). macro_rules! impl_handler_for_gpu_command { ($api:ident, $path:expr, $($field:ident: $ftype:ty,)*) => { paste! { diff --git a/zeusd/src/routes/mod.rs b/zeusd/src/routes/mod.rs index fae1350b..9a781a03 100644 --- a/zeusd/src/routes/mod.rs +++ b/zeusd/src/routes/mod.rs @@ -1,5 +1,7 @@ //! Routes and handlers for interacting with devices +pub mod cpu; pub mod gpu; +pub use cpu::cpu_routes; pub use gpu::gpu_routes; diff --git a/zeusd/src/startup.rs b/zeusd/src/startup.rs index 878b2569..04c14d0a 100644 --- a/zeusd/src/startup.rs +++ b/zeusd/src/startup.rs @@ -12,7 +12,9 @@ use tracing_subscriber::fmt::MakeWriter; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::{EnvFilter, Registry}; +use crate::devices::cpu::{CpuManagementTasks, CpuManager, RaplCpu}; use crate::devices::gpu::{GpuManagementTasks, GpuManager, NvmlGpu}; +use crate::routes::cpu_routes; use crate::routes::gpu_routes; /// Initialize tracing with the given where to write logs to. @@ -51,7 +53,7 @@ pub fn get_unix_listener( } /// Initialize NVML and start GPU management tasks. -pub fn start_device_tasks() -> anyhow::Result { +pub fn start_gpu_device_tasks() -> anyhow::Result { tracing::info!("Starting NVML and GPU management tasks."); let num_gpus = NvmlGpu::device_count()?; let mut gpus = Vec::with_capacity(num_gpus as usize); @@ -60,7 +62,19 @@ pub fn start_device_tasks() -> anyhow::Result { tracing::info!("Initialized NVML for GPU {}", gpu_id); gpus.push(gpu); } - GpuManagementTasks::start(gpus) + Ok(GpuManagementTasks::start(gpus)?) +} + +pub fn start_cpu_device_tasks() -> anyhow::Result { + tracing::info!("Starting Rapl and CPU management tasks."); + let num_cpus = RaplCpu::device_count()?; + let mut cpus = Vec::with_capacity(num_cpus); + for cpu_id in 0..num_cpus { + let cpu = RaplCpu::init(cpu_id)?; + tracing::info!("Initialized RAPL for CPU {}", cpu_id); + cpus.push(cpu); + } + Ok(CpuManagementTasks::start(cpus)?) } /// Ensure the daemon is running as root. @@ -78,14 +92,17 @@ pub fn ensure_root() -> anyhow::Result<()> { /// Set up routing and start the server on a unix domain socket. pub fn start_server_uds( listener: UnixListener, - device_tasks: GpuManagementTasks, + gpu_device_tasks: GpuManagementTasks, + cpu_device_tasks: CpuManagementTasks, num_workers: usize, ) -> std::io::Result { let server = HttpServer::new(move || { App::new() .wrap(tracing_actix_web::TracingLogger::default()) .service(web::scope("/gpu").configure(gpu_routes)) - .app_data(web::Data::new(device_tasks.clone())) + .service(web::scope("/cpu").configure(cpu_routes)) + .app_data(web::Data::new(gpu_device_tasks.clone())) + .app_data(web::Data::new(cpu_device_tasks.clone())) }) .workers(num_workers) .listen_uds(listener)? @@ -97,14 +114,17 @@ pub fn start_server_uds( /// Set up routing and start the server over TCP. pub fn start_server_tcp( listener: TcpListener, - device_tasks: GpuManagementTasks, + gpu_device_tasks: GpuManagementTasks, + cpu_device_tasks: CpuManagementTasks, num_workers: usize, ) -> std::io::Result { let server = HttpServer::new(move || { App::new() .wrap(tracing_actix_web::TracingLogger::default()) .service(web::scope("/gpu").configure(gpu_routes)) - .app_data(web::Data::new(device_tasks.clone())) + .service(web::scope("/cpu").configure(cpu_routes)) + .app_data(web::Data::new(gpu_device_tasks.clone())) + .app_data(web::Data::new(cpu_device_tasks.clone())) }) .workers(num_workers) .listen(listener)? diff --git a/zeusd/tests/cpu.rs b/zeusd/tests/cpu.rs new file mode 100644 index 00000000..35c5a92b --- /dev/null +++ b/zeusd/tests/cpu.rs @@ -0,0 +1,156 @@ +mod helpers; + +use zeusd::devices::cpu::RaplResponse; +use zeusd::routes::cpu::GetIndexEnergy; + +use crate::helpers::{TestApp, ZeusdRequest}; + +#[tokio::test] +async fn test_only_cpu_measuremnt() { + let mut app = TestApp::start().await; + let measurements: Vec = vec![10000, 10001, 12313, 8213, 0]; + app.set_cpu_energy_measurements(0, &measurements); + + for expected in measurements { + let resp = app + .send( + 0, + GetIndexEnergy { + cpu: true, + dram: false, + }, + ) + .await + .expect("Failed to send request"); + assert_eq!(resp.status(), 200); + let rapl_response: RaplResponse = serde_json::from_str(&resp.text().await.unwrap()) + .expect("Failed to deserialize response body"); + assert_eq!(rapl_response.cpu_energy_uj.unwrap(), expected); + assert_eq!(rapl_response.dram_energy_uj, None); + } +} + +#[tokio::test] +async fn test_only_dram_measuremnt() { + let mut app = TestApp::start().await; + let measurements: Vec = vec![10000, 10001, 12313, 8213, 0]; + app.set_dram_energy_measurements(0, &measurements); + + for expected in measurements { + let resp = app + .send( + 0, + GetIndexEnergy { + cpu: false, + dram: true, + }, + ) + .await + .expect("Failed to send request"); + assert_eq!(resp.status(), 200); + let rapl_response: RaplResponse = serde_json::from_str(&resp.text().await.unwrap()) + .expect("Failed to deserialiez response body"); + assert_eq!(rapl_response.cpu_energy_uj, None); + assert_eq!(rapl_response.dram_energy_uj.unwrap(), expected); + } +} + +#[tokio::test] +async fn test_both_measuremnt() { + let mut app = TestApp::start().await; + let measurements: Vec = vec![10000, 10001, 12313, 8213, 0]; + app.set_cpu_energy_measurements(0, &measurements); + app.set_dram_energy_measurements(0, &measurements); + + for expected in measurements { + let resp = app + .send( + 0, + GetIndexEnergy { + cpu: true, + dram: true, + }, + ) + .await + .expect("Failed to send request"); + assert_eq!(resp.status(), 200); + let rapl_response: RaplResponse = serde_json::from_str(&resp.text().await.unwrap()) + .expect("Failed to deserialiez response body"); + assert_eq!(rapl_response.cpu_energy_uj.unwrap(), expected); + assert_eq!(rapl_response.dram_energy_uj.unwrap(), expected); + } +} + +#[tokio::test] +async fn test_invalid_requests() { + let app = TestApp::start().await; + + let client = reqwest::Client::new(); + let url = GetIndexEnergy::build_url(&app, 0); + let resp = client + .post(url) + .json(&serde_json::json!( + { + "cpu": true, // Missing dram field + } + )) + .send() + .await + .expect("Failed to send request"); + assert_eq!(resp.status(), 400); + + let url = GetIndexEnergy::build_url(&app, 0); + let resp = client + .post(url) + .json(&serde_json::json!( + { + "dram": true, // Missing cpu field + } + )) + .send() + .await + .expect("Failed to send request"); + assert_eq!(resp.status(), 400); + + let url = GetIndexEnergy::build_url(&app, 0); + let resp = client + .post(url) + .json(&serde_json::json!( + { + "cpu": "true", //Invalid type + "dram": true, + } + )) + .send() + .await + .expect("Failed to send request"); + assert_eq!(resp.status(), 400); + + let url = GetIndexEnergy::build_url(&app, 2); // Out of index CPU + let resp = client + .post(url) + .json(&serde_json::json!( + { + "cp": true, // Invalid field name + "dram": true, + } + )) + .send() + .await + .expect("Failed to send request"); + assert_eq!(resp.status(), 400); + + let url = GetIndexEnergy::build_url(&app, 2); // Out of index CPU + let resp = client + .post(url) + .json(&serde_json::json!( + { + "cpu": true, + "dram": true, + } + )) + .send() + .await + .expect("Failed to send request"); + assert_eq!(resp.status(), 400); +} diff --git a/zeusd/tests/helpers/mod.rs b/zeusd/tests/helpers/mod.rs index 8e6e9860..610f1ca3 100644 --- a/zeusd/tests/helpers/mod.rs +++ b/zeusd/tests/helpers/mod.rs @@ -8,13 +8,18 @@ use once_cell::sync::Lazy; use paste::paste; use std::future::Future; use std::net::TcpListener; +use std::path::PathBuf; +use std::sync::{Arc, RwLock}; use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender}; +use zeusd::devices::cpu::{CpuManagementTasks, CpuManager, PackageInfo}; use zeusd::devices::gpu::{GpuManagementTasks, GpuManager}; use zeusd::error::ZeusdError; use zeusd::startup::{init_tracing, start_server_tcp}; static NUM_GPUS: u32 = 4; +static NUM_CPUS: usize = 1; + static TRACING: Lazy<()> = Lazy::new(|| { if std::env::var("TEST_LOG").is_ok() { init_tracing(std::io::stdout).expect("Failed to initialize tracing"); @@ -117,7 +122,79 @@ impl GpuManager for TestGpu { } } -pub fn start_test_tasks() -> anyhow::Result<(GpuManagementTasks, Vec)> { +pub struct TestCpu { + pub cpu: UnboundedReceiver, + pub dram: UnboundedReceiver, +} + +pub struct TestCpuInjector { + pub cpu: UnboundedSender, + pub dram: UnboundedSender, +} + +impl TestCpu { + fn init(_index: usize) -> Result<(Self, TestCpuInjector), ZeusdError> { + let (cpu_sender, cpu_receiver) = tokio::sync::mpsc::unbounded_channel(); + let (dram_sender, dram_receiver) = tokio::sync::mpsc::unbounded_channel(); + Ok(( + TestCpu { + cpu: cpu_receiver, + dram: dram_receiver, + }, + TestCpuInjector { + cpu: cpu_sender, + dram: dram_sender, + }, + )) + } +} + +impl CpuManager for TestCpu { + fn device_count() -> Result { + Ok(1) + } + + fn get_available_fields( + _index: usize, + ) -> Result<(Arc, Option>), ZeusdError> { + Ok(( + Arc::new(PackageInfo { + index: _index, + name: "package-0".to_string(), + energy_uj_path: PathBuf::from( + "/sys/class/powercap/intel-rapl/intel-rapl:0/energy_uj", + ), + max_energy_uj: 1000000, + num_wraparounds: RwLock::new(0), + }), + Some(Arc::new(PackageInfo { + index: _index, + name: "dram".to_string(), + energy_uj_path: PathBuf::from( + "/sys/class/powercap/intel-rapl/intel-rapl:0/intel-rapl:0:0/energy_uj", + ), + max_energy_uj: 1000000, + num_wraparounds: RwLock::new(0), + })), + )) + } + + fn get_cpu_energy(&mut self) -> Result { + Ok(self.cpu.try_recv().ok().unwrap()) + } + + fn get_dram_energy(&mut self) -> Result { + Ok(self.dram.try_recv().ok().unwrap()) + } + + fn stop_monitoring(&mut self) {} + + fn is_dram_available(&self) -> bool { + true + } +} + +pub fn start_gpu_test_tasks() -> anyhow::Result<(GpuManagementTasks, Vec)> { let mut gpus = Vec::with_capacity(4); let mut observers = Vec::with_capacity(4); for _ in 0..4 { @@ -131,12 +208,24 @@ pub fn start_test_tasks() -> anyhow::Result<(GpuManagementTasks, Vec anyhow::Result<(CpuManagementTasks, Vec)> { + let mut cpus = Vec::with_capacity(NUM_CPUS); + let mut injectors = Vec::with_capacity(NUM_CPUS); + for i in 0..NUM_CPUS { + let (cpu, cpu_injector) = TestCpu::init(i)?; + cpus.push(cpu); + injectors.push(cpu_injector) + } + let tasks = CpuManagementTasks::start(cpus)?; + Ok((tasks, injectors)) +} + /// A helper trait for building URLs to send requests to. pub trait ZeusdRequest: serde::Serialize { fn build_url(app: &TestApp, gpu_id: u32) -> String; } -macro_rules! impl_zeusd_request { +macro_rules! impl_zeusd_request_gpu { ($api:ident) => { paste! { impl ZeusdRequest for zeusd::routes::gpu::[<$api:camel>] { @@ -151,35 +240,57 @@ macro_rules! impl_zeusd_request { }; } -impl_zeusd_request!(SetPersistenceMode); -impl_zeusd_request!(SetPowerLimit); -impl_zeusd_request!(SetGpuLockedClocks); -impl_zeusd_request!(ResetGpuLockedClocks); -impl_zeusd_request!(SetMemLockedClocks); -impl_zeusd_request!(ResetMemLockedClocks); +macro_rules! impl_zeusd_request_cpu { + ($api:ident) => { + paste! { + impl ZeusdRequest for zeusd::routes::cpu::[<$api:camel>] { + fn build_url(app: &TestApp, cpu_id: u32) -> String { + format!( + "http://127.0.0.1:{}/cpu/{}/{}", + app.port, cpu_id, stringify!([<$api:snake>]), + ) + } + } + } + }; +} +impl_zeusd_request_gpu!(SetPersistenceMode); +impl_zeusd_request_gpu!(SetPowerLimit); +impl_zeusd_request_gpu!(SetGpuLockedClocks); +impl_zeusd_request_gpu!(ResetGpuLockedClocks); +impl_zeusd_request_gpu!(SetMemLockedClocks); +impl_zeusd_request_gpu!(ResetMemLockedClocks); + +impl_zeusd_request_cpu!(GetIndexEnergy); /// A test application that starts a server over TCP and provides helper methods /// for sending requests and fetching what happened to the fake GPUs. pub struct TestApp { port: u16, observers: Vec, + cpu_injectors: Vec, } impl TestApp { pub async fn start() -> Self { Lazy::force(&TRACING); - let (test_tasks, test_gpu_observers) = - start_test_tasks().expect("Failed to start test tasks"); + let (gpu_test_tasks, test_gpu_observers) = + start_gpu_test_tasks().expect("Failed to start gpu test tasks"); + + let (cpu_test_tasks, cpu_test_injectors) = + start_cpu_test_tasks().expect("Failed to start cpu test tasks"); let listener = TcpListener::bind("127.0.0.1:0").expect("Failed to bind TCP listener"); let port = listener.local_addr().unwrap().port(); - let server = start_server_tcp(listener, test_tasks, 8).expect("Failed to start server"); + let server = start_server_tcp(listener, gpu_test_tasks, cpu_test_tasks, 2) + .expect("Failed to start server"); let _ = tokio::spawn(async move { server.await }); TestApp { port, observers: test_gpu_observers, + cpu_injectors: cpu_test_injectors, } } @@ -213,4 +324,16 @@ impl TestApp { let rx = &mut self.observers[gpu_id].mem_locked_clocks_rx; std::iter::from_fn(|| rx.try_recv().ok()).collect() } + + pub fn set_cpu_energy_measurements(&mut self, cpu_id: usize, measurements: &Vec) { + for measurement in measurements { + self.cpu_injectors[cpu_id].cpu.send(*measurement).unwrap(); + } + } + + pub fn set_dram_energy_measurements(&mut self, cpu_id: usize, measurements: &Vec) { + for measurement in measurements { + self.cpu_injectors[cpu_id].dram.send(*measurement).unwrap(); + } + } } From b62ed5d175600692882e0569de672009e6759695 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Sat, 30 Nov 2024 17:38:15 -0500 Subject: [PATCH 28/57] Fix energy histogram to properly handle default bucket ranges --- tests/test_metric.py | 2 +- zeus/metric.py | 44 ++++++++++++++++++++++++++++---------------- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/tests/test_metric.py b/tests/test_metric.py index afca7469..00484486 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -99,7 +99,7 @@ def test_energy_histogram( cpu_histogram.labels = MagicMock(return_value=cpu_histogram) cpu_histogram.observe = MagicMock() - if histogram_metric.dram_histogram: + if histogram_metric.dram_histograms: for _dram_index, dram_histogram in histogram_metric.dram_histograms.items(): dram_histogram.labels = MagicMock(return_value=dram_histogram) dram_histogram.observe = MagicMock() diff --git a/zeus/metric.py b/zeus/metric.py index 91edfc90..47e5b446 100644 --- a/zeus/metric.py +++ b/zeus/metric.py @@ -60,9 +60,9 @@ def __init__( gpu_indices: list, prometheus_url: str, job: str, - gpu_bucket_range: list[float] | None, - cpu_bucket_range: list[float] | None, - dram_bucket_range: list[float] | None, + gpu_bucket_range: list[float] | None = None, + cpu_bucket_range: list[float] | None = None, + dram_bucket_range: list[float] | None = None, ) -> None: """Initialize the EnergyHistogram class. @@ -84,29 +84,41 @@ def __init__( Raises: ValueError: If any of the bucket ranges (GPU, CPU, DRAM) is an empty list. """ - if not gpu_bucket_range: + self.gpu_bucket_range = ( + [50.0, 100.0, 200.0, 500.0, 1000.0] + if gpu_bucket_range is None + else gpu_bucket_range + ) + self.cpu_bucket_range = ( + [10.0, 20.0, 50.0, 100.0, 200.0] + if cpu_bucket_range is None + else cpu_bucket_range + ) + self.dram_bucket_range = ( + [5.0, 10.0, 20.0, 50.0, 150.0] + if dram_bucket_range is None + else dram_bucket_range + ) + self.cpu_indices = cpu_indices + self.gpu_indices = gpu_indices + self.prometheus_url = prometheus_url + self.job = job + + self.registry = CollectorRegistry() + + if gpu_bucket_range == []: raise ValueError( "GPU bucket range cannot be empty. Please provide a valid range or omit the argument to use defaults." ) - if not cpu_bucket_range: + if cpu_bucket_range == []: raise ValueError( "CPU bucket range cannot be empty. Please provide a valid range or omit the argument to use defaults." ) - if not dram_bucket_range: + if dram_bucket_range == []: raise ValueError( "DRAM bucket range cannot be empty. Please provide a valid range or omit the argument to use defaults." ) - self.gpu_bucket_range = gpu_bucket_range or [50.0, 100.0, 200.0, 500.0, 1000.0] - self.cpu_bucket_range = cpu_bucket_range or [10.0, 20.0, 50.0, 100.0, 200.0] - self.dram_bucket_range = dram_bucket_range or [5.0, 10.0, 20.0, 50.0, 150.0] - self.cpu_indices = cpu_indices - self.gpu_indices = gpu_indices - self.prometheus_url = prometheus_url - self.job = job - - self.registry = CollectorRegistry() - # Initialize GPU histograms self.gpu_histograms = {} if self.gpu_indices: From 0dbb2362d240c1258723c49ad218fe7940eed59c Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Sat, 30 Nov 2024 18:29:36 -0500 Subject: [PATCH 29/57] Add the mock_push_to_gateway Patch to each test --- tests/test_metric.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tests/test_metric.py b/tests/test_metric.py index 00484486..6a7d738e 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -66,8 +66,12 @@ def mock_gauge(): yield gauge +@patch("prometheus_client.exposition.push_to_gateway", autospec=True) def test_energy_histogram( - mock_get_cpus: MagicMock, mock_zeus_monitor: MagicMock, mock_histogram: MagicMock + mock_push_to_gateway: MagicMock, + mock_get_cpus: MagicMock, + mock_zeus_monitor: MagicMock, + mock_histogram: MagicMock, ) -> None: """Test EnergyHistogram class. @@ -75,10 +79,12 @@ def test_energy_histogram( and that the correct energy values are recorded. Args: + mock_push_to_gateway (MagicMock): Mocked `push_to_gateway` function for Prometheus. mock_get_cpus (MagicMock): Mocked `get_cpus` fixture. mock_zeus_monitor (MagicMock): Mocked ZeusMonitor fixture. mock_histogram (MagicMock): Mocked Prometheus Histogram fixture. """ + mock_push_to_gateway.return_value = None cpu_indices = [0, 1] gpu_indices = [0, 1, 2] prometheus_url = "http://localhost:9091" @@ -150,12 +156,16 @@ def test_energy_histogram( assert energy in calls, f"Expected DRAM energy {energy} in {calls}" +@patch("prometheus_client.exposition.push_to_gateway", autospec=True) def test_energy_cumulative_counter( - mock_get_cpus: MagicMock, mock_zeus_monitor: MagicMock + mock_push_to_gateway: MagicMock, + mock_get_cpus: MagicMock, + mock_zeus_monitor: MagicMock, ) -> None: """Test EnergyCumulativeCounter with mocked ZeusMonitor. Args: + mock_push_to_gateway (MagicMock): Mocked `push_to_gateway` function for Prometheus. mock_get_cpus (MagicMock): Mocked `get_cpus` fixture. mock_zeus_monitor (MagicMock): Mocked ZeusMonitor fixture. """ @@ -205,8 +215,10 @@ def test_energy_cumulative_counter( cumulative_counter.cpu_counters[cpu_index].inc.assert_called_with(energy) +@patch("prometheus_client.exposition.push_to_gateway", autospec=True) @patch("zeus.device.gpu.get_gpus") def test_power_gauge( + mock_push_to_gateway: MagicMock, mock_get_gpus: MagicMock, mock_power_monitor: MagicMock, mock_gauge: MagicMock, @@ -214,6 +226,7 @@ def test_power_gauge( """Test PowerGauge with mocked PowerMonitor and Prometheus Gauges. Args: + mock_push_to_gateway (MagicMock): Mocked `push_to_gateway` function for Prometheus. mock_get_gpus (MagicMock): Mocked `get_gpus` function to simulate available GPUs. mock_power_monitor (MagicMock): Mocked PowerMonitor to simulate GPU power data. mock_gauge (MagicMock): Mocked Prometheus Gauge creation. From 753f1de82da6b59877ecb1aed5dc0c88a7f0135f Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Sat, 30 Nov 2024 18:32:39 -0500 Subject: [PATCH 30/57] Update gpu_bucket_range, cpu_bucket_range, and dram_bucket_range in the EnergyHistogram constructor to use Sequence[float] for type safety, maintaining default values for bucket ranges --- zeus/metric.py | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/zeus/metric.py b/zeus/metric.py index 47e5b446..64a3e5bf 100644 --- a/zeus/metric.py +++ b/zeus/metric.py @@ -7,6 +7,8 @@ import warnings import multiprocessing as mp +from typing import Sequence + from prometheus_client import ( CollectorRegistry, Histogram, @@ -60,9 +62,9 @@ def __init__( gpu_indices: list, prometheus_url: str, job: str, - gpu_bucket_range: list[float] | None = None, - cpu_bucket_range: list[float] | None = None, - dram_bucket_range: list[float] | None = None, + gpu_bucket_range: Sequence[float] = [50.0, 100.0, 200.0, 500.0, 1000.0], + cpu_bucket_range: Sequence[float] = [10.0, 20.0, 50.0, 100.0, 200.0], + dram_bucket_range: Sequence[float] = [5.0, 10.0, 20.0, 50.0, 150.0], ) -> None: """Initialize the EnergyHistogram class. @@ -84,37 +86,24 @@ def __init__( Raises: ValueError: If any of the bucket ranges (GPU, CPU, DRAM) is an empty list. """ - self.gpu_bucket_range = ( - [50.0, 100.0, 200.0, 500.0, 1000.0] - if gpu_bucket_range is None - else gpu_bucket_range - ) - self.cpu_bucket_range = ( - [10.0, 20.0, 50.0, 100.0, 200.0] - if cpu_bucket_range is None - else cpu_bucket_range - ) - self.dram_bucket_range = ( - [5.0, 10.0, 20.0, 50.0, 150.0] - if dram_bucket_range is None - else dram_bucket_range - ) + self.gpu_bucket_range = gpu_bucket_range + self.cpu_bucket_range = cpu_bucket_range + self.dram_bucket_range = dram_bucket_range self.cpu_indices = cpu_indices self.gpu_indices = gpu_indices self.prometheus_url = prometheus_url self.job = job - self.registry = CollectorRegistry() - if gpu_bucket_range == []: + if not gpu_bucket_range: raise ValueError( "GPU bucket range cannot be empty. Please provide a valid range or omit the argument to use defaults." ) - if cpu_bucket_range == []: + if not cpu_bucket_range: raise ValueError( "CPU bucket range cannot be empty. Please provide a valid range or omit the argument to use defaults." ) - if dram_bucket_range == []: + if not dram_bucket_range: raise ValueError( "DRAM bucket range cannot be empty. Please provide a valid range or omit the argument to use defaults." ) From 77ff0754171c2890841c25e31a7a3ac4ffe8bc83 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Sat, 30 Nov 2024 19:00:39 -0500 Subject: [PATCH 31/57] Patch to mock urllib.request.urlopen preventing attempts to an actual HTTP request --- tests/test_metric.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/test_metric.py b/tests/test_metric.py index 6a7d738e..46a50b1d 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -111,7 +111,10 @@ def test_energy_histogram( dram_histogram.observe = MagicMock() histogram_metric.begin_window("test_window") - histogram_metric.end_window("test_window") + + with patch("urllib.request.urlopen", autospec=True) as mock_urlopen: + mock_urlopen.side_effect = RuntimeError("No external calls allowed") + histogram_metric.end_window("test_window") # Assert GPU histograms were observed if mock_zeus_monitor.return_value.end_window.return_value.gpu_energy: @@ -169,6 +172,7 @@ def test_energy_cumulative_counter( mock_get_cpus (MagicMock): Mocked `get_cpus` fixture. mock_zeus_monitor (MagicMock): Mocked ZeusMonitor fixture. """ + mock_push_to_gateway.return_value = None cpu_indices = [0, 1] gpu_indices = [0, 1, 2] prometheus_url = "http://localhost:9091" @@ -189,8 +193,10 @@ def test_energy_cumulative_counter( counter.labels = MagicMock(return_value=counter) counter.inc = MagicMock() - cumulative_counter.begin_window("test_counter") - cumulative_counter.end_window("test_counter") + with patch("urllib.request.urlopen", autospec=True) as mock_urlopen: + mock_urlopen.side_effect = RuntimeError("No external calls allowed") + cumulative_counter.begin_window("test_counter") + cumulative_counter.end_window("test_counter") # Assert GPU counters if mock_zeus_monitor.return_value.end_window.return_value.gpu_energy: @@ -231,6 +237,7 @@ def test_power_gauge( mock_power_monitor (MagicMock): Mocked PowerMonitor to simulate GPU power data. mock_gauge (MagicMock): Mocked Prometheus Gauge creation. """ + mock_push_to_gateway.return_value = None gpu_indices = [0, 1, 2] prometheus_url = "http://localhost:9091" @@ -251,8 +258,10 @@ def test_power_gauge( gauge.labels = MagicMock(return_value=gauge) gauge.set = MagicMock() - power_gauge.begin_window("test_power_window") - power_gauge.end_window("test_power_window") + with patch("urllib.request.urlopen", autospec=True) as mock_urlopen: + mock_urlopen.side_effect = RuntimeError("No external calls allowed") + power_gauge.begin_window("test_power_window") + power_gauge.end_window("test_power_window") # Assert that the gauges were set with the correct power values if mock_power_monitor.return_value.get_power.return_value: From 793a186b93c4bc86d772508f2459df69210c3e01 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Sat, 30 Nov 2024 19:12:24 -0500 Subject: [PATCH 32/57] Patch to mock urllib.request.urlopen preventing attempts to an actual HTTP request --- tests/test_metric.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_metric.py b/tests/test_metric.py index 46a50b1d..f3994873 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -112,7 +112,7 @@ def test_energy_histogram( histogram_metric.begin_window("test_window") - with patch("urllib.request.urlopen", autospec=True) as mock_urlopen: + with patch("prometheus_client.exposition.urlopen", autospec=True) as mock_urlopen: mock_urlopen.side_effect = RuntimeError("No external calls allowed") histogram_metric.end_window("test_window") @@ -193,7 +193,7 @@ def test_energy_cumulative_counter( counter.labels = MagicMock(return_value=counter) counter.inc = MagicMock() - with patch("urllib.request.urlopen", autospec=True) as mock_urlopen: + with patch("prometheus_client.exposition.urlopen", autospec=True) as mock_urlopen: mock_urlopen.side_effect = RuntimeError("No external calls allowed") cumulative_counter.begin_window("test_counter") cumulative_counter.end_window("test_counter") @@ -258,7 +258,7 @@ def test_power_gauge( gauge.labels = MagicMock(return_value=gauge) gauge.set = MagicMock() - with patch("urllib.request.urlopen", autospec=True) as mock_urlopen: + with patch("prometheus_client.exposition.urlopen", autospec=True) as mock_urlopen: mock_urlopen.side_effect = RuntimeError("No external calls allowed") power_gauge.begin_window("test_power_window") power_gauge.end_window("test_power_window") From 30b7e1cd621578db1a75c4568ece28f912946615 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Sat, 30 Nov 2024 19:21:19 -0500 Subject: [PATCH 33/57] Patch to mock prometheus_client.exposition.push_to_gateway external calls --- tests/test_metric.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/test_metric.py b/tests/test_metric.py index f3994873..dc7e8683 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -66,7 +66,7 @@ def mock_gauge(): yield gauge -@patch("prometheus_client.exposition.push_to_gateway", autospec=True) +@patch("prometheus_client.exposition.push_to_gateway") def test_energy_histogram( mock_push_to_gateway: MagicMock, mock_get_cpus: MagicMock, @@ -112,8 +112,8 @@ def test_energy_histogram( histogram_metric.begin_window("test_window") - with patch("prometheus_client.exposition.urlopen", autospec=True) as mock_urlopen: - mock_urlopen.side_effect = RuntimeError("No external calls allowed") + with patch("prometheus_client.exposition.push_to_gateway") as mock_push: + mock_push.return_value = None histogram_metric.end_window("test_window") # Assert GPU histograms were observed @@ -159,7 +159,7 @@ def test_energy_histogram( assert energy in calls, f"Expected DRAM energy {energy} in {calls}" -@patch("prometheus_client.exposition.push_to_gateway", autospec=True) +@patch("prometheus_client.exposition.push_to_gateway") def test_energy_cumulative_counter( mock_push_to_gateway: MagicMock, mock_get_cpus: MagicMock, @@ -193,8 +193,8 @@ def test_energy_cumulative_counter( counter.labels = MagicMock(return_value=counter) counter.inc = MagicMock() - with patch("prometheus_client.exposition.urlopen", autospec=True) as mock_urlopen: - mock_urlopen.side_effect = RuntimeError("No external calls allowed") + with patch("prometheus_client.exposition.push_to_gateway") as mock_push: + mock_push.return_value = None cumulative_counter.begin_window("test_counter") cumulative_counter.end_window("test_counter") @@ -221,7 +221,7 @@ def test_energy_cumulative_counter( cumulative_counter.cpu_counters[cpu_index].inc.assert_called_with(energy) -@patch("prometheus_client.exposition.push_to_gateway", autospec=True) +@patch("prometheus_client.exposition.push_to_gateway") @patch("zeus.device.gpu.get_gpus") def test_power_gauge( mock_push_to_gateway: MagicMock, @@ -258,8 +258,8 @@ def test_power_gauge( gauge.labels = MagicMock(return_value=gauge) gauge.set = MagicMock() - with patch("prometheus_client.exposition.urlopen", autospec=True) as mock_urlopen: - mock_urlopen.side_effect = RuntimeError("No external calls allowed") + with patch("prometheus_client.exposition.push_to_gateway") as mock_push: + mock_push.return_value = None power_gauge.begin_window("test_power_window") power_gauge.end_window("test_power_window") From 3ab4d8961a7421ef5c3cb2bbf9f94b1c430c3981 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Sat, 30 Nov 2024 19:47:38 -0500 Subject: [PATCH 34/57] Patch to http.client.HTTPConnection --- tests/test_metric.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/test_metric.py b/tests/test_metric.py index dc7e8683..be1eaf42 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -66,7 +66,7 @@ def mock_gauge(): yield gauge -@patch("prometheus_client.exposition.push_to_gateway") +@patch("prometheus_client.exposition.push_to_gateway", autospec=True) def test_energy_histogram( mock_push_to_gateway: MagicMock, mock_get_cpus: MagicMock, @@ -79,7 +79,7 @@ def test_energy_histogram( and that the correct energy values are recorded. Args: - mock_push_to_gateway (MagicMock): Mocked `push_to_gateway` function for Prometheus. + mock_push_to_gateway (MagicMock): Mocked `push_to_gateway` mock_get_cpus (MagicMock): Mocked `get_cpus` fixture. mock_zeus_monitor (MagicMock): Mocked ZeusMonitor fixture. mock_histogram (MagicMock): Mocked Prometheus Histogram fixture. @@ -111,11 +111,13 @@ def test_energy_histogram( dram_histogram.observe = MagicMock() histogram_metric.begin_window("test_window") - - with patch("prometheus_client.exposition.push_to_gateway") as mock_push: - mock_push.return_value = None + with patch("http.client.HTTPConnection", autospec=True) as mock_http: + mock_http_instance = mock_http.return_value + mock_http_instance.getresponse.return_value.code = 200 + mock_http_instance.getresponse.return_value.msg = "OK" + mock_http_instance.getresponse.return_value.info = lambda: {} + mock_http_instance.sock = MagicMock() histogram_metric.end_window("test_window") - # Assert GPU histograms were observed if mock_zeus_monitor.return_value.end_window.return_value.gpu_energy: for ( From f032488d9c1d98375db44fa7a919eda79a064a00 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Sat, 30 Nov 2024 20:07:37 -0500 Subject: [PATCH 35/57] Remove unneccessary mock --- tests/test_metric.py | 27 +++++---------------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/tests/test_metric.py b/tests/test_metric.py index be1eaf42..f5f416de 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -66,9 +66,7 @@ def mock_gauge(): yield gauge -@patch("prometheus_client.exposition.push_to_gateway", autospec=True) def test_energy_histogram( - mock_push_to_gateway: MagicMock, mock_get_cpus: MagicMock, mock_zeus_monitor: MagicMock, mock_histogram: MagicMock, @@ -79,12 +77,10 @@ def test_energy_histogram( and that the correct energy values are recorded. Args: - mock_push_to_gateway (MagicMock): Mocked `push_to_gateway` mock_get_cpus (MagicMock): Mocked `get_cpus` fixture. mock_zeus_monitor (MagicMock): Mocked ZeusMonitor fixture. mock_histogram (MagicMock): Mocked Prometheus Histogram fixture. """ - mock_push_to_gateway.return_value = None cpu_indices = [0, 1] gpu_indices = [0, 1, 2] prometheus_url = "http://localhost:9091" @@ -161,20 +157,15 @@ def test_energy_histogram( assert energy in calls, f"Expected DRAM energy {energy} in {calls}" -@patch("prometheus_client.exposition.push_to_gateway") def test_energy_cumulative_counter( - mock_push_to_gateway: MagicMock, - mock_get_cpus: MagicMock, - mock_zeus_monitor: MagicMock, + mock_get_cpus: MagicMock, mock_zeus_monitor: MagicMock ) -> None: """Test EnergyCumulativeCounter with mocked ZeusMonitor. Args: - mock_push_to_gateway (MagicMock): Mocked `push_to_gateway` function for Prometheus. mock_get_cpus (MagicMock): Mocked `get_cpus` fixture. mock_zeus_monitor (MagicMock): Mocked ZeusMonitor fixture. """ - mock_push_to_gateway.return_value = None cpu_indices = [0, 1] gpu_indices = [0, 1, 2] prometheus_url = "http://localhost:9091" @@ -195,10 +186,8 @@ def test_energy_cumulative_counter( counter.labels = MagicMock(return_value=counter) counter.inc = MagicMock() - with patch("prometheus_client.exposition.push_to_gateway") as mock_push: - mock_push.return_value = None - cumulative_counter.begin_window("test_counter") - cumulative_counter.end_window("test_counter") + cumulative_counter.begin_window("test_counter") + cumulative_counter.end_window("test_counter") # Assert GPU counters if mock_zeus_monitor.return_value.end_window.return_value.gpu_energy: @@ -223,10 +212,8 @@ def test_energy_cumulative_counter( cumulative_counter.cpu_counters[cpu_index].inc.assert_called_with(energy) -@patch("prometheus_client.exposition.push_to_gateway") @patch("zeus.device.gpu.get_gpus") def test_power_gauge( - mock_push_to_gateway: MagicMock, mock_get_gpus: MagicMock, mock_power_monitor: MagicMock, mock_gauge: MagicMock, @@ -234,12 +221,10 @@ def test_power_gauge( """Test PowerGauge with mocked PowerMonitor and Prometheus Gauges. Args: - mock_push_to_gateway (MagicMock): Mocked `push_to_gateway` function for Prometheus. mock_get_gpus (MagicMock): Mocked `get_gpus` function to simulate available GPUs. mock_power_monitor (MagicMock): Mocked PowerMonitor to simulate GPU power data. mock_gauge (MagicMock): Mocked Prometheus Gauge creation. """ - mock_push_to_gateway.return_value = None gpu_indices = [0, 1, 2] prometheus_url = "http://localhost:9091" @@ -260,10 +245,8 @@ def test_power_gauge( gauge.labels = MagicMock(return_value=gauge) gauge.set = MagicMock() - with patch("prometheus_client.exposition.push_to_gateway") as mock_push: - mock_push.return_value = None - power_gauge.begin_window("test_power_window") - power_gauge.end_window("test_power_window") + power_gauge.begin_window("test_power_window") + power_gauge.end_window("test_power_window") # Assert that the gauges were set with the correct power values if mock_power_monitor.return_value.get_power.return_value: From 73d8c8ce85d44799828741ff746acca8200623bf Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Thu, 5 Dec 2024 22:32:20 -0500 Subject: [PATCH 36/57] Add zeus.metric to the list --- zeus/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/zeus/__init__.py b/zeus/__init__.py index 0c0e36ec..df9c7519 100644 --- a/zeus/__init__.py +++ b/zeus/__init__.py @@ -2,6 +2,7 @@ - [`device`][zeus.device]: Abstraction layer over compute devices - [`monitor`][zeus.monitor]: Programmatic power and energy measurement tools +- [`metric`][zeus.metric]: Tools for defining and tracking power and energy-related metrics - [`optimizer`][zeus.optimizer]: A collection of optimizers for time and energy - [`utils`][zeus.utils]: Utility functions and classes - [`callback`][zeus.callback]: Callback definition From c153ef247080954bf73ee9fec79507344ac0f638 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Thu, 5 Dec 2024 23:41:16 -0500 Subject: [PATCH 37/57] Update reference link for each class --- examples/prometheus/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/prometheus/README.md b/examples/prometheus/README.md index c09e39e5..4cc0ff61 100644 --- a/examples/prometheus/README.md +++ b/examples/prometheus/README.md @@ -19,13 +19,13 @@ You just need to download and extract the ImageNet data and mount it to the Dock ``` 1. Install `prometheus_client`: ```sh - pip install prometheus-client + pip install zeus-ml[prometheus] ``` ## EnergyHistogram, PowerGauge, and EnergyCumulativeCounter -- [`EnergyHistogram`][zeus.metric.EnergyHistogram]: Records energy consumption data for GPUs, CPUs, and DRAM and pushes the data to Prometheus as histogram metrics. This is useful for tracking energy usage distribution over time. -- [`PowerGauge`][zeus.metric.PowerGauge]: Monitors real-time GPU power usage and pushes the data to Prometheus as gauge metrics, which are updated at regular intervals. -- [`EnergyCumulativeCounter`][zeus.metric.EnergyCumulativeCounter]: Tracks cumulative energy consumption over time for CPUs and GPUs and pushes the results to Prometheus as counter metrics. +- [`EnergyHistogram`](https://ml.energy/zeus/reference/metric/#zeus.metric.EnergyHistogram): Records energy consumption data for GPUs, CPUs, and DRAM and pushes the data to Prometheus as histogram metrics. This is useful for tracking energy usage distribution over time. +- [`PowerGauge`](https://ml.energy/zeus/reference/metric/#zeus.metric.PowerGauge): Monitors real-time GPU power usage and pushes the data to Prometheus as gauge metrics, which are updated at regular intervals. +- [`EnergyCumulativeCounter`](https://ml.energy/zeus/reference/metric/#zeus.metric.EnergyCumulativeCounter): Tracks cumulative energy consumption over time for CPUs and GPUs and pushes the results to Prometheus as counter metrics. ## `ZeusMonitor` and `GlobalPowerLimitOptimizer` From 0aa03d4cb4ee058b7d1f900f2cca5a450ebe8ec7 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Thu, 5 Dec 2024 23:45:10 -0500 Subject: [PATCH 38/57] Move line for prometheus-client --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f7282097..e33a1c5b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,10 +46,10 @@ pfo-server = ["fastapi[standard]", "pydantic<2", "lowtime", "aiofiles", "torch"] bso = ["pydantic<2"] bso-server = ["fastapi[standard]", "sqlalchemy", "pydantic<2", "python-dotenv"] migration = ["alembic", "sqlalchemy", "pydantic<2", "python-dotenv"] +prometheus = ["prometheus-client"] lint = ["ruff", "black==22.6.0", "pyright", "pandas-stubs", "transformers"] test = ["fastapi[standard]", "sqlalchemy", "pydantic<2", "pytest==7.3.2", "pytest-mock==3.10.0", "pytest-xdist==3.3.1", "anyio==3.7.1", "aiosqlite==0.20.0", "numpy<2"] docs = ["mkdocs-material[imaging]==9.5.19", "mkdocstrings[python]==0.25.0", "mkdocs-gen-files==0.5.0", "mkdocs-literate-nav==0.6.1", "mkdocs-section-index==0.3.9", "mkdocs-redirects==1.2.1", "urllib3<2", "black"] -prometheus = ["prometheus-client"] # greenlet is for supporting apple mac silicon for sqlalchemy(https://docs.sqlalchemy.org/en/20/faq/installation.html) dev = ["zeus-ml[pfo-server,bso,bso-server,migration,prometheus,lint,test]", "greenlet"] From 0e0e63c88a9497725e9c2f4e97eb76ea282b7c22 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Fri, 6 Dec 2024 00:38:57 -0500 Subject: [PATCH 39/57] feat: Add multiprocessing dict and sync execution for begin/end window - Introduced `MonitoringProcessState` dataclass to handle each multiprocessing process. - Added `sync_execution` parameter to `begin_window` and `end_window` methods in `Metric` and its subclasses. - Updated `EnergyHistogram`, `EnergyCumulativeCounter`, and `PowerGauge` to handle `sync_execution` dynamically. - Modified `energy_monitoring_loop` and `power_monitoring_loop` to include process-safe handling. - Improved process cleanup and error handling for monitoring windows. --- zeus/metric.py | 268 ++++++++++++++++++++++++++++--------------------- 1 file changed, 154 insertions(+), 114 deletions(-) diff --git a/zeus/metric.py b/zeus/metric.py index 64a3e5bf..fcfe7aeb 100644 --- a/zeus/metric.py +++ b/zeus/metric.py @@ -5,9 +5,9 @@ import abc import time import warnings -import multiprocessing as mp - from typing import Sequence +import multiprocessing as mp +from dataclasses import dataclass from prometheus_client import ( CollectorRegistry, @@ -23,6 +23,14 @@ from zeus.device.cpu import get_cpus +@dataclass +class MonitoringProcessState: + """Represents the state of a monitoring window.""" + + queue: mp.Queue + proc: mp.Process + + class Metric(abc.ABC): """Abstract base class for all metric types in Zeus. @@ -31,13 +39,23 @@ class Metric(abc.ABC): """ @abc.abstractmethod - def begin_window(self, name: str): - """Start a new measurement window.""" + def begin_window(self, name: str, sync_execution: bool = None) -> None: + """Start a new measurement window. + + Args: + name (str): Name of the measurement window. + sync_execution (bool): Whether to execute synchronously. Defaults to None. + """ pass @abc.abstractmethod - def end_window(self, name: str): - """End the current measurement window and report metrics.""" + def end_window(self, name: str, sync_execution: bool = None) -> None: + """End the current measurement window and report metrics. + + Args: + name (str): Name of the measurement window. + sync_execution (bool): Whether to execute synchronously. Defaults to None. + """ pass @@ -111,57 +129,56 @@ def __init__( # Initialize GPU histograms self.gpu_histograms = {} if self.gpu_indices: - for gpu_index in gpu_indices: - self.gpu_histograms[gpu_index] = Histogram( - f"energy_monitor_gpu_{gpu_index}_energy_joules", - f"GPU {gpu_index} energy consumption", - ["window", "index"], - buckets=self.gpu_bucket_range, - registry=self.registry, - ) + self.gpu_histograms = Histogram( + "energy_monitor_gpu_energy_joules", + "GPU energy consumption", + ["window", "index"], + buckets=self.gpu_bucket_range, + registry=self.registry, + ) # Initialize CPU histograms self.cpu_histograms = {} self.dram_histograms = {} if self.cpu_indices: - for cpu_index in self.cpu_indices: - self.cpu_histograms[cpu_index] = Histogram( - f"energy_monitor_cpu_{cpu_index}_energy_joules", - f"CPU {cpu_index} energy consumption", + self.cpu_histograms = Histogram( + "energy_monitor_cpu_energy_joules", + "CPU energy consumption", + ["window", "index"], + buckets=self.cpu_bucket_range, + registry=self.registry, + ) + # Initialize CPU and DRAM histograms + if any(cpu.supportsGetDramEnergyConsumption() for cpu in get_cpus().cpus): + self.dram_histograms = Histogram( + "energy_monitor_dram_energy_joules", + "DRAM energy consumption", ["window", "index"], - buckets=self.cpu_bucket_range, + buckets=self.dram_bucket_range, registry=self.registry, ) - # Initialize CPU and DRAM histograms - # Only when CPUs are available, we check if DRAM is available. - for i, cpu in enumerate(get_cpus().cpus): - if cpu.supportsGetDramEnergyConsumption(): - self.dram_histograms[i] = Histogram( - f"energy_monitor_dram_{i}_energy_joules", - f"DRAM {i} energy consumption", - ["window", "index"], - buckets=self.dram_bucket_range, - registry=self.registry, - ) self.max_gpu_bucket = max(self.gpu_bucket_range) self.max_cpu_bucket = max(self.cpu_bucket_range) self.max_dram_bucket = max(self.dram_bucket_range) + self.min_gpu_bucket = min(self.gpu_bucket_range) + self.min_cpu_bucket = min(self.cpu_bucket_range) + self.min_dram_bucket = min(self.dram_bucket_range) + self.energy_monitor = ZeusMonitor( cpu_indices=cpu_indices, gpu_indices=gpu_indices ) - def begin_window(self, name: str) -> None: + def begin_window(self, name: str, sync_execution: bool = True) -> None: """Begin the energy monitoring window. Args: name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. + sync_execution (bool): Whether to execute synchronously. Defaults to True. """ - self.energy_monitor.begin_window( - f"__EnergyHistogram_{name}", sync_execution=True - ) + self.energy_monitor.begin_window(f"__EnergyHistogram_{name}", sync_execution) - def end_window(self, name: str) -> None: + def end_window(self, name: str, sync_execution: bool = True) -> None: """End the current energy monitoring window and record the energy data. Retrieves the energy consumption data (for GPUs, CPUs, and DRAMs) for the monitoring window @@ -169,6 +186,7 @@ def end_window(self, name: str) -> None: Args: name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. + sync_execution (bool): Whether to execute synchronously. Defaults to True. Pushes: - GPU energy data to the Prometheus Push Gateway via the associated Histogram metric. @@ -176,44 +194,56 @@ def end_window(self, name: str) -> None: - DRAM energy data to the Prometheus Push Gateway via the associated Histogram metric. """ measurement = self.energy_monitor.end_window( - f"__EnergyHistogram_{name}", sync_execution=True + f"__EnergyHistogram_{name}", sync_execution ) if measurement.gpu_energy: for gpu_index, gpu_energy in measurement.gpu_energy.items(): - if gpu_index in self.gpu_histograms: - self.gpu_histograms[gpu_index].labels( - window=name, index=gpu_index - ).observe(gpu_energy) + self.gpu_histograms.labels(window=name, index=gpu_index).observe( + gpu_energy + ) if gpu_energy > self.max_gpu_bucket: warnings.warn( f"GPU {gpu_index} energy {gpu_energy} exceeds the maximum bucket value of {self.max_gpu_bucket}", stacklevel=1, ) + if gpu_energy < self.min_gpu_bucket: + warnings.warn( + f"GPU {gpu_index} energy {gpu_energy} exceeds the minimum bucket value of {self.min_gpu_bucket}", + stacklevel=1, + ) if measurement.cpu_energy: for cpu_index, cpu_energy in measurement.cpu_energy.items(): - if cpu_index in self.cpu_histograms: - self.cpu_histograms[cpu_index].labels( - window=name, index=cpu_index - ).observe(cpu_energy) + self.cpu_histograms.labels(window=name, index=cpu_index).observe( + cpu_energy + ) if cpu_energy > self.max_cpu_bucket: warnings.warn( f"CPU {cpu_index} energy {cpu_energy} exceeds the maximum bucket value of {self.max_cpu_bucket}", stacklevel=1, ) + if cpu_energy < self.min_cpu_bucket: + warnings.warn( + f"CPU {cpu_index} energy {cpu_energy} exceeds the minimum bucket value of {self.min_cpu_bucket}", + stacklevel=1, + ) if measurement.dram_energy: for dram_index, dram_energy in measurement.dram_energy.items(): - if dram_index in self.dram_histograms: - self.dram_histograms[dram_index].labels( - window=name, index=dram_index - ).observe(dram_energy) + self.dram_histograms.labels(window=name, index=dram_index).observe( + dram_energy + ) if dram_energy > self.max_dram_bucket: warnings.warn( f"DRAM {dram_index} energy {dram_energy} exceeds the maximum bucket value of {self.max_dram_bucket}", stacklevel=1, ) + if dram_energy < self.min_dram_bucket: + warnings.warn( + f"CPU {dram_index} energy {dram_energy} exceeds the minimum bucket value of {self.min_dram_bucket}", + stacklevel=1, + ) push_to_gateway(self.prometheus_url, job=self.job, registry=self.registry) @@ -237,6 +267,7 @@ class EnergyCumulativeCounter(Metric): dram_counters: A dictionary storing the Prometheus Counter metrics for DRAM. queue: A multiprocessing queue used to send signals to start/stop energy monitoring. proc: A multiprocessing process that runs the energy monitoring loop. + window_state: A dictionary that maps the monitoring window names to their corresponding process state. """ def __init__( @@ -266,8 +297,9 @@ def __init__( self.dram_counters = {} self.queue = None self.proc = None + self.window_state: dict[str, MonitoringProcessState] = {} - def begin_window(self, name: str) -> None: + def begin_window(self, name: str, sync_execution: bool = False) -> None: """Begin the energy monitoring window. Starts a new multiprocessing process that monitors energy usage periodically @@ -275,6 +307,7 @@ def begin_window(self, name: str) -> None: Args: name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. + sync_execution (bool, optional): Whether to execute monitoring synchronously. Defaults to False. """ context = mp.get_context("spawn") self.queue = context.Queue() @@ -288,28 +321,33 @@ def begin_window(self, name: str) -> None: self.update_period, self.prometheus_url, self.job, + sync_execution, ), ) self.proc.start() if not self.proc.is_alive(): raise RuntimeError(f"Failed to start monitoring process for {name}.") - def end_window(self, name: str) -> None: + self.window_state[name] = MonitoringProcessState( + queue=self.queue, proc=self.proc + ) + + def end_window(self, name: str, sync_execution: bool = False) -> None: """End the energy monitoring window. Args: name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. + sync_execution (bool, optional): Whether to execute monitoring synchronously. Defaults to False. """ - if not hasattr(self, "queue") or self.queue is None: - raise RuntimeError( - "EnergyCumulativeCounter's 'queue' is not initialized. " - "Make sure 'begin_window' is called before 'end_window'." - ) + if name not in self.window_state: + raise ValueError(f"No active monitoring process found for '{name}'.") + + state = self.window_state.pop(name) self.queue.put("stop") - if self.proc is not None: - self.proc.join(timeout=20) - if self.proc.is_alive(): - self.proc.terminate() + state.proc.join(timeout=20) + + if state.proc.is_alive(): + state.proc.terminate() def energy_monitoring_loop( @@ -320,6 +358,7 @@ def energy_monitoring_loop( update_period: int, prometheus_url: str, job: str, + sync_execution: bool, ) -> None: """Runs in a separate process to collect and update energy consumption metrics (for GPUs, CPUs, and DRAM). @@ -331,6 +370,7 @@ def energy_monitoring_loop( update_period (int): The interval (in seconds) between consecutive energy data updates. prometheus_url (str): The URL of the Prometheus Push Gateway where the metrics will be pushed. job (str): The name of the Prometheus job associated with these metrics. + sync_execution (bool): Whether to execute monitoring synchronously. """ registry = CollectorRegistry() energy_monitor = ZeusMonitor(cpu_indices=cpu_indices, gpu_indices=gpu_indices) @@ -340,64 +380,53 @@ def energy_monitoring_loop( dram_counters = {} if energy_monitor.gpu_indices: - for gpu_index in energy_monitor.gpu_indices: - gpu_counters[gpu_index] = Counter( - f"energy_monitor_gpu_{gpu_index}_energy_joules", - f"GPU {gpu_index} energy consumption", - ["window", "index"], - registry=registry, - ) + gpu_counters = Counter( + "energy_monitor_gpu_energy_joules", + "GPU energy consumption", + ["window", "index"], + registry=registry, + ) if energy_monitor.cpu_indices: - for cpu_index in energy_monitor.cpu_indices: - cpu_counters[cpu_index] = Counter( - f"energy_monitor_cpu_{cpu_index}_energy_joules", - f"CPU {cpu_index} energy consumption", + cpu_counters = Counter( + "energy_monitor_cpu_energy_joules", + "CPU energy consumption", + ["window", "index"], + registry=registry, + ) + if any(cpu.supportsGetDramEnergyConsumption() for cpu in get_cpus().cpus): + dram_counters = Counter( + "energy_monitor_dram_energy_joules", + "DRAM energy consumption", ["window", "index"], registry=registry, ) - for i, cpu in enumerate(get_cpus().cpus): - if cpu.supportsGetDramEnergyConsumption(): - dram_counters[i] = Counter( - f"energy_monitor_dram_{i}_energy_joules", - f"DRAM {i} energy consumption", - ["window", "index"], - registry=registry, - ) while True: if not pipe.empty(): break - - energy_monitor.begin_window( - f"__EnergyCumulativeCounter_{name}", sync_execution=False - ) + # Begin and end monitoring window using sync_execution + energy_monitor.begin_window(f"__EnergyCumulativeCounter_{name}", sync_execution) time.sleep(update_period) measurement = energy_monitor.end_window( - f"__EnergyCumulativeCounter_{name}", sync_execution=False + f"__EnergyCumulativeCounter_{name}", sync_execution ) if measurement.gpu_energy: for gpu_index, energy in measurement.gpu_energy.items(): - if gpu_counters and gpu_index in gpu_counters: - gpu_counters[gpu_index].labels(window=name, index=gpu_index).inc( - energy - ) + if gpu_counters: + gpu_counters.labels(window=name, index=gpu_index).inc(energy) if measurement.cpu_energy: for cpu_index, energy in measurement.cpu_energy.items(): - if cpu_counters and cpu_index in cpu_counters: - cpu_counters[cpu_index].labels(window=name, index=cpu_index).inc( - energy - ) + if cpu_counters: + cpu_counters.labels(window=name, index=cpu_index).inc(energy) if measurement.dram_energy: for dram_index, energy in measurement.dram_energy.items(): - if dram_counters and dram_index in dram_counters: - dram_counters[dram_index].labels(window=name, index=dram_index).inc( - energy - ) - + if dram_counters: + dram_counters.labels(window=name, index=dram_index).inc(energy) + # Push metrics to Prometheus push_to_gateway(prometheus_url, job=job, registry=registry) @@ -417,6 +446,7 @@ class PowerGauge(Metric): gpu_gauges (dict[int, Gauge]): Dictionary mapping GPU indices to Prometheus Gauge metrics for real-time power consumption tracking. queue: Queue for controlling the monitoring process. proc: Process running the power monitoring loop. + window_state: A dictionary mapping monitoring window names to their process state. """ def __init__( @@ -439,8 +469,9 @@ def __init__( self.prometheus_url = prometheus_url self.job = job self.gpu_gauges = {} + self.window_state: dict[str, MonitoringProcessState] = {} - def begin_window(self, name: str) -> None: + def begin_window(self, name: str, sync_execution: bool = False) -> None: """Begin the power monitoring window. Starts a new multiprocessing process that runs the power monitoring loop. @@ -449,7 +480,11 @@ def begin_window(self, name: str) -> None: Args: name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. + sync_execution (bool, optional): Whether to execute monitoring synchronously. Defaults to False. """ + if name in self.window_state: + raise ValueError(f"PowerGauge metric '{name}' already exists.") + context = mp.get_context("spawn") self.queue = context.Queue() self.proc = context.Process( @@ -464,19 +499,27 @@ def begin_window(self, name: str) -> None: ), ) self.proc.start() - time.sleep(5) + if not self.proc.is_alive(): + raise RuntimeError( + f"Failed to start power monitoring process for '{name}'." + ) + + self.window_state[name] = MonitoringProcessState( + queue=self.queue, proc=self.proc + ) - def end_window(self, name: str) -> None: + def end_window(self, name: str, sync_execution: bool = False) -> None: """End the power monitoring window. Args: name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. + sync_execution (bool, optional): Whether to execute monitoring synchronously. Defaults to False. """ - self.queue.put("stop") - if self.proc is not None: - self.proc.join(timeout=20) - if self.proc.is_alive(): - self.proc.terminate() + state = self.window_state.pop(name) + state.queue.put("stop") + state.proc.join(timeout=20) + if state.proc.is_alive(): + state.proc.terminate() def power_monitoring_loop( @@ -501,13 +544,12 @@ def power_monitoring_loop( power_monitor = PowerMonitor(gpu_indices=gpu_indices) registry = CollectorRegistry() - for gpu_index in gpu_indices: - gpu_gauges[gpu_index] = Gauge( - f"power_monitor_gpu_{gpu_index}_power_watts", - f"Records power consumption for GPU {gpu_index} over time", - ["gpu_index"], - registry=registry, - ) + gpu_gauges = Gauge( + "power_monitor_gpu_power_watts", + "Records power consumption for GPU over time", + ["window", "index"], + registry=registry, + ) while True: if not pipe.empty(): @@ -518,9 +560,7 @@ def power_monitoring_loop( try: if power_measurement: for gpu_index, power_value in power_measurement.items(): - gpu_gauges[gpu_index].labels( - gpu_index=f"{name}_gpu{gpu_index}" - ).set(power_value) + gpu_gauges.labels(window=name, index=gpu_index).set(power_value) except Exception as e: print(f"Error during processing power measurement: {e}") From 9228b74d678bcb27fd93ca1941b6534b48160321 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Fri, 6 Dec 2024 00:55:44 -0500 Subject: [PATCH 40/57] Add error handling for queue --- zeus/metric.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/zeus/metric.py b/zeus/metric.py index fcfe7aeb..5f03d2be 100644 --- a/zeus/metric.py +++ b/zeus/metric.py @@ -8,6 +8,7 @@ from typing import Sequence import multiprocessing as mp from dataclasses import dataclass +from multiprocessing.context import SpawnProcess from prometheus_client import ( CollectorRegistry, @@ -28,7 +29,7 @@ class MonitoringProcessState: """Represents the state of a monitoring window.""" queue: mp.Queue - proc: mp.Process + proc: SpawnProcess class Metric(abc.ABC): @@ -39,7 +40,7 @@ class Metric(abc.ABC): """ @abc.abstractmethod - def begin_window(self, name: str, sync_execution: bool = None) -> None: + def begin_window(self, name: str, sync_execution: bool = False) -> None: """Start a new measurement window. Args: @@ -49,7 +50,7 @@ def begin_window(self, name: str, sync_execution: bool = None) -> None: pass @abc.abstractmethod - def end_window(self, name: str, sync_execution: bool = None) -> None: + def end_window(self, name: str, sync_execution: bool = False) -> None: """End the current measurement window and report metrics. Args: @@ -292,9 +293,9 @@ def __init__( self.update_period = update_period self.prometheus_url = prometheus_url self.job = job - self.gpu_counters = {} - self.cpu_counters = {} - self.dram_counters = {} + self.gpu_counters: dict[int, Counter] = {} + self.cpu_counters: dict[int, Counter] = {} + self.dram_counters: dict[int, Counter] = {} self.queue = None self.proc = None self.window_state: dict[str, MonitoringProcessState] = {} @@ -343,7 +344,11 @@ def end_window(self, name: str, sync_execution: bool = False) -> None: raise ValueError(f"No active monitoring process found for '{name}'.") state = self.window_state.pop(name) + + if self.queue is None: + raise RuntimeError("Queue is not initialized.") self.queue.put("stop") + state.proc.join(timeout=20) if state.proc.is_alive(): @@ -516,8 +521,13 @@ def end_window(self, name: str, sync_execution: bool = False) -> None: sync_execution (bool, optional): Whether to execute monitoring synchronously. Defaults to False. """ state = self.window_state.pop(name) + + if self.queue is None: + raise RuntimeError("Queue is not initialized.") state.queue.put("stop") + state.proc.join(timeout=20) + if state.proc.is_alive(): state.proc.terminate() From 866365ed74291935d94b1cae93b17ffaf8dcdd01 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Fri, 6 Dec 2024 17:45:36 -0500 Subject: [PATCH 41/57] Add a call to train() in main --- examples/prometheus/train_single.py | 77 +++++++++++++---------------- 1 file changed, 33 insertions(+), 44 deletions(-) diff --git a/examples/prometheus/train_single.py b/examples/prometheus/train_single.py index 85a674a8..63737385 100644 --- a/examples/prometheus/train_single.py +++ b/examples/prometheus/train_single.py @@ -19,6 +19,8 @@ import torchvision.datasets as datasets import torchvision.models as models from multiprocessing import set_start_method +from PIL import Image, ImageFile, UnidentifiedImageError +#set_start_method("fork", force=True) # ZEUS from zeus.monitor import ZeusMonitor @@ -110,6 +112,20 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() +ImageFile.LOAD_TRUNCATED_IMAGES = True # Optionally allow truncated images + +def remove_corrupted_images(dataset_dir): + """Remove corrupted or truncated image files from the dataset directory.""" + for root, _, files in os.walk(dataset_dir): + for file in files: + img_path = os.path.join(root, file) + try: + with Image.open(img_path) as img: + img.verify() # Verify if the image is valid + img.convert("RGB") # Ensure it's in a proper format + except (UnidentifiedImageError, OSError): + print(f"Removing corrupted or truncated file: {img_path}") + os.remove(img_path) def main(): """Main function that prepares values and spawns/calls the worker function.""" @@ -136,7 +152,11 @@ def main(): scheduler = StepLR(optimizer, step_size=30, gamma=0.1) traindir = os.path.join(args.data, "train") + #remove_corrupted_images(traindir) + valdir = os.path.join(args.data, "val") + #remove_corrupted_images(valdir) + normalize = transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ) @@ -180,64 +200,32 @@ def main(): val_dataset = Subset(val_dataset, range(2)) ################################## The important part ##################################### - # ZeusMonitor is used to profile the time and energy consumption of the GPU. - - # EnergyHistogram: Records the energy consumption during specific phases of the program execution - # and pushes it to the Prometheus Push Gateway as histogram metrics. - energy_histogram = EnergyHistogram( - cpu_indices=[0], - gpu_indices=[0], - prometheus_url='http://localhost:9091', - job='training_energy_histogram' - ) - - # PowerGauge: Monitors real-time power usage of the GPUs and pushes the data to the Prometheus - # Push Gateway as gauge metrics, updated at regular intervals. - power_gauge = PowerGauge( - gpu_indices=[0], - update_period=2, - prometheus_url='http://localhost:9091', - job='training_power_gauge' - ) + # Histogram to track energy consumption over time + energy_histogram = EnergyHistogram(cpu_indices=[0,1], gpu_indices=[0], prometheus_url='http://localhost:9091', job='training_energy_histogram') + # Gauge to track power consumption over time + power_gauge = PowerGauge(gpu_indices=[0], update_period=2, prometheus_url='http://localhost:9091', job='training_power_gauge') + # Counter to track energy consumption over time + energy_counter = EnergyCumulativeCounter(cpu_indices=[0,1], gpu_indices=[0], update_period=2, prometheus_url='http://localhost:9091', job='training_energy_counter') - # EnergyCumulativeCounter: Tracks cumulative energy consumption over time for CPUs and GPUs - # and pushes the results to the Prometheus Push Gateway as counter metrics. - energy_counter = EnergyCumulativeCounter( - cpu_indices=[0], - gpu_indices=[0], - update_period=2, - prometheus_url='http://localhost:9091', - job='training_energy_counter' - ) - - # Start monitoring real-time power usage. power_gauge.begin_window("epoch_power") - - # Start tracking cumulative energy consumption. energy_counter.begin_window("epoch_energy") - # Loop through training epochs while measuring energy and power metrics. for epoch in range(args.epochs): - # Validate the model and compute accuracy. acc1 = validate(val_loader, model, criterion, args) - - # Begin and end energy monitoring for the current epoch. - energy_histogram.begin_window(f"epoch_{epoch}_energy") - energy_histogram.end_window(f"epoch_{epoch}_energy") - + energy_histogram.begin_window("training_energy") + energy_histogram.end_window("training_energy") + train(train_loader, model, criterion, optimizer, epoch, args) print(f"Top-1 accuracy: {acc1}") - # Allow metrics to capture remaining data before shutting down monitoring. time.sleep(10) - # End the cumulative energy and power monitoring windows. energy_counter.end_window("epoch_energy") power_gauge.end_window("epoch_power") ################################## The important part ##################################### def train( - train_loader, model, criterion, optimizer, epoch, args, power_limit_optimizer + train_loader, model, criterion, optimizer, epoch, args ): batch_time = AverageMeter("Time", ":6.3f") data_time = AverageMeter("Data", ":6.3f") @@ -256,7 +244,7 @@ def train( end = time.time() for i, (images, target) in enumerate(train_loader): - power_limit_optimizer.on_step_begin() # Mark the beginning of one training step. + #power_limit_optimizer.on_step_begin() # Mark the beginning of one training step. # Load data to GPU images = images.cuda(args.gpu, non_blocking=True) @@ -280,7 +268,7 @@ def train( loss.backward() optimizer.step() - power_limit_optimizer.on_step_end() # Mark the end of one training step. + #power_limit_optimizer.on_step_end() # Mark the end of one training step. # measure elapsed time batch_time.update(time.time() - end) @@ -430,3 +418,4 @@ def accuracy(output, target, topk=(1,)): if __name__ == "__main__": main() + \ No newline at end of file From 53f6c62c2bef2e1e09ee3990e26072934e2f5ef7 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Fri, 6 Dec 2024 21:41:34 -0500 Subject: [PATCH 42/57] Refactor tests for the modified code --- tests/test_metric.py | 311 ++++++++++++++++++++----------------------- zeus/metric.py | 41 ++---- 2 files changed, 159 insertions(+), 193 deletions(-) diff --git a/tests/test_metric.py b/tests/test_metric.py index f5f416de..450abd54 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -13,9 +13,12 @@ def mock_get_cpus(): """Fixture to mock `get_cpus()` to avoid RAPL-related errors.""" with patch("zeus.metric.get_cpus", autospec=True) as mock_get_cpus: - mock_cpu = MagicMock() - mock_cpu.cpus = [] - mock_get_cpus.return_value = mock_cpu + mock_cpu_0 = MagicMock() + mock_cpu_0.supportsGetDramEnergyConsumption.return_value = True + mock_cpu_1 = MagicMock() + mock_cpu_1.supportsGetDramEnergyConsumption.return_value = False + + mock_get_cpus.return_value.cpus = [mock_cpu_0, mock_cpu_1] yield mock_get_cpus @@ -25,8 +28,8 @@ def mock_zeus_monitor(): with patch("zeus.metric.ZeusMonitor", autospec=True) as zeus_monitor: mock_instance = zeus_monitor.return_value mock_instance.end_window.return_value = MagicMock( - gpu_energy={0: 30.0, 1: 35.0, 2: 40.0}, - cpu_energy={0: 20.0, 1: 25.0}, + gpu_energy={0: 50.0, 1: 100.0, 2: 200.0}, + cpu_energy={0: 40.0, 1: 50.0}, dram_energy={}, ) mock_instance.gpu_indices = [0, 1, 2] @@ -58,32 +61,22 @@ def mock_histogram(): yield histogram -@pytest.fixture -def mock_gauge(): - """Fixture to mock Prometheus Gauge creation.""" - with patch("zeus.metric.Gauge", autospec=True) as gauge: - gauge.side_effect = lambda *args, **kwargs: MagicMock() - yield gauge - - def test_energy_histogram( mock_get_cpus: MagicMock, mock_zeus_monitor: MagicMock, mock_histogram: MagicMock, ) -> None: - """Test EnergyHistogram class. - - Validates that GPU, CPU, and DRAM histograms are properly initialized, - and that the correct energy values are recorded. - - Args: - mock_get_cpus (MagicMock): Mocked `get_cpus` fixture. - mock_zeus_monitor (MagicMock): Mocked ZeusMonitor fixture. - mock_histogram (MagicMock): Mocked Prometheus Histogram fixture. - """ + """Test EnergyHistogram class.""" cpu_indices = [0, 1] gpu_indices = [0, 1, 2] prometheus_url = "http://localhost:9091" + window_name = "test_window" + + # Ensure mocked CPUs have the required method + mock_get_cpus.return_value.cpus = [ + MagicMock(supportsGetDramEnergyConsumption=MagicMock(return_value=True)), + MagicMock(supportsGetDramEnergyConsumption=MagicMock(return_value=False)), + ] histogram_metric = EnergyHistogram( cpu_indices=cpu_indices, @@ -91,85 +84,89 @@ def test_energy_histogram( prometheus_url=prometheus_url, job="test_energy_histogram", ) - if histogram_metric.gpu_histograms: - for _gpu_index, gpu_histogram in histogram_metric.gpu_histograms.items(): - gpu_histogram.labels = MagicMock(return_value=gpu_histogram) - gpu_histogram.observe = MagicMock() - - if histogram_metric.cpu_histograms: - for _cpu_index, cpu_histogram in histogram_metric.cpu_histograms.items(): - cpu_histogram.labels = MagicMock(return_value=cpu_histogram) - cpu_histogram.observe = MagicMock() - - if histogram_metric.dram_histograms: - for _dram_index, dram_histogram in histogram_metric.dram_histograms.items(): - dram_histogram.labels = MagicMock(return_value=dram_histogram) - dram_histogram.observe = MagicMock() - - histogram_metric.begin_window("test_window") + + # Mock single Histogram objects for GPU, CPU, and DRAM + gpu_mock_histogram = mock_histogram( + name="gpu_energy_histogram", + documentation="Mocked GPU histogram", + labelnames=["window", "index"], + ) + + cpu_mock_histogram = mock_histogram( + name="cpu_energy_histogram", + documentation="Mocked CPU histogram", + labelnames=["window", "index"], + ) + + dram_mock_histogram = mock_histogram( + name="dram_energy_histogram", + documentation="Mocked DRAM histogram", + labelnames=["window", "index"], + ) + + # Attach mocked histograms to the metric + histogram_metric.gpu_histograms = gpu_mock_histogram + histogram_metric.cpu_histograms = cpu_mock_histogram + histogram_metric.dram_histograms = dram_mock_histogram + + # Begin and end the monitoring window + histogram_metric.begin_window(window_name) with patch("http.client.HTTPConnection", autospec=True) as mock_http: mock_http_instance = mock_http.return_value mock_http_instance.getresponse.return_value.code = 200 mock_http_instance.getresponse.return_value.msg = "OK" mock_http_instance.getresponse.return_value.info = lambda: {} mock_http_instance.sock = MagicMock() - histogram_metric.end_window("test_window") - # Assert GPU histograms were observed - if mock_zeus_monitor.return_value.end_window.return_value.gpu_energy: - for ( - gpu_index, - energy, - ) in mock_zeus_monitor.return_value.end_window.return_value.gpu_energy.items(): - calls = [ - call[0][0] - for call in histogram_metric.gpu_histograms[ - gpu_index - ].observe.call_args_list - ] - assert energy in calls, f"Expected {energy} in {calls}" - - # Assert CPU histograms were observed - if mock_zeus_monitor.return_value.end_window.return_value.cpu_energy: - for ( - cpu_index, - energy, - ) in mock_zeus_monitor.return_value.end_window.return_value.cpu_energy.items(): - calls = [ - call[0][0] - for call in histogram_metric.cpu_histograms[ - cpu_index - ].observe.call_args_list - ] - assert energy in calls, f"Expected CPU energy {energy} in {calls}" - - # Assert DRAM histograms were observed - if mock_zeus_monitor.return_value.end_window.return_value.dram_energy: - for ( - dram_index, - energy, - ) in mock_zeus_monitor.return_value.end_window.return_value.dram_energy.items(): - calls = [ - call[0][0] - for call in histogram_metric.dram_histograms[ - dram_index - ].observe.call_args_list - ] - assert energy in calls, f"Expected DRAM energy {energy} in {calls}" - - + histogram_metric.end_window(window_name) + + # Validate GPU histogram observations + for ( + gpu_index, + energy, + ) in mock_zeus_monitor.return_value.end_window.return_value.gpu_energy.items(): + gpu_mock_histogram.labels.assert_any_call(window=window_name, index=gpu_index) + gpu_mock_histogram.labels.return_value.observe.assert_any_call(energy) + + # Validate CPU histogram observations + for ( + cpu_index, + energy, + ) in mock_zeus_monitor.return_value.end_window.return_value.cpu_energy.items(): + cpu_mock_histogram.labels.assert_any_call(window=window_name, index=cpu_index) + cpu_mock_histogram.labels.return_value.observe.assert_any_call(energy) + + # Validate DRAM histogram observations + for ( + dram_index, + energy, + ) in mock_zeus_monitor.return_value.end_window.return_value.dram_energy.items(): + dram_mock_histogram.labels.assert_any_call(window=window_name, index=dram_index) + dram_mock_histogram.labels.return_value.observe.assert_any_call(energy) + + +@patch("zeus.metric.energy_monitoring_loop", autospec=True) +@patch("zeus.metric.mp.get_context", autospec=True) def test_energy_cumulative_counter( - mock_get_cpus: MagicMock, mock_zeus_monitor: MagicMock -) -> None: - """Test EnergyCumulativeCounter with mocked ZeusMonitor. - - Args: - mock_get_cpus (MagicMock): Mocked `get_cpus` fixture. - mock_zeus_monitor (MagicMock): Mocked ZeusMonitor fixture. - """ + mock_mp_context: MagicMock, + mock_energy_monitoring_loop: MagicMock, +): + """Test EnergyCumulativeCounter with mocked subprocess behavior.""" cpu_indices = [0, 1] gpu_indices = [0, 1, 2] prometheus_url = "http://localhost:9091" + # Mock the context and queue + mock_queue = MagicMock() + mock_process = MagicMock() + mock_mp_context.return_value.Queue.return_value = mock_queue + mock_mp_context.return_value.Process.return_value = mock_process + + # Mock the behavior of subprocess + mock_energy_monitoring_loop.return_value = ( + None # Simulate the subprocess running without errors + ) + + # Create the EnergyCumulativeCounter instance cumulative_counter = EnergyCumulativeCounter( cpu_indices=cpu_indices, gpu_indices=gpu_indices, @@ -178,90 +175,78 @@ def test_energy_cumulative_counter( job="test_energy_counter", ) - for counters in [ - cumulative_counter.gpu_counters, - cumulative_counter.cpu_counters, - ]: - for counter in counters.values(): - counter.labels = MagicMock(return_value=counter) - counter.inc = MagicMock() - + # Begin a monitoring window cumulative_counter.begin_window("test_counter") + + # Assert that the subprocess was started with the correct arguments + mock_mp_context.return_value.Process.assert_called_once_with( + target=mock_energy_monitoring_loop, + args=( + "test_counter", + mock_queue, + cpu_indices, + gpu_indices, + 2, + prometheus_url, + "test_energy_counter", + False, + ), + ) + mock_process.start.assert_called_once() + + # End the monitoring window cumulative_counter.end_window("test_counter") + mock_queue.put.assert_called_once_with("stop") + mock_process.join.assert_called_once() - # Assert GPU counters - if mock_zeus_monitor.return_value.end_window.return_value.gpu_energy: - for ( - gpu_index, - energy, - ) in mock_zeus_monitor.return_value.end_window.return_value.gpu_energy.items(): - assert ( - gpu_index in cumulative_counter.gpu_counters - ), f"GPU counter for index {gpu_index} not initialized" - cumulative_counter.gpu_counters[gpu_index].inc.assert_called_with(energy) - - # Assert CPU counters - if mock_zeus_monitor.return_value.end_window.return_value.cpu_energy: - for ( - cpu_index, - energy, - ) in mock_zeus_monitor.return_value.end_window.return_value.cpu_energy.items(): - assert ( - cpu_index in cumulative_counter.cpu_counters - ), f"CPU counter for index {cpu_index} not initialized" - cumulative_counter.cpu_counters[cpu_index].inc.assert_called_with(energy) - - -@patch("zeus.device.gpu.get_gpus") -def test_power_gauge( - mock_get_gpus: MagicMock, - mock_power_monitor: MagicMock, - mock_gauge: MagicMock, -) -> None: - """Test PowerGauge with mocked PowerMonitor and Prometheus Gauges. - Args: - mock_get_gpus (MagicMock): Mocked `get_gpus` function to simulate available GPUs. - mock_power_monitor (MagicMock): Mocked PowerMonitor to simulate GPU power data. - mock_gauge (MagicMock): Mocked Prometheus Gauge creation. - """ +@patch("zeus.metric.power_monitoring_loop", autospec=True) +@patch("zeus.metric.mp.get_context", autospec=True) +def test_power_gauge( + mock_mp_context: MagicMock, + mock_power_monitoring_loop: MagicMock, +): + """Test PowerGauge with mocked subprocess behavior.""" gpu_indices = [0, 1, 2] prometheus_url = "http://localhost:9091" - # Mock `get_gpus` to simulate available GPUs - mock_get_gpus.return_value = MagicMock() - mock_get_gpus.return_value.gpus = gpu_indices + # Mock the context and queue + mock_queue = MagicMock() + mock_process = MagicMock() + mock_mp_context.return_value.Queue.return_value = mock_queue + mock_mp_context.return_value.Process.return_value = mock_process - mock_gauge.side_effect = lambda *args, **kwargs: MagicMock() + # Mock the behavior of subprocess + mock_power_monitoring_loop.return_value = ( + None # Simulate the subprocess running without errors + ) + # Create the EnergyCumulativeCounter instance power_gauge = PowerGauge( gpu_indices=gpu_indices, update_period=2, prometheus_url=prometheus_url, job="test_power_gauge", ) - if power_gauge.gpu_gauges: - for _gpu_index, gauge in power_gauge.gpu_gauges.items(): - gauge.labels = MagicMock(return_value=gauge) - gauge.set = MagicMock() - - power_gauge.begin_window("test_power_window") - power_gauge.end_window("test_power_window") - - # Assert that the gauges were set with the correct power values - if mock_power_monitor.return_value.get_power.return_value: - for ( - gpu_index, - power_value, - ) in mock_power_monitor.return_value.get_power.return_value.items(): - try: - # Check if `labels` was called with the correct arguments - power_gauge.gpu_gauges[gpu_index].labels.assert_called_once_with( - gpu_index=gpu_index, window="test_power_window" - ) - power_gauge.gpu_gauges[gpu_index].set.assert_called_once_with( - power_value - ) - except AssertionError as e: - print(f"AssertionError for GPU {gpu_index}:") - raise e + + # Begin a monitoring window + power_gauge.begin_window("test_power_gauge") + + # Assert that the subprocess was started with the correct arguments + mock_mp_context.return_value.Process.assert_called_once_with( + target=mock_power_monitoring_loop, + args=( + "test_power_gauge", + mock_queue, + gpu_indices, + 2, + prometheus_url, + "test_power_gauge", + ), + ) + mock_process.start.assert_called_once() + + # End the monitoring window + power_gauge.end_window("test_power_gauge") + mock_queue.put.assert_called_once_with("stop") + mock_process.join.assert_called_once() diff --git a/zeus/metric.py b/zeus/metric.py index 5f03d2be..e36885ea 100644 --- a/zeus/metric.py +++ b/zeus/metric.py @@ -8,7 +8,6 @@ from typing import Sequence import multiprocessing as mp from dataclasses import dataclass -from multiprocessing.context import SpawnProcess from prometheus_client import ( CollectorRegistry, @@ -29,7 +28,7 @@ class MonitoringProcessState: """Represents the state of a monitoring window.""" queue: mp.Queue - proc: SpawnProcess + proc: mp.Process class Metric(abc.ABC): @@ -40,7 +39,7 @@ class Metric(abc.ABC): """ @abc.abstractmethod - def begin_window(self, name: str, sync_execution: bool = False) -> None: + def begin_window(self, name: str, sync_execution: bool = None) -> None: """Start a new measurement window. Args: @@ -50,7 +49,7 @@ def begin_window(self, name: str, sync_execution: bool = False) -> None: pass @abc.abstractmethod - def end_window(self, name: str, sync_execution: bool = False) -> None: + def end_window(self, name: str, sync_execution: bool = None) -> None: """End the current measurement window and report metrics. Args: @@ -73,6 +72,9 @@ class EnergyHistogram(Metric): gpu_bucket_range: Histogram buckets for GPU energy. cpu_bucket_range: Histogram buckets for CPU energy. dram_bucket_range: Histogram buckets for DRAM energy. + gpu_histograms: A single Prometheus Histogram metric for all GPU energy consumption, indexed by window and GPU index. + cpu_histograms: A single Prometheus Histogram metric for all CPU energy consumption, indexed by window and CPU index. + dram_histograms: A single Prometheus Histogram metric for all DRAM energy consumption, indexed by window and DRAM index. """ def __init__( @@ -82,7 +84,7 @@ def __init__( prometheus_url: str, job: str, gpu_bucket_range: Sequence[float] = [50.0, 100.0, 200.0, 500.0, 1000.0], - cpu_bucket_range: Sequence[float] = [10.0, 20.0, 50.0, 100.0, 200.0], + cpu_bucket_range: Sequence[float] = [50.0, 100.0, 200.0, 500.0, 1000.0], dram_bucket_range: Sequence[float] = [5.0, 10.0, 20.0, 50.0, 150.0], ) -> None: """Initialize the EnergyHistogram class. @@ -128,7 +130,6 @@ def __init__( ) # Initialize GPU histograms - self.gpu_histograms = {} if self.gpu_indices: self.gpu_histograms = Histogram( "energy_monitor_gpu_energy_joules", @@ -138,8 +139,6 @@ def __init__( registry=self.registry, ) # Initialize CPU histograms - self.cpu_histograms = {} - self.dram_histograms = {} if self.cpu_indices: self.cpu_histograms = Histogram( "energy_monitor_cpu_energy_joules", @@ -263,9 +262,9 @@ class EnergyCumulativeCounter(Metric): update_period: The interval (in seconds) between consecutive energy data updates. prometheus_url: The URL of the Prometheus Push Gateway where the Counter metrics will be pushed. job: The name of the job associated with the energy monitoring in Prometheus. - gpu_counters: A dictionary storing the Prometheus Counter metrics for each GPU. - cpu_counters: A dictionary storing the Prometheus Counter metrics for each CPU. - dram_counters: A dictionary storing the Prometheus Counter metrics for DRAM. + gpu_counters: A single Prometheus Counter metric for all GPU energy consumption, indexed by window and GPU index. + cpu_counters: A single Prometheus Counter metric for all CPU energy consumption, indexed by window and CPU index. + dram_counters: A single Prometheus Counter metric for all DRAM energy consumption, indexed by window and DRAM index. queue: A multiprocessing queue used to send signals to start/stop energy monitoring. proc: A multiprocessing process that runs the energy monitoring loop. window_state: A dictionary that maps the monitoring window names to their corresponding process state. @@ -293,9 +292,6 @@ def __init__( self.update_period = update_period self.prometheus_url = prometheus_url self.job = job - self.gpu_counters: dict[int, Counter] = {} - self.cpu_counters: dict[int, Counter] = {} - self.dram_counters: dict[int, Counter] = {} self.queue = None self.proc = None self.window_state: dict[str, MonitoringProcessState] = {} @@ -344,11 +340,7 @@ def end_window(self, name: str, sync_execution: bool = False) -> None: raise ValueError(f"No active monitoring process found for '{name}'.") state = self.window_state.pop(name) - - if self.queue is None: - raise RuntimeError("Queue is not initialized.") self.queue.put("stop") - state.proc.join(timeout=20) if state.proc.is_alive(): @@ -380,10 +372,6 @@ def energy_monitoring_loop( registry = CollectorRegistry() energy_monitor = ZeusMonitor(cpu_indices=cpu_indices, gpu_indices=gpu_indices) - gpu_counters = {} - cpu_counters = {} - dram_counters = {} - if energy_monitor.gpu_indices: gpu_counters = Counter( "energy_monitor_gpu_energy_joules", @@ -448,7 +436,7 @@ class PowerGauge(Metric): update_period: Time interval (in seconds) between consecutive power measurements. prometheus_url: URL of the Prometheus Push Gateway where Gauge metrics are pushed. job: Name of the Prometheus job associated with the power metrics. - gpu_gauges (dict[int, Gauge]): Dictionary mapping GPU indices to Prometheus Gauge metrics for real-time power consumption tracking. + gpu_gauges: A single Prometheus Gauge metrics for real-time power consumption tracking. queue: Queue for controlling the monitoring process. proc: Process running the power monitoring loop. window_state: A dictionary mapping monitoring window names to their process state. @@ -473,7 +461,6 @@ def __init__( self.update_period = update_period self.prometheus_url = prometheus_url self.job = job - self.gpu_gauges = {} self.window_state: dict[str, MonitoringProcessState] = {} def begin_window(self, name: str, sync_execution: bool = False) -> None: @@ -521,13 +508,8 @@ def end_window(self, name: str, sync_execution: bool = False) -> None: sync_execution (bool, optional): Whether to execute monitoring synchronously. Defaults to False. """ state = self.window_state.pop(name) - - if self.queue is None: - raise RuntimeError("Queue is not initialized.") state.queue.put("stop") - state.proc.join(timeout=20) - if state.proc.is_alive(): state.proc.terminate() @@ -550,7 +532,6 @@ def power_monitoring_loop( prometheus_url (str): URL of the Prometheus Push Gateway where metrics are pushed. job (str): Name of the Prometheus job to associate with the metrics. """ - gpu_gauges = {} power_monitor = PowerMonitor(gpu_indices=gpu_indices) registry = CollectorRegistry() From c014ff1d0704c44becdeab571561d759e1836a1c Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Mon, 9 Dec 2024 13:06:04 -0500 Subject: [PATCH 43/57] Reformat the metric monitoring section for consistency --- docs/measure/index.md | 136 ++++++++++++++++++------------------------ 1 file changed, 57 insertions(+), 79 deletions(-) diff --git a/docs/measure/index.md b/docs/measure/index.md index 59acc265..c7803615 100644 --- a/docs/measure/index.md +++ b/docs/measure/index.md @@ -116,55 +116,17 @@ if __name__ == "__main__": ``` ## Metric Monitoring -To monitor energy and power consumption effectively using Zeus, Prometheus and the Prometheus Push Gateway must be properly set up. This section outlines the assumptions and provides a guide to configure Prometheus and the Push Gateway. +Zeus allows for efficient monitoring of energy and power consumption for GPUs, CPUs, and DRAM using Prometheus. It tracks key metrics such as energy usage, power draw, and cumulative consumption. Users can define measurement windows to track energy usage for specific operations, enabling granular analysis and optimization. -### **Metric Name Construction** -Zeus organizes metrics using **static metric names** and **dynamic labels** to ensure flexibility and ease of querying in Prometheus. Below, we document how metric names are constructed and how users can query them effectively. +!!! Assumption + A Prometheus Push Gateway must be deployed and accessible at the URL provided in your Zeus configuration. **This ensures that metrics collected by Zeus can be pushed to Prometheus.** -Currently, metric names (e.g., `energy_monitor_gpu_{gpu_index}_energy_joules`) are static and cannot be overridden. However, users can customize the name of the window to define the context of the metrics. +### Setup Guide -- **Metric Name**: `energy_monitor_gpu_{gpu_index}_energy_joules` -- **Labels**: - - `window`: The user-defined window name provided during `begin_window()` and `end_window()` (e.g. energy_histogram.begin_window(f"epoch_{epoch}_energy")). - - `index`: The GPU index (e.g., `0` for GPU 0). - - -## How to Query Metrics in Prometheus - -### 1. Query Metrics for a Specific Window -Retrieve energy metrics for a GPU during a specific window: -```promql -energy_monitor_gpu_0_energy_joules{window="epoch_1_step_0"} -``` -```promql -sum(energy_monitor_gpu_0_energy_joules) by (window) -``` ---- - -## Assumptions - -1. **Prometheus Push Gateway Deployment** - A Prometheus Push Gateway must be deployed and accessible at the URL provided in your Zeus configuration. This ensures that metrics collected by Zeus can be pushed to Prometheus. - -2. **Prometheus Configuration** - Prometheus is configured to scrape data from the Push Gateway. This involves adding the Push Gateway URL to the Prometheus `prometheus.yml` configuration file. - -3. **Network Accessibility** - If the Push Gateway and Prometheus are hosted on a remote server, ensure that firewall settings allow traffic on the required ports: - - **Push Gateway**: Port `9091` - - **Prometheus**: Port `9090` - -4. **Optional Visualization Tools** - Tools like Grafana can be integrated with Prometheus to create detailed visualizations of the metrics collected. - ---- - -## Setup Guide - -### Step 1: Install and Start the Prometheus Push Gateway +#### Step 1: Install and Start the Prometheus Push Gateway Choose either Option 1 (Binary) or Option 2 (Docker). -#### Option 1: Download Binary +##### Option 1: Download Binary 1. Visit the [Prometheus Push Gateway Download Page](https://prometheus.io/download/#pushgateway). 2. Download the appropriate binary for your operating system. 3. Extract the binary: @@ -178,7 +140,7 @@ Choose either Option 1 (Binary) or Option 2 (Docker). ``` 5. Verify the Push Gateway is running by visiting http://localhost:9091 in your browser. -### Option 2: Using Docker +##### Option 2: Using Docker 1. Pull the official Prometheus Push Gateway Docker image: ```sh docker pull prom/pushgateway @@ -189,7 +151,7 @@ docker run -d -p 9091:9091 prom/pushgateway ``` 3. Verify it is running by visiting http://localhost:9091 in your browser. -### Step 2: Install and Configure Prometheus +#### Step 2: Install and Configure Prometheus 1. Visit the Prometheus [Prometheus Download Page](https://prometheus.io/download/#prometheus). 2. Download the appropriate binary for your operating system. 3. Extract the binary: @@ -212,16 +174,25 @@ scrape_configs: 6. Visit http://localhost:9090 in your browser, or use curl http://localhost:9090/api/v1/targets 7. Verify Prometheus is running by visiting http://localhost:9090 in your browser. -Zeus allows you to monitor energy and power consumption through different metrics, such as Histograms, Counters, and Gauges, which can be pushed to a Prometheus Push Gateway for further analysis. +### Metric Name Construction ---- +Zeus organizes metrics using **static metric names** and **dynamic labels** for flexibility and ease of querying in Prometheus. Metric names are static and cannot be overridden, but users can customize the context of the metrics by naming the window when using `begin_window()` and `end_window()`. -[`EnergyHistogram`][zeus.metric.EnergyHistogram] records energy consumption data for GPUs, CPUs, and DRAM in Prometheus Histograms. This is ideal for observing how often energy usage falls within specific ranges. +#### Metric Name +- For Histogram: `energy_monitor_{component}_energy_joules` +- For Counter: `energy_monitor_{component}_energy_joules` +- For Gauge: `power_monitor_gpu_power_watts` -You can customize the bucket ranges for each component (GPU, CPU, and DRAM), or let Zeus use default ranges. +component: gpu, cpu, or dram -```python hl_lines="2 5-15" +#### Labels +- window: The user-defined window name provided during `begin_window()` and `end_window()` (e.g., `energy_histogram.begin_window(f"epoch_energy")`). +- index: The GPU index (e.g., `0` for GPU 0). +### Usage and Initialization +[`EnergyHistogram`][zeus.metric.EnergyHistogram] records energy consumption data for GPUs, CPUs, and DRAM in Prometheus Histograms. This is ideal for observing how often energy usage falls within specific ranges. + +```python hl_lines="2 5-15" from zeus.metric import EnergyHistogram if __name__ == "__main__": @@ -235,21 +206,12 @@ if __name__ == "__main__": for epoch in range(100): # Start monitoring energy for the entire epoch - energy_histogram.begin_window(f"epoch_{epoch}_energy") - - # Step-level monitoring - for step_idx, (x, y) in enumerate(train_loader): - energy_histogram.begin_window(f"epoch_{epoch}_step_{step_idx}_energy") - train_one_step(x, y) - energy_histogram.end_window(f"epoch_{epoch}_step_{step_idx}_energy") - - # Perform epoch-level operations (e.g., aggregation) + energy_histogram.begin_window("epoch_energy") + # Perform epoch-level operations train_one_epoch(train_loader, model, optimizer, criterion, epoch, args) acc1 = validate(val_loader, model, criterion, args) - # End monitoring energy for the epoch - energy_histogram.end_window(f"epoch_{epoch}_energy") - + energy_histogram.end_window("epoch_energy") print(f"Epoch {epoch} completed. Validation Accuracy: {acc1}%") ``` @@ -257,24 +219,24 @@ You can use the `begin_window` and `end_window` methods to define a measurement !!! Tip You can customize the bucket ranges for GPUs, CPUs, and DRAM during initialization to tailor the granularity of energy monitoring. For example: - -```python hl_lines="2 5-15" -energy_histogram = EnergyHistogram( - cpu_indices=[0], - gpu_indices=[0], - prometheus_url='http://localhost:9091', - job='training_energy_histogram', - gpu_bucket_range = [10.0, 25.0, 50.0, 100.0], - cpu_bucket_range = [5.0, 15.0, 30.0, 50.0], - dram_bucket_range = [2.0, 8.0, 20.0, 40.0], -) -``` + ```python hl_lines="2 5-15" + energy_histogram = EnergyHistogram( + cpu_indices=[0], + gpu_indices=[0], + prometheus_url='http://localhost:9091', + job='training_energy_histogram', + gpu_bucket_range = [10.0, 25.0, 50.0, 100.0], + cpu_bucket_range = [5.0, 15.0, 30.0, 50.0], + dram_bucket_range = [2.0, 8.0, 20.0, 40.0], + ) + ``` If no custom `bucket ranges` are specified, Zeus uses these default ranges: -- **GPU**: `[50.0, 100.0, 200.0, 500.0, 1000.0]` -- **CPU**: `[10.0, 20.0, 50.0, 100.0, 200.0]` -- **DRAM**: `[5.0, 10.0, 20.0, 50.0, 150.0]` - +``` +- GPU: [50.0, 100.0, 200.0, 500.0, 1000.0] +- CPU: [10.0, 20.0, 50.0, 100.0, 200.0] +- DRAM: [5.0, 10.0, 20.0, 50.0, 150.0] +``` !!! Warning Empty bucket ranges (e.g., []) are not allowed and will raise an error. Ensure you provide a valid range for each device or use the defaults. @@ -345,6 +307,22 @@ if __name__ == "__main__": ``` The `update_period` parameter defines how often the power datas are updated and pushed to Prometheus. + +### How to Query Metrics in Prometheus + +#### Query to View Energy for a Specific Window +```promql +energy_monitor_gpu_energy_joules{window="epoch_energy"} +``` +#### Query to Sum Energy for a Specific Window +```promql +sum(energy_monitor_gpu_energy_joules) by (window) +``` +#### Query to Sum Energy for Specific GPU Across All Windows +```promql +sum(energy_monitor_gpu_energy_joules{index="0"}) +``` + ## CLI power and energy monitor The energy monitor measures the total energy consumed by the GPU during the lifetime of the monitor process. From f7e5d799f69a7ec156311cb15e6afc335045676f Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Mon, 9 Dec 2024 13:11:03 -0500 Subject: [PATCH 44/57] Setup Guide -> Local Setup Guide --- docs/measure/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/measure/index.md b/docs/measure/index.md index c7803615..02edd2a4 100644 --- a/docs/measure/index.md +++ b/docs/measure/index.md @@ -121,7 +121,7 @@ Zeus allows for efficient monitoring of energy and power consumption for GPUs, C !!! Assumption A Prometheus Push Gateway must be deployed and accessible at the URL provided in your Zeus configuration. **This ensures that metrics collected by Zeus can be pushed to Prometheus.** -### Setup Guide +### Local Setup Guide #### Step 1: Install and Start the Prometheus Push Gateway Choose either Option 1 (Binary) or Option 2 (Docker). From f8d5b6768f033eb5816ab1b5671474ac27ab6e4f Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Mon, 9 Dec 2024 16:16:44 -0500 Subject: [PATCH 45/57] Add condition for using put() with empty queue --- zeus/metric.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/zeus/metric.py b/zeus/metric.py index e36885ea..9882cd05 100644 --- a/zeus/metric.py +++ b/zeus/metric.py @@ -28,7 +28,7 @@ class MonitoringProcessState: """Represents the state of a monitoring window.""" queue: mp.Queue - proc: mp.Process + proc: mp.context.SpawnProcess class Metric(abc.ABC): @@ -39,7 +39,7 @@ class Metric(abc.ABC): """ @abc.abstractmethod - def begin_window(self, name: str, sync_execution: bool = None) -> None: + def begin_window(self, name: str, sync_execution: bool = True) -> None: """Start a new measurement window. Args: @@ -49,7 +49,7 @@ def begin_window(self, name: str, sync_execution: bool = None) -> None: pass @abc.abstractmethod - def end_window(self, name: str, sync_execution: bool = None) -> None: + def end_window(self, name: str, sync_execution: bool = True) -> None: """End the current measurement window and report metrics. Args: @@ -84,7 +84,7 @@ def __init__( prometheus_url: str, job: str, gpu_bucket_range: Sequence[float] = [50.0, 100.0, 200.0, 500.0, 1000.0], - cpu_bucket_range: Sequence[float] = [50.0, 100.0, 200.0, 500.0, 1000.0], + cpu_bucket_range: Sequence[float] = [10.0, 50.0, 100.0, 500.0, 1000.0], dram_bucket_range: Sequence[float] = [5.0, 10.0, 20.0, 50.0, 150.0], ) -> None: """Initialize the EnergyHistogram class. @@ -340,7 +340,10 @@ def end_window(self, name: str, sync_execution: bool = False) -> None: raise ValueError(f"No active monitoring process found for '{name}'.") state = self.window_state.pop(name) - self.queue.put("stop") + if self.queue is not None: + self.queue.put("stop") + else: + raise RuntimeError("Queue is not initialized") state.proc.join(timeout=20) if state.proc.is_alive(): @@ -371,6 +374,9 @@ def energy_monitoring_loop( """ registry = CollectorRegistry() energy_monitor = ZeusMonitor(cpu_indices=cpu_indices, gpu_indices=gpu_indices) + gpu_counters = None + cpu_counters = None + dram_counters = None if energy_monitor.gpu_indices: gpu_counters = Counter( @@ -508,7 +514,10 @@ def end_window(self, name: str, sync_execution: bool = False) -> None: sync_execution (bool, optional): Whether to execute monitoring synchronously. Defaults to False. """ state = self.window_state.pop(name) - state.queue.put("stop") + if self.queue is not None: + self.queue.put("stop") + else: + raise RuntimeError("Queue is not initialized") state.proc.join(timeout=20) if state.proc.is_alive(): state.proc.terminate() From 8c5456e1a0b2d1bec168715ed222b3dbdbeaf573 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Mon, 9 Dec 2024 16:29:37 -0500 Subject: [PATCH 46/57] Import the SpawnProcess class from multiprocessing.context --- zeus/metric.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zeus/metric.py b/zeus/metric.py index 9882cd05..14f38fa6 100644 --- a/zeus/metric.py +++ b/zeus/metric.py @@ -7,6 +7,7 @@ import warnings from typing import Sequence import multiprocessing as mp +from multiprocessing.context import SpawnProcess from dataclasses import dataclass from prometheus_client import ( @@ -28,7 +29,7 @@ class MonitoringProcessState: """Represents the state of a monitoring window.""" queue: mp.Queue - proc: mp.context.SpawnProcess + proc: SpawnProcess class Metric(abc.ABC): From 4c2e794c18d65a5b4c111247ba52d5537baeb510 Mon Sep 17 00:00:00 2001 From: Sharon Seungyu Han <87476439+sharonsyh@users.noreply.github.com> Date: Tue, 10 Dec 2024 13:32:26 -0500 Subject: [PATCH 47/57] Update docs/measure/index.md Add link to the push gateway Co-authored-by: Jae-Won Chung --- docs/measure/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/measure/index.md b/docs/measure/index.md index 02edd2a4..46c0e2d5 100644 --- a/docs/measure/index.md +++ b/docs/measure/index.md @@ -119,7 +119,7 @@ if __name__ == "__main__": Zeus allows for efficient monitoring of energy and power consumption for GPUs, CPUs, and DRAM using Prometheus. It tracks key metrics such as energy usage, power draw, and cumulative consumption. Users can define measurement windows to track energy usage for specific operations, enabling granular analysis and optimization. !!! Assumption - A Prometheus Push Gateway must be deployed and accessible at the URL provided in your Zeus configuration. **This ensures that metrics collected by Zeus can be pushed to Prometheus.** + A [Prometheus Push Gateway](https://prometheus.io/docs/instrumenting/pushing/) must be deployed and accessible. This ensures that metrics collected by Zeus can be pushed to Prometheus. ### Local Setup Guide From d8a6f1caef5fd91ab6645b00d8cd68139a07fdfc Mon Sep 17 00:00:00 2001 From: Sharon Seungyu Han <87476439+sharonsyh@users.noreply.github.com> Date: Tue, 10 Dec 2024 13:33:24 -0500 Subject: [PATCH 48/57] Update docs/measure/index.md Generalize the device as {component} with the note that Gauge only supports GPU Co-authored-by: Jae-Won Chung --- docs/measure/index.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/measure/index.md b/docs/measure/index.md index 46c0e2d5..e51050d8 100644 --- a/docs/measure/index.md +++ b/docs/measure/index.md @@ -181,7 +181,10 @@ Zeus organizes metrics using **static metric names** and **dynamic labels** for #### Metric Name - For Histogram: `energy_monitor_{component}_energy_joules` - For Counter: `energy_monitor_{component}_energy_joules` -- For Gauge: `power_monitor_gpu_power_watts` +- For Gauge: `power_monitor_{component}_power_watts` + +Note that Gauge only supports the GPU component at the moment. Tracking issue: [#128](https://github.com/ml-energy/zeus/issues/128) + component: gpu, cpu, or dram From d85f255d49fac2fef765fe8f1e10944ec1f59688 Mon Sep 17 00:00:00 2001 From: Sharon Seungyu Han <87476439+sharonsyh@users.noreply.github.com> Date: Tue, 10 Dec 2024 13:33:43 -0500 Subject: [PATCH 49/57] Update docs/measure/index.md Co-authored-by: Jae-Won Chung --- docs/measure/index.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/measure/index.md b/docs/measure/index.md index e51050d8..c669d03e 100644 --- a/docs/measure/index.md +++ b/docs/measure/index.md @@ -313,15 +313,17 @@ The `update_period` parameter defines how often the power datas are updated and ### How to Query Metrics in Prometheus -#### Query to View Energy for a Specific Window +Energy for a specific window: ```promql energy_monitor_gpu_energy_joules{window="epoch_energy"} ``` -#### Query to Sum Energy for a Specific Window + +Sum of energy for a specific window: ```promql sum(energy_monitor_gpu_energy_joules) by (window) ``` -#### Query to Sum Energy for Specific GPU Across All Windows + +Sum of energy for specific GPU across all windows: ```promql sum(energy_monitor_gpu_energy_joules{index="0"}) ``` From 5f9cc6ba367b1e91c70c855c69a9ce6720a47b2c Mon Sep 17 00:00:00 2001 From: Sharon Seungyu Han <87476439+sharonsyh@users.noreply.github.com> Date: Tue, 10 Dec 2024 13:35:08 -0500 Subject: [PATCH 50/57] Update docs/measure/index.md Co-authored-by: Jae-Won Chung --- docs/measure/index.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/measure/index.md b/docs/measure/index.md index c669d03e..fbe7fbda 100644 --- a/docs/measure/index.md +++ b/docs/measure/index.md @@ -228,9 +228,9 @@ You can use the `begin_window` and `end_window` methods to define a measurement gpu_indices=[0], prometheus_url='http://localhost:9091', job='training_energy_histogram', - gpu_bucket_range = [10.0, 25.0, 50.0, 100.0], - cpu_bucket_range = [5.0, 15.0, 30.0, 50.0], - dram_bucket_range = [2.0, 8.0, 20.0, 40.0], + gpu_bucket_range=[10.0, 25.0, 50.0, 100.0], + cpu_bucket_range=[5.0, 15.0, 30.0, 50.0], + dram_bucket_range=[2.0, 8.0, 20.0, 40.0], ) ``` From 0f8d5506d72c8e67412b8c82ee2fd16d9d060f10 Mon Sep 17 00:00:00 2001 From: Sharon Seungyu Han <87476439+sharonsyh@users.noreply.github.com> Date: Tue, 10 Dec 2024 13:35:21 -0500 Subject: [PATCH 51/57] Update docs/measure/index.md Co-authored-by: Jae-Won Chung --- docs/measure/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/measure/index.md b/docs/measure/index.md index fbe7fbda..af7fb789 100644 --- a/docs/measure/index.md +++ b/docs/measure/index.md @@ -190,7 +190,7 @@ component: gpu, cpu, or dram #### Labels - window: The user-defined window name provided during `begin_window()` and `end_window()` (e.g., `energy_histogram.begin_window(f"epoch_energy")`). -- index: The GPU index (e.g., `0` for GPU 0). +- index: The index of the device (e.g., `0` for GPU 0). ### Usage and Initialization [`EnergyHistogram`][zeus.metric.EnergyHistogram] records energy consumption data for GPUs, CPUs, and DRAM in Prometheus Histograms. This is ideal for observing how often energy usage falls within specific ranges. From 49acc9a8fda9d8c8362df60bf95bc1002454fd4b Mon Sep 17 00:00:00 2001 From: Sharon Seungyu Han <87476439+sharonsyh@users.noreply.github.com> Date: Tue, 10 Dec 2024 13:35:35 -0500 Subject: [PATCH 52/57] Update docs/measure/index.md Co-authored-by: Jae-Won Chung --- docs/measure/index.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/measure/index.md b/docs/measure/index.md index af7fb789..d8d3f5e1 100644 --- a/docs/measure/index.md +++ b/docs/measure/index.md @@ -226,8 +226,8 @@ You can use the `begin_window` and `end_window` methods to define a measurement energy_histogram = EnergyHistogram( cpu_indices=[0], gpu_indices=[0], - prometheus_url='http://localhost:9091', - job='training_energy_histogram', + prometheus_url="http://localhost:9091", + job="training_energy_histogram", gpu_bucket_range=[10.0, 25.0, 50.0, 100.0], cpu_bucket_range=[5.0, 15.0, 30.0, 50.0], dram_bucket_range=[2.0, 8.0, 20.0, 40.0], From f18ecb917a64178f2217a1cb66c1e7aeca33b0b1 Mon Sep 17 00:00:00 2001 From: Sharon Seungyu Han <87476439+sharonsyh@users.noreply.github.com> Date: Tue, 10 Dec 2024 13:35:45 -0500 Subject: [PATCH 53/57] Update docs/measure/index.md Co-authored-by: Jae-Won Chung --- docs/measure/index.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/measure/index.md b/docs/measure/index.md index d8d3f5e1..d7bf16af 100644 --- a/docs/measure/index.md +++ b/docs/measure/index.md @@ -235,11 +235,10 @@ You can use the `begin_window` and `end_window` methods to define a measurement ``` If no custom `bucket ranges` are specified, Zeus uses these default ranges: -``` -- GPU: [50.0, 100.0, 200.0, 500.0, 1000.0] -- CPU: [10.0, 20.0, 50.0, 100.0, 200.0] -- DRAM: [5.0, 10.0, 20.0, 50.0, 150.0] -``` + +- GPU: `[50.0, 100.0, 200.0, 500.0, 1000.0]` +- CPU: `[10.0, 20.0, 50.0, 100.0, 200.0]` +- DRAM: `[5.0, 10.0, 20.0, 50.0, 150.0]` !!! Warning Empty bucket ranges (e.g., []) are not allowed and will raise an error. Ensure you provide a valid range for each device or use the defaults. From 2276ac229c01283fd6529c99b37711293ba6be98 Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Tue, 10 Dec 2024 14:04:45 -0500 Subject: [PATCH 54/57] Remove power_limit_optimizer and bring back the original code for image processing --- examples/prometheus/train_single.py | 64 ++++++++++++----------------- 1 file changed, 27 insertions(+), 37 deletions(-) diff --git a/examples/prometheus/train_single.py b/examples/prometheus/train_single.py index 63737385..2cac6893 100644 --- a/examples/prometheus/train_single.py +++ b/examples/prometheus/train_single.py @@ -14,18 +14,13 @@ import torch.utils.data from torch.utils.data import DataLoader import torch.utils.data.distributed -from torch.utils.data import Subset import torchvision.transforms as transforms import torchvision.datasets as datasets import torchvision.models as models -from multiprocessing import set_start_method -from PIL import Image, ImageFile, UnidentifiedImageError -#set_start_method("fork", force=True) # ZEUS from zeus.monitor import ZeusMonitor from zeus.monitor import PowerMonitor -from zeus.optimizer.power_limit import MaxSlowdownConstraint, GlobalPowerLimitOptimizer from zeus.utils.env import get_env from zeus.metric import EnergyHistogram, EnergyCumulativeCounter, PowerGauge @@ -112,21 +107,6 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() -ImageFile.LOAD_TRUNCATED_IMAGES = True # Optionally allow truncated images - -def remove_corrupted_images(dataset_dir): - """Remove corrupted or truncated image files from the dataset directory.""" - for root, _, files in os.walk(dataset_dir): - for file in files: - img_path = os.path.join(root, file) - try: - with Image.open(img_path) as img: - img.verify() # Verify if the image is valid - img.convert("RGB") # Ensure it's in a proper format - except (UnidentifiedImageError, OSError): - print(f"Removing corrupted or truncated file: {img_path}") - os.remove(img_path) - def main(): """Main function that prepares values and spawns/calls the worker function.""" args = parse_args() @@ -152,11 +132,7 @@ def main(): scheduler = StepLR(optimizer, step_size=30, gamma=0.1) traindir = os.path.join(args.data, "train") - #remove_corrupted_images(traindir) - valdir = os.path.join(args.data, "val") - #remove_corrupted_images(valdir) - normalize = transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ) @@ -164,7 +140,8 @@ def main(): traindir, transforms.Compose( [ - transforms.Resize((224, 224)), + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ] @@ -174,7 +151,8 @@ def main(): valdir, transforms.Compose( [ - transforms.Resize((224, 224)), + transforms.Resize(256), + transforms.CenterCrop(224), transforms.ToTensor(), normalize, ] @@ -196,16 +174,29 @@ def main(): pin_memory=True, ) - train_dataset = Subset(train_dataset, range(5)) - val_dataset = Subset(val_dataset, range(2)) - ################################## The important part ##################################### # Histogram to track energy consumption over time - energy_histogram = EnergyHistogram(cpu_indices=[0,1], gpu_indices=[0], prometheus_url='http://localhost:9091', job='training_energy_histogram') + energy_histogram = EnergyHistogram( + cpu_indices=[0,1], + gpu_indices=[0], + prometheus_url='http://localhost:9091', + job='training_energy_histogram' + ) # Gauge to track power consumption over time - power_gauge = PowerGauge(gpu_indices=[0], update_period=2, prometheus_url='http://localhost:9091', job='training_power_gauge') + power_gauge = PowerGauge( + gpu_indices=[0], + update_period=2, + prometheus_url='http://localhost:9091', + job='training_power_gauge' + ) # Counter to track energy consumption over time - energy_counter = EnergyCumulativeCounter(cpu_indices=[0,1], gpu_indices=[0], update_period=2, prometheus_url='http://localhost:9091', job='training_energy_counter') + energy_counter = EnergyCumulativeCounter( + cpu_indices=[0,1], + gpu_indices=[0], + update_period=2, + prometheus_url='http://localhost:9091', + job='training_energy_counter' + ) power_gauge.begin_window("epoch_power") energy_counter.begin_window("epoch_energy") @@ -213,10 +204,11 @@ def main(): for epoch in range(args.epochs): acc1 = validate(val_loader, model, criterion, args) energy_histogram.begin_window("training_energy") - energy_histogram.end_window("training_energy") train(train_loader, model, criterion, optimizer, epoch, args) + energy_histogram.end_window("training_energy") print(f"Top-1 accuracy: {acc1}") - + + # Allow metrics to capture remaining data before shutting down monitoring. time.sleep(10) energy_counter.end_window("epoch_energy") @@ -244,7 +236,6 @@ def train( end = time.time() for i, (images, target) in enumerate(train_loader): - #power_limit_optimizer.on_step_begin() # Mark the beginning of one training step. # Load data to GPU images = images.cuda(args.gpu, non_blocking=True) @@ -268,8 +259,6 @@ def train( loss.backward() optimizer.step() - #power_limit_optimizer.on_step_end() # Mark the end of one training step. - # measure elapsed time batch_time.update(time.time() - end) end = time.time() @@ -418,4 +407,5 @@ def accuracy(output, target, topk=(1,)): if __name__ == "__main__": main() + \ No newline at end of file From dea8b5df3787f3787c59961d545403a4302df61f Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Fri, 10 Jan 2025 00:47:41 +0900 Subject: [PATCH 55/57] Add sync execution function to handle synchronization during monitoring window operations --- zeus/metric.py | 160 ++++++++++++++++++++----------------------------- 1 file changed, 66 insertions(+), 94 deletions(-) diff --git a/zeus/metric.py b/zeus/metric.py index 14f38fa6..5555dbd0 100644 --- a/zeus/metric.py +++ b/zeus/metric.py @@ -20,7 +20,7 @@ from zeus.monitor.power import PowerMonitor from zeus.monitor.energy import ZeusMonitor - +from zeus.utils.framework import sync_execution as sync_execution_fn from zeus.device.cpu import get_cpus @@ -45,7 +45,8 @@ def begin_window(self, name: str, sync_execution: bool = True) -> None: Args: name (str): Name of the measurement window. - sync_execution (bool): Whether to execute synchronously. Defaults to None. + sync_execution (bool): Whether to wait for asynchronously dispatched computations + to finish before starting the measurement window. """ pass @@ -55,7 +56,8 @@ def end_window(self, name: str, sync_execution: bool = True) -> None: Args: name (str): Name of the measurement window. - sync_execution (bool): Whether to execute synchronously. Defaults to None. + sync_execution (bool): Whether to wait for asynchronously dispatched computations + to finish before starting the measurement window. """ pass @@ -64,25 +66,13 @@ class EnergyHistogram(Metric): """Measures the energy consumption a code range and exports a histogram metrics. Tracks energy consumption for GPUs, CPUs, and DRAM as Prometheus Histogram metrics. - - Attributes: - cpu_indices: List of CPU indices to monitor. - gpu_indices: List of GPU indices to monitor. - prometheus_url: Prometheus Push Gateway URL. - job: Prometheus job name. - gpu_bucket_range: Histogram buckets for GPU energy. - cpu_bucket_range: Histogram buckets for CPU energy. - dram_bucket_range: Histogram buckets for DRAM energy. - gpu_histograms: A single Prometheus Histogram metric for all GPU energy consumption, indexed by window and GPU index. - cpu_histograms: A single Prometheus Histogram metric for all CPU energy consumption, indexed by window and CPU index. - dram_histograms: A single Prometheus Histogram metric for all DRAM energy consumption, indexed by window and DRAM index. """ def __init__( self, cpu_indices: list, gpu_indices: list, - prometheus_url: str, + pushgateway_url: str, job: str, gpu_bucket_range: Sequence[float] = [50.0, 100.0, 200.0, 500.0, 1000.0], cpu_bucket_range: Sequence[float] = [10.0, 50.0, 100.0, 500.0, 1000.0], @@ -96,7 +86,7 @@ def __init__( Args: cpu_indices (list): List of CPU indices to monitor. gpu_indices (list): List of GPU indices to monitor. - prometheus_url (str): URL of the Prometheus Push Gateway where metrics will be pushed. + pushgateway_url (str): URL of the Prometheus Push Gateway where metrics will be pushed. job (str): Name of the Prometheus job to associate with the energy metrics. gpu_bucket_range (list[float], optional): Bucket ranges for GPU energy histograms. Defaults to [50.0, 100.0, 200.0, 500.0, 1000.0]. @@ -113,7 +103,7 @@ def __init__( self.dram_bucket_range = dram_bucket_range self.cpu_indices = cpu_indices self.gpu_indices = gpu_indices - self.prometheus_url = prometheus_url + self.pushgateway_url = pushgateway_url self.job = job self.registry = CollectorRegistry() @@ -175,9 +165,14 @@ def begin_window(self, name: str, sync_execution: bool = True) -> None: Args: name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. - sync_execution (bool): Whether to execute synchronously. Defaults to True. + sync_execution (bool): Whether to execute synchronously. Defaults to True. If assigned True, calls sync_execution_fn with the defined gpu """ - self.energy_monitor.begin_window(f"__EnergyHistogram_{name}", sync_execution) + if sync_execution: + sync_execution_fn(self.gpu_indices) + + self.energy_monitor.begin_window( + f"__EnergyHistogram_{name}", sync_execution=sync_execution + ) def end_window(self, name: str, sync_execution: bool = True) -> None: """End the current energy monitoring window and record the energy data. @@ -188,14 +183,12 @@ def end_window(self, name: str, sync_execution: bool = True) -> None: Args: name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. sync_execution (bool): Whether to execute synchronously. Defaults to True. - - Pushes: - - GPU energy data to the Prometheus Push Gateway via the associated Histogram metric. - - CPU energy data to the Prometheus Push Gateway via the associated Histogram metric. - - DRAM energy data to the Prometheus Push Gateway via the associated Histogram metric. """ + if sync_execution: + sync_execution_fn(self.gpu_indices) + measurement = self.energy_monitor.end_window( - f"__EnergyHistogram_{name}", sync_execution + f"__EnergyHistogram_{name}", sync_execution=sync_execution ) if measurement.gpu_energy: @@ -242,11 +235,11 @@ def end_window(self, name: str, sync_execution: bool = True) -> None: ) if dram_energy < self.min_dram_bucket: warnings.warn( - f"CPU {dram_index} energy {dram_energy} exceeds the minimum bucket value of {self.min_dram_bucket}", + f"DRAM {dram_index} energy {dram_energy} exceeds the minimum bucket value of {self.min_dram_bucket}", stacklevel=1, ) - push_to_gateway(self.prometheus_url, job=self.job, registry=self.registry) + push_to_gateway(self.pushgateway_url, job=self.job, registry=self.registry) class EnergyCumulativeCounter(Metric): @@ -257,18 +250,6 @@ class EnergyCumulativeCounter(Metric): The cumulative nature of the Counter ensures that energy values are always incremented over time, never reset, which is ideal for tracking continuously increasing values like energy usage. - - Attributes: - energy_monitor: The ZeusMonitor instance that collects energy consumption data for the system. - update_period: The interval (in seconds) between consecutive energy data updates. - prometheus_url: The URL of the Prometheus Push Gateway where the Counter metrics will be pushed. - job: The name of the job associated with the energy monitoring in Prometheus. - gpu_counters: A single Prometheus Counter metric for all GPU energy consumption, indexed by window and GPU index. - cpu_counters: A single Prometheus Counter metric for all CPU energy consumption, indexed by window and CPU index. - dram_counters: A single Prometheus Counter metric for all DRAM energy consumption, indexed by window and DRAM index. - queue: A multiprocessing queue used to send signals to start/stop energy monitoring. - proc: A multiprocessing process that runs the energy monitoring loop. - window_state: A dictionary that maps the monitoring window names to their corresponding process state. """ def __init__( @@ -276,7 +257,7 @@ def __init__( cpu_indices: list, gpu_indices: list, update_period: int, - prometheus_url: str, + pushgateway_url: str, job: str, ) -> None: """Initialize the EnergyCumulativeCounter. @@ -285,16 +266,14 @@ def __init__( cpu_indices (list): List of CPU indices to monitor. gpu_indices (list): List of GPU indices to monitor. update_period: The time interval (in seconds) at which energy measurements are updated. - prometheus_url: The URL for the Prometheus Push Gateway where the metrics will be pushed. + pushgateway_url: The URL for the Prometheus Push Gateway where the metrics will be pushed. job: The name of the job to be associated with the Prometheus metrics. """ self.cpu_indices = cpu_indices self.gpu_indices = gpu_indices self.update_period = update_period - self.prometheus_url = prometheus_url + self.pushgateway_url = pushgateway_url self.job = job - self.queue = None - self.proc = None self.window_state: dict[str, MonitoringProcessState] = {} def begin_window(self, name: str, sync_execution: bool = False) -> None: @@ -307,28 +286,28 @@ def begin_window(self, name: str, sync_execution: bool = False) -> None: name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. sync_execution (bool, optional): Whether to execute monitoring synchronously. Defaults to False. """ + if sync_execution: + sync_execution_fn(self.gpu_indices) + context = mp.get_context("spawn") - self.queue = context.Queue() - self.proc = context.Process( + queue = context.Queue() + proc = context.Process( target=energy_monitoring_loop, args=( name, - self.queue, + queue, self.cpu_indices, self.gpu_indices, self.update_period, - self.prometheus_url, + self.pushgateway_url, self.job, - sync_execution, ), ) - self.proc.start() - if not self.proc.is_alive(): + proc.start() + if not proc.is_alive(): raise RuntimeError(f"Failed to start monitoring process for {name}.") - self.window_state[name] = MonitoringProcessState( - queue=self.queue, proc=self.proc - ) + self.window_state[name] = MonitoringProcessState(queue=queue, proc=proc) def end_window(self, name: str, sync_execution: bool = False) -> None: """End the energy monitoring window. @@ -340,11 +319,11 @@ def end_window(self, name: str, sync_execution: bool = False) -> None: if name not in self.window_state: raise ValueError(f"No active monitoring process found for '{name}'.") + if sync_execution: + sync_execution_fn(self.gpu_indices) + state = self.window_state.pop(name) - if self.queue is not None: - self.queue.put("stop") - else: - raise RuntimeError("Queue is not initialized") + state.queue.put("stop") state.proc.join(timeout=20) if state.proc.is_alive(): @@ -357,9 +336,8 @@ def energy_monitoring_loop( cpu_indices: list, gpu_indices: list, update_period: int, - prometheus_url: str, + pushgateway_url: str, job: str, - sync_execution: bool, ) -> None: """Runs in a separate process to collect and update energy consumption metrics (for GPUs, CPUs, and DRAM). @@ -369,7 +347,7 @@ def energy_monitoring_loop( cpu_indices (list): List of CPU indices to monitor. gpu_indices (list): List of GPU indices to monitor. update_period (int): The interval (in seconds) between consecutive energy data updates. - prometheus_url (str): The URL of the Prometheus Push Gateway where the metrics will be pushed. + pushgateway_url (str): The URL of the Prometheus Push Gateway where the metrics will be pushed. job (str): The name of the Prometheus job associated with these metrics. sync_execution (bool): Whether to execute monitoring synchronously. """ @@ -406,10 +384,12 @@ def energy_monitoring_loop( if not pipe.empty(): break # Begin and end monitoring window using sync_execution - energy_monitor.begin_window(f"__EnergyCumulativeCounter_{name}", sync_execution) + energy_monitor.begin_window( + f"__EnergyCumulativeCounter_{name}", sync_execution=False + ) time.sleep(update_period) measurement = energy_monitor.end_window( - f"__EnergyCumulativeCounter_{name}", sync_execution + f"__EnergyCumulativeCounter_{name}", sync_execution=False ) if measurement.gpu_energy: @@ -427,7 +407,7 @@ def energy_monitoring_loop( if dram_counters: dram_counters.labels(window=name, index=dram_index).inc(energy) # Push metrics to Prometheus - push_to_gateway(prometheus_url, job=job, registry=registry) + push_to_gateway(pushgateway_url, job=job, registry=registry) class PowerGauge(Metric): @@ -437,23 +417,13 @@ class PowerGauge(Metric): The Gauge metric type is suitable for tracking values that can go up and down over time, like power consumption. Power usage data is collected at regular intervals and pushed to a Prometheus Push Gateway for monitoring. - - Attributes: - gpu_indices: List of GPU indices to monitor for power consumption. - update_period: Time interval (in seconds) between consecutive power measurements. - prometheus_url: URL of the Prometheus Push Gateway where Gauge metrics are pushed. - job: Name of the Prometheus job associated with the power metrics. - gpu_gauges: A single Prometheus Gauge metrics for real-time power consumption tracking. - queue: Queue for controlling the monitoring process. - proc: Process running the power monitoring loop. - window_state: A dictionary mapping monitoring window names to their process state. """ def __init__( self, gpu_indices: list, update_period: int, - prometheus_url: str, + pushgateway_url: str, job: str, ) -> None: """Initialize the PowerGauge metric. @@ -461,12 +431,12 @@ def __init__( Args: gpu_indices (list[int]): List of GPU indices to monitor for power consumption. update_period (int): Interval (in seconds) between consecutive power measurements. - prometheus_url (str): URL of the Prometheus Push Gateway where Gauge metrics are pushed. + pushgateway_url (str): URL of the Prometheus Push Gateway where Gauge metrics are pushed. job (str): Name of the Prometheus job to associate with the power metrics. """ self.gpu_indices = gpu_indices self.update_period = update_period - self.prometheus_url = prometheus_url + self.pushgateway_url = pushgateway_url self.job = job self.window_state: dict[str, MonitoringProcessState] = {} @@ -484,28 +454,29 @@ def begin_window(self, name: str, sync_execution: bool = False) -> None: if name in self.window_state: raise ValueError(f"PowerGauge metric '{name}' already exists.") + if sync_execution: + sync_execution_fn(self.gpu_indices) + context = mp.get_context("spawn") - self.queue = context.Queue() - self.proc = context.Process( + queue = context.Queue() + proc = context.Process( target=power_monitoring_loop, args=( name, - self.queue, + queue, self.gpu_indices, self.update_period, - self.prometheus_url, + self.pushgateway_url, self.job, ), ) - self.proc.start() - if not self.proc.is_alive(): + proc.start() + if not proc.is_alive(): raise RuntimeError( f"Failed to start power monitoring process for '{name}'." ) - self.window_state[name] = MonitoringProcessState( - queue=self.queue, proc=self.proc - ) + self.window_state[name] = MonitoringProcessState(queue=queue, proc=proc) def end_window(self, name: str, sync_execution: bool = False) -> None: """End the power monitoring window. @@ -514,12 +485,13 @@ def end_window(self, name: str, sync_execution: bool = False) -> None: name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'. sync_execution (bool, optional): Whether to execute monitoring synchronously. Defaults to False. """ + if sync_execution: + sync_execution_fn(self.gpu_indices) + state = self.window_state.pop(name) - if self.queue is not None: - self.queue.put("stop") - else: - raise RuntimeError("Queue is not initialized") + state.queue.put("stop") state.proc.join(timeout=20) + if state.proc.is_alive(): state.proc.terminate() @@ -529,7 +501,7 @@ def power_monitoring_loop( pipe: mp.Queue, gpu_indices: list[int], update_period: int, - prometheus_url: str, + pushgateway_url: str, job: str, ) -> None: """Runs in a separate process and periodically collects power consumption data for each GPU and pushes the results to the Prometheus Push Gateway. @@ -539,7 +511,7 @@ def power_monitoring_loop( pipe (multiprocessing.Queue): Queue to receive control signals (e.g., "stop"). gpu_indices (list[int]): List of GPU indices to monitor for power consumption. update_period (int): Interval (in seconds) between consecutive power data polls. - prometheus_url (str): URL of the Prometheus Push Gateway where metrics are pushed. + pushgateway_url (str): URL of the Prometheus Push Gateway where metrics are pushed. job (str): Name of the Prometheus job to associate with the metrics. """ power_monitor = PowerMonitor(gpu_indices=gpu_indices) @@ -566,7 +538,7 @@ def power_monitoring_loop( print(f"Error during processing power measurement: {e}") try: - push_to_gateway(prometheus_url, job=job, registry=registry) + push_to_gateway(pushgateway_url, job=job, registry=registry) except Exception as e: print(f"Error pushing metrics: {e}") From 9411495f2e272bb97ccc5dc954d2d6c7fde45bdc Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Fri, 10 Jan 2025 00:48:53 +0900 Subject: [PATCH 56/57] Changed variable name prometheus_url -> pushgateway_url --- examples/prometheus/train_single.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/prometheus/train_single.py b/examples/prometheus/train_single.py index 2cac6893..7b07813c 100644 --- a/examples/prometheus/train_single.py +++ b/examples/prometheus/train_single.py @@ -179,14 +179,14 @@ def main(): energy_histogram = EnergyHistogram( cpu_indices=[0,1], gpu_indices=[0], - prometheus_url='http://localhost:9091', + pushgateway_url='http://localhost:9091', job='training_energy_histogram' ) # Gauge to track power consumption over time power_gauge = PowerGauge( gpu_indices=[0], update_period=2, - prometheus_url='http://localhost:9091', + pushgateway_url='http://localhost:9091', job='training_power_gauge' ) # Counter to track energy consumption over time @@ -194,7 +194,7 @@ def main(): cpu_indices=[0,1], gpu_indices=[0], update_period=2, - prometheus_url='http://localhost:9091', + pushgateway_url='http://localhost:9091', job='training_energy_counter' ) From ef48d88bf5cab5f4d6e200aa82d0ec0bb7448ebc Mon Sep 17 00:00:00 2001 From: sharonsyh Date: Fri, 10 Jan 2025 00:50:40 +0900 Subject: [PATCH 57/57] Adjust unit test functions to reflect changes in the sync_execution logic and multiprocessing window state management. --- tests/test_metric.py | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/tests/test_metric.py b/tests/test_metric.py index 450abd54..d0e4125a 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -30,7 +30,7 @@ def mock_zeus_monitor(): mock_instance.end_window.return_value = MagicMock( gpu_energy={0: 50.0, 1: 100.0, 2: 200.0}, cpu_energy={0: 40.0, 1: 50.0}, - dram_energy={}, + dram_energy={0: 10}, ) mock_instance.gpu_indices = [0, 1, 2] mock_instance.cpu_indices = [0, 1] @@ -69,19 +69,19 @@ def test_energy_histogram( """Test EnergyHistogram class.""" cpu_indices = [0, 1] gpu_indices = [0, 1, 2] - prometheus_url = "http://localhost:9091" + pushgateway_url = "http://localhost:9091" window_name = "test_window" # Ensure mocked CPUs have the required method mock_get_cpus.return_value.cpus = [ - MagicMock(supportsGetDramEnergyConsumption=MagicMock(return_value=True)), + MagicMock(supportsGetDramEnergyConsumption=MagicMock(return_value=False)), MagicMock(supportsGetDramEnergyConsumption=MagicMock(return_value=False)), ] histogram_metric = EnergyHistogram( cpu_indices=cpu_indices, gpu_indices=gpu_indices, - prometheus_url=prometheus_url, + pushgateway_url=pushgateway_url, job="test_energy_histogram", ) @@ -110,14 +110,14 @@ def test_energy_histogram( histogram_metric.dram_histograms = dram_mock_histogram # Begin and end the monitoring window - histogram_metric.begin_window(window_name) + histogram_metric.begin_window(window_name, False) with patch("http.client.HTTPConnection", autospec=True) as mock_http: mock_http_instance = mock_http.return_value mock_http_instance.getresponse.return_value.code = 200 mock_http_instance.getresponse.return_value.msg = "OK" mock_http_instance.getresponse.return_value.info = lambda: {} mock_http_instance.sock = MagicMock() - histogram_metric.end_window(window_name) + histogram_metric.end_window(window_name, False) # Validate GPU histogram observations for ( @@ -153,13 +153,15 @@ def test_energy_cumulative_counter( """Test EnergyCumulativeCounter with mocked subprocess behavior.""" cpu_indices = [0, 1] gpu_indices = [0, 1, 2] - prometheus_url = "http://localhost:9091" + pushgateway_url = "http://localhost:9091" # Mock the context and queue mock_queue = MagicMock() mock_process = MagicMock() mock_mp_context.return_value.Queue.return_value = mock_queue - mock_mp_context.return_value.Process.return_value = mock_process + mock_mp_context.return_value.Process.return_value = ( + mock_process # Ensure Process returns mock_process + ) # Mock the behavior of subprocess mock_energy_monitoring_loop.return_value = ( @@ -171,7 +173,7 @@ def test_energy_cumulative_counter( cpu_indices=cpu_indices, gpu_indices=gpu_indices, update_period=2, - prometheus_url=prometheus_url, + pushgateway_url=pushgateway_url, job="test_energy_counter", ) @@ -187,17 +189,23 @@ def test_energy_cumulative_counter( cpu_indices, gpu_indices, 2, - prometheus_url, + pushgateway_url, "test_energy_counter", - False, ), ) mock_process.start.assert_called_once() + # Assert the window state is updated correctly + assert "test_counter" in cumulative_counter.window_state + state = cumulative_counter.window_state["test_counter"] + assert state.queue == mock_queue + assert state.proc == mock_process + # End the monitoring window cumulative_counter.end_window("test_counter") mock_queue.put.assert_called_once_with("stop") mock_process.join.assert_called_once() + assert "test_counter" not in cumulative_counter.window_state @patch("zeus.metric.power_monitoring_loop", autospec=True) @@ -208,7 +216,7 @@ def test_power_gauge( ): """Test PowerGauge with mocked subprocess behavior.""" gpu_indices = [0, 1, 2] - prometheus_url = "http://localhost:9091" + pushgateway_url = "http://localhost:9091" # Mock the context and queue mock_queue = MagicMock() @@ -225,7 +233,7 @@ def test_power_gauge( power_gauge = PowerGauge( gpu_indices=gpu_indices, update_period=2, - prometheus_url=prometheus_url, + pushgateway_url=pushgateway_url, job="test_power_gauge", ) @@ -240,13 +248,20 @@ def test_power_gauge( mock_queue, gpu_indices, 2, - prometheus_url, + pushgateway_url, "test_power_gauge", ), ) mock_process.start.assert_called_once() + # Assert the window state is updated correctly + assert "test_power_gauge" in power_gauge.window_state + state = power_gauge.window_state["test_power_gauge"] + assert state.queue == mock_queue + assert state.proc == mock_process + # End the monitoring window power_gauge.end_window("test_power_gauge") mock_queue.put.assert_called_once_with("stop") mock_process.join.assert_called_once() + assert "test_power_gauge" not in power_gauge.window_state