From b7acfa03b5b3b0afc8f37e93db93890ad5d09364 Mon Sep 17 00:00:00 2001 From: "basti.werner" Date: Fri, 5 Aug 2022 17:37:43 +0200 Subject: [PATCH 01/24] implemented prometheus based resource tracker. --- .../optuna_kubernetes_benchmark.py | 3 +- .../simple_raytune/ops/configs/prometheus.yml | 12 ++ experiments/simple_raytune/prometheus.yml | 42 ++++++ .../simple_raytune/raytune_benchmark.py | 4 +- ml_benchmark/__init__.py | 3 +- ml_benchmark/benchmark_runner.py | 11 ++ ml_benchmark/metrics.py | 54 +++++-- ml_benchmark/metrics_storage.py | 29 +++- ml_benchmark/resource_tracker.py | 136 +++++++++++++++++- 9 files changed, 276 insertions(+), 18 deletions(-) create mode 100644 experiments/simple_raytune/ops/configs/prometheus.yml create mode 100644 experiments/simple_raytune/prometheus.yml diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py b/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py index e216578..0ac348a 100644 --- a/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py +++ b/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py @@ -208,11 +208,12 @@ def _watch_db(self): "kubernetesContext": "admin@smile", "kubernetesMasterIP": "130.149.158.143", "deleteAfterRun": False, + "prometheus_url": "http://130.149.158.143:30041", } # TODO: hyperparams. # import an use the runner runner = BenchmarkRunner( - benchmark_cls=OptunaKubernetesBenchmark, resource_definition=resource_definition) + benchmark_cls=OptunaKubernetesBenchmark, resources=resource_definition) runner.run() diff --git a/experiments/simple_raytune/ops/configs/prometheus.yml b/experiments/simple_raytune/ops/configs/prometheus.yml new file mode 100644 index 0000000..4261afe --- /dev/null +++ b/experiments/simple_raytune/ops/configs/prometheus.yml @@ -0,0 +1,12 @@ +global: + scrape_interval: 1m + +scrape_configs: + - job_name: "prometheus" + scrape_interval: 1m + static_configs: + - targets: ["localhost:9090"] + + - job_name: "node" + static_configs: + - targets: ["node-exporter:9100"] \ No newline at end of file diff --git a/experiments/simple_raytune/prometheus.yml b/experiments/simple_raytune/prometheus.yml new file mode 100644 index 0000000..885089a --- /dev/null +++ b/experiments/simple_raytune/prometheus.yml @@ -0,0 +1,42 @@ +version: '3.8' + +networks: + monitoring: + driver: bridge + +services: + node-exporter: + image: prom/node-exporter:latest + container_name: node-exporter + restart: unless-stopped + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - '--path.procfs=/host/proc' + - '--path.rootfs=/rootfs' + - '--path.sysfs=/host/sys' + - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' + expose: + - 9100 + networks: + - monitoring + + prometheus: + image: prom/prometheus:latest + container_name: prometheus + restart: unless-stopped + volumes: + - ./ops/configs/:/configs:ro + command: + - '--config.file=/configs/prometheus.yml' + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--web.enable-lifecycle' + expose: + - 9090 + networks: + - monitoring + ports: + - "9090:9090" \ No newline at end of file diff --git a/experiments/simple_raytune/raytune_benchmark.py b/experiments/simple_raytune/raytune_benchmark.py index d5ac1ca..8000f76 100644 --- a/experiments/simple_raytune/raytune_benchmark.py +++ b/experiments/simple_raytune/raytune_benchmark.py @@ -99,7 +99,9 @@ def undeploy(self): # For benchmarking take the default value of 100 # your ressources the optimization should run on - resources = {"workerCpu": 12} + resources = {"workerCpu": 8, + "prometheus_url": "http://localhost:9090" # Assuming u used docker-compose up + } # Add your hyperparameter setting procedure here # your hyperparameter grid you want to search over diff --git a/ml_benchmark/__init__.py b/ml_benchmark/__init__.py index f628f98..4ab3c5f 100644 --- a/ml_benchmark/__init__.py +++ b/ml_benchmark/__init__.py @@ -2,6 +2,7 @@ install_requires = [ "scikit-learn==0.24.2", "tqdm==4.62.3", "SQLAlchemy==1.4.31", "docker==4.4.2", - "psycopg2-binary"], + "psycopg2-binary", + "prometheus-api-client"], test_install_requires = ["pytest==7.1.2", "pytest-cov==3.0.0"] URL = "https://github.com/gebauerm/ml_benchmark" diff --git a/ml_benchmark/benchmark_runner.py b/ml_benchmark/benchmark_runner.py index d290b0b..482458b 100644 --- a/ml_benchmark/benchmark_runner.py +++ b/ml_benchmark/benchmark_runner.py @@ -9,9 +9,11 @@ import docker import numpy as np import torch +import logging from ml_benchmark.latency_tracker import Latency, LatencyTracker from ml_benchmark.metrics_storage import MetricsStorage +from ml_benchmark.resource_tracker import ResourceTracker class Benchmark(ABC): @@ -130,6 +132,11 @@ def __init__( # prepare tracker self.metrics_storage = MetricsStorage() self.latency_tracker = LatencyTracker(MetricsStorage.connection_string) + if "prometheus_url" in resources: + self.resource_tracker = ResourceTracker(resources["prometheus_url"]) + else: + logging.warning("No Prometheus URL provided. Resource Tracker will not be used.") + self.resource_tracker = None def run(self): """ @@ -147,6 +154,8 @@ def run(self): try: self.metrics_storage.start_db() + if self.resource_tracker is not None: + self.resource_tracker.start() for benchmark_fun in run_process: with Latency(benchmark_fun) as latency: benchmark_fun() @@ -155,6 +164,8 @@ def run(self): # just to be save we wait a bit before killing shit. sleep(5) + if self.resource_tracker is not None: + self.resource_tracker.stop() self.metrics_storage.stop_db() except (docker.errors.APIError, AttributeError, ValueError, RuntimeError) as e: diff --git a/ml_benchmark/metrics.py b/ml_benchmark/metrics.py index 59947a7..a02955f 100644 --- a/ml_benchmark/metrics.py +++ b/ml_benchmark/metrics.py @@ -10,9 +10,7 @@ class Metric: """ Metric Parentclass. Creates a unique identifier for every metric and gathers basic information. """ - process_id = os.getpid() - hostname = socket.gethostname() - metric_id = f"id_{uuid4()}__pid_{process_id}__hostname_{hostname}" + metric_id = "" def add_to_id(self, id_addition): self.metric_id = self.metric_id + f"__{id_addition}" @@ -21,6 +19,33 @@ def to_dict(self): return asdict(self) +class NodeUsage(Metric): + def __init__(self, node_id): + super().__init__() + self.node_id = node_id + + self.add_to_id(f"{self.node_id}") + + self.timestamp = None + self.cpu_usage = None + self.memory_usage = None + self.network_usage = None + self.accelerator_usage = None + + def to_dict(self): + node_dict = dict( + metric_id=self.metric_id, + timestamp=self.timestamp, + cpu_usage=self.cpu_usage, + memory_usage=self.memory_usage, + network_usage=self.network_usage, + ) + if self.accelerator_usage: + node_dict["accelerator_usage"] = self.accelerator_usage + + return {key: _convert_datetime_to_unix(value) for key, value in node_dict.items()} + + class Latency(Metric): def __init__(self, func) -> None: @@ -41,6 +66,10 @@ def __init__(self, func) -> None: AttributeError: _description_ """ super().__init__() + process_id = os.getpid() + hostname = socket.gethostname() + self.add_to_id(f"id_{uuid4()}__pid_{process_id}__hostname_{hostname}") + self.function_name: str = func.__name__ try: self.obj_hash = hash(func.__self__) @@ -61,7 +90,7 @@ def to_dict(self): duration_sec=self.duration_sec ) # latency_dict.update(super().to_dict()) - latency_dict = {key: self._convert_times_to_float(value) for key, value in latency_dict.items()} + latency_dict = {key: _convert_times_to_float(value) for key, value in latency_dict.items()} return latency_dict @@ -84,8 +113,15 @@ def __exit__(self, *args): def _calculate_duration(self): self.duration_sec = self.end_time - self.start_time - def _convert_times_to_float(self, value): - if isinstance(value, timedelta): - return value.total_seconds() - else: - return str(value) + +def _convert_times_to_float(value): + if isinstance(value, timedelta): + return value.total_seconds() + else: + return str(value) + +def _convert_datetime_to_unix(value): + if isinstance(value, datetime): + return value.ctime() + else: + return str(value) \ No newline at end of file diff --git a/ml_benchmark/metrics_storage.py b/ml_benchmark/metrics_storage.py index 9de9ccc..5d2e2d7 100644 --- a/ml_benchmark/metrics_storage.py +++ b/ml_benchmark/metrics_storage.py @@ -1,3 +1,4 @@ +import logging import time import docker from sqlalchemy import create_engine, MetaData, Table, Column, String, Float, select @@ -15,6 +16,7 @@ class MetricsStorage: connection_string = MetricsStorageConfig.connection_string def __init__(self, connection_string: str = None) -> None: + """ The MetricsStorage serves as the representation of the databse. It sets up a postgres database in a docker container and creates tables for every recorded metric. @@ -25,15 +27,21 @@ def __init__(self, connection_string: str = None) -> None: Args: connection_string (str, optional): _description_. Defaults to None. """ + logging.basicConfig() + logging.getLogger('sqlalchemy').setLevel(logging.ERROR) + + self.meta = None + self.client = None + self.engine = None if connection_string: self.connection_string = connection_string self.latency = None + self.resources = None def start_db(self): self.setup_db() self.engine = create_engine(self.connection_string) self.create_metrics_table() - self.create_resource_table() return self def setup_db(self): @@ -76,7 +84,15 @@ def create_latency_table(self): ) def create_resource_table(self): - pass + self.resources = Table( + "resources", self.meta, + Column("metric_id", String, primary_key=True), + Column("timestamp", String, primary_key=True), + Column("cpu_usage", Float), + Column("memory_usage", Float), + Column("network_usage", Float), + Column("accelerator_usage", Float) + ) def create_classification_metrics_table(self): pass @@ -98,7 +114,14 @@ def get_latency_results(self): return result_list def get_resource_results(self): - pass + result_list = [] + with self.engine.connect() as conn: + stmt = select(self.resources) + cursor = conn.execute(stmt) + cursor = cursor.mappings().all() + for row in cursor: + result_list.append(dict(row)) + return result_list def get_classification_results(self): pass diff --git a/ml_benchmark/resource_tracker.py b/ml_benchmark/resource_tracker.py index fec04ce..a8318b1 100644 --- a/ml_benchmark/resource_tracker.py +++ b/ml_benchmark/resource_tracker.py @@ -1,5 +1,135 @@ +import datetime + +from prometheus_api_client import PrometheusConnect + +from ml_benchmark.config import MetricsStorageConfig +import psycopg2 +from sqlalchemy import MetaData, Table, create_engine, insert +from threading import Timer + +from ml_benchmark.metrics import NodeUsage +import logging + + +class RepeatTimer(Timer): + + def run(self): + while not self.finished.wait(self.interval): + self.function(*self.args, **self.kwargs) + + +def _sum_samples(samples): + return sum(map(lambda x: x.value, samples)) + + class ResourceTracker: - pass -# For trials open a seperate subprocess next to the objective, which polls resources every few seconds -# Handle it over a decorator as well + # update every 2 seconds ... maybe make this tuneable + UPDATE_INTERVAL = 2 + + def __init__(self, prometheus_url): + if prometheus_url is None: + raise ValueError("Prometheus URL is required.") + self.prometheus_url = prometheus_url + self.prm = PrometheusConnect(url=self.prometheus_url, disable_ssl=True) + + if not self.prm.check_prometheus_connection(): + raise ValueError("Could not connect to Prometheus.") + + self.engine = self._create_engine() + self.timer = RepeatTimer(self.UPDATE_INTERVAL, self.update) + + self._check_metrics() + + def _check_metrics(self): + available = set(self.prm.all_metrics()) + + #check node_exporter metrics - cpu/memory + required = {"node_memory_MemFree_bytes", "node_memory_MemTotal_bytes", "node_cpu_seconds_total"} + if not required.issubset(available): + raise ValueError("Prometheus does not provide the required metrics.") + + #check if prometheus is managing a kubernetes cluster + if "container_network_transmit_bytes_total" in available: + self.network_metric = "container_network" + elif "node_network_transmit_bytes_total" in available: + self.network_metric = "node_network" + else: + raise ValueError("Prometheus does not provide a vaild network metric.") + + if "kube_node_info" in available: + info = self.prm.get_current_metric_value("kube_node_info") + self.node_map = dict(map(lambda x: (x["internal_ip"], x["node"]), map(lambda x: x["metric"], info))) + else: + self.node_map = {} + + def _create_engine(self): + try: + engine = create_engine(MetricsStorageConfig.connection_string, echo=True) + except psycopg2.Error: + raise ConnectionError("Could not create an Engine for the Postgres DB.") + return engine + + def update(self): + try: + self.track() + except Exception as e: + logging.exception("Error while updating resource tracker. %s", e) + + def track(self): + #query prometheus for node usage + memory = 'avg by (instance) (node_memory_MemFree_bytes/node_memory_MemTotal_bytes)' + cpu = '100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[2m])*100))' + network = f'sum by (instance) (rate({self.network_metric}_receive_bytes_total[2m])+rate({self.network_metric}_transmit_bytes_total[2m]))' + + mem_result = self.prm.custom_query(memory) + cpu_result = self.prm.custom_query(cpu) + network_result = self.prm.custom_query(network) + + logging.debug("Got results from Prometheus.", mem_result, cpu_result, network_result) + + # assert len(mem_result) == len(cpu_result) == len(network_result) + + #grab the data per instance + mem_result = dict(map(lambda x: (self._try_norm(x["metric"]["instance"]), float(x["value"][1])), mem_result)) + cpu_result = dict(map(lambda x: (self._try_norm(x["metric"]["instance"]), float(x["value"][1])), cpu_result)) + network_result = dict(map(lambda x: (self._try_norm(x["metric"]["instance"]), float(x["value"][1])), network_result)) + + logging.debug("Processed Prometheus Results", mem_result, cpu_result, network_result) + + # assert mem_result.keys() == cpu_result.keys() == network_result.keys() + + #merge the data + data = [] + for instance in mem_result: + n = NodeUsage(instance) + n.timestamp = datetime.datetime.now() + n.cpu_usage = cpu_result.get(instance, 0) + n.memory_usage = mem_result.get(instance, 0) + n.network_usage = network_result.get(instance, 0) + data.append(n) + logging.debug("Added node usage for %s", instance) + + #insert the data + for n in data: + metadata = MetaData(bind=self.engine) + node_usage = Table("resources", metadata, autoload_with=self.engine) + with self.engine.connect() as conn: + stmt = insert(node_usage).values(n.to_dict()) + conn.execute(stmt) + + def _try_norm(self, instance: str): + if instance in self.node_map: + return self.node_map[instance] + elif instance[:instance.find(":")] in self.node_map: + return self.node_map[instance[:instance.find(":")]] + else: + return instance + + def start(self): + logging.debug("Starting resource tracker.") + self.timer.start() + + def stop(self): + logging.debug("Stopping resource tracker.") + self.timer.cancel() From 707fc72f392dd6cd0d87d1a70c31975b9f82b9f3 Mon Sep 17 00:00:00 2001 From: Michael Gebauer Date: Thu, 18 Aug 2022 14:48:12 +0200 Subject: [PATCH 02/24] added yml definitions --- experiments/optuna_minikube/dockerfile.trial | 2 +- .../optuna_minikube/hyperparameter_space.yml | 12 ++++++++ .../optuna_minikube_benchmark.py | 28 +++++++++++-------- experiments/optuna_minikube/optuna_trial.py | 12 +++++--- .../optuna_minikube/resource_definition.yml | 11 ++++++++ experiments/optuna_minikube/utils.py | 22 +++++++++++++++ ml_benchmark/utils/yml_parser.py | 4 +-- 7 files changed, 73 insertions(+), 18 deletions(-) create mode 100644 experiments/optuna_minikube/hyperparameter_space.yml create mode 100644 experiments/optuna_minikube/resource_definition.yml create mode 100644 experiments/optuna_minikube/utils.py diff --git a/experiments/optuna_minikube/dockerfile.trial b/experiments/optuna_minikube/dockerfile.trial index a863e52..ccd0321 100644 --- a/experiments/optuna_minikube/dockerfile.trial +++ b/experiments/optuna_minikube/dockerfile.trial @@ -3,7 +3,7 @@ FROM pytorch/pytorch:1.12.0-cuda11.3-cudnn8-runtime RUN pip install pip --upgrade RUN pip install optuna==2.10.1 -COPY experiments experiments +COPY experiments/optuna_minikube experiments/optuna_minikube COPY data data COPY setup.py setup.py COPY ml_benchmark ml_benchmark diff --git a/experiments/optuna_minikube/hyperparameter_space.yml b/experiments/optuna_minikube/hyperparameter_space.yml new file mode 100644 index 0000000..6dd7226 --- /dev/null +++ b/experiments/optuna_minikube/hyperparameter_space.yml @@ -0,0 +1,12 @@ +learning_rate: + start: 1e-4 + end: 1e-2 + step_size: 1e-5 +weight_decay: + start: 1e-6 + end: 1e-4 + step_size: 1e-5 +hidden_layer_config: + start: [10] + end: [100, 100, 100] + step_size: [10, 1] diff --git a/experiments/optuna_minikube/optuna_minikube_benchmark.py b/experiments/optuna_minikube/optuna_minikube_benchmark.py index 3816f97..05dbfb0 100644 --- a/experiments/optuna_minikube/optuna_minikube_benchmark.py +++ b/experiments/optuna_minikube/optuna_minikube_benchmark.py @@ -10,6 +10,7 @@ from ml_benchmark.utils.image_build_wrapper import builder_from_string from ml_benchmark.workload.mnist.mnist_task import MnistTask from ml_benchmark.utils.yaml_template_filler import YamlTemplateFiller +from ml_benchmark.utils.yml_parser import YMLParser class OptunaMinikubeBenchmark(Benchmark): @@ -198,18 +199,23 @@ def _watch_db(self): # The basic config for the workload. For testing purposes set epochs to one. # For benchmarking take the default value of 100 # your ressources the optimization should run on - resources = { - "workerCpu": 2, - "workerMemory": 2, - "workerCount": 4, + resources = YMLParser.parse("experiments/optuna_minikube/resource_definition.yml") + to_automate = { "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), - "kubernetesMasterIP": subprocess.check_output("minikube ip", shell=True).decode("utf-8").strip("\n"), - "dockerImageTag": "tawalaya/optuna-trial:latest", - "dockerImageBuilder": "minikube", - "kubernetesNamespace": "optuna-study", - "kubernetesContext": "minikube", - "deleteAfterRun": True, - } + "kubernetesMasterIP": subprocess.check_output("minikube ip", shell=True).decode("utf-8").strip("\n")} + resources.update(to_automate) + # resources = { + # "workerCpu": 2, + # "workerMemory": 2, + # "workerCount": 4, + # "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), + # "kubernetesMasterIP": subprocess.check_output("minikube ip", shell=True).decode("utf-8").strip("\n"), + # "dockerImageTag": "tawalaya/optuna-trial:latest", + # "dockerImageBuilder": "minikube", + # "kubernetesNamespace": "optuna-study", + # "kubernetesContext": "minikube", + # "deleteAfterRun": True, + # } # TODO: hyperparams. diff --git a/experiments/optuna_minikube/optuna_trial.py b/experiments/optuna_minikube/optuna_trial.py index afacef2..6c68b22 100644 --- a/experiments/optuna_minikube/optuna_trial.py +++ b/experiments/optuna_minikube/optuna_trial.py @@ -3,16 +3,18 @@ from time import sleep import optuna from ml_benchmark.workload.mnist.mnist_task import MnistTask - -#TODO: can we extract this to a point were we can use a config to drive this? +from utils import generate_search_space def optuna_trial(trial): task = MnistTask(config_init={"epochs": 5}) objective = task.create_objective() + # optuna doesnt care, these lines of code just get hyperparameters from the search space in grid search lr = trial.suggest_float("learning_rate", 1e-3, 0.1, log=True) decay = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True) - objective.set_hyperparameters({"learning_rate": lr, "weight_decay": decay}) + hidden_layer_config = trial.suggest_int("hidden_layer_config", 1, 4) + objective.set_hyperparameters( + {"learning_rate": lr, "weight_decay": decay, "hidden_layer_config": hidden_layer_config}) objective.train() validation_scores = objective.validate() return validation_scores["macro avg"]["f1-score"] @@ -22,8 +24,10 @@ def optuna_trial(trial): try: study_name = os.environ.get("STUDY_NAME") database_conn = os.environ.get("DB_CONN") + search_space = generate_search_space("hyp_space_definition.yml") study = optuna.create_study( - study_name=study_name, storage=database_conn, direction="maximize", load_if_exists=True) + study_name=study_name, storage=database_conn, direction="maximize", load_if_exists=True, + sampler=optuna.samplers.GridSampler(search_space)) study.optimize(optuna_trial, n_trials=6) # TODO: add small wait to avoid missing metrics sleep(5) diff --git a/experiments/optuna_minikube/resource_definition.yml b/experiments/optuna_minikube/resource_definition.yml new file mode 100644 index 0000000..5102319 --- /dev/null +++ b/experiments/optuna_minikube/resource_definition.yml @@ -0,0 +1,11 @@ + +workerCpu: 2 +workerMemory: 2 +workerCount: 4 +metricsIP: auto ##urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), +kubernetesMasterIP: minikube ##subprocess.check_output("minikube ip", shell=True).decode("utf-8").strip("\n") +dockerImageTag: tawalaya/optuna-trial:latest +dockerImageBuilder: minikube +kubernetesNamespace: optuna-study +kubernetesContext: "minikube" +deleteAfterRun: True diff --git a/experiments/optuna_minikube/utils.py b/experiments/optuna_minikube/utils.py new file mode 100644 index 0000000..0b7b93b --- /dev/null +++ b/experiments/optuna_minikube/utils.py @@ -0,0 +1,22 @@ +import numpy as np +from ml_benchmark.utils.yml_parser import YMLParser +import itertools + +def generate_search_space(yaml_file_path): + search_space = YMLParser.parse("experiments/optuna_minikube/hyp_space_definition.yml") + modified_search_space = {} + hidden_layer_config = [] + for key, value in search_space.items(): + if isinstance(value["start"], list): + combinations = [] + numbers = range(value["start"][0], value["end"][-1], value["step_size"][0]) + for r in range(len(value["end"])): + r = r + 1 + for combination in itertools.combinations(set(numbers), r): + combinations.append(list(combination)) + + + modified_search_space[key] = combinations + else: + modified_search_space[key] = np.arange(value["start"], value["end"], value["step_size"]) + return modified_search_space diff --git a/ml_benchmark/utils/yml_parser.py b/ml_benchmark/utils/yml_parser.py index 8e57bdd..321fa46 100644 --- a/ml_benchmark/utils/yml_parser.py +++ b/ml_benchmark/utils/yml_parser.py @@ -1,4 +1,4 @@ -from yaml import load, Loader +import ruamel.yaml class YMLParser: @@ -6,7 +6,7 @@ class YMLParser: @staticmethod def parse(hyperparameter_file_path): with open(hyperparameter_file_path, "r") as f: - hyper_dict = load(f, Loader=Loader) + hyper_dict = ruamel.yaml.safe_load(f) return hyper_dict From ca8586845251a65f785c337b9efa04df4c40b117 Mon Sep 17 00:00:00 2001 From: "basti.werner" Date: Mon, 22 Aug 2022 11:15:35 +0200 Subject: [PATCH 03/24] small fixes --- .gitignore | 8 +++++--- experiments/optuna_kubernetes/dockerfile.trial | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 89eb912..6d7e8bd 100644 --- a/.gitignore +++ b/.gitignore @@ -129,7 +129,9 @@ dmypy.json # Pyre type checker .pyre/ exp__* -experiments/simple_raytune/benchmark__RaytuneBenchmark -experiments/optuna_minikube/benchmark__OptunaMinikubeBenchmark +**/benchmark__** -data/ \ No newline at end of file +data/ + +#idea +.idea/ \ No newline at end of file diff --git a/experiments/optuna_kubernetes/dockerfile.trial b/experiments/optuna_kubernetes/dockerfile.trial index a863e52..ccd0321 100644 --- a/experiments/optuna_kubernetes/dockerfile.trial +++ b/experiments/optuna_kubernetes/dockerfile.trial @@ -3,7 +3,7 @@ FROM pytorch/pytorch:1.12.0-cuda11.3-cudnn8-runtime RUN pip install pip --upgrade RUN pip install optuna==2.10.1 -COPY experiments experiments +COPY experiments/optuna_minikube experiments/optuna_minikube COPY data data COPY setup.py setup.py COPY ml_benchmark ml_benchmark From 70594d41a45a55b1238e620ee8ad147befd0056e Mon Sep 17 00:00:00 2001 From: "basti.werner" Date: Mon, 22 Aug 2022 14:09:26 +0200 Subject: [PATCH 04/24] added wattage tracker per node --- ml_benchmark/metrics.py | 2 + ml_benchmark/metrics_storage.py | 3 +- ml_benchmark/resource_tracker.py | 89 +++++++++++++++---- .../test_ml_benchmark/test_resouce_tracker.py | 14 +++ 4 files changed, 88 insertions(+), 20 deletions(-) create mode 100644 test/test_ml_benchmark/test_resouce_tracker.py diff --git a/ml_benchmark/metrics.py b/ml_benchmark/metrics.py index a02955f..2b249ed 100644 --- a/ml_benchmark/metrics.py +++ b/ml_benchmark/metrics.py @@ -31,6 +31,7 @@ def __init__(self, node_id): self.memory_usage = None self.network_usage = None self.accelerator_usage = None + self.wattage = None def to_dict(self): node_dict = dict( @@ -39,6 +40,7 @@ def to_dict(self): cpu_usage=self.cpu_usage, memory_usage=self.memory_usage, network_usage=self.network_usage, + wattage=self.wattage, ) if self.accelerator_usage: node_dict["accelerator_usage"] = self.accelerator_usage diff --git a/ml_benchmark/metrics_storage.py b/ml_benchmark/metrics_storage.py index 5d2e2d7..2808c04 100644 --- a/ml_benchmark/metrics_storage.py +++ b/ml_benchmark/metrics_storage.py @@ -91,7 +91,8 @@ def create_resource_table(self): Column("cpu_usage", Float), Column("memory_usage", Float), Column("network_usage", Float), - Column("accelerator_usage", Float) + Column("accelerator_usage", Float), + Column("wattage", Float), ) def create_classification_metrics_table(self): diff --git a/ml_benchmark/resource_tracker.py b/ml_benchmark/resource_tracker.py index a8318b1..88ad33a 100644 --- a/ml_benchmark/resource_tracker.py +++ b/ml_benchmark/resource_tracker.py @@ -1,3 +1,4 @@ +from abc import abstractmethod import datetime from prometheus_api_client import PrometheusConnect @@ -18,16 +19,61 @@ def run(self): self.function(*self.args, **self.kwargs) -def _sum_samples(samples): - return sum(map(lambda x: x.value, samples)) +class ResourceStore(object): + @abstractmethod + def setup(self, **kwargs): + """ + Setup the resource store, e.g., create a database connection. + """ + pass + + @abstractmethod + def store(self, node_usage): + """ + Store the node usage in the resource store. + """ + pass +class DBResouceStore(ResourceStore): + + def __init__(self): + self.engine = None + + def setup(self, **kwargs): + self._create_engine(kwargs.get("connection_string",MetricsStorageConfig.connection_string)) + + def _create_engine(self, connection_string): + try: + engine = create_engine(connection_string, echo=True) + except psycopg2.Error: + raise ConnectionError("Could not create an Engine for the Postgres DB.") + return engine + + def store(self, data): + metadata = MetaData(bind=self.engine) + node_usage = Table("resources", metadata, autoload_with=self.engine) + with self.engine.connect() as conn: + stmt = insert(node_usage).values(data.to_dict()) + conn.execute(stmt) + +class LoggingResouceStore(ResourceStore): + + def __init__(self): + self.log = [] + + def setup(self, **kwargs): + pass + + def store(self, data): + logging.info("Storing data: {}".format(data.to_dict())) + self.log.append(data) class ResourceTracker: # update every 2 seconds ... maybe make this tuneable UPDATE_INTERVAL = 2 - def __init__(self, prometheus_url): + def __init__(self, prometheus_url, resouce_store=DBResouceStore ): if prometheus_url is None: raise ValueError("Prometheus URL is required.") self.prometheus_url = prometheus_url @@ -36,7 +82,9 @@ def __init__(self, prometheus_url): if not self.prm.check_prometheus_connection(): raise ValueError("Could not connect to Prometheus.") - self.engine = self._create_engine() + self.store = resouce_store() + self.store.setup() + self.timer = RepeatTimer(self.UPDATE_INTERVAL, self.update) self._check_metrics() @@ -63,28 +111,26 @@ def _check_metrics(self): else: self.node_map = {} - def _create_engine(self): - try: - engine = create_engine(MetricsStorageConfig.connection_string, echo=True) - except psycopg2.Error: - raise ConnectionError("Could not create an Engine for the Postgres DB.") - return engine - def update(self): try: self.track() except Exception as e: logging.exception("Error while updating resource tracker. %s", e) - def track(self): - #query prometheus for node usage + def _query(self): + """ + Query Prometheus for the current resource usage. + """ + # ? is there a better way to map nodes using the node_exporter memory = 'avg by (instance) (node_memory_MemFree_bytes/node_memory_MemTotal_bytes)' cpu = '100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[2m])*100))' network = f'sum by (instance) (rate({self.network_metric}_receive_bytes_total[2m])+rate({self.network_metric}_transmit_bytes_total[2m]))' + wattage = f'sum by (node) (scaph_host_power_microwatts)' mem_result = self.prm.custom_query(memory) cpu_result = self.prm.custom_query(cpu) network_result = self.prm.custom_query(network) + wattage_result = self.prm.custom_query(wattage) logging.debug("Got results from Prometheus.", mem_result, cpu_result, network_result) @@ -94,7 +140,7 @@ def track(self): mem_result = dict(map(lambda x: (self._try_norm(x["metric"]["instance"]), float(x["value"][1])), mem_result)) cpu_result = dict(map(lambda x: (self._try_norm(x["metric"]["instance"]), float(x["value"][1])), cpu_result)) network_result = dict(map(lambda x: (self._try_norm(x["metric"]["instance"]), float(x["value"][1])), network_result)) - + wattage_result = dict(map(lambda x: (self._try_norm(x["metric"]["node"]), float(x["value"][1])), wattage_result)) logging.debug("Processed Prometheus Results", mem_result, cpu_result, network_result) # assert mem_result.keys() == cpu_result.keys() == network_result.keys() @@ -107,16 +153,21 @@ def track(self): n.cpu_usage = cpu_result.get(instance, 0) n.memory_usage = mem_result.get(instance, 0) n.network_usage = network_result.get(instance, 0) + if instance in wattage_result: + n.wattage = wattage_result[instance] + else: + n.wattage = -1 data.append(n) logging.debug("Added node usage for %s", instance) + + return data + + def track(self): + data = self._query() #insert the data for n in data: - metadata = MetaData(bind=self.engine) - node_usage = Table("resources", metadata, autoload_with=self.engine) - with self.engine.connect() as conn: - stmt = insert(node_usage).values(n.to_dict()) - conn.execute(stmt) + self.store.store(n) def _try_norm(self, instance: str): if instance in self.node_map: diff --git a/test/test_ml_benchmark/test_resouce_tracker.py b/test/test_ml_benchmark/test_resouce_tracker.py new file mode 100644 index 0000000..b157105 --- /dev/null +++ b/test/test_ml_benchmark/test_resouce_tracker.py @@ -0,0 +1,14 @@ + +import logging +from ml_benchmark.resource_tracker import ResourceTracker, LoggingResouceStore +def test_resouce_tracker(): + + + import time + logging.basicConfig(level=logging.DEBUG) + rt = ResourceTracker(prometheus_url="http://130.149.158.143:30041", resouce_store=LoggingResouceStore) + rt.start() + time.sleep(ResourceTracker.UPDATE_INTERVAL * 15) + rt.stop() + print(rt.store.log) + assert rt.store.log != [] \ No newline at end of file From 17454d9d2e7f9c4b0366b394f2f72c91991f7f37 Mon Sep 17 00:00:00 2001 From: "basti.werner" Date: Tue, 23 Aug 2022 11:34:58 +0200 Subject: [PATCH 05/24] fixed wattage metrics measrumentes improved runner stability improved optuna_kube stability added tests for yaml serializations added integration tests for resouce_tracker removed sqlalchemy logging --- .gitignore | 6 +- .../ops/manifests/db/db-deployment.yml | 41 +---------- .../ops/manifests/trial/job.yml | 7 ++ .../optuna_kubernetes_benchmark.py | 68 +++++++++++++------ .../optuna_kubernetes/resource_definition.yml | 24 +++++++ .../optuna_minikube/hyperparameter_space.yml | 17 ++--- .../ops/manifests/trial/job.yml | 5 ++ .../optuna_minikube_benchmark.py | 21 +++--- experiments/optuna_minikube/optuna_trial.py | 10 +-- .../optuna_minikube/resource_definition.yml | 13 ++++ experiments/optuna_minikube/utils.py | 2 +- ml_benchmark/__init__.py | 3 +- ml_benchmark/benchmark_runner.py | 40 +++++++---- ml_benchmark/latency_tracker.py | 2 +- ml_benchmark/metrics.py | 13 +++- ml_benchmark/metrics_storage.py | 30 +++++--- ml_benchmark/resource_tracker.py | 54 +++++++++++---- ml_benchmark/utils/yaml_template_filler.py | 12 ++++ test/test_ml_benchmark/test.yaml | 13 ++++ .../test_ml_benchmark/test_resouce_tracker.py | 31 ++++++++- test/test_ml_benchmark/yaml_test.py | 16 +++++ 21 files changed, 299 insertions(+), 129 deletions(-) create mode 100644 experiments/optuna_kubernetes/resource_definition.yml create mode 100644 test/test_ml_benchmark/test.yaml create mode 100644 test/test_ml_benchmark/yaml_test.py diff --git a/.gitignore b/.gitignore index 6d7e8bd..f2ff438 100644 --- a/.gitignore +++ b/.gitignore @@ -134,4 +134,8 @@ exp__* data/ #idea -.idea/ \ No newline at end of file +.idea/ + +# +.envs +test/test_ml_benchmark/hyperparameter_space.yml diff --git a/experiments/optuna_kubernetes/ops/manifests/db/db-deployment.yml b/experiments/optuna_kubernetes/ops/manifests/db/db-deployment.yml index 51e7bd0..9782521 100644 --- a/experiments/optuna_kubernetes/ops/manifests/db/db-deployment.yml +++ b/experiments/optuna_kubernetes/ops/manifests/db/db-deployment.yml @@ -9,38 +9,6 @@ data: POSTGRES_DB: postgresdb POSTGRES_USER: postgresadmin POSTGRES_PASSWORD: admin123 -# --- -# kind: PersistentVolume -# apiVersion: v1 -# metadata: -# name: postgres-pv-volume -# labels: -# type: local -# app: postgres -# spec: -# # storageClassName: manual -# capacity: -# storage: 1Gi -# accessModes: -# - ReadWriteMany -# hostPath: -# path: "/mnt/data" -# reclaimPolicy: Delete -# --- -# kind: PersistentVolumeClaim -# apiVersion: v1 -# metadata: -# name: postgres-pv-claim -# labels: -# app: postgres -# spec: -# storageClassName: manual -# accessModes: -# - ReadWriteMany -# resources: -# requests: -# storage: 1Gi - --- apiVersion: apps/v1 kind: Deployment @@ -56,6 +24,8 @@ spec: labels: app: postgres spec: + nodeSelector: + scaphandre : "true" containers: - name: postgres image: postgres:10.4 @@ -65,17 +35,10 @@ spec: envFrom: - configMapRef: name: postgres-config - # volumeMounts: - # - mountPath: /var/lib/postgresql/data - # name: postgredb resources: limits: cpu: 1.0 memory: 1G - # volumes: - # - name: postgredb - # persistentVolumeClaim: - # claimName: postgres-pv-claim --- apiVersion: v1 kind: Service diff --git a/experiments/optuna_kubernetes/ops/manifests/trial/job.yml b/experiments/optuna_kubernetes/ops/manifests/trial/job.yml index 49e8ff9..a915864 100644 --- a/experiments/optuna_kubernetes/ops/manifests/trial/job.yml +++ b/experiments/optuna_kubernetes/ops/manifests/trial/job.yml @@ -7,6 +7,8 @@ spec: parallelism: $worker_num template: spec: + nodeSelector: + scaphandre : "true" containers: - name: optuna-trial image: $worker_image @@ -22,5 +24,10 @@ spec: value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" - name: "METRICS_STORAGE_HOST" value: "$metrics_ip" + # injects the kuberntes node name into eacah pod + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName restartPolicy: OnFailure diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py b/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py index 0ac348a..7aa50ee 100644 --- a/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py +++ b/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py @@ -15,7 +15,7 @@ class OptunaKubernetesBenchmark(Benchmark): - def __init__(self, resources: dict) -> None: + def __init__(self, resources: dict, runner=None) -> None: """ Processes the given resources dictionary and creates class variables from it which are used in the benchmark. @@ -36,12 +36,20 @@ def __init__(self, resources: dict) -> None: self.workerCount = resources.get("workerCount", 4) self.delete_after_run = resources.get("deleteAfterRun", True) self.metrics_ip = resources.get("metricsIP") + self.runner = runner + self.hyperparameter = resources.get("hyperparameter") def deploy(self) -> None: """ Deploy DB """ # TODO: deal with exsiting resources... + + if self.hyperparameter: + #TODO: XXX we got to fix this dependency thing. eitehr merge minikube/kubernetes or use the same baseclass or something... + f = path.join(path.dirname(__file__),"..","optuna_minikube","hyperparameter_space.yml") + YamlTemplateFiller.as_yaml(f, self.hyperparameter) + try: resp = client.CoreV1Api().create_namespace( client.V1Namespace(metadata=client.V1ObjectMeta(name=self.namespace))) @@ -64,6 +72,10 @@ def deploy(self) -> None: self._watch_db() + # update the resoruce collector with the namespace used during the run + if self.runner and self.runner.resource_tracker: + self.runner.resource_tracker.namespace = self.namespace + @staticmethod def _is_create_conflict(e): if isinstance(e, ApiException): @@ -105,11 +117,19 @@ def run(self): if self._is_create_conflict(e): # lets remove the old one and try again client.BatchV1Api().delete_namespaced_job(name="optuna-trial", namespace=self.namespace) + #wait for that to complete + sleep(5) + # try again create_from_yaml( client.ApiClient(), yaml_objects=job_yml_objects, namespace=self.namespace, verbose=True) else: raise e - self._watch_trials() + try: + for t in range(1,5): + self._watch_trials(timeout=120*t) + except Exception as e: + #TODO deal with mitigatable errors + raise e def _getDBURL(self): postgres_sepc = client.CoreV1Api().read_namespaced_service(namespace=self.namespace, name="postgres") @@ -127,17 +147,30 @@ def collect_run_results(self): study = optuna.load_study(study_name=self.study_name, storage=self._getDBURL()) self.best_trial = study.best_trial - def _watch_trials(self): + def _watch_trials(self,timeout=120): """ Checks if Trials (Kubernetes Jobs) are completed. If not the process waits on it. """ w = watch.Watch() c = client.BatchV1Api() - for e in w.stream(c.list_namespaced_job, namespace=self.namespace, timeout_seconds=10): + + for e in w.stream(c.list_namespaced_job, namespace=self.namespace, timeout_seconds=timeout): if "object" in e and e["object"].status.completion_time is not None: w.stop() - return - print("Trials completed! Collecting Results") + print("Trials completed! Collecting Results") + return True + print("Watch_Trails timed out") + try: + job = client.BatchV1Api().read_namespaced_job(name="optuna-trial", namespace=self.namespace) + if job.status.failed != None and job.status.failed > 0: + raise Exception("Trials failed") + except ApiException as e: + if e.status == 404: + raise Exception("Job not created...") + raise e + return False + + def test(self): @@ -169,6 +202,7 @@ def undeploy(self): def _watch_namespace(self): try: + #TODO: XXX fix me! client.CoreV1Api().read_namespace_status(self.namespace).to_dict() sleep(2) except client.exceptions.ApiException: @@ -193,27 +227,23 @@ def _watch_db(self): if __name__ == "__main__": from ml_benchmark.benchmark_runner import BenchmarkRunner from urllib.request import urlopen - # The basic config for the workload. For testing purposes set epochs to one. - # For benchmarking take the default value of 100 - # your ressources the optimization should run on - resource_definition = { - "workerCpu": 2, - "workerMemory": 2, - "workerCount": 4, + from ml_benchmark.utils.yml_parser import YMLParser + resources = YMLParser.parse(path.join(path.dirname(__file__),"resource_definition.yml")) + + # TODO: XXX remove this hardcoded values + to_automate = { "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), - "studyName": "optuna-study", "dockerImageTag": "tawalaya/optuna-trial:latest", "dockerImageBuilder": "docker", "kubernetesNamespace": "optuna-study", "kubernetesContext": "admin@smile", "kubernetesMasterIP": "130.149.158.143", - "deleteAfterRun": False, "prometheus_url": "http://130.149.158.143:30041", + "deleteAfterRun":False, } + resources.update(to_automate) - # TODO: hyperparams. - - # import an use the runner runner = BenchmarkRunner( - benchmark_cls=OptunaKubernetesBenchmark, resources=resource_definition) + benchmark_cls=OptunaKubernetesBenchmark, resources=resources) runner.run() + diff --git a/experiments/optuna_kubernetes/resource_definition.yml b/experiments/optuna_kubernetes/resource_definition.yml new file mode 100644 index 0000000..3f0ce28 --- /dev/null +++ b/experiments/optuna_kubernetes/resource_definition.yml @@ -0,0 +1,24 @@ + +workerCpu: 2 +workerMemory: 2 +workerCount: 2 +metricsIP: auto ##urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), +kubernetesMasterIP: minikube ##subprocess.check_output("minikube ip", shell=True).decode("utf-8").strip("\n") +dockerImageTag: tawalaya/optuna-trial:latest +dockerImageBuilder: docker +kubernetesNamespace: optuna-study +kubernetesContext: "minikube" +deleteAfterRun: True +hyperparameter: + learning_rate: + start: 1e-4 + end: 1e-2 + step_size: 1e-5 + weight_decay: + start: 1e-6 + end: 1e-4 + step_size: 1e-5 + # hidden_layer_config: + # start: [10] + # end: [100, 100, 100] + # step_size: [10, 1] diff --git a/experiments/optuna_minikube/hyperparameter_space.yml b/experiments/optuna_minikube/hyperparameter_space.yml index 6dd7226..3a76fc2 100644 --- a/experiments/optuna_minikube/hyperparameter_space.yml +++ b/experiments/optuna_minikube/hyperparameter_space.yml @@ -1,12 +1,9 @@ +# generated file - do not edit learning_rate: - start: 1e-4 - end: 1e-2 - step_size: 1e-5 + end: 0.01 + start: 0.0001 + step_size: 1.0e-05 weight_decay: - start: 1e-6 - end: 1e-4 - step_size: 1e-5 -hidden_layer_config: - start: [10] - end: [100, 100, 100] - step_size: [10, 1] + end: 0.0001 + start: 1.0e-06 + step_size: 1.0e-05 diff --git a/experiments/optuna_minikube/ops/manifests/trial/job.yml b/experiments/optuna_minikube/ops/manifests/trial/job.yml index 202fd2d..a3e111e 100644 --- a/experiments/optuna_minikube/ops/manifests/trial/job.yml +++ b/experiments/optuna_minikube/ops/manifests/trial/job.yml @@ -23,5 +23,10 @@ spec: value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" - name: "METRICS_STORAGE_HOST" value: "$metrics_ip" + # injects the kuberntes node name into eacah pod + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName restartPolicy: OnFailure diff --git a/experiments/optuna_minikube/optuna_minikube_benchmark.py b/experiments/optuna_minikube/optuna_minikube_benchmark.py index 05dbfb0..a18793c 100644 --- a/experiments/optuna_minikube/optuna_minikube_benchmark.py +++ b/experiments/optuna_minikube/optuna_minikube_benchmark.py @@ -36,12 +36,21 @@ def __init__(self, resources: dict) -> None: self.workerCount = resources.get("workerCount", 4) self.delete_after_run = resources.get("deleteAfterRun", True) self.metrics_ip = resources.get("metricsIP") + self.hyperparameter = resources.get("hyperparameter") def deploy(self) -> None: """ Deploy DB """ + # TODO: deal with exsiting resources... + + #generate hyperparameter file from resouces def. + + if self.hyperparameter: + f = path.join(path.dirname(__file__),"hyperparameter_space.yml") + YamlTemplateFiller.as_yaml(f, self.hyperparameter) + try: resp = client.CoreV1Api().create_namespace( client.V1Namespace(metadata=client.V1ObjectMeta(name=self.namespace))) @@ -204,18 +213,6 @@ def _watch_db(self): "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), "kubernetesMasterIP": subprocess.check_output("minikube ip", shell=True).decode("utf-8").strip("\n")} resources.update(to_automate) - # resources = { - # "workerCpu": 2, - # "workerMemory": 2, - # "workerCount": 4, - # "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), - # "kubernetesMasterIP": subprocess.check_output("minikube ip", shell=True).decode("utf-8").strip("\n"), - # "dockerImageTag": "tawalaya/optuna-trial:latest", - # "dockerImageBuilder": "minikube", - # "kubernetesNamespace": "optuna-study", - # "kubernetesContext": "minikube", - # "deleteAfterRun": True, - # } # TODO: hyperparams. diff --git a/experiments/optuna_minikube/optuna_trial.py b/experiments/optuna_minikube/optuna_trial.py index 6c68b22..92cac23 100644 --- a/experiments/optuna_minikube/optuna_trial.py +++ b/experiments/optuna_minikube/optuna_trial.py @@ -12,9 +12,9 @@ def optuna_trial(trial): # optuna doesnt care, these lines of code just get hyperparameters from the search space in grid search lr = trial.suggest_float("learning_rate", 1e-3, 0.1, log=True) decay = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True) - hidden_layer_config = trial.suggest_int("hidden_layer_config", 1, 4) + # hidden_layer_config = trial.suggest_int("hidden_layer_config", 1, 4) objective.set_hyperparameters( - {"learning_rate": lr, "weight_decay": decay, "hidden_layer_config": hidden_layer_config}) + {"learning_rate": lr, "weight_decay": decay})#, "hidden_layer_config": hidden_layer_config}) objective.train() validation_scores = objective.validate() return validation_scores["macro avg"]["f1-score"] @@ -24,7 +24,8 @@ def optuna_trial(trial): try: study_name = os.environ.get("STUDY_NAME") database_conn = os.environ.get("DB_CONN") - search_space = generate_search_space("hyp_space_definition.yml") + search_space = generate_search_space(os.path.join(os.path.dirname(__file__),"hyperparameter_space.yml")) + print(search_space) study = optuna.create_study( study_name=study_name, storage=database_conn, direction="maximize", load_if_exists=True, sampler=optuna.samplers.GridSampler(search_space)) @@ -32,5 +33,6 @@ def optuna_trial(trial): # TODO: add small wait to avoid missing metrics sleep(5) sys.exit(0) - except Exception: + except Exception as e: + print(e) sys.exit(1) diff --git a/experiments/optuna_minikube/resource_definition.yml b/experiments/optuna_minikube/resource_definition.yml index 5102319..3c8d03d 100644 --- a/experiments/optuna_minikube/resource_definition.yml +++ b/experiments/optuna_minikube/resource_definition.yml @@ -9,3 +9,16 @@ dockerImageBuilder: minikube kubernetesNamespace: optuna-study kubernetesContext: "minikube" deleteAfterRun: True +hyperparameter: + learning_rate: + start: 1e-4 + end: 1e-2 + step_size: 1e-5 + weight_decay: + start: 1e-6 + end: 1e-4 + step_size: 1e-5 + hidden_layer_config: + start: [10] + end: [100, 100, 100] + step_size: [10, 1] diff --git a/experiments/optuna_minikube/utils.py b/experiments/optuna_minikube/utils.py index 0b7b93b..d0946a5 100644 --- a/experiments/optuna_minikube/utils.py +++ b/experiments/optuna_minikube/utils.py @@ -3,7 +3,7 @@ import itertools def generate_search_space(yaml_file_path): - search_space = YMLParser.parse("experiments/optuna_minikube/hyp_space_definition.yml") + search_space = YMLParser.parse(yaml_file_path) modified_search_space = {} hidden_layer_config = [] for key, value in search_space.items(): diff --git a/ml_benchmark/__init__.py b/ml_benchmark/__init__.py index 4ab3c5f..10ded25 100644 --- a/ml_benchmark/__init__.py +++ b/ml_benchmark/__init__.py @@ -3,6 +3,7 @@ "scikit-learn==0.24.2", "tqdm==4.62.3", "SQLAlchemy==1.4.31", "docker==4.4.2", "psycopg2-binary", - "prometheus-api-client"], + "prometheus-api-client", + "ruamel.yaml"], test_install_requires = ["pytest==7.1.2", "pytest-cov==3.0.0"] URL = "https://github.com/gebauerm/ml_benchmark" diff --git a/ml_benchmark/benchmark_runner.py b/ml_benchmark/benchmark_runner.py index 482458b..69b674f 100644 --- a/ml_benchmark/benchmark_runner.py +++ b/ml_benchmark/benchmark_runner.py @@ -21,9 +21,6 @@ class Benchmark(ABC): This class serves as an Interface for a benchmark. All neccessary methods have to be implemented in the subclass that is using the interface. Make sure to use the predefined static variables. Your benchmark will most likely not run properly if the variables value remains to be "None". - - Args: - ABC (_type_): Abstract Base Class """ # TODO: objective and grid are not allowed to be in the benchmark @@ -124,7 +121,7 @@ def __init__( self.create_benchmark_folder(self.benchmark_folder) # add input and output size to the benchmark. - self.benchmark = benchmark_cls(resources) + self.benchmark = benchmark_cls(resources, self) # set seeds self._set_all_seeds() @@ -146,32 +143,45 @@ def run(self): Raises: ValueError: _description_ """ - run_process = [ - self.benchmark.deploy, self.benchmark.setup, self.benchmark.run, - self.benchmark.collect_run_results, - self.benchmark.test, self.benchmark.collect_benchmark_metrics] benchmark_results = None try: - self.metrics_storage.start_db() + self.metrics_storage.start_db() + + # Deploy the SUT + with Latency(self.benchmark.deploy) as latency: + self.benchmark.deploy() + self.latency_tracker.track(latency) + + # RUN the benchmark + run_process = [ + self.benchmark.setup, self.benchmark.run, + self.benchmark.collect_run_results, + self.benchmark.test, self.benchmark.collect_benchmark_metrics] + if self.resource_tracker is not None: self.resource_tracker.start() + for benchmark_fun in run_process: with Latency(benchmark_fun) as latency: benchmark_fun() self.latency_tracker.track(latency) + + # Get the results of the benchmark benchmark_results = self.metrics_storage.get_benchmark_results() # just to be save we wait a bit before killing shit. + + except (docker.errors.APIError, AttributeError, ValueError, RuntimeError) as e: + print(e) + raise ValueError("No Results obtained, Benchmark failed.") + finally: sleep(5) if self.resource_tracker is not None: self.resource_tracker.stop() self.metrics_storage.stop_db() - except (docker.errors.APIError, AttributeError, ValueError, RuntimeError) as e: - print(e) - raise ValueError("No Results obtained, Benchmark failed.") - finally: + # Undeploy the SUT try: self.benchmark.undeploy() except Exception: @@ -181,7 +191,9 @@ def run(self): self.metrics_storage.stop_db() except Exception: pass - + + # TODO: move to finally block to ensure that results are always caputres if possible? + # persist the results self.save_benchmark_results(benchmark_results) def _set_all_seeds(self): diff --git a/ml_benchmark/latency_tracker.py b/ml_benchmark/latency_tracker.py index 0f98094..05f865d 100644 --- a/ml_benchmark/latency_tracker.py +++ b/ml_benchmark/latency_tracker.py @@ -40,7 +40,7 @@ def __init__(self, connection_string: str = None) -> None: def _create_engine(self, connection_string): try: - engine = create_engine(connection_string, echo=True) + engine = create_engine(connection_string, echo=False) except psycopg2.Error: raise ConnectionError("Could not create an Engine for the Postgres DB.") return engine diff --git a/ml_benchmark/metrics.py b/ml_benchmark/metrics.py index 2b249ed..e7d3c9d 100644 --- a/ml_benchmark/metrics.py +++ b/ml_benchmark/metrics.py @@ -32,6 +32,7 @@ def __init__(self, node_id): self.network_usage = None self.accelerator_usage = None self.wattage = None + self.processes = None def to_dict(self): node_dict = dict( @@ -41,11 +42,15 @@ def to_dict(self): memory_usage=self.memory_usage, network_usage=self.network_usage, wattage=self.wattage, + processes=int(self.processes), ) if self.accelerator_usage: node_dict["accelerator_usage"] = self.accelerator_usage return {key: _convert_datetime_to_unix(value) for key, value in node_dict.items()} + + def __repr__(self): + return str(self.to_dict()) class Latency(Metric): @@ -68,9 +73,15 @@ def __init__(self, func) -> None: AttributeError: _description_ """ super().__init__() + #TODO: make each id filed also availible as a column process_id = os.getpid() - hostname = socket.gethostname() + # inject the NODE_NAME (from the environment) - should be availble in containerized environments + if os.getenv("NODE_NAME"): + hostname = f'{os.getenv("NODE_NAME")}_{socket.gethostname()}' + else: + hostname = f'BARE_{socket.gethostname()}' self.add_to_id(f"id_{uuid4()}__pid_{process_id}__hostname_{hostname}") + self.function_name: str = func.__name__ try: diff --git a/ml_benchmark/metrics_storage.py b/ml_benchmark/metrics_storage.py index 2808c04..421f220 100644 --- a/ml_benchmark/metrics_storage.py +++ b/ml_benchmark/metrics_storage.py @@ -1,7 +1,8 @@ import logging import time import docker -from sqlalchemy import create_engine, MetaData, Table, Column, String, Float, select +from docker.errors import APIError +from sqlalchemy import create_engine, MetaData, Table, Column, String, Float, select, Integer from ml_benchmark.config import MetricsStorageConfig @@ -29,6 +30,7 @@ def __init__(self, connection_string: str = None) -> None: """ logging.basicConfig() logging.getLogger('sqlalchemy').setLevel(logging.ERROR) + logging.getLogger('sqlalchemy.engine').setLevel(logging.ERROR) self.meta = None self.client = None @@ -46,13 +48,21 @@ def start_db(self): def setup_db(self): self.client = docker.from_env() - self.client.containers.run( - "postgres:14.1", detach=True, - environment=[ - f"POSTGRES_PASSWORD={self.password}", f"POSTGRES_DB={self.db}", f"POSTGRES_USER={self.user}"], - ports={f'{self.port}/tcp': self.port}, - name="postgres", - remove=True) + try: + self.client.containers.run( + "postgres:14.1", detach=True, + environment=[ + f"POSTGRES_PASSWORD={self.password}", f"POSTGRES_DB={self.db}", f"POSTGRES_USER={self.user}"], + ports={f'{self.port}/tcp': self.port}, + name="postgres", + remove=True) + except APIError as e: + if e.status_code == 409: + #TODO: we maybe want to drop the database in these cases + logging.info("Postgres is already running") + else: + raise e + container = self.client.containers.get("postgres") # checks if db is up while "accepting connections" not in container.exec_run("pg_isready").output.decode(): @@ -71,7 +81,8 @@ def create_metrics_table(self): self.create_latency_table() self.create_resource_table() self.create_classification_metrics_table() - self.meta.create_all(self.engine) + self.meta.create_all(self.engine,checkfirst=True) + def create_latency_table(self): self.latency = Table( @@ -93,6 +104,7 @@ def create_resource_table(self): Column("network_usage", Float), Column("accelerator_usage", Float), Column("wattage", Float), + Column("processes", Integer), ) def create_classification_metrics_table(self): diff --git a/ml_benchmark/resource_tracker.py b/ml_benchmark/resource_tracker.py index 88ad33a..22bc0f4 100644 --- a/ml_benchmark/resource_tracker.py +++ b/ml_benchmark/resource_tracker.py @@ -8,7 +8,7 @@ from sqlalchemy import MetaData, Table, create_engine, insert from threading import Timer -from ml_benchmark.metrics import NodeUsage +from ml_benchmark.metrics import Metric, NodeUsage import logging @@ -20,6 +20,9 @@ def run(self): class ResourceStore(object): + """ + Interface for swapping out different implementations of the resource store, e.g., a database, a file, etc. + """ @abstractmethod def setup(self, **kwargs): """ @@ -28,33 +31,36 @@ def setup(self, **kwargs): pass @abstractmethod - def store(self, node_usage): + def store(self, node_usage:Metric, **kwargs): """ Store the node usage in the resource store. """ pass -class DBResouceStore(ResourceStore): +class MetricsResouceStore(ResourceStore): def __init__(self): self.engine = None def setup(self, **kwargs): - self._create_engine(kwargs.get("connection_string",MetricsStorageConfig.connection_string)) + self.engine = self._create_engine(kwargs.get("connection_string",MetricsStorageConfig.connection_string)) def _create_engine(self, connection_string): try: - engine = create_engine(connection_string, echo=True) + engine = create_engine(connection_string, echo=False) except psycopg2.Error: raise ConnectionError("Could not create an Engine for the Postgres DB.") return engine - def store(self, data): - metadata = MetaData(bind=self.engine) - node_usage = Table("resources", metadata, autoload_with=self.engine) - with self.engine.connect() as conn: - stmt = insert(node_usage).values(data.to_dict()) - conn.execute(stmt) + def store(self, data:Metric, **kwargs): + try: + metadata = MetaData(bind=self.engine) + node_usage = Table(kwargs.get("table_name","resources"), metadata, autoload_with=self.engine) + with self.engine.connect() as conn: + stmt = insert(node_usage).values(data.to_dict()) + conn.execute(stmt) + except Exception as e: + logging.warn(f"Could not store the data in the Metrics DB {data} - {e}") class LoggingResouceStore(ResourceStore): @@ -73,7 +79,7 @@ class ResourceTracker: # update every 2 seconds ... maybe make this tuneable UPDATE_INTERVAL = 2 - def __init__(self, prometheus_url, resouce_store=DBResouceStore ): + def __init__(self, prometheus_url, resouce_store=MetricsResouceStore ): if prometheus_url is None: raise ValueError("Prometheus URL is required.") self.prometheus_url = prometheus_url @@ -89,11 +95,13 @@ def __init__(self, prometheus_url, resouce_store=DBResouceStore ): self._check_metrics() + self.namespace = None + def _check_metrics(self): available = set(self.prm.all_metrics()) #check node_exporter metrics - cpu/memory - required = {"node_memory_MemFree_bytes", "node_memory_MemTotal_bytes", "node_cpu_seconds_total"} + required = {"node_memory_MemFree_bytes", "node_memory_MemTotal_bytes", "node_cpu_seconds_total","scaph_host_power_microwatts","scaph_process_power_consumption_microwatts"} if not required.issubset(available): raise ValueError("Prometheus does not provide the required metrics.") @@ -110,6 +118,7 @@ def _check_metrics(self): self.node_map = dict(map(lambda x: (x["internal_ip"], x["node"]), map(lambda x: x["metric"], info))) else: self.node_map = {} + def update(self): try: @@ -124,13 +133,23 @@ def _query(self): # ? is there a better way to map nodes using the node_exporter memory = 'avg by (instance) (node_memory_MemFree_bytes/node_memory_MemTotal_bytes)' cpu = '100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[2m])*100))' + + ##needs mapping network = f'sum by (instance) (rate({self.network_metric}_receive_bytes_total[2m])+rate({self.network_metric}_transmit_bytes_total[2m]))' + #TODO: reduce measurments to only the ones we care about - dose currently not work with scaph_process_power_consumption_microwatts + #if we can we collect the power consumption from the scaph_host_power_microwatts metric only for the used namespace + # if self.namespace: + # wattage = f'sum by (node) (scaph_process_power_consumption_microwatts{{namespace="{self.namespace}"}})' + # processes = f'count by (node) (scaph_process_power_consumption_microwatts{{namespace="{self.namespace}"}})' + # else : wattage = f'sum by (node) (scaph_host_power_microwatts)' + processes = 'count by (node) (scaph_process_power_consumption_microwatts)' mem_result = self.prm.custom_query(memory) cpu_result = self.prm.custom_query(cpu) network_result = self.prm.custom_query(network) wattage_result = self.prm.custom_query(wattage) + processes_result = self.prm.custom_query(processes) logging.debug("Got results from Prometheus.", mem_result, cpu_result, network_result) @@ -141,7 +160,9 @@ def _query(self): cpu_result = dict(map(lambda x: (self._try_norm(x["metric"]["instance"]), float(x["value"][1])), cpu_result)) network_result = dict(map(lambda x: (self._try_norm(x["metric"]["instance"]), float(x["value"][1])), network_result)) wattage_result = dict(map(lambda x: (self._try_norm(x["metric"]["node"]), float(x["value"][1])), wattage_result)) - logging.debug("Processed Prometheus Results", mem_result, cpu_result, network_result) + processes_result = dict(map(lambda x: (self._try_norm(x["metric"]["node"]), float(x["value"][1])), processes_result)) + + logging.debug("Processed Prometheus Results", mem_result, cpu_result, network_result, wattage_result, processes_result) # assert mem_result.keys() == cpu_result.keys() == network_result.keys() @@ -155,10 +176,13 @@ def _query(self): n.network_usage = network_result.get(instance, 0) if instance in wattage_result: n.wattage = wattage_result[instance] + n.processes = processes_result[instance] else: n.wattage = -1 + n.processes = -1 + data.append(n) - logging.debug("Added node usage for %s", instance) + # logging.debug("Added node usage for %s", instance) return data diff --git a/ml_benchmark/utils/yaml_template_filler.py b/ml_benchmark/utils/yaml_template_filler.py index 95ec831..6bb40b2 100644 --- a/ml_benchmark/utils/yaml_template_filler.py +++ b/ml_benchmark/utils/yaml_template_filler.py @@ -22,3 +22,15 @@ def load_and_fill_yaml_template(yaml_path: str, yaml_values: dict) -> dict: with open(yaml_path, "r") as f: job_template = Template(f.read()) return yaml.safe_load_all(job_template.substitute(yaml_values)) + + @staticmethod + def as_yaml(yaml_path: str,obj : object) -> None: + """Safely writes an object to a YAML-File. + Args: + yaml_path (str): filename to write yaml to + obj (any): object to save as yaml + """ + with open(yaml_path, "w") as f: + f.write("# generated file - do not edit\n") + yaml.dump(obj, f) + \ No newline at end of file diff --git a/test/test_ml_benchmark/test.yaml b/test/test_ml_benchmark/test.yaml new file mode 100644 index 0000000..968d6d9 --- /dev/null +++ b/test/test_ml_benchmark/test.yaml @@ -0,0 +1,13 @@ +kubernetesContext: "minikube" +metricsIP: auto +kubernetesMasterIP: minikube +deleteAfterRun: true +hyperparameter: + learning_rate: + start: 1e-4 + end: 1e-2 + step_size: 1e-5 + hidden_layer_config: + start: [10] + end: [100, 100, 100] + step_size: [10, 1] diff --git a/test/test_ml_benchmark/test_resouce_tracker.py b/test/test_ml_benchmark/test_resouce_tracker.py index b157105..23500e5 100644 --- a/test/test_ml_benchmark/test_resouce_tracker.py +++ b/test/test_ml_benchmark/test_resouce_tracker.py @@ -1,12 +1,39 @@ import logging +import os +import pytest from ml_benchmark.resource_tracker import ResourceTracker, LoggingResouceStore -def test_resouce_tracker(): +import requests +@pytest.fixture +def prometeus_url(): + url = os.environ.get("PROMETHEUS_URL", "http://localhost:9090") + try: + resp = requests.get(url) + if resp.status_code != 200: + pytest.skip("Prometheus is availible") + except Exception: + pytest.skip("Could not connect to Prometheus.") + + return url + + +def test_resouce_tracker(prometeus_url): + import time + logging.basicConfig(level=logging.DEBUG) + rt = ResourceTracker(prometheus_url=prometeus_url, resouce_store=LoggingResouceStore) + rt.start() + time.sleep(ResourceTracker.UPDATE_INTERVAL * 15) + rt.stop() + print(rt.store.log) + assert rt.store.log != [] + +def test_resouce_tracker_with_namespace(prometeus_url): import time logging.basicConfig(level=logging.DEBUG) - rt = ResourceTracker(prometheus_url="http://130.149.158.143:30041", resouce_store=LoggingResouceStore) + rt = ResourceTracker(prometheus_url=prometeus_url, resouce_store=LoggingResouceStore) + rt.namespace = "optuna-study" rt.start() time.sleep(ResourceTracker.UPDATE_INTERVAL * 15) rt.stop() diff --git a/test/test_ml_benchmark/yaml_test.py b/test/test_ml_benchmark/yaml_test.py new file mode 100644 index 0000000..9e60b78 --- /dev/null +++ b/test/test_ml_benchmark/yaml_test.py @@ -0,0 +1,16 @@ +import logging +from os import path +from ml_benchmark.utils.yml_parser import YMLParser +from ml_benchmark.utils.yaml_template_filler import YamlTemplateFiller + +def test(): + resources = YMLParser.parse(path.join(path.dirname(__file__),"test.yaml")) + assert resources["deleteAfterRun"] + + print(resources["hyperparameter"]) + + YamlTemplateFiller.as_yaml(path.join(path.dirname(__file__),"hyperparameter_space.yml"), resources["hyperparameter"]) + params = YMLParser.parse(path.join(path.dirname(__file__),"hyperparameter_space.yml")) + assert params == resources["hyperparameter"] + + \ No newline at end of file From bc1db2abf5cf58b0145cef1b24fb3a2ac370e7c7 Mon Sep 17 00:00:00 2001 From: "basti.werner" Date: Tue, 23 Aug 2022 11:59:44 +0200 Subject: [PATCH 06/24] added rnode experiment runner for k8s optuna --- .../optuna_kubernetes_rnode.py | 43 +++++++++++++++++++ .../optuna_kubernetes/resource_definition.yml | 7 ++- ml_benchmark/benchmark_runner.py | 3 +- 3 files changed, 48 insertions(+), 5 deletions(-) create mode 100644 experiments/optuna_kubernetes/optuna_kubernetes_rnode.py diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py b/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py new file mode 100644 index 0000000..70bfda7 --- /dev/null +++ b/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py @@ -0,0 +1,43 @@ +import logging +from os import path +from time import sleep +from experiments.optuna_kubernetes.optuna_kubernetes_benchmark import OptunaKubernetesBenchmark +from ml_benchmark.benchmark_runner import BenchmarkRunner +from urllib.request import urlopen +from ml_benchmark.utils.yml_parser import YMLParser + +if __name__ == "__main__": + metricsIP = urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip() + + # read in base configuration + resources = YMLParser.parse(path.join(path.dirname(__file__),"resource_definition.yml")) + # TODO: XXX remove this hardcoded values + to_automate = { + "metricsIP": metricsIP, + "dockerImageTag": "tawalaya/optuna-trial:latest", + "dockerImageBuilder": "docker", + #force random namespaces to reduce conflicts + # "kubernetesNamespace": "optuna-study", + "kubernetesContext": "admin@smile", + "kubernetesMasterIP": "130.149.158.143", + "prometheus_url": "http://130.149.158.143:30041", + "deleteAfterRun":True, + } + resources.update(to_automate) + + repetions = 3 + nodes = [1,2,3,4,5,6,7,8,9,10] + for i in range(repetions): + for n in nodes: + sleep(3) + logging.info(f"Starting Run {i} with {n} nodes") + try: + resources["workerCount"] = n + resources["goal"] = f"rnode{n}-{i}" + runner = BenchmarkRunner( + benchmark_cls=OptunaKubernetesBenchmark, resources=resources) + runner.run() + sleep(7) + except Exception as e: + logging.warn(f'Failed Run {i} with {n} nodes - {e}') + \ No newline at end of file diff --git a/experiments/optuna_kubernetes/resource_definition.yml b/experiments/optuna_kubernetes/resource_definition.yml index 3f0ce28..19e4f07 100644 --- a/experiments/optuna_kubernetes/resource_definition.yml +++ b/experiments/optuna_kubernetes/resource_definition.yml @@ -1,12 +1,11 @@ -workerCpu: 2 -workerMemory: 2 -workerCount: 2 +workerCpu: 1 +workerMemory: 1 +workerCount: 1 metricsIP: auto ##urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), kubernetesMasterIP: minikube ##subprocess.check_output("minikube ip", shell=True).decode("utf-8").strip("\n") dockerImageTag: tawalaya/optuna-trial:latest dockerImageBuilder: docker -kubernetesNamespace: optuna-study kubernetesContext: "minikube" deleteAfterRun: True hyperparameter: diff --git a/ml_benchmark/benchmark_runner.py b/ml_benchmark/benchmark_runner.py index 69b674f..dda0eb9 100644 --- a/ml_benchmark/benchmark_runner.py +++ b/ml_benchmark/benchmark_runner.py @@ -117,6 +117,7 @@ def __init__( self.rundate = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") benchmark_path = os.path.abspath(os.path.dirname(inspect.getabsfile(benchmark_cls))) self.bench_name = f"{benchmark_cls.__name__}" + self.bench_goal = resources.get("goal", "debug") self.benchmark_folder = os.path.join(benchmark_path, f"benchmark__{self.bench_name}") self.create_benchmark_folder(self.benchmark_folder) @@ -223,7 +224,7 @@ def save_benchmark_results(self, benchmark_results): with open( os.path.join( self.benchmark_folder, - f"benchmark_results__{self.rundate}__id.json"), "w" + f"benchmark_results__{self.rundate}__{self.bench_goal}.json"), "w" ) as f: json.dump(benchmark_result_dict, f) print("Results saved!") From 62dbbf20bcf66ae47b9d4ff197e9b11afc072470 Mon Sep 17 00:00:00 2001 From: "basti.werner" Date: Wed, 24 Aug 2022 10:02:22 +0200 Subject: [PATCH 07/24] minor changes to experiment code --- experiments/optuna_kubernetes/optuna_kubernetes_rnode.py | 7 ++++--- ml_benchmark/benchmark_runner.py | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py b/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py index 70bfda7..781562a 100644 --- a/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py +++ b/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py @@ -26,9 +26,9 @@ resources.update(to_automate) repetions = 3 - nodes = [1,2,3,4,5,6,7,8,9,10] - for i in range(repetions): - for n in nodes: + + for n in range(1,10): + for i in range(1,repetions): sleep(3) logging.info(f"Starting Run {i} with {n} nodes") try: @@ -38,6 +38,7 @@ benchmark_cls=OptunaKubernetesBenchmark, resources=resources) runner.run() sleep(7) + runner = None except Exception as e: logging.warn(f'Failed Run {i} with {n} nodes - {e}') \ No newline at end of file diff --git a/ml_benchmark/benchmark_runner.py b/ml_benchmark/benchmark_runner.py index dda0eb9..6635462 100644 --- a/ml_benchmark/benchmark_runner.py +++ b/ml_benchmark/benchmark_runner.py @@ -180,6 +180,7 @@ def run(self): sleep(5) if self.resource_tracker is not None: self.resource_tracker.stop() + self.resource_tracker = None self.metrics_storage.stop_db() # Undeploy the SUT From 666991d80c253bd741003789ae68a5603b4eadda Mon Sep 17 00:00:00 2001 From: "basti.werner" Date: Wed, 24 Aug 2022 12:48:25 +0200 Subject: [PATCH 08/24] [WIP] implemented classification tracking --- experiments/optuna_minikube/optuna_trial.py | 1 + ml_benchmark/decorators.py | 48 +++++++ ml_benchmark/latency_tracker.py | 20 --- ml_benchmark/metrics.py | 60 ++++++--- ml_benchmark/metrics_storage.py | 123 ++++++++++++++++-- ml_benchmark/resource_tracker.py | 71 +--------- ml_benchmark/results_tracker.py | 18 +++ ml_benchmark/workload/mnist/mlp_objective.py | 4 +- test/conftest.py | 21 ++- test/test_ml_benchmark/test_metrics.py | 36 +++++ .../test_ml_benchmark/test_resouce_tracker.py | 23 +--- 11 files changed, 288 insertions(+), 137 deletions(-) create mode 100644 ml_benchmark/decorators.py create mode 100644 ml_benchmark/results_tracker.py create mode 100644 test/test_ml_benchmark/test_metrics.py diff --git a/experiments/optuna_minikube/optuna_trial.py b/experiments/optuna_minikube/optuna_trial.py index 92cac23..f0cfafd 100644 --- a/experiments/optuna_minikube/optuna_trial.py +++ b/experiments/optuna_minikube/optuna_trial.py @@ -3,6 +3,7 @@ from time import sleep import optuna from ml_benchmark.workload.mnist.mnist_task import MnistTask +from ml_benchmark.results_tracker import ResultsTracker from utils import generate_search_space diff --git a/ml_benchmark/decorators.py b/ml_benchmark/decorators.py new file mode 100644 index 0000000..8b4b305 --- /dev/null +++ b/ml_benchmark/decorators.py @@ -0,0 +1,48 @@ + +from ml_benchmark.latency_tracker import LatencyTracker +from ml_benchmark.metrics import Latency +from ml_benchmark.results_tracker import ResultTracker + + +def validation_latency_decorator(func): + """ + A Decorator to record the latency of the decorated function. Once it is recorded the LatencyTracker + writes the result into the postgres database. + + We assume that that the decorated function returns a dictionary with the following keys: + - "macro avg": the macro average of the validation with the keys: + - "f1-score": the f1-score + + """ + def result_func(*args, **kwargs): + func.__self__ = args[0] + with Latency(func) as latency: + result = func(*args, **kwargs) + latency_tracker = LatencyTracker() + tracker = ResultTracker() + + latency_tracker.track(latency) + #XXX this locks us into the f1-score, we probably want to track all callification metrics not just f1-score. MG please help :) + tracker.track(func, result["macro avg"]["f1-score"], "f1-score") + + func.__self__ = None + return result_func + +def latency_decorator(func): + """A Decorator to record the latency of the decorated function. Once it is recorded the LatencyTracker + writes the result into the postgres databse. + + Decorators overwrite a decorated function once the code is passed to the compier + + Args: + func (_type_): _description_ + """ + def latency_func(*args, **kwargs): + func.__self__ = args[0] + with Latency(func) as latency: + result = func(*args, **kwargs) + latency_tracker = LatencyTracker() + latency_tracker.track(latency) + func.__self__ = None + return result + return latency_func \ No newline at end of file diff --git a/ml_benchmark/latency_tracker.py b/ml_benchmark/latency_tracker.py index 05f865d..b23ce3d 100644 --- a/ml_benchmark/latency_tracker.py +++ b/ml_benchmark/latency_tracker.py @@ -6,7 +6,6 @@ from ml_benchmark.config import MetricsStorageConfig -from ml_benchmark.metrics import Latency class Tracker(ABC): @@ -84,25 +83,6 @@ def shape_connection_string(self, host): return f"postgresql://{user}:{password}@{host}:{port}/{db}" -def latency_decorator(func): - """A Decorator to record the latency of the decorated function. Once it is recorded the LatencyTracker - writes the result into the postgres databse. - - Decorators overwrite a decorated function once the code is passed to the compier - - Args: - func (_type_): _description_ - """ - def latency_func(*args, **kwargs): - func.__self__ = args[0] - with Latency(func) as latency: - result = func(*args, **kwargs) - latency_tracker = LatencyTracker() - latency_tracker.track(latency) - func.__self__ = None - return result - return latency_func - if __name__ == "__main__": diff --git a/ml_benchmark/metrics.py b/ml_benchmark/metrics.py index e7d3c9d..13a8452 100644 --- a/ml_benchmark/metrics.py +++ b/ml_benchmark/metrics.py @@ -1,3 +1,4 @@ +import logging import os from datetime import datetime, timedelta from uuid import uuid4 @@ -53,6 +54,45 @@ def __repr__(self): return str(self.to_dict()) +class Result(Metric): + def __init__(self, objective): + super().__init__() + + # add fingerprinting data to self + fp = _fingerprint(self,objective) + self.__dict__.update(fp) + self.timestamp = datetime.now().ctime() + self.value = None + self.measure = None + + def to_dict(self): + return self.__dict__ + + + +def _fingerprint(metric,func): + process_id = os.getpid() + # inject the NODE_NAME (from the environment) - should be availble in containerized environments + if os.getenv("NODE_NAME"): + hostname = f'{os.getenv("NODE_NAME")}_{socket.gethostname()}' + else: + hostname = f'BARE_{socket.gethostname()}' + metric.add_to_id(f"id_{uuid4()}__pid_{process_id}__hostname_{hostname}") + + + try: + obj_hash = hash(func.__self__) + except AttributeError as e: + logging.warn(f"fingerprinting error {e}") + raise AttributeError("Functions need to be part of a class in order to measure their latency. {e}") + + return { + "process_id": process_id, + "hostname": hostname, + "obj_hash": obj_hash, + } + + class Latency(Metric): def __init__(self, func) -> None: @@ -74,22 +114,12 @@ def __init__(self, func) -> None: """ super().__init__() #TODO: make each id filed also availible as a column - process_id = os.getpid() - # inject the NODE_NAME (from the environment) - should be availble in containerized environments - if os.getenv("NODE_NAME"): - hostname = f'{os.getenv("NODE_NAME")}_{socket.gethostname()}' - else: - hostname = f'BARE_{socket.gethostname()}' - self.add_to_id(f"id_{uuid4()}__pid_{process_id}__hostname_{hostname}") - + fp = _fingerprint(self,func) + obj_hash = fp["obj_hash"] + function_name: str = func.__name__ + self.function_name = function_name + self.add_to_id(f"function-name_{function_name}__objHash_{obj_hash}") - self.function_name: str = func.__name__ - try: - self.obj_hash = hash(func.__self__) - except AttributeError as e: - print(e) - raise AttributeError("Functions need to be part of a class in order to measure their latency.") - self.add_to_id(f"function-name_{self.function_name}__objHash_{self.obj_hash}") self.start_time: float = None self.end_time: float = None self.duration_sec: float = None diff --git a/ml_benchmark/metrics_storage.py b/ml_benchmark/metrics_storage.py index 421f220..988d01d 100644 --- a/ml_benchmark/metrics_storage.py +++ b/ml_benchmark/metrics_storage.py @@ -1,11 +1,21 @@ +from abc import abstractmethod import logging import time import docker from docker.errors import APIError -from sqlalchemy import create_engine, MetaData, Table, Column, String, Float, select, Integer +from sqlalchemy import create_engine, MetaData, Table, Column, String, Float, select, Integer, insert +import psycopg2 +import os from ml_benchmark.config import MetricsStorageConfig +from ml_benchmark.metrics import Metric +# see metrics._fingerprint for more information +fingerprint_columns = [ + Column("process_id", Integer), + Column("hostname", String), + Column("obj_hash", Integer), +] class MetricsStorage: @@ -67,6 +77,7 @@ def setup_db(self): # checks if db is up while "accepting connections" not in container.exec_run("pg_isready").output.decode(): time.sleep(2) + #TODO: should have a timeout condition print("DB-Container Running") def stop_db(self): @@ -92,6 +103,7 @@ def create_latency_table(self): Column("start_time", String), Column("end_time", String), Column("duration_sec", Float) + #TODO add fingerprint ) def create_resource_table(self): @@ -108,33 +120,118 @@ def create_resource_table(self): ) def create_classification_metrics_table(self): - pass + self.classification_metrics = Table( + "classification_metrics", self.meta, + Column("metric_id", String, primary_key=True), + Column("timestamp", String, primary_key=True), + Column("value", Float), + Column("measure", String), + *fingerprint_columns + ) def get_benchmark_results(self): latency = self.get_latency_results() resources = self.get_resource_results() classification = self.get_classification_results() return dict(latency=latency, resources=resources, classification=classification) - - def get_latency_results(self): + + def _get_table_results(self,table): result_list = [] with self.engine.connect() as conn: - stmt = select(self.latency) + stmt = select(table) cursor = conn.execute(stmt) cursor = cursor.mappings().all() for row in cursor: result_list.append(dict(row)) return result_list + def get_latency_results(self): + return self._get_table_results(self.latency) + def get_resource_results(self): - result_list = [] - with self.engine.connect() as conn: - stmt = select(self.resources) - cursor = conn.execute(stmt) - cursor = cursor.mappings().all() - for row in cursor: - result_list.append(dict(row)) - return result_list + return self._get_table_results(self.resources) def get_classification_results(self): + return self._get_table_results(self.classification_metrics) + + +class StoreStrategy(object): + """ + Interface for swapping out different implementations of the resource store, e.g., a database, a file, etc. + """ + @abstractmethod + def setup(self, **kwargs): + """ + Setup the resource store, e.g., create a database connection. + """ pass + + @abstractmethod + def store(self, node_usage:Metric, **kwargs): + """ + Store the node usage in the resource store. + """ + pass + +#global store engine used as a singleton to safe +engine=None + +class MetricsStorageStrategy(StoreStrategy): + + def __init__(self): + self.engine = None + + def setup(self, **kwargs): + if self.engine: + return + + #resue the global engine if it exists + # global engine + # if engine: + # self.engine = engine + + self.engine = self._create_engine(**kwargs) + # engine = self.engine + + def _get_connection_string(self, **kwargs): + # XXX: list order is implicitly a priority + connection_string_actions_registry = [ + ("env", os.environ.get("METRICS_STORAGE_HOST", None)), + ("args",kwargs.get("connection_string",None)) + ] + for method, value in connection_string_actions_registry: + if value: + logging.debug(f"Tracker Connection String retrieved from: {method} using {value}") + return self.shape_connection_string(value) + logging.warn("No Method was succsessful. Setting Tracker URL to current Host.") + return MetricsStorageConfig.connection_string + + def _create_engine(self, **kwargs): + connection_string = self._get_connection_string(**kwargs) + try: + engine = create_engine(connection_string, echo=False) + except psycopg2.Error: + raise ConnectionError("Could not create an Engine for the Postgres DB.") + return engine + + def store(self, data:Metric, **kwargs): + try: + metadata = MetaData(bind=self.engine) + node_usage = Table(kwargs.get("table_name","metrics"), metadata, autoload_with=self.engine) + with self.engine.connect() as conn: + stmt = insert(node_usage).values(data.to_dict()) + conn.execute(stmt) + except Exception as e: + logging.warn(f"Could not store the data in the Metrics DB {data} - {e}") + +class LoggingStoreStrategy(StoreStrategy): + + def __init__(self): + self.log = [] + + def setup(self, **kwargs): + pass + + def store(self, data): + logging.info("Storing data: {}".format(data.to_dict())) + self.log.append(data) diff --git a/ml_benchmark/resource_tracker.py b/ml_benchmark/resource_tracker.py index 22bc0f4..f4f3fc9 100644 --- a/ml_benchmark/resource_tracker.py +++ b/ml_benchmark/resource_tracker.py @@ -1,15 +1,11 @@ -from abc import abstractmethod import datetime +import logging +from threading import Timer from prometheus_api_client import PrometheusConnect -from ml_benchmark.config import MetricsStorageConfig -import psycopg2 -from sqlalchemy import MetaData, Table, create_engine, insert -from threading import Timer - -from ml_benchmark.metrics import Metric, NodeUsage -import logging +from ml_benchmark.metrics import NodeUsage +from ml_benchmark.metrics_storage import MetricsStorageStrategy class RepeatTimer(Timer): @@ -19,67 +15,12 @@ def run(self): self.function(*self.args, **self.kwargs) -class ResourceStore(object): - """ - Interface for swapping out different implementations of the resource store, e.g., a database, a file, etc. - """ - @abstractmethod - def setup(self, **kwargs): - """ - Setup the resource store, e.g., create a database connection. - """ - pass - - @abstractmethod - def store(self, node_usage:Metric, **kwargs): - """ - Store the node usage in the resource store. - """ - pass - -class MetricsResouceStore(ResourceStore): - - def __init__(self): - self.engine = None - - def setup(self, **kwargs): - self.engine = self._create_engine(kwargs.get("connection_string",MetricsStorageConfig.connection_string)) - - def _create_engine(self, connection_string): - try: - engine = create_engine(connection_string, echo=False) - except psycopg2.Error: - raise ConnectionError("Could not create an Engine for the Postgres DB.") - return engine - - def store(self, data:Metric, **kwargs): - try: - metadata = MetaData(bind=self.engine) - node_usage = Table(kwargs.get("table_name","resources"), metadata, autoload_with=self.engine) - with self.engine.connect() as conn: - stmt = insert(node_usage).values(data.to_dict()) - conn.execute(stmt) - except Exception as e: - logging.warn(f"Could not store the data in the Metrics DB {data} - {e}") - -class LoggingResouceStore(ResourceStore): - - def __init__(self): - self.log = [] - - def setup(self, **kwargs): - pass - - def store(self, data): - logging.info("Storing data: {}".format(data.to_dict())) - self.log.append(data) - class ResourceTracker: # update every 2 seconds ... maybe make this tuneable UPDATE_INTERVAL = 2 - def __init__(self, prometheus_url, resouce_store=MetricsResouceStore ): + def __init__(self, prometheus_url, resouce_store=MetricsStorageStrategy ): if prometheus_url is None: raise ValueError("Prometheus URL is required.") self.prometheus_url = prometheus_url @@ -191,7 +132,7 @@ def track(self): #insert the data for n in data: - self.store.store(n) + self.store.store(n,table_name="resources") def _try_norm(self, instance: str): if instance in self.node_map: diff --git a/ml_benchmark/results_tracker.py b/ml_benchmark/results_tracker.py new file mode 100644 index 0000000..4b83b25 --- /dev/null +++ b/ml_benchmark/results_tracker.py @@ -0,0 +1,18 @@ +import logging +from ml_benchmark.latency_tracker import Tracker #TODO: move to utils +from ml_benchmark.metrics import Result +from ml_benchmark.metrics_storage import MetricsStorageStrategy + +class ResultTracker(Tracker): + def __init__(self,resouce_store=MetricsStorageStrategy): + self.store = resouce_store() + self.store.setup() + + def track(self, objective, value, measure): + r = Result(objective=objective) + r.value = value + r.measure = measure + try: + self.store.store(r,table_name="classification_metrics") + except Exception as e: + logging.warn(f"failed to store result {e}") diff --git a/ml_benchmark/workload/mnist/mlp_objective.py b/ml_benchmark/workload/mnist/mlp_objective.py index 0169b27..1217851 100644 --- a/ml_benchmark/workload/mnist/mlp_objective.py +++ b/ml_benchmark/workload/mnist/mlp_objective.py @@ -1,7 +1,7 @@ import torch import tqdm from ml_benchmark.config import MLPHyperparameter -from ml_benchmark.latency_tracker import latency_decorator +from ml_benchmark.decorators import latency_decorator, validation_latency_decorator from ml_benchmark.workload.mnist.mlp import MLP from ml_benchmark.workload.objective import Objective from sklearn.metrics import classification_report @@ -50,7 +50,7 @@ def train(self): epoch_losses.append(sum(batch_losses)/len(batch_losses)) return {"train_loss": epoch_losses} - @latency_decorator + @validation_latency_decorator def validate(self): self.model.eval() self.model = self.model.to(self.device) diff --git a/test/conftest.py b/test/conftest.py index c93523c..ea0df60 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,5 +1,8 @@ import pytest -from ml_benchmark.latency_tracker import latency_decorator +import requests +import os + +from ml_benchmark.decorators import latency_decorator, validation_latency_decorator from ml_benchmark.workload.objective import Objective @@ -13,11 +16,23 @@ def __init__(self) -> None: def train(self): pass - @latency_decorator + @validation_latency_decorator def validate(self): - return 0.5 + return {"macro avg":{"f1-score":0.5}} @latency_decorator def test(self): return {"score": 0.5} return TestObjective + +@pytest.fixture +def prometeus_url(): + url = os.environ.get("PROMETHEUS_URL", "http://localhost:9090") + try: + resp = requests.get(url) + if resp.status_code != 200: + pytest.skip("Prometheus is availible") + except Exception: + pytest.skip("Could not connect to Prometheus.") + + return url \ No newline at end of file diff --git a/test/test_ml_benchmark/test_metrics.py b/test/test_ml_benchmark/test_metrics.py new file mode 100644 index 0000000..8fbeab2 --- /dev/null +++ b/test/test_ml_benchmark/test_metrics.py @@ -0,0 +1,36 @@ +import imp +import logging +from ml_benchmark.metrics_storage import MetricsStorage +from ml_benchmark.resource_tracker import ResourceTracker +from ml_benchmark.results_tracker import ResultTracker + +from ml_benchmark.workload.mnist.mnist_task import MnistTask +from time import sleep + +def test_metrics(prometeus_url): + task = MnistTask({"epochs": 1}) + objective = task.create_objective() + metrics_storage = MetricsStorage() + resourceTracker = ResourceTracker(prometheus_url=prometeus_url) + try: + metrics_storage.start_db() + sleep(2) + resourceTracker.start() + objective.set_hyperparameters({"learning_rate":1e-3}) + objective.train() + score = objective.validate() + objective.test() + + sleep(15) + + result = metrics_storage.get_benchmark_results() + logging.info(result) + + assert len(result["latency"]) > 0 + assert len(result["classification"]) > 0 + assert len(result["resources"]) > 0 + except Exception as e: + assert False, e + finally: + resourceTracker.stop() + metrics_storage.stop_db() diff --git a/test/test_ml_benchmark/test_resouce_tracker.py b/test/test_ml_benchmark/test_resouce_tracker.py index 23500e5..14a61f8 100644 --- a/test/test_ml_benchmark/test_resouce_tracker.py +++ b/test/test_ml_benchmark/test_resouce_tracker.py @@ -1,28 +1,13 @@ import logging -import os -import pytest -from ml_benchmark.resource_tracker import ResourceTracker, LoggingResouceStore -import requests - -@pytest.fixture -def prometeus_url(): - url = os.environ.get("PROMETHEUS_URL", "http://localhost:9090") - try: - resp = requests.get(url) - if resp.status_code != 200: - pytest.skip("Prometheus is availible") - except Exception: - pytest.skip("Could not connect to Prometheus.") - - return url - +from ml_benchmark.resource_tracker import ResourceTracker +from ml_benchmark.metrics_storage import LoggingStoreStrategy def test_resouce_tracker(prometeus_url): import time logging.basicConfig(level=logging.DEBUG) - rt = ResourceTracker(prometheus_url=prometeus_url, resouce_store=LoggingResouceStore) + rt = ResourceTracker(prometheus_url=prometeus_url, resouce_store=LoggingStoreStrategy) rt.start() time.sleep(ResourceTracker.UPDATE_INTERVAL * 15) rt.stop() @@ -32,7 +17,7 @@ def test_resouce_tracker(prometeus_url): def test_resouce_tracker_with_namespace(prometeus_url): import time logging.basicConfig(level=logging.DEBUG) - rt = ResourceTracker(prometheus_url=prometeus_url, resouce_store=LoggingResouceStore) + rt = ResourceTracker(prometheus_url=prometeus_url, resouce_store=LoggingStoreStrategy) rt.namespace = "optuna-study" rt.start() time.sleep(ResourceTracker.UPDATE_INTERVAL * 15) From 7988f7f040344e9bee92e3cca9ef44e76b86c134 Mon Sep 17 00:00:00 2001 From: "basti.werner" Date: Wed, 24 Aug 2022 17:12:09 +0200 Subject: [PATCH 09/24] fixed some minor issues --- .../optuna_kubernetes_benchmark.py | 4 ++-- experiments/optuna_minikube/optuna_trial.py | 1 - ml_benchmark/benchmark_runner.py | 4 ++-- ml_benchmark/decorators.py | 3 ++- ml_benchmark/latency_tracker.py | 4 +--- ml_benchmark/metrics.py | 13 +++++++---- ml_benchmark/metrics_storage.py | 22 ++++++++++--------- ml_benchmark/results_tracker.py | 7 +++--- .../test_ml_benchmark/test_latency_tracker.py | 19 +++++++++++++++- 9 files changed, 50 insertions(+), 27 deletions(-) diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py b/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py index 7aa50ee..c399cac 100644 --- a/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py +++ b/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py @@ -175,8 +175,8 @@ def _watch_trials(self,timeout=120): def test(self): def optuna_trial(trial): - objective = MnistTask(config_init={"epochs": 1}).create_objective() - lr = trial.suggest_float("learning_rate", 1e-3, 0.1, log=True) + objective = MnistTask(config_init={"epochs": 5}).create_objective() + lr = trial.suggest_float("learning_rate", 1e-4, 0.1, log=True) decay = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True) objective.set_hyperparameters({"learning_rate": lr, "weight_decay": decay}) # these are the results, that can be used for the hyperparameter search diff --git a/experiments/optuna_minikube/optuna_trial.py b/experiments/optuna_minikube/optuna_trial.py index f0cfafd..92cac23 100644 --- a/experiments/optuna_minikube/optuna_trial.py +++ b/experiments/optuna_minikube/optuna_trial.py @@ -3,7 +3,6 @@ from time import sleep import optuna from ml_benchmark.workload.mnist.mnist_task import MnistTask -from ml_benchmark.results_tracker import ResultsTracker from utils import generate_search_space diff --git a/ml_benchmark/benchmark_runner.py b/ml_benchmark/benchmark_runner.py index 6635462..d695f67 100644 --- a/ml_benchmark/benchmark_runner.py +++ b/ml_benchmark/benchmark_runner.py @@ -11,10 +11,10 @@ import torch import logging -from ml_benchmark.latency_tracker import Latency, LatencyTracker +from ml_benchmark.latency_tracker import LatencyTracker from ml_benchmark.metrics_storage import MetricsStorage from ml_benchmark.resource_tracker import ResourceTracker - +from ml_benchmark.metrics import Latency class Benchmark(ABC): """ diff --git a/ml_benchmark/decorators.py b/ml_benchmark/decorators.py index 8b4b305..7444b12 100644 --- a/ml_benchmark/decorators.py +++ b/ml_benchmark/decorators.py @@ -24,8 +24,9 @@ def result_func(*args, **kwargs): latency_tracker.track(latency) #XXX this locks us into the f1-score, we probably want to track all callification metrics not just f1-score. MG please help :) tracker.track(func, result["macro avg"]["f1-score"], "f1-score") - func.__self__ = None + return result + return result_func def latency_decorator(func): diff --git a/ml_benchmark/latency_tracker.py b/ml_benchmark/latency_tracker.py index b23ce3d..0ad71b1 100644 --- a/ml_benchmark/latency_tracker.py +++ b/ml_benchmark/latency_tracker.py @@ -1,10 +1,8 @@ import os from abc import ABC, abstractmethod - import psycopg2 from sqlalchemy import MetaData, Table, create_engine, insert - from ml_benchmark.config import MetricsStorageConfig @@ -94,7 +92,7 @@ def shape_connection_string(self, host): storage = MetricsStorage() result = [] - + from ml_benchmark.decorators import latency_decorator class Test: metrics_storage_address = MetricsStorage.connection_string diff --git a/ml_benchmark/metrics.py b/ml_benchmark/metrics.py index 13a8452..52b18c2 100644 --- a/ml_benchmark/metrics.py +++ b/ml_benchmark/metrics.py @@ -59,14 +59,19 @@ def __init__(self, objective): super().__init__() # add fingerprinting data to self - fp = _fingerprint(self,objective) - self.__dict__.update(fp) + self.fp = _fingerprint(self,objective) self.timestamp = datetime.now().ctime() self.value = None self.measure = None def to_dict(self): - return self.__dict__ + return dict( + metric_id=self.metric_id, + timestamp=self.timestamp, + value=self.value, + measure=self.measure, + **self.fp + ) @@ -79,7 +84,7 @@ def _fingerprint(metric,func): hostname = f'BARE_{socket.gethostname()}' metric.add_to_id(f"id_{uuid4()}__pid_{process_id}__hostname_{hostname}") - + obj_hash = 0 try: obj_hash = hash(func.__self__) except AttributeError as e: diff --git a/ml_benchmark/metrics_storage.py b/ml_benchmark/metrics_storage.py index 988d01d..0eefe28 100644 --- a/ml_benchmark/metrics_storage.py +++ b/ml_benchmark/metrics_storage.py @@ -3,20 +3,13 @@ import time import docker from docker.errors import APIError -from sqlalchemy import create_engine, MetaData, Table, Column, String, Float, select, Integer, insert +from sqlalchemy import create_engine, MetaData, Table, Column, String, Float, select, Integer, insert, BigInteger import psycopg2 import os from ml_benchmark.config import MetricsStorageConfig from ml_benchmark.metrics import Metric -# see metrics._fingerprint for more information -fingerprint_columns = [ - Column("process_id", Integer), - Column("hostname", String), - Column("obj_hash", Integer), -] - class MetricsStorage: port = MetricsStorageConfig.port @@ -126,7 +119,9 @@ def create_classification_metrics_table(self): Column("timestamp", String, primary_key=True), Column("value", Float), Column("measure", String), - *fingerprint_columns + Column("process_id", Integer, nullable=True), + Column("hostname", String), + Column("obj_hash", BigInteger, nullable=True), ) def get_benchmark_results(self): @@ -205,7 +200,14 @@ def _get_connection_string(self, **kwargs): return self.shape_connection_string(value) logging.warn("No Method was succsessful. Setting Tracker URL to current Host.") return MetricsStorageConfig.connection_string - + + def shape_connection_string(self, host): + user = MetricsStorageConfig.user + password = MetricsStorageConfig.password + port = MetricsStorageConfig.port + db = MetricsStorageConfig.db + return f"postgresql://{user}:{password}@{host}:{port}/{db}" + def _create_engine(self, **kwargs): connection_string = self._get_connection_string(**kwargs) try: diff --git a/ml_benchmark/results_tracker.py b/ml_benchmark/results_tracker.py index 4b83b25..d8f5108 100644 --- a/ml_benchmark/results_tracker.py +++ b/ml_benchmark/results_tracker.py @@ -4,8 +4,8 @@ from ml_benchmark.metrics_storage import MetricsStorageStrategy class ResultTracker(Tracker): - def __init__(self,resouce_store=MetricsStorageStrategy): - self.store = resouce_store() + def __init__(self,store=MetricsStorageStrategy): + self.store = store() self.store.setup() def track(self, objective, value, measure): @@ -14,5 +14,6 @@ def track(self, objective, value, measure): r.measure = measure try: self.store.store(r,table_name="classification_metrics") + logging.info("Stored result") except Exception as e: - logging.warn(f"failed to store result {e}") + logging.error(f"failed to store result {e}") diff --git a/test/test_ml_benchmark/test_latency_tracker.py b/test/test_ml_benchmark/test_latency_tracker.py index 15292cb..9dbe55b 100644 --- a/test/test_ml_benchmark/test_latency_tracker.py +++ b/test/test_ml_benchmark/test_latency_tracker.py @@ -1,7 +1,7 @@ from ml_benchmark.metrics_storage import MetricsStorage import docker import json - +import os def test_latency_decorator(objective): objective = objective() @@ -18,3 +18,20 @@ def test_latency_decorator(objective): metrics_storage.stop_db() assert isinstance(json.dumps(result), str) + +def test_latency_decorator_using_env(objective): + objective = objective() + metrics_storage = MetricsStorage() + + try: + metrics_storage.start_db() + os.environ["METRICS_STORAGE_HOST"] = MetricsStorage.connection_string + objective.train() + objective.validate() + objective.test() + result = metrics_storage.get_benchmark_results() + metrics_storage.stop_db() + except docker.errors.APIError: + metrics_storage.stop_db() + + assert isinstance(json.dumps(result), str)ð \ No newline at end of file From 56d5eba4e8d4c996186198fadc80625a89f01dba Mon Sep 17 00:00:00 2001 From: "basti.werner" Date: Thu, 25 Aug 2022 12:56:11 +0200 Subject: [PATCH 10/24] added trail modifcations --- .../optuna_kubernetes/optuna_kubernetes_benchmark.py | 6 +++++- experiments/optuna_kubernetes/resource_definition.yml | 6 ++++-- experiments/optuna_minikube/ops/manifests/trial/job.yml | 4 ++++ experiments/optuna_minikube/optuna_trial.py | 7 ++++--- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py b/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py index c399cac..dec4577 100644 --- a/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py +++ b/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py @@ -37,6 +37,8 @@ def __init__(self, resources: dict, runner=None) -> None: self.delete_after_run = resources.get("deleteAfterRun", True) self.metrics_ip = resources.get("metricsIP") self.runner = runner + self.trails = resources.get("trails", 6) + self.epochs = resources.get("epochs", 5) self.hyperparameter = resources.get("hyperparameter") def deploy(self) -> None: @@ -107,6 +109,8 @@ def run(self): "worker_image": self.trial_tag, "study_name": self.study_name, "metrics_ip": self.metrics_ip, + "trails": self.trails, + "epochs": self.epochs, } job_yml_objects = YamlTemplateFiller.load_and_fill_yaml_template( path.join(path.dirname(__file__), "ops/manifests/trial/job.yml"), job_definition) @@ -175,7 +179,7 @@ def _watch_trials(self,timeout=120): def test(self): def optuna_trial(trial): - objective = MnistTask(config_init={"epochs": 5}).create_objective() + objective = MnistTask(config_init={"epochs": self.epochs}).create_objective() lr = trial.suggest_float("learning_rate", 1e-4, 0.1, log=True) decay = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True) objective.set_hyperparameters({"learning_rate": lr, "weight_decay": decay}) diff --git a/experiments/optuna_kubernetes/resource_definition.yml b/experiments/optuna_kubernetes/resource_definition.yml index 19e4f07..ff91c1b 100644 --- a/experiments/optuna_kubernetes/resource_definition.yml +++ b/experiments/optuna_kubernetes/resource_definition.yml @@ -1,7 +1,9 @@ workerCpu: 1 -workerMemory: 1 -workerCount: 1 +workerMemory: 1.5 +workerCount: 4 +trails: 6 +epochs: 5 metricsIP: auto ##urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), kubernetesMasterIP: minikube ##subprocess.check_output("minikube ip", shell=True).decode("utf-8").strip("\n") dockerImageTag: tawalaya/optuna-trial:latest diff --git a/experiments/optuna_minikube/ops/manifests/trial/job.yml b/experiments/optuna_minikube/ops/manifests/trial/job.yml index a3e111e..7b0ff8f 100644 --- a/experiments/optuna_minikube/ops/manifests/trial/job.yml +++ b/experiments/optuna_minikube/ops/manifests/trial/job.yml @@ -23,6 +23,10 @@ spec: value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" - name: "METRICS_STORAGE_HOST" value: "$metrics_ip" + - name: "N_TRAILS": + value: "$n_trials" + - name: "EPOCHs": + value: "$epochs" # injects the kuberntes node name into eacah pod - name: NODE_NAME valueFrom: diff --git a/experiments/optuna_minikube/optuna_trial.py b/experiments/optuna_minikube/optuna_trial.py index 92cac23..79e4b72 100644 --- a/experiments/optuna_minikube/optuna_trial.py +++ b/experiments/optuna_minikube/optuna_trial.py @@ -5,9 +5,9 @@ from ml_benchmark.workload.mnist.mnist_task import MnistTask from utils import generate_search_space - def optuna_trial(trial): - task = MnistTask(config_init={"epochs": 5}) + epochs = int(os.environ.get("EPOCHS",5)) + task = MnistTask(config_init={"epochs": epochs}) objective = task.create_objective() # optuna doesnt care, these lines of code just get hyperparameters from the search space in grid search lr = trial.suggest_float("learning_rate", 1e-3, 0.1, log=True) @@ -24,12 +24,13 @@ def optuna_trial(trial): try: study_name = os.environ.get("STUDY_NAME") database_conn = os.environ.get("DB_CONN") + n_trails = int(os.environ.get("N_TRAILS",6)) search_space = generate_search_space(os.path.join(os.path.dirname(__file__),"hyperparameter_space.yml")) print(search_space) study = optuna.create_study( study_name=study_name, storage=database_conn, direction="maximize", load_if_exists=True, sampler=optuna.samplers.GridSampler(search_space)) - study.optimize(optuna_trial, n_trials=6) + study.optimize(optuna_trial, n_trials=n_trails,n_jobs=-1) ##TODO:XXX We need to make this a configurable parameter!!! # TODO: add small wait to avoid missing metrics sleep(5) sys.exit(0) From 95cfb7c993058587686b982fc35684546c67a37b Mon Sep 17 00:00:00 2001 From: "basti.werner" Date: Thu, 25 Aug 2022 16:17:26 +0200 Subject: [PATCH 11/24] fixed eproch/trails insertion --- .../ops/manifests/trial/job.yml | 4 +++ experiments/optuna_minikube/optuna_trial.py | 13 ++++++--- experiments/optuna_minikube/test_trail.py | 28 +++++++++++++++++++ ml_benchmark/metrics_storage.py | 2 +- .../test_ml_benchmark/test_latency_tracker.py | 4 +-- 5 files changed, 44 insertions(+), 7 deletions(-) create mode 100644 experiments/optuna_minikube/test_trail.py diff --git a/experiments/optuna_kubernetes/ops/manifests/trial/job.yml b/experiments/optuna_kubernetes/ops/manifests/trial/job.yml index a915864..ef9d78c 100644 --- a/experiments/optuna_kubernetes/ops/manifests/trial/job.yml +++ b/experiments/optuna_kubernetes/ops/manifests/trial/job.yml @@ -24,6 +24,10 @@ spec: value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" - name: "METRICS_STORAGE_HOST" value: "$metrics_ip" + - name: "N_TRAILS": + value: "$n_trials" + - name: "EPOCHS": + value: "$epochs" # injects the kuberntes node name into eacah pod - name: NODE_NAME valueFrom: diff --git a/experiments/optuna_minikube/optuna_trial.py b/experiments/optuna_minikube/optuna_trial.py index 79e4b72..0681531 100644 --- a/experiments/optuna_minikube/optuna_trial.py +++ b/experiments/optuna_minikube/optuna_trial.py @@ -19,8 +19,7 @@ def optuna_trial(trial): validation_scores = objective.validate() return validation_scores["macro avg"]["f1-score"] - -if __name__ == "__main__": +def main(): try: study_name = os.environ.get("STUDY_NAME") database_conn = os.environ.get("DB_CONN") @@ -33,7 +32,13 @@ def optuna_trial(trial): study.optimize(optuna_trial, n_trials=n_trails,n_jobs=-1) ##TODO:XXX We need to make this a configurable parameter!!! # TODO: add small wait to avoid missing metrics sleep(5) - sys.exit(0) + return True except Exception as e: print(e) - sys.exit(1) + return False + +if __name__ == "__main__": + if main(): + sys.exit(0) + else: + sys.exit(1) \ No newline at end of file diff --git a/experiments/optuna_minikube/test_trail.py b/experiments/optuna_minikube/test_trail.py new file mode 100644 index 0000000..af966c2 --- /dev/null +++ b/experiments/optuna_minikube/test_trail.py @@ -0,0 +1,28 @@ + +from distutils import file_util +import os +from time import sleep +from experiments.optuna_minikube.optuna_trial import main +from ml_benchmark.metrics_storage import MetricsStorage + + +def test_trail(): + metrics_storage = MetricsStorage() + try: + metrics_storage.start_db() + sleep(5) + os.environ["METRICS_STORAGE_HOST"] = MetricsStorage.host + os.environ["DB_CONN"] = MetricsStorage.connection_string + os.environ["N_TRAILS"] = "10" + os.environ["EPOCHS"] = "2" + + f = main() + assert f + + lats = metrics_storage.get_latency_results() + assert len(lats) >= int(os.environ["N_TRAILS"])*2 #(validate+train) + finally: + metrics_storage.stop_db() + +#TODO: do the same for the container .... +# def test_trail_container(): \ No newline at end of file diff --git a/ml_benchmark/metrics_storage.py b/ml_benchmark/metrics_storage.py index 0eefe28..8c9cb6a 100644 --- a/ml_benchmark/metrics_storage.py +++ b/ml_benchmark/metrics_storage.py @@ -234,6 +234,6 @@ def __init__(self): def setup(self, **kwargs): pass - def store(self, data): + def store(self, data,**kwargs): logging.info("Storing data: {}".format(data.to_dict())) self.log.append(data) diff --git a/test/test_ml_benchmark/test_latency_tracker.py b/test/test_ml_benchmark/test_latency_tracker.py index 9dbe55b..58f8855 100644 --- a/test/test_ml_benchmark/test_latency_tracker.py +++ b/test/test_ml_benchmark/test_latency_tracker.py @@ -25,7 +25,7 @@ def test_latency_decorator_using_env(objective): try: metrics_storage.start_db() - os.environ["METRICS_STORAGE_HOST"] = MetricsStorage.connection_string + os.environ["METRICS_STORAGE_HOST"] = MetricsStorage.host objective.train() objective.validate() objective.test() @@ -34,4 +34,4 @@ def test_latency_decorator_using_env(objective): except docker.errors.APIError: metrics_storage.stop_db() - assert isinstance(json.dumps(result), str)ð \ No newline at end of file + assert isinstance(json.dumps(result), str) \ No newline at end of file From 5aaa0bba6f95e8282576f6fd3cb1f100d9553a0f Mon Sep 17 00:00:00 2001 From: Sebastian Werner Date: Mon, 29 Aug 2022 08:47:52 +0200 Subject: [PATCH 12/24] fixed minor experiment issues --- .dockerignore | 1 + .../ops/manifests/trial/job.yml | 6 +-- .../optuna_kubernetes_benchmark.py | 4 +- .../optuna_kubernetes_rcpu.py | 44 +++++++++++++++++++ .../optuna_kubernetes_rnode.py | 34 +++++++------- .../optuna_kubernetes/resource_definition.yml | 6 +-- .../ops/manifests/trial/job.yml | 4 -- experiments/optuna_minikube/optuna_trial.py | 2 +- 8 files changed, 71 insertions(+), 30 deletions(-) create mode 100644 .dockerignore create mode 100644 experiments/optuna_kubernetes/optuna_kubernetes_rcpu.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..b694934 --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +.venv \ No newline at end of file diff --git a/experiments/optuna_kubernetes/ops/manifests/trial/job.yml b/experiments/optuna_kubernetes/ops/manifests/trial/job.yml index ef9d78c..410ea4d 100644 --- a/experiments/optuna_kubernetes/ops/manifests/trial/job.yml +++ b/experiments/optuna_kubernetes/ops/manifests/trial/job.yml @@ -24,9 +24,9 @@ spec: value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" - name: "METRICS_STORAGE_HOST" value: "$metrics_ip" - - name: "N_TRAILS": - value: "$n_trials" - - name: "EPOCHS": + - name: "N_TRAILS" + value: "$trails" + - name: "EPOCHS" value: "$epochs" # injects the kuberntes node name into eacah pod - name: NODE_NAME diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py b/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py index dec4577..11074e7 100644 --- a/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py +++ b/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py @@ -129,7 +129,7 @@ def run(self): else: raise e try: - for t in range(1,5): + for t in range(1,14): self._watch_trials(timeout=120*t) except Exception as e: #TODO deal with mitigatable errors @@ -202,7 +202,7 @@ def undeploy(self): if self.delete_after_run: client.CoreV1Api().delete_namespace(self.namespace) self._watch_namespace() - self.image_builder.cleanup(self.trial_tag) + # self.image_builder.cleanup(self.trial_tag) def _watch_namespace(self): try: diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_rcpu.py b/experiments/optuna_kubernetes/optuna_kubernetes_rcpu.py new file mode 100644 index 0000000..471485e --- /dev/null +++ b/experiments/optuna_kubernetes/optuna_kubernetes_rcpu.py @@ -0,0 +1,44 @@ +import logging +from os import path +from time import sleep +from experiments.optuna_kubernetes.optuna_kubernetes_benchmark import OptunaKubernetesBenchmark +from ml_benchmark.benchmark_runner import BenchmarkRunner +from urllib.request import urlopen +from ml_benchmark.utils.yml_parser import YMLParser + +if __name__ == "__main__": + metricsIP = urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip() + + # read in base configuration + resources = YMLParser.parse(path.join(path.dirname(__file__),"resource_definition.yml")) + # TODO: XXX remove this hardcoded values + to_automate = { + "metricsIP": metricsIP, + "dockerImageTag": "tawalaya/optuna-trial:latest", + "dockerImageBuilder": "docker", + #force random namespaces to reduce conflicts + # "kubernetesNamespace": "optuna-study", + "kubernetesContext": "admin@smile", + "kubernetesMasterIP": "130.149.158.143", + "prometheus_url": "http://130.149.158.143:30041", + "deleteAfterRun":True, + "epochs":25, + } + resources.update(to_automate) + + repetions = 2 + for trails in [6,12,18]: + for cpu in range(2,8): + for i in range(1,repetions+1): + sleep(3) + logging.info(f"Starting Run {i} with 3x{cpu} vCPUs with n_trails {trails}") + try: + resources["trails"] = trails + resources["workerCpu"] = (cpu/2.0) + resources["goal"] = f"rcpu{cpu}-{trails}-{i}" + runner = BenchmarkRunner(benchmark_cls=OptunaKubernetesBenchmark, resources=resources) + runner.run() + sleep(7) + runner = None + except Exception as e: + logging.warning(f'Failed Run {i} with 3x{cpu} vCPUs with n_trails {trails} - {e}') diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py b/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py index 781562a..dac8e2c 100644 --- a/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py +++ b/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py @@ -22,23 +22,23 @@ "kubernetesMasterIP": "130.149.158.143", "prometheus_url": "http://130.149.158.143:30041", "deleteAfterRun":True, + "epochs":25, } resources.update(to_automate) - repetions = 3 - - for n in range(1,10): - for i in range(1,repetions): - sleep(3) - logging.info(f"Starting Run {i} with {n} nodes") - try: - resources["workerCount"] = n - resources["goal"] = f"rnode{n}-{i}" - runner = BenchmarkRunner( - benchmark_cls=OptunaKubernetesBenchmark, resources=resources) - runner.run() - sleep(7) - runner = None - except Exception as e: - logging.warn(f'Failed Run {i} with {n} nodes - {e}') - \ No newline at end of file + repetions = 2 + for t in [6,12,18]: + for n in range(1,11): + for i in range(1,repetions+1): + sleep(3) + logging.info(f"Starting Run {i} with {n} nodes with n_trails {t}") + try: + resources["trails"] = t + resources["workerCount"] = n + resources["goal"] = f"rnode{n}-{t}-{i}" + runner = BenchmarkRunner(benchmark_cls=OptunaKubernetesBenchmark, resources=resources) + runner.run() + sleep(7) + runner = None + except Exception as e: + logging.warning(f'Failed Run {i} with {n} nodes and n_trails {t} - {e}') diff --git a/experiments/optuna_kubernetes/resource_definition.yml b/experiments/optuna_kubernetes/resource_definition.yml index ff91c1b..064126a 100644 --- a/experiments/optuna_kubernetes/resource_definition.yml +++ b/experiments/optuna_kubernetes/resource_definition.yml @@ -1,7 +1,7 @@ -workerCpu: 1 -workerMemory: 1.5 -workerCount: 4 +workerCpu: 2 +workerMemory: 2 +workerCount: 1 trails: 6 epochs: 5 metricsIP: auto ##urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), diff --git a/experiments/optuna_minikube/ops/manifests/trial/job.yml b/experiments/optuna_minikube/ops/manifests/trial/job.yml index 7b0ff8f..a3e111e 100644 --- a/experiments/optuna_minikube/ops/manifests/trial/job.yml +++ b/experiments/optuna_minikube/ops/manifests/trial/job.yml @@ -23,10 +23,6 @@ spec: value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" - name: "METRICS_STORAGE_HOST" value: "$metrics_ip" - - name: "N_TRAILS": - value: "$n_trials" - - name: "EPOCHs": - value: "$epochs" # injects the kuberntes node name into eacah pod - name: NODE_NAME valueFrom: diff --git a/experiments/optuna_minikube/optuna_trial.py b/experiments/optuna_minikube/optuna_trial.py index 0681531..4bec663 100644 --- a/experiments/optuna_minikube/optuna_trial.py +++ b/experiments/optuna_minikube/optuna_trial.py @@ -29,7 +29,7 @@ def main(): study = optuna.create_study( study_name=study_name, storage=database_conn, direction="maximize", load_if_exists=True, sampler=optuna.samplers.GridSampler(search_space)) - study.optimize(optuna_trial, n_trials=n_trails,n_jobs=-1) ##TODO:XXX We need to make this a configurable parameter!!! + study.optimize(optuna_trial, n_trials=n_trails) ##TODO:XXX We need to make this a configurable parameter!!! # TODO: add small wait to avoid missing metrics sleep(5) return True From 6e3ea588477bdc87878c82b90059b0e5970cf998 Mon Sep 17 00:00:00 2001 From: Michael Gebauer | TU Date: Mon, 29 Aug 2022 15:37:07 +0200 Subject: [PATCH 13/24] added trial per worker calc --- .../optuna_kubernetes_benchmark.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py b/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py index 11074e7..991fb0e 100644 --- a/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py +++ b/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py @@ -1,6 +1,7 @@ import random from os import path from time import sleep +from math import ceil import optuna from kubernetes import client, config, watch @@ -37,10 +38,18 @@ def __init__(self, resources: dict, runner=None) -> None: self.delete_after_run = resources.get("deleteAfterRun", True) self.metrics_ip = resources.get("metricsIP") self.runner = runner - self.trails = resources.get("trails", 6) + self.trails = self._calculate_trial_number(resources.get("trials", 6)) self.epochs = resources.get("epochs", 5) self.hyperparameter = resources.get("hyperparameter") + def _calculate_trial_number(self, n_trials): + new_n_trials = None + if n_trials < self.workerCount: + new_n_trials = self.workerCount + else: + new_n_trials = ceil(n_trials/self.workerCount) + return new_n_trials + def deploy(self) -> None: """ Deploy DB @@ -173,8 +182,8 @@ def _watch_trials(self,timeout=120): raise Exception("Job not created...") raise e return False - - + + def test(self): @@ -233,7 +242,7 @@ def _watch_db(self): from urllib.request import urlopen from ml_benchmark.utils.yml_parser import YMLParser resources = YMLParser.parse(path.join(path.dirname(__file__),"resource_definition.yml")) - + # TODO: XXX remove this hardcoded values to_automate = { "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), @@ -250,4 +259,4 @@ def _watch_db(self): runner = BenchmarkRunner( benchmark_cls=OptunaKubernetesBenchmark, resources=resources) runner.run() - + From cd735a8b66ed3bb1efef5eb07fb1a0a0c4afb93e Mon Sep 17 00:00:00 2001 From: Sebastian Werner Date: Mon, 29 Aug 2022 16:19:14 +0200 Subject: [PATCH 14/24] removed runner reference --- .../optuna_kubernetes/optuna_kubernetes_benchmark.py | 7 +------ experiments/optuna_kubernetes/optuna_kubernetes_rnode.py | 2 +- ml_benchmark/benchmark_runner.py | 2 +- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py b/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py index 991fb0e..f54d035 100644 --- a/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py +++ b/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py @@ -16,7 +16,7 @@ class OptunaKubernetesBenchmark(Benchmark): - def __init__(self, resources: dict, runner=None) -> None: + def __init__(self, resources: dict) -> None: """ Processes the given resources dictionary and creates class variables from it which are used in the benchmark. @@ -37,7 +37,6 @@ def __init__(self, resources: dict, runner=None) -> None: self.workerCount = resources.get("workerCount", 4) self.delete_after_run = resources.get("deleteAfterRun", True) self.metrics_ip = resources.get("metricsIP") - self.runner = runner self.trails = self._calculate_trial_number(resources.get("trials", 6)) self.epochs = resources.get("epochs", 5) self.hyperparameter = resources.get("hyperparameter") @@ -83,10 +82,6 @@ def deploy(self) -> None: self._watch_db() - # update the resoruce collector with the namespace used during the run - if self.runner and self.runner.resource_tracker: - self.runner.resource_tracker.namespace = self.namespace - @staticmethod def _is_create_conflict(e): if isinstance(e, ApiException): diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py b/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py index dac8e2c..5272bf3 100644 --- a/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py +++ b/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py @@ -22,7 +22,7 @@ "kubernetesMasterIP": "130.149.158.143", "prometheus_url": "http://130.149.158.143:30041", "deleteAfterRun":True, - "epochs":25, + "epochs":5, } resources.update(to_automate) diff --git a/ml_benchmark/benchmark_runner.py b/ml_benchmark/benchmark_runner.py index d695f67..360e59e 100644 --- a/ml_benchmark/benchmark_runner.py +++ b/ml_benchmark/benchmark_runner.py @@ -122,7 +122,7 @@ def __init__( self.create_benchmark_folder(self.benchmark_folder) # add input and output size to the benchmark. - self.benchmark = benchmark_cls(resources, self) + self.benchmark = benchmark_cls(resources) # set seeds self._set_all_seeds() From 65e8ce1dd688ad5175d291f2ddb302563ef0d6b3 Mon Sep 17 00:00:00 2001 From: "basti.werner" Date: Mon, 29 Aug 2022 17:09:21 +0200 Subject: [PATCH 15/24] fixed typo --- .../optuna_kubernetes/ops/manifests/trial/job.yml | 4 ++-- .../optuna_kubernetes/optuna_kubernetes_benchmark.py | 4 ++-- .../optuna_kubernetes/optuna_kubernetes_rcpu.py | 10 +++++----- .../optuna_kubernetes/optuna_kubernetes_rnode.py | 2 +- experiments/optuna_minikube/optuna_trial.py | 4 ++-- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/experiments/optuna_kubernetes/ops/manifests/trial/job.yml b/experiments/optuna_kubernetes/ops/manifests/trial/job.yml index 410ea4d..4fa93fd 100644 --- a/experiments/optuna_kubernetes/ops/manifests/trial/job.yml +++ b/experiments/optuna_kubernetes/ops/manifests/trial/job.yml @@ -24,8 +24,8 @@ spec: value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" - name: "METRICS_STORAGE_HOST" value: "$metrics_ip" - - name: "N_TRAILS" - value: "$trails" + - name: "N_TRIALS" + value: "$trials" - name: "EPOCHS" value: "$epochs" # injects the kuberntes node name into eacah pod diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py b/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py index f54d035..eeb27bd 100644 --- a/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py +++ b/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py @@ -37,7 +37,7 @@ def __init__(self, resources: dict) -> None: self.workerCount = resources.get("workerCount", 4) self.delete_after_run = resources.get("deleteAfterRun", True) self.metrics_ip = resources.get("metricsIP") - self.trails = self._calculate_trial_number(resources.get("trials", 6)) + self.trials = self._calculate_trial_number(resources.get("trials", 6)) self.epochs = resources.get("epochs", 5) self.hyperparameter = resources.get("hyperparameter") @@ -113,7 +113,7 @@ def run(self): "worker_image": self.trial_tag, "study_name": self.study_name, "metrics_ip": self.metrics_ip, - "trails": self.trails, + "trials": self.trials, "epochs": self.epochs, } job_yml_objects = YamlTemplateFiller.load_and_fill_yaml_template( diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_rcpu.py b/experiments/optuna_kubernetes/optuna_kubernetes_rcpu.py index 471485e..3989021 100644 --- a/experiments/optuna_kubernetes/optuna_kubernetes_rcpu.py +++ b/experiments/optuna_kubernetes/optuna_kubernetes_rcpu.py @@ -27,18 +27,18 @@ resources.update(to_automate) repetions = 2 - for trails in [6,12,18]: + for trials in [6,12,18]: for cpu in range(2,8): for i in range(1,repetions+1): sleep(3) - logging.info(f"Starting Run {i} with 3x{cpu} vCPUs with n_trails {trails}") + logging.info(f"Starting Run {i} with 3x{cpu} vCPUs with n_trails {trials}") try: - resources["trails"] = trails + resources["trials"] = trials resources["workerCpu"] = (cpu/2.0) - resources["goal"] = f"rcpu{cpu}-{trails}-{i}" + resources["goal"] = f"rcpu{cpu}-{trials}-{i}" runner = BenchmarkRunner(benchmark_cls=OptunaKubernetesBenchmark, resources=resources) runner.run() sleep(7) runner = None except Exception as e: - logging.warning(f'Failed Run {i} with 3x{cpu} vCPUs with n_trails {trails} - {e}') + logging.warning(f'Failed Run {i} with 3x{cpu} vCPUs with n_trails {trials} - {e}') diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py b/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py index 5272bf3..703608a 100644 --- a/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py +++ b/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py @@ -33,7 +33,7 @@ sleep(3) logging.info(f"Starting Run {i} with {n} nodes with n_trails {t}") try: - resources["trails"] = t + resources["trials"] = t resources["workerCount"] = n resources["goal"] = f"rnode{n}-{t}-{i}" runner = BenchmarkRunner(benchmark_cls=OptunaKubernetesBenchmark, resources=resources) diff --git a/experiments/optuna_minikube/optuna_trial.py b/experiments/optuna_minikube/optuna_trial.py index 4bec663..6b7baf0 100644 --- a/experiments/optuna_minikube/optuna_trial.py +++ b/experiments/optuna_minikube/optuna_trial.py @@ -23,13 +23,13 @@ def main(): try: study_name = os.environ.get("STUDY_NAME") database_conn = os.environ.get("DB_CONN") - n_trails = int(os.environ.get("N_TRAILS",6)) + n_trials = int(os.environ.get("N_TRIALS",6)) search_space = generate_search_space(os.path.join(os.path.dirname(__file__),"hyperparameter_space.yml")) print(search_space) study = optuna.create_study( study_name=study_name, storage=database_conn, direction="maximize", load_if_exists=True, sampler=optuna.samplers.GridSampler(search_space)) - study.optimize(optuna_trial, n_trials=n_trails) ##TODO:XXX We need to make this a configurable parameter!!! + study.optimize(optuna_trial, n_trials=n_trials) ##TODO:XXX We need to make this a configurable parameter!!! # TODO: add small wait to avoid missing metrics sleep(5) return True From 1dcac212ff0e3a9d58cd229eb4f166179f12b409 Mon Sep 17 00:00:00 2001 From: Michael Gebauer | TU Date: Mon, 29 Aug 2022 17:12:23 +0200 Subject: [PATCH 16/24] added config to benchmark dict --- ml_benchmark/__init__.py | 4 ++-- ml_benchmark/benchmark_runner.py | 17 +++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/ml_benchmark/__init__.py b/ml_benchmark/__init__.py index 10ded25..813097c 100644 --- a/ml_benchmark/__init__.py +++ b/ml_benchmark/__init__.py @@ -3,7 +3,7 @@ "scikit-learn==0.24.2", "tqdm==4.62.3", "SQLAlchemy==1.4.31", "docker==4.4.2", "psycopg2-binary", - "prometheus-api-client", - "ruamel.yaml"], + "prometheus-api-client==0.5.1", + "ruamel.yaml==0.17.21"], test_install_requires = ["pytest==7.1.2", "pytest-cov==3.0.0"] URL = "https://github.com/gebauerm/ml_benchmark" diff --git a/ml_benchmark/benchmark_runner.py b/ml_benchmark/benchmark_runner.py index 360e59e..27a117c 100644 --- a/ml_benchmark/benchmark_runner.py +++ b/ml_benchmark/benchmark_runner.py @@ -120,6 +120,7 @@ def __init__( self.bench_goal = resources.get("goal", "debug") self.benchmark_folder = os.path.join(benchmark_path, f"benchmark__{self.bench_name}") self.create_benchmark_folder(self.benchmark_folder) + self.resources = resources # add input and output size to the benchmark. self.benchmark = benchmark_cls(resources) @@ -147,13 +148,13 @@ def run(self): benchmark_results = None try: - self.metrics_storage.start_db() - + self.metrics_storage.start_db() + # Deploy the SUT with Latency(self.benchmark.deploy) as latency: self.benchmark.deploy() self.latency_tracker.track(latency) - + # RUN the benchmark run_process = [ self.benchmark.setup, self.benchmark.run, @@ -162,17 +163,17 @@ def run(self): if self.resource_tracker is not None: self.resource_tracker.start() - + for benchmark_fun in run_process: with Latency(benchmark_fun) as latency: benchmark_fun() self.latency_tracker.track(latency) - + # Get the results of the benchmark benchmark_results = self.metrics_storage.get_benchmark_results() # just to be save we wait a bit before killing shit. - + except (docker.errors.APIError, AttributeError, ValueError, RuntimeError) as e: print(e) raise ValueError("No Results obtained, Benchmark failed.") @@ -193,7 +194,7 @@ def run(self): self.metrics_storage.stop_db() except Exception: pass - + # TODO: move to finally block to ensure that results are always caputres if possible? # persist the results self.save_benchmark_results(benchmark_results) @@ -216,7 +217,7 @@ def save_benchmark_results(self, benchmark_results): benchmark_results (_type_): _description_ """ benchmark_config_dict = dict( - resources=self.benchmark.resources, + resources=self.resources, ) benchmark_result_dict = dict( benchmark_metrics=benchmark_results, From 060845c2dbc18ed2754a06bfecea534bec35805e Mon Sep 17 00:00:00 2001 From: Michael Gebauer | TU Date: Mon, 29 Aug 2022 17:36:21 +0200 Subject: [PATCH 17/24] added description, fixed timeout error for mini --- experiments/optuna_minikube/optuna_minikube_benchmark.py | 8 ++++---- experiments/optuna_minikube/optuna_trial.py | 4 ++-- ml_benchmark/__init__.py | 4 ++++ setup.py | 3 ++- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/experiments/optuna_minikube/optuna_minikube_benchmark.py b/experiments/optuna_minikube/optuna_minikube_benchmark.py index a18793c..33efbfb 100644 --- a/experiments/optuna_minikube/optuna_minikube_benchmark.py +++ b/experiments/optuna_minikube/optuna_minikube_benchmark.py @@ -42,15 +42,15 @@ def deploy(self) -> None: """ Deploy DB """ - + # TODO: deal with exsiting resources... #generate hyperparameter file from resouces def. - + if self.hyperparameter: f = path.join(path.dirname(__file__),"hyperparameter_space.yml") YamlTemplateFiller.as_yaml(f, self.hyperparameter) - + try: resp = client.CoreV1Api().create_namespace( client.V1Namespace(metadata=client.V1ObjectMeta(name=self.namespace))) @@ -142,7 +142,7 @@ def _watch_trials(self): """ w = watch.Watch() c = client.BatchV1Api() - for e in w.stream(c.list_namespaced_job, namespace=self.namespace, timeout_seconds=10): + for e in w.stream(c.list_namespaced_job, namespace=self.namespace, timeout_seconds=100): if "object" in e and e["object"].status.completion_time is not None: w.stop() return diff --git a/experiments/optuna_minikube/optuna_trial.py b/experiments/optuna_minikube/optuna_trial.py index 6b7baf0..c735f81 100644 --- a/experiments/optuna_minikube/optuna_trial.py +++ b/experiments/optuna_minikube/optuna_trial.py @@ -10,7 +10,7 @@ def optuna_trial(trial): task = MnistTask(config_init={"epochs": epochs}) objective = task.create_objective() # optuna doesnt care, these lines of code just get hyperparameters from the search space in grid search - lr = trial.suggest_float("learning_rate", 1e-3, 0.1, log=True) + lr = trial.suggest_float("learning_rate", 1e-4, 0.1, log=True) decay = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True) # hidden_layer_config = trial.suggest_int("hidden_layer_config", 1, 4) objective.set_hyperparameters( @@ -41,4 +41,4 @@ def main(): if main(): sys.exit(0) else: - sys.exit(1) \ No newline at end of file + sys.exit(1) diff --git a/ml_benchmark/__init__.py b/ml_benchmark/__init__.py index 813097c..9e6220b 100644 --- a/ml_benchmark/__init__.py +++ b/ml_benchmark/__init__.py @@ -1,3 +1,5 @@ +from pathlib import Path + __version__ = "develop" install_requires = [ "scikit-learn==0.24.2", @@ -7,3 +9,5 @@ "ruamel.yaml==0.17.21"], test_install_requires = ["pytest==7.1.2", "pytest-cov==3.0.0"] URL = "https://github.com/gebauerm/ml_benchmark" +this_directory = Path(__file__).parent.parent +long_description = (this_directory / "README.md").read_text() diff --git a/setup.py b/setup.py index e8b67f0..37c485d 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,8 @@ def main(): python_requires=">=3.6", include_package_data=True, extras_require={"test": package.test_install_requires}, - long_description="/README.md" + long_description=package.long_description, + long_description_content_type="text/markdown" ) From f99b09529a6f6a846170da3d83e7f5f7ba23f5e4 Mon Sep 17 00:00:00 2001 From: Michael Gebauer | TU Date: Mon, 29 Aug 2022 18:36:33 +0200 Subject: [PATCH 18/24] fixed typos --- experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py | 2 +- experiments/optuna_kubernetes/resource_definition.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py b/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py index eeb27bd..07805e7 100644 --- a/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py +++ b/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py @@ -167,7 +167,7 @@ def _watch_trials(self,timeout=120): w.stop() print("Trials completed! Collecting Results") return True - print("Watch_Trails timed out") + print("Watch_Trials timed out") try: job = client.BatchV1Api().read_namespaced_job(name="optuna-trial", namespace=self.namespace) if job.status.failed != None and job.status.failed > 0: diff --git a/experiments/optuna_kubernetes/resource_definition.yml b/experiments/optuna_kubernetes/resource_definition.yml index 064126a..9ddd456 100644 --- a/experiments/optuna_kubernetes/resource_definition.yml +++ b/experiments/optuna_kubernetes/resource_definition.yml @@ -2,7 +2,7 @@ workerCpu: 2 workerMemory: 2 workerCount: 1 -trails: 6 +trials: 6 epochs: 5 metricsIP: auto ##urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), kubernetesMasterIP: minikube ##subprocess.check_output("minikube ip", shell=True).decode("utf-8").strip("\n") @@ -14,7 +14,7 @@ hyperparameter: learning_rate: start: 1e-4 end: 1e-2 - step_size: 1e-5 + step_size: 1e-3 weight_decay: start: 1e-6 end: 1e-4 From 166137ecaca1c6ffe6441a6bdff0c611fc431f9c Mon Sep 17 00:00:00 2001 From: Sebastian Werner Date: Mon, 29 Aug 2022 18:40:03 +0200 Subject: [PATCH 19/24] migrated to optuna.study.MaxTrials --- .../optuna_kubernetes_benchmark.py | 2 +- .../optuna_kubernetes_rnode.py | 29 +++++++++---------- experiments/optuna_minikube/optuna_trial.py | 7 +++-- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py b/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py index eeb27bd..320654e 100644 --- a/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py +++ b/experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py @@ -37,7 +37,7 @@ def __init__(self, resources: dict) -> None: self.workerCount = resources.get("workerCount", 4) self.delete_after_run = resources.get("deleteAfterRun", True) self.metrics_ip = resources.get("metricsIP") - self.trials = self._calculate_trial_number(resources.get("trials", 6)) + self.trials = resources.get("trials", 10) #self._calculate_trial_number(resources.get("trials", 6)) self.epochs = resources.get("epochs", 5) self.hyperparameter = resources.get("hyperparameter") diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py b/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py index 703608a..20b6b42 100644 --- a/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py +++ b/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py @@ -27,18 +27,17 @@ resources.update(to_automate) repetions = 2 - for t in [6,12,18]: - for n in range(1,11): - for i in range(1,repetions+1): - sleep(3) - logging.info(f"Starting Run {i} with {n} nodes with n_trails {t}") - try: - resources["trials"] = t - resources["workerCount"] = n - resources["goal"] = f"rnode{n}-{t}-{i}" - runner = BenchmarkRunner(benchmark_cls=OptunaKubernetesBenchmark, resources=resources) - runner.run() - sleep(7) - runner = None - except Exception as e: - logging.warning(f'Failed Run {i} with {n} nodes and n_trails {t} - {e}') + for n in range(1,11): + for i in range(1,repetions+1): + sleep(3) + logging.info(f"Starting Run {i} with {n} nodes with n_trails 100") + try: + resources["trials"] = 100 + resources["workerCount"] = n + resources["goal"] = f"rnode{n}-100-{i}" + runner = BenchmarkRunner(benchmark_cls=OptunaKubernetesBenchmark, resources=resources) + runner.run() + sleep(7) + runner = None + except Exception as e: + logging.warning(f'Failed Run {i} with {n} nodes and n_trails 100 - {e}') diff --git a/experiments/optuna_minikube/optuna_trial.py b/experiments/optuna_minikube/optuna_trial.py index 6b7baf0..692bc36 100644 --- a/experiments/optuna_minikube/optuna_trial.py +++ b/experiments/optuna_minikube/optuna_trial.py @@ -4,7 +4,8 @@ import optuna from ml_benchmark.workload.mnist.mnist_task import MnistTask from utils import generate_search_space - +from optuna.study import MaxTrialsCallback +from optuna.trial import TrialState def optuna_trial(trial): epochs = int(os.environ.get("EPOCHS",5)) task = MnistTask(config_init={"epochs": epochs}) @@ -29,7 +30,9 @@ def main(): study = optuna.create_study( study_name=study_name, storage=database_conn, direction="maximize", load_if_exists=True, sampler=optuna.samplers.GridSampler(search_space)) - study.optimize(optuna_trial, n_trials=n_trials) ##TODO:XXX We need to make this a configurable parameter!!! + study.optimize(optuna_trial, + callbacks=[MaxTrialsCallback(n_trials, states=(TrialState.COMPLETE,))], + ) ##TODO:XXX We need to make this a configurable parameter!!! # TODO: add small wait to avoid missing metrics sleep(5) return True From 9154920efd26a73afd3a65caf35178f40dca230f Mon Sep 17 00:00:00 2001 From: Sebastian Werner Date: Tue, 30 Aug 2022 14:17:22 +0200 Subject: [PATCH 20/24] [WIP] bench run --- experiments/optuna_kubernetes/optuna_kubernetes_rnode.py | 6 +++--- experiments/optuna_kubernetes/resource_definition.yml | 4 ++-- experiments/optuna_minikube/hyperparameter_space.yml | 2 +- ml_benchmark/__init__.py | 7 ++++--- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py b/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py index 20b6b42..4989202 100644 --- a/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py +++ b/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py @@ -22,13 +22,13 @@ "kubernetesMasterIP": "130.149.158.143", "prometheus_url": "http://130.149.158.143:30041", "deleteAfterRun":True, - "epochs":5, + "epochs":100, } resources.update(to_automate) repetions = 2 - for n in range(1,11): - for i in range(1,repetions+1): + for i in range(1,repetions+1): + for n in range(1,7): sleep(3) logging.info(f"Starting Run {i} with {n} nodes with n_trails 100") try: diff --git a/experiments/optuna_kubernetes/resource_definition.yml b/experiments/optuna_kubernetes/resource_definition.yml index 9ddd456..542f7f8 100644 --- a/experiments/optuna_kubernetes/resource_definition.yml +++ b/experiments/optuna_kubernetes/resource_definition.yml @@ -1,6 +1,6 @@ -workerCpu: 2 -workerMemory: 2 +workerCpu: 3.25 +workerMemory: 6 workerCount: 1 trials: 6 epochs: 5 diff --git a/experiments/optuna_minikube/hyperparameter_space.yml b/experiments/optuna_minikube/hyperparameter_space.yml index 3a76fc2..1e72da6 100644 --- a/experiments/optuna_minikube/hyperparameter_space.yml +++ b/experiments/optuna_minikube/hyperparameter_space.yml @@ -2,7 +2,7 @@ learning_rate: end: 0.01 start: 0.0001 - step_size: 1.0e-05 + step_size: 0.001 weight_decay: end: 0.0001 start: 1.0e-06 diff --git a/ml_benchmark/__init__.py b/ml_benchmark/__init__.py index 9e6220b..c2bddfe 100644 --- a/ml_benchmark/__init__.py +++ b/ml_benchmark/__init__.py @@ -1,4 +1,4 @@ -from pathlib import Path +# from pathlib import Path __version__ = "develop" install_requires = [ @@ -9,5 +9,6 @@ "ruamel.yaml==0.17.21"], test_install_requires = ["pytest==7.1.2", "pytest-cov==3.0.0"] URL = "https://github.com/gebauerm/ml_benchmark" -this_directory = Path(__file__).parent.parent -long_description = (this_directory / "README.md").read_text() +# this_directory = Path(__file__).parent.parent +# long_description = (this_directory / "README.md").read_text() +long_description = "FIXME" From 071d6aa3fa4c40a7dbeb6e671f364122f67eb0ea Mon Sep 17 00:00:00 2001 From: Michael Gebauer | TU Date: Tue, 30 Aug 2022 16:19:10 +0200 Subject: [PATCH 21/24] adjusted workload --- experiments/optuna_kubernetes/optuna_kubernetes_rcpu.py | 2 +- experiments/optuna_kubernetes/optuna_kubernetes_rnode.py | 2 +- experiments/optuna_minikube/optuna_trial.py | 9 ++++++--- ml_benchmark/config.py | 2 +- ml_benchmark/workload/mnist/mlp.py | 2 +- 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_rcpu.py b/experiments/optuna_kubernetes/optuna_kubernetes_rcpu.py index 3989021..14518f2 100644 --- a/experiments/optuna_kubernetes/optuna_kubernetes_rcpu.py +++ b/experiments/optuna_kubernetes/optuna_kubernetes_rcpu.py @@ -22,7 +22,7 @@ "kubernetesMasterIP": "130.149.158.143", "prometheus_url": "http://130.149.158.143:30041", "deleteAfterRun":True, - "epochs":25, + "epochs": 50, } resources.update(to_automate) diff --git a/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py b/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py index 4989202..f771655 100644 --- a/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py +++ b/experiments/optuna_kubernetes/optuna_kubernetes_rnode.py @@ -22,7 +22,7 @@ "kubernetesMasterIP": "130.149.158.143", "prometheus_url": "http://130.149.158.143:30041", "deleteAfterRun":True, - "epochs":100, + "epochs": 50, } resources.update(to_automate) diff --git a/experiments/optuna_minikube/optuna_trial.py b/experiments/optuna_minikube/optuna_trial.py index 76c506b..5453677 100644 --- a/experiments/optuna_minikube/optuna_trial.py +++ b/experiments/optuna_minikube/optuna_trial.py @@ -6,12 +6,15 @@ from utils import generate_search_space from optuna.study import MaxTrialsCallback from optuna.trial import TrialState + + + def optuna_trial(trial): - epochs = int(os.environ.get("EPOCHS",5)) + epochs = int(os.environ.get("EPOCHS", 5)) task = MnistTask(config_init={"epochs": epochs}) objective = task.create_objective() # optuna doesnt care, these lines of code just get hyperparameters from the search space in grid search - lr = trial.suggest_float("learning_rate", 1e-4, 0.1, log=True) + lr = trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True) decay = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True) # hidden_layer_config = trial.suggest_int("hidden_layer_config", 1, 4) objective.set_hyperparameters( @@ -24,7 +27,7 @@ def main(): try: study_name = os.environ.get("STUDY_NAME", "Test-Study") database_conn = os.environ.get("DB_CONN") - n_trials = int(os.environ.get("N_TRIALS",6)) + n_trials = int(os.environ.get("N_TRIALS", 2)) search_space = generate_search_space(os.path.join(os.path.dirname(__file__),"hyperparameter_space.yml")) print(search_space) study = optuna.create_study( diff --git a/ml_benchmark/config.py b/ml_benchmark/config.py index c45fc9d..6582d1f 100644 --- a/ml_benchmark/config.py +++ b/ml_benchmark/config.py @@ -27,7 +27,7 @@ def to_dict(self): class MLPHyperparameter: learning_rate: float = 1e-3 weight_decay: float = 1e-6 - hidden_layer_config: list = field(default_factory=lambda: [50, 20]) + hidden_layer_config: list = field(default_factory=lambda: [15]) def to_dict(self): return asdict(self) diff --git a/ml_benchmark/workload/mnist/mlp.py b/ml_benchmark/workload/mnist/mlp.py index cb703b0..4d1c972 100644 --- a/ml_benchmark/workload/mnist/mlp.py +++ b/ml_benchmark/workload/mnist/mlp.py @@ -42,7 +42,7 @@ def _construct_layer(self, input_size, hidden_layer_config, output_size): def forward(self, x): for layer in self.layers[:-1]: - x = self.relu(layer(x)) + x = layer(x) x = self.layers[-1](x) return x From d05ed8459ced0b96b77d55e6ed94ce4dd4bc2134 Mon Sep 17 00:00:00 2001 From: "basti.werner" Date: Wed, 31 Aug 2022 11:46:45 +0200 Subject: [PATCH 22/24] v6 results --- .vscode/launch.json | 15 +++++++++++++++ .vscode/settings.json | 10 ++++++++++ 2 files changed, 25 insertions(+) create mode 100644 .vscode/launch.json create mode 100644 .vscode/settings.json diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..b89c2e2 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Module", + "type": "python", + "request": "launch", + "module": "enter-your-module-name", + "justMyCode": true + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..a679442 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,10 @@ +{ + "python.testing.pytestArgs": [ + ".", + "-s" + ], + "python.globalModuleInstallation": true, + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "python.envFile": "${workspaceFolder}/.envs" +} \ No newline at end of file From de02c8456b0fa6eda35e335b3c7eac836a2ac346 Mon Sep 17 00:00:00 2001 From: "basti.werner" Date: Wed, 31 Aug 2022 17:17:14 +0200 Subject: [PATCH 23/24] added classfication and hyperparams to result tracker --- experiments/optuna_minikube/test_trail.py | 6 +++--- ml_benchmark/decorators.py | 2 +- ml_benchmark/metrics.py | 6 ++++++ ml_benchmark/metrics_storage.py | 2 ++ ml_benchmark/results_tracker.py | 13 ++++++++---- ml_benchmark/workload/mnist/mlp_objective.py | 4 ++++ ml_benchmark/workload/objective.py | 21 +++++++++++++++++++- test/conftest.py | 6 ++++++ 8 files changed, 51 insertions(+), 9 deletions(-) diff --git a/experiments/optuna_minikube/test_trail.py b/experiments/optuna_minikube/test_trail.py index af966c2..41e9b07 100644 --- a/experiments/optuna_minikube/test_trail.py +++ b/experiments/optuna_minikube/test_trail.py @@ -13,14 +13,14 @@ def test_trail(): sleep(5) os.environ["METRICS_STORAGE_HOST"] = MetricsStorage.host os.environ["DB_CONN"] = MetricsStorage.connection_string - os.environ["N_TRAILS"] = "10" - os.environ["EPOCHS"] = "2" + os.environ["N_TRIALS"] = "10" + os.environ["EPOCHS"] = "1" f = main() assert f lats = metrics_storage.get_latency_results() - assert len(lats) >= int(os.environ["N_TRAILS"])*2 #(validate+train) + assert len(lats) >= int(os.environ["N_TRIALS"])*2 #(validate+train) finally: metrics_storage.stop_db() diff --git a/ml_benchmark/decorators.py b/ml_benchmark/decorators.py index 7444b12..a9f1e08 100644 --- a/ml_benchmark/decorators.py +++ b/ml_benchmark/decorators.py @@ -23,7 +23,7 @@ def result_func(*args, **kwargs): latency_tracker.track(latency) #XXX this locks us into the f1-score, we probably want to track all callification metrics not just f1-score. MG please help :) - tracker.track(func, result["macro avg"]["f1-score"], "f1-score") + tracker.track(func, result) func.__self__ = None return result diff --git a/ml_benchmark/metrics.py b/ml_benchmark/metrics.py index 52b18c2..6adea31 100644 --- a/ml_benchmark/metrics.py +++ b/ml_benchmark/metrics.py @@ -1,3 +1,4 @@ +import json import logging import os from datetime import datetime, timedelta @@ -64,11 +65,16 @@ def __init__(self, objective): self.value = None self.measure = None + self.hyperparameters = None + self.classification_metrics = None + def to_dict(self): return dict( metric_id=self.metric_id, timestamp=self.timestamp, value=self.value, + hyperparameters=json.dumps(self.hyperparameters, indent=None), + classification_metrics=json.dumps(self.classification_metrics,indent=None), measure=self.measure, **self.fp ) diff --git a/ml_benchmark/metrics_storage.py b/ml_benchmark/metrics_storage.py index 8c9cb6a..1e08056 100644 --- a/ml_benchmark/metrics_storage.py +++ b/ml_benchmark/metrics_storage.py @@ -119,6 +119,8 @@ def create_classification_metrics_table(self): Column("timestamp", String, primary_key=True), Column("value", Float), Column("measure", String), + Column("hyperparameters", String), + Column("classification_metrics", String), Column("process_id", Integer, nullable=True), Column("hostname", String), Column("obj_hash", BigInteger, nullable=True), diff --git a/ml_benchmark/results_tracker.py b/ml_benchmark/results_tracker.py index d8f5108..2d670b2 100644 --- a/ml_benchmark/results_tracker.py +++ b/ml_benchmark/results_tracker.py @@ -8,10 +8,15 @@ def __init__(self,store=MetricsStorageStrategy): self.store = store() self.store.setup() - def track(self, objective, value, measure): - r = Result(objective=objective) - r.value = value - r.measure = measure + def track(self, objective_function, result): + r = Result(objective=objective_function) + + r.value = result["macro avg"]["f1-score"] + r.measure = "f1-score" + + r.hyperparameters = objective_function.__self__.get_hyperparameters() + r.classification_metrics = result + try: self.store.store(r,table_name="classification_metrics") logging.info("Stored result") diff --git a/ml_benchmark/workload/mnist/mlp_objective.py b/ml_benchmark/workload/mnist/mlp_objective.py index 1217851..169ffc0 100644 --- a/ml_benchmark/workload/mnist/mlp_objective.py +++ b/ml_benchmark/workload/mnist/mlp_objective.py @@ -11,6 +11,7 @@ class MLPObjective(Objective): def __init__(self, epochs, train_loader, val_loader, test_loader, input_size, output_size) -> None: + super().__init__() self.train_loader = train_loader self.val_laoder = val_loader self.test_loader = test_loader @@ -26,6 +27,9 @@ def set_hyperparameters(self, hyperparameters: dict): print(self.hyperparameters) self.hyperparameters.update(hyperparameters) print(self.hyperparameters) + + def get_hyperparameters(self) -> dict: + return self.hyperparameters def set_device(self, device_str: str = None): if device_str: diff --git a/ml_benchmark/workload/objective.py b/ml_benchmark/workload/objective.py index 709d275..a441741 100644 --- a/ml_benchmark/workload/objective.py +++ b/ml_benchmark/workload/objective.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod -import os +from numpy import random +from datetime import datetime from ml_benchmark.config import MetricsStorageConfig @@ -10,6 +11,24 @@ class Objective(ABC): Interface for a training, validation and test procedure of a model. """ + def __init__(self) -> None: + self._unique_id = random.randint(0, 1000000) + self._created_at = datetime.now() + + @abstractmethod + def set_hyperparameters(self, hyperparameters: dict): + """ + Set the hyperparameters of the objective. + """ + pass + + @abstractmethod + def get_hyperparameters(self) -> dict: + """ + Get the hyperparameters of the objective. + """ + pass + @abstractmethod def train(self): pass diff --git a/test/conftest.py b/test/conftest.py index ea0df60..5aab816 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -16,6 +16,12 @@ def __init__(self) -> None: def train(self): pass + def get_hyperparameters(self) -> dict: + return {"test":True} + + def set_hyperparameters(self, hyperparameters: dict): + pass + @validation_latency_decorator def validate(self): return {"macro avg":{"f1-score":0.5}} From a9b5ee5f800ba1123ea8bd9388ee214e396b59b5 Mon Sep 17 00:00:00 2001 From: "basti.werner" Date: Wed, 31 Aug 2022 17:18:10 +0200 Subject: [PATCH 24/24] Revert "v6 results" This reverts commit d05ed8459ced0b96b77d55e6ed94ce4dd4bc2134. --- .vscode/launch.json | 15 --------------- .vscode/settings.json | 10 ---------- 2 files changed, 25 deletions(-) delete mode 100644 .vscode/launch.json delete mode 100644 .vscode/settings.json diff --git a/.vscode/launch.json b/.vscode/launch.json deleted file mode 100644 index b89c2e2..0000000 --- a/.vscode/launch.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - // Use IntelliSense to learn about possible attributes. - // Hover to view descriptions of existing attributes. - // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 - "version": "0.2.0", - "configurations": [ - { - "name": "Python: Module", - "type": "python", - "request": "launch", - "module": "enter-your-module-name", - "justMyCode": true - } - ] -} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index a679442..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "python.testing.pytestArgs": [ - ".", - "-s" - ], - "python.globalModuleInstallation": true, - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, - "python.envFile": "${workspaceFolder}/.envs" -} \ No newline at end of file