-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feature/optuna kube bench #26
Changes from 18 commits
b7acfa0
707fc72
ca85868
445afde
70594d4
17454d9
bc1db2a
62dbbf2
666991d
7988f7f
56d5eba
95cfb7c
5aaa0bb
6e3ea58
cd735a8
65e8ce1
1dcac21
060845c
f99b095
166137e
713e476
9154920
ff7e436
071d6aa
d05ed84
c96907b
de02c84
a9b5ee5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
.venv |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
import random | ||
from os import path | ||
from time import sleep | ||
from math import ceil | ||
|
||
import optuna | ||
from kubernetes import client, config, watch | ||
|
@@ -36,12 +37,29 @@ def __init__(self, resources: dict) -> None: | |
self.workerCount = resources.get("workerCount", 4) | ||
self.delete_after_run = resources.get("deleteAfterRun", True) | ||
self.metrics_ip = resources.get("metricsIP") | ||
self.trials = self._calculate_trial_number(resources.get("trials", 6)) | ||
self.epochs = resources.get("epochs", 5) | ||
self.hyperparameter = resources.get("hyperparameter") | ||
|
||
def _calculate_trial_number(self, n_trials): | ||
new_n_trials = None | ||
if n_trials < self.workerCount: | ||
new_n_trials = self.workerCount | ||
else: | ||
new_n_trials = ceil(n_trials/self.workerCount) | ||
return new_n_trials | ||
|
||
def deploy(self) -> None: | ||
""" | ||
Deploy DB | ||
""" | ||
# TODO: deal with exsiting resources... | ||
|
||
if self.hyperparameter: | ||
#TODO: XXX we got to fix this dependency thing. eitehr merge minikube/kubernetes or use the same baseclass or something... | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added as an issue in #22 |
||
f = path.join(path.dirname(__file__),"..","optuna_minikube","hyperparameter_space.yml") | ||
YamlTemplateFiller.as_yaml(f, self.hyperparameter) | ||
|
||
try: | ||
resp = client.CoreV1Api().create_namespace( | ||
client.V1Namespace(metadata=client.V1ObjectMeta(name=self.namespace))) | ||
|
@@ -95,6 +113,8 @@ def run(self): | |
"worker_image": self.trial_tag, | ||
"study_name": self.study_name, | ||
"metrics_ip": self.metrics_ip, | ||
"trials": self.trials, | ||
"epochs": self.epochs, | ||
} | ||
job_yml_objects = YamlTemplateFiller.load_and_fill_yaml_template( | ||
path.join(path.dirname(__file__), "ops/manifests/trial/job.yml"), job_definition) | ||
|
@@ -105,11 +125,19 @@ def run(self): | |
if self._is_create_conflict(e): | ||
# lets remove the old one and try again | ||
client.BatchV1Api().delete_namespaced_job(name="optuna-trial", namespace=self.namespace) | ||
#wait for that to complete | ||
sleep(5) | ||
# try again | ||
create_from_yaml( | ||
client.ApiClient(), yaml_objects=job_yml_objects, namespace=self.namespace, verbose=True) | ||
else: | ||
raise e | ||
self._watch_trials() | ||
try: | ||
for t in range(1,14): | ||
self._watch_trials(timeout=120*t) | ||
except Exception as e: | ||
#TODO deal with mitigatable errors | ||
raise e | ||
|
||
def _getDBURL(self): | ||
postgres_sepc = client.CoreV1Api().read_namespaced_service(namespace=self.namespace, name="postgres") | ||
|
@@ -127,23 +155,36 @@ def collect_run_results(self): | |
study = optuna.load_study(study_name=self.study_name, storage=self._getDBURL()) | ||
self.best_trial = study.best_trial | ||
|
||
def _watch_trials(self): | ||
def _watch_trials(self,timeout=120): | ||
""" | ||
Checks if Trials (Kubernetes Jobs) are completed. If not the process waits on it. | ||
""" | ||
w = watch.Watch() | ||
c = client.BatchV1Api() | ||
for e in w.stream(c.list_namespaced_job, namespace=self.namespace, timeout_seconds=10): | ||
|
||
for e in w.stream(c.list_namespaced_job, namespace=self.namespace, timeout_seconds=timeout): | ||
if "object" in e and e["object"].status.completion_time is not None: | ||
w.stop() | ||
return | ||
print("Trials completed! Collecting Results") | ||
print("Trials completed! Collecting Results") | ||
return True | ||
print("Watch_Trails timed out") | ||
try: | ||
job = client.BatchV1Api().read_namespaced_job(name="optuna-trial", namespace=self.namespace) | ||
if job.status.failed != None and job.status.failed > 0: | ||
raise Exception("Trials failed") | ||
except ApiException as e: | ||
if e.status == 404: | ||
raise Exception("Job not created...") | ||
raise e | ||
return False | ||
|
||
|
||
|
||
def test(self): | ||
|
||
def optuna_trial(trial): | ||
objective = MnistTask(config_init={"epochs": 1}).create_objective() | ||
lr = trial.suggest_float("learning_rate", 1e-3, 0.1, log=True) | ||
objective = MnistTask(config_init={"epochs": self.epochs}).create_objective() | ||
lr = trial.suggest_float("learning_rate", 1e-4, 0.1, log=True) | ||
decay = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True) | ||
objective.set_hyperparameters({"learning_rate": lr, "weight_decay": decay}) | ||
# these are the results, that can be used for the hyperparameter search | ||
|
@@ -165,10 +206,11 @@ def undeploy(self): | |
if self.delete_after_run: | ||
client.CoreV1Api().delete_namespace(self.namespace) | ||
self._watch_namespace() | ||
self.image_builder.cleanup(self.trial_tag) | ||
# self.image_builder.cleanup(self.trial_tag) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. image builder causes issues as images have to be forcefully deleted, at least in minikube added as an issue in #27 |
||
|
||
def _watch_namespace(self): | ||
try: | ||
#TODO: XXX fix me! | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. while loop needed. Added as an Issue in #28 |
||
client.CoreV1Api().read_namespace_status(self.namespace).to_dict() | ||
sleep(2) | ||
except client.exceptions.ApiException: | ||
|
@@ -193,26 +235,23 @@ def _watch_db(self): | |
if __name__ == "__main__": | ||
from ml_benchmark.benchmark_runner import BenchmarkRunner | ||
from urllib.request import urlopen | ||
# The basic config for the workload. For testing purposes set epochs to one. | ||
# For benchmarking take the default value of 100 | ||
# your ressources the optimization should run on | ||
resource_definition = { | ||
"workerCpu": 2, | ||
"workerMemory": 2, | ||
"workerCount": 4, | ||
from ml_benchmark.utils.yml_parser import YMLParser | ||
resources = YMLParser.parse(path.join(path.dirname(__file__),"resource_definition.yml")) | ||
|
||
# TODO: XXX remove this hardcoded values | ||
to_automate = { | ||
"metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), | ||
"studyName": "optuna-study", | ||
"dockerImageTag": "tawalaya/optuna-trial:latest", | ||
"dockerImageBuilder": "docker", | ||
"kubernetesNamespace": "optuna-study", | ||
"kubernetesContext": "admin@smile", | ||
"kubernetesMasterIP": "130.149.158.143", | ||
"deleteAfterRun": False, | ||
"prometheus_url": "http://130.149.158.143:30041", | ||
"deleteAfterRun":False, | ||
} | ||
resources.update(to_automate) | ||
|
||
# TODO: hyperparams. | ||
|
||
# import an use the runner | ||
runner = BenchmarkRunner( | ||
benchmark_cls=OptunaKubernetesBenchmark, resource_definition=resource_definition) | ||
benchmark_cls=OptunaKubernetesBenchmark, resources=resources) | ||
runner.run() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import logging | ||
from os import path | ||
from time import sleep | ||
from experiments.optuna_kubernetes.optuna_kubernetes_benchmark import OptunaKubernetesBenchmark | ||
from ml_benchmark.benchmark_runner import BenchmarkRunner | ||
from urllib.request import urlopen | ||
from ml_benchmark.utils.yml_parser import YMLParser | ||
|
||
if __name__ == "__main__": | ||
metricsIP = urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip() | ||
|
||
# read in base configuration | ||
resources = YMLParser.parse(path.join(path.dirname(__file__),"resource_definition.yml")) | ||
# TODO: XXX remove this hardcoded values | ||
to_automate = { | ||
"metricsIP": metricsIP, | ||
"dockerImageTag": "tawalaya/optuna-trial:latest", | ||
"dockerImageBuilder": "docker", | ||
#force random namespaces to reduce conflicts | ||
# "kubernetesNamespace": "optuna-study", | ||
"kubernetesContext": "admin@smile", | ||
"kubernetesMasterIP": "130.149.158.143", | ||
"prometheus_url": "http://130.149.158.143:30041", | ||
"deleteAfterRun":True, | ||
"epochs":25, | ||
} | ||
resources.update(to_automate) | ||
|
||
repetions = 2 | ||
for trials in [6,12,18]: | ||
for cpu in range(2,8): | ||
for i in range(1,repetions+1): | ||
sleep(3) | ||
logging.info(f"Starting Run {i} with 3x{cpu} vCPUs with n_trails {trials}") | ||
try: | ||
resources["trials"] = trials | ||
resources["workerCpu"] = (cpu/2.0) | ||
resources["goal"] = f"rcpu{cpu}-{trials}-{i}" | ||
runner = BenchmarkRunner(benchmark_cls=OptunaKubernetesBenchmark, resources=resources) | ||
runner.run() | ||
sleep(7) | ||
runner = None | ||
except Exception as e: | ||
logging.warning(f'Failed Run {i} with 3x{cpu} vCPUs with n_trails {trials} - {e}') |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import logging | ||
from os import path | ||
from time import sleep | ||
from experiments.optuna_kubernetes.optuna_kubernetes_benchmark import OptunaKubernetesBenchmark | ||
from ml_benchmark.benchmark_runner import BenchmarkRunner | ||
from urllib.request import urlopen | ||
from ml_benchmark.utils.yml_parser import YMLParser | ||
|
||
if __name__ == "__main__": | ||
metricsIP = urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip() | ||
|
||
# read in base configuration | ||
resources = YMLParser.parse(path.join(path.dirname(__file__),"resource_definition.yml")) | ||
# TODO: XXX remove this hardcoded values | ||
to_automate = { | ||
"metricsIP": metricsIP, | ||
"dockerImageTag": "tawalaya/optuna-trial:latest", | ||
"dockerImageBuilder": "docker", | ||
#force random namespaces to reduce conflicts | ||
# "kubernetesNamespace": "optuna-study", | ||
"kubernetesContext": "admin@smile", | ||
"kubernetesMasterIP": "130.149.158.143", | ||
"prometheus_url": "http://130.149.158.143:30041", | ||
"deleteAfterRun":True, | ||
"epochs":5, | ||
} | ||
resources.update(to_automate) | ||
|
||
repetions = 2 | ||
for t in [6,12,18]: | ||
for n in range(1,11): | ||
for i in range(1,repetions+1): | ||
sleep(3) | ||
logging.info(f"Starting Run {i} with {n} nodes with n_trails {t}") | ||
try: | ||
resources["trials"] = t | ||
resources["workerCount"] = n | ||
resources["goal"] = f"rnode{n}-{t}-{i}" | ||
runner = BenchmarkRunner(benchmark_cls=OptunaKubernetesBenchmark, resources=resources) | ||
runner.run() | ||
sleep(7) | ||
runner = None | ||
except Exception as e: | ||
logging.warning(f'Failed Run {i} with {n} nodes and n_trails {t} - {e}') |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
|
||
workerCpu: 2 | ||
workerMemory: 2 | ||
workerCount: 1 | ||
trails: 6 | ||
epochs: 5 | ||
metricsIP: auto ##urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), | ||
kubernetesMasterIP: minikube ##subprocess.check_output("minikube ip", shell=True).decode("utf-8").strip("\n") | ||
dockerImageTag: tawalaya/optuna-trial:latest | ||
dockerImageBuilder: docker | ||
kubernetesContext: "minikube" | ||
deleteAfterRun: True | ||
hyperparameter: | ||
learning_rate: | ||
start: 1e-4 | ||
end: 1e-2 | ||
step_size: 1e-5 | ||
weight_decay: | ||
start: 1e-6 | ||
end: 1e-4 | ||
step_size: 1e-5 | ||
# hidden_layer_config: | ||
# start: [10] | ||
# end: [100, 100, 100] | ||
# step_size: [10, 1] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
delete if unused