Skip to content

Commit

Permalink
Merge pull request #26 from ISE-Sustainable-AI/feature/optuna-kube-bench
Browse files Browse the repository at this point in the history
Feature/optuna kube bench
  • Loading branch information
gebauerm authored Sep 2, 2022
2 parents 81a96fc + a9b5ee5 commit ce9735f
Show file tree
Hide file tree
Showing 39 changed files with 1,090 additions and 171 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.venv
12 changes: 9 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,13 @@ dmypy.json
# Pyre type checker
.pyre/
exp__*
experiments/simple_raytune/benchmark__RaytuneBenchmark
experiments/optuna_minikube/benchmark__OptunaMinikubeBenchmark
**/benchmark__**

data/
data/

#idea
.idea/

#
.envs
test/test_ml_benchmark/hyperparameter_space.yml
41 changes: 2 additions & 39 deletions experiments/optuna_kubernetes/ops/manifests/db/db-deployment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,38 +9,6 @@ data:
POSTGRES_DB: postgresdb
POSTGRES_USER: postgresadmin
POSTGRES_PASSWORD: admin123
# ---
# kind: PersistentVolume
# apiVersion: v1
# metadata:
# name: postgres-pv-volume
# labels:
# type: local
# app: postgres
# spec:
# # storageClassName: manual
# capacity:
# storage: 1Gi
# accessModes:
# - ReadWriteMany
# hostPath:
# path: "/mnt/data"
# reclaimPolicy: Delete
# ---
# kind: PersistentVolumeClaim
# apiVersion: v1
# metadata:
# name: postgres-pv-claim
# labels:
# app: postgres
# spec:
# storageClassName: manual
# accessModes:
# - ReadWriteMany
# resources:
# requests:
# storage: 1Gi

---
apiVersion: apps/v1
kind: Deployment
Expand All @@ -56,6 +24,8 @@ spec:
labels:
app: postgres
spec:
nodeSelector:
scaphandre : "true"
containers:
- name: postgres
image: postgres:10.4
Expand All @@ -65,17 +35,10 @@ spec:
envFrom:
- configMapRef:
name: postgres-config
# volumeMounts:
# - mountPath: /var/lib/postgresql/data
# name: postgredb
resources:
limits:
cpu: 1.0
memory: 1G
# volumes:
# - name: postgredb
# persistentVolumeClaim:
# claimName: postgres-pv-claim
---
apiVersion: v1
kind: Service
Expand Down
11 changes: 11 additions & 0 deletions experiments/optuna_kubernetes/ops/manifests/trial/job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ spec:
parallelism: $worker_num
template:
spec:
nodeSelector:
scaphandre : "true"
containers:
- name: optuna-trial
image: $worker_image
Expand All @@ -22,5 +24,14 @@ spec:
value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
- name: "METRICS_STORAGE_HOST"
value: "$metrics_ip"
- name: "N_TRIALS"
value: "$trials"
- name: "EPOCHS"
value: "$epochs"
# injects the kuberntes node name into eacah pod
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
restartPolicy: OnFailure

81 changes: 60 additions & 21 deletions experiments/optuna_kubernetes/optuna_kubernetes_benchmark.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import random
from os import path
from time import sleep
from math import ceil

import optuna
from kubernetes import client, config, watch
Expand Down Expand Up @@ -36,12 +37,29 @@ def __init__(self, resources: dict) -> None:
self.workerCount = resources.get("workerCount", 4)
self.delete_after_run = resources.get("deleteAfterRun", True)
self.metrics_ip = resources.get("metricsIP")
self.trials = resources.get("trials", 10) #self._calculate_trial_number(resources.get("trials", 6))
self.epochs = resources.get("epochs", 5)
self.hyperparameter = resources.get("hyperparameter")

def _calculate_trial_number(self, n_trials):
new_n_trials = None
if n_trials < self.workerCount:
new_n_trials = self.workerCount
else:
new_n_trials = ceil(n_trials/self.workerCount)
return new_n_trials

def deploy(self) -> None:
"""
Deploy DB
"""
# TODO: deal with exsiting resources...

if self.hyperparameter:
#TODO: XXX we got to fix this dependency thing. eitehr merge minikube/kubernetes or use the same baseclass or something...
f = path.join(path.dirname(__file__),"..","optuna_minikube","hyperparameter_space.yml")
YamlTemplateFiller.as_yaml(f, self.hyperparameter)

try:
resp = client.CoreV1Api().create_namespace(
client.V1Namespace(metadata=client.V1ObjectMeta(name=self.namespace)))
Expand Down Expand Up @@ -95,6 +113,8 @@ def run(self):
"worker_image": self.trial_tag,
"study_name": self.study_name,
"metrics_ip": self.metrics_ip,
"trials": self.trials,
"epochs": self.epochs,
}
job_yml_objects = YamlTemplateFiller.load_and_fill_yaml_template(
path.join(path.dirname(__file__), "ops/manifests/trial/job.yml"), job_definition)
Expand All @@ -105,11 +125,19 @@ def run(self):
if self._is_create_conflict(e):
# lets remove the old one and try again
client.BatchV1Api().delete_namespaced_job(name="optuna-trial", namespace=self.namespace)
#wait for that to complete
sleep(5)
# try again
create_from_yaml(
client.ApiClient(), yaml_objects=job_yml_objects, namespace=self.namespace, verbose=True)
else:
raise e
self._watch_trials()
try:
for t in range(1,14):
self._watch_trials(timeout=120*t)
except Exception as e:
#TODO deal with mitigatable errors
raise e

def _getDBURL(self):
postgres_sepc = client.CoreV1Api().read_namespaced_service(namespace=self.namespace, name="postgres")
Expand All @@ -127,23 +155,36 @@ def collect_run_results(self):
study = optuna.load_study(study_name=self.study_name, storage=self._getDBURL())
self.best_trial = study.best_trial

def _watch_trials(self):
def _watch_trials(self,timeout=120):
"""
Checks if Trials (Kubernetes Jobs) are completed. If not the process waits on it.
"""
w = watch.Watch()
c = client.BatchV1Api()
for e in w.stream(c.list_namespaced_job, namespace=self.namespace, timeout_seconds=10):

for e in w.stream(c.list_namespaced_job, namespace=self.namespace, timeout_seconds=timeout):
if "object" in e and e["object"].status.completion_time is not None:
w.stop()
return
print("Trials completed! Collecting Results")
print("Trials completed! Collecting Results")
return True
print("Watch_Trials timed out")
try:
job = client.BatchV1Api().read_namespaced_job(name="optuna-trial", namespace=self.namespace)
if job.status.failed != None and job.status.failed > 0:
raise Exception("Trials failed")
except ApiException as e:
if e.status == 404:
raise Exception("Job not created...")
raise e
return False



def test(self):

def optuna_trial(trial):
objective = MnistTask(config_init={"epochs": 1}).create_objective()
lr = trial.suggest_float("learning_rate", 1e-3, 0.1, log=True)
objective = MnistTask(config_init={"epochs": self.epochs}).create_objective()
lr = trial.suggest_float("learning_rate", 1e-4, 0.1, log=True)
decay = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True)
objective.set_hyperparameters({"learning_rate": lr, "weight_decay": decay})
# these are the results, that can be used for the hyperparameter search
Expand All @@ -165,10 +206,11 @@ def undeploy(self):
if self.delete_after_run:
client.CoreV1Api().delete_namespace(self.namespace)
self._watch_namespace()
self.image_builder.cleanup(self.trial_tag)
# self.image_builder.cleanup(self.trial_tag)

def _watch_namespace(self):
try:
#TODO: XXX fix me!
client.CoreV1Api().read_namespace_status(self.namespace).to_dict()
sleep(2)
except client.exceptions.ApiException:
Expand All @@ -193,26 +235,23 @@ def _watch_db(self):
if __name__ == "__main__":
from ml_benchmark.benchmark_runner import BenchmarkRunner
from urllib.request import urlopen
# The basic config for the workload. For testing purposes set epochs to one.
# For benchmarking take the default value of 100
# your ressources the optimization should run on
resource_definition = {
"workerCpu": 2,
"workerMemory": 2,
"workerCount": 4,
from ml_benchmark.utils.yml_parser import YMLParser
resources = YMLParser.parse(path.join(path.dirname(__file__),"resource_definition.yml"))

# TODO: XXX remove this hardcoded values
to_automate = {
"metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(),
"studyName": "optuna-study",
"dockerImageTag": "tawalaya/optuna-trial:latest",
"dockerImageBuilder": "docker",
"kubernetesNamespace": "optuna-study",
"kubernetesContext": "admin@smile",
"kubernetesMasterIP": "130.149.158.143",
"deleteAfterRun": False,
"prometheus_url": "http://130.149.158.143:30041",
"deleteAfterRun":False,
}
resources.update(to_automate)

# TODO: hyperparams.

# import an use the runner
runner = BenchmarkRunner(
benchmark_cls=OptunaKubernetesBenchmark, resource_definition=resource_definition)
benchmark_cls=OptunaKubernetesBenchmark, resources=resources)
runner.run()

44 changes: 44 additions & 0 deletions experiments/optuna_kubernetes/optuna_kubernetes_rcpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import logging
from os import path
from time import sleep
from experiments.optuna_kubernetes.optuna_kubernetes_benchmark import OptunaKubernetesBenchmark
from ml_benchmark.benchmark_runner import BenchmarkRunner
from urllib.request import urlopen
from ml_benchmark.utils.yml_parser import YMLParser

if __name__ == "__main__":
metricsIP = urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip()

# read in base configuration
resources = YMLParser.parse(path.join(path.dirname(__file__),"resource_definition.yml"))
# TODO: XXX remove this hardcoded values
to_automate = {
"metricsIP": metricsIP,
"dockerImageTag": "tawalaya/optuna-trial:latest",
"dockerImageBuilder": "docker",
#force random namespaces to reduce conflicts
# "kubernetesNamespace": "optuna-study",
"kubernetesContext": "admin@smile",
"kubernetesMasterIP": "130.149.158.143",
"prometheus_url": "http://130.149.158.143:30041",
"deleteAfterRun":True,
"epochs": 50,
}
resources.update(to_automate)

repetions = 2
for trials in [6,12,18]:
for cpu in range(2,8):
for i in range(1,repetions+1):
sleep(3)
logging.info(f"Starting Run {i} with 3x{cpu} vCPUs with n_trails {trials}")
try:
resources["trials"] = trials
resources["workerCpu"] = (cpu/2.0)
resources["goal"] = f"rcpu{cpu}-{trials}-{i}"
runner = BenchmarkRunner(benchmark_cls=OptunaKubernetesBenchmark, resources=resources)
runner.run()
sleep(7)
runner = None
except Exception as e:
logging.warning(f'Failed Run {i} with 3x{cpu} vCPUs with n_trails {trials} - {e}')
43 changes: 43 additions & 0 deletions experiments/optuna_kubernetes/optuna_kubernetes_rnode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import logging
from os import path
from time import sleep
from experiments.optuna_kubernetes.optuna_kubernetes_benchmark import OptunaKubernetesBenchmark
from ml_benchmark.benchmark_runner import BenchmarkRunner
from urllib.request import urlopen
from ml_benchmark.utils.yml_parser import YMLParser

if __name__ == "__main__":
metricsIP = urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip()

# read in base configuration
resources = YMLParser.parse(path.join(path.dirname(__file__),"resource_definition.yml"))
# TODO: XXX remove this hardcoded values
to_automate = {
"metricsIP": metricsIP,
"dockerImageTag": "tawalaya/optuna-trial:latest",
"dockerImageBuilder": "docker",
#force random namespaces to reduce conflicts
# "kubernetesNamespace": "optuna-study",
"kubernetesContext": "admin@smile",
"kubernetesMasterIP": "130.149.158.143",
"prometheus_url": "http://130.149.158.143:30041",
"deleteAfterRun":True,
"epochs": 50,
}
resources.update(to_automate)

repetions = 2
for i in range(1,repetions+1):
for n in range(1,7):
sleep(3)
logging.info(f"Starting Run {i} with {n} nodes with n_trails 100")
try:
resources["trials"] = 100
resources["workerCount"] = n
resources["goal"] = f"rnode{n}-100-{i}"
runner = BenchmarkRunner(benchmark_cls=OptunaKubernetesBenchmark, resources=resources)
runner.run()
sleep(7)
runner = None
except Exception as e:
logging.warning(f'Failed Run {i} with {n} nodes and n_trails 100 - {e}')
25 changes: 25 additions & 0 deletions experiments/optuna_kubernetes/resource_definition.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

workerCpu: 3.25
workerMemory: 6
workerCount: 1
trials: 6
epochs: 5
metricsIP: auto ##urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(),
kubernetesMasterIP: minikube ##subprocess.check_output("minikube ip", shell=True).decode("utf-8").strip("\n")
dockerImageTag: tawalaya/optuna-trial:latest
dockerImageBuilder: docker
kubernetesContext: "minikube"
deleteAfterRun: True
hyperparameter:
learning_rate:
start: 1e-4
end: 1e-2
step_size: 1e-3
weight_decay:
start: 1e-6
end: 1e-4
step_size: 1e-5
# hidden_layer_config:
# start: [10]
# end: [100, 100, 100]
# step_size: [10, 1]
9 changes: 9 additions & 0 deletions experiments/optuna_minikube/hyperparameter_space.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# generated file - do not edit
learning_rate:
end: 0.01
start: 0.0001
step_size: 0.001
weight_decay:
end: 0.0001
start: 1.0e-06
step_size: 1.0e-05
Loading

0 comments on commit ce9735f

Please sign in to comment.