gzip config improvements

simplymathematics · Aug 4, 2024 · ee6472c · ee6472c
1 parent 60a37df
commit ee6472c
Show file tree

Hide file tree

Showing 8 changed files with 16,289 additions and 460 deletions.
diff --git a/examples/gzip/batchMixin.py b/examples/gzip/batchMixin.py
@@ -174,37 +174,3 @@ def wrapper(method, **kwargs):
 
         return wrapper
 
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO)
-    big_X = []
-    big_y = []
-    for i in range(100):
-        X, y = make_classification(
-            n_samples=100,
-            n_features=20,
-            n_informative=19,
-            n_redundant=1,
-            n_classes=2,
-            random_state=42 + i,
-        )
-        big_X.extend(X.tolist())
-        big_y.extend(y.tolist())
-    big_X = np.array(big_X)
-    big_y = np.array(big_y)
-    logger.info(f"Shape of big_X: {big_X.shape}")
-    i = 42
-    X, y = make_classification(
-        n_samples=10000,
-        n_features=20,
-        n_informative=19,
-        n_redundant=1,
-        n_classes=2,
-        random_state=42 + i,
-    )
-    X_train, X_test, y_train, y_test = train_test_split(
-        X,
-        y,
-        test_size=0.2,
-        random_state=42,
-    )
diff --git a/examples/gzip/conf/condense_knn.yaml b/examples/gzip/conf/condense_knn.yaml
@@ -37,30 +37,26 @@ hydra:
       consider_prior: true
       prior_weight: 1.0
       consider_magic_clip: true
-      consider_endpoints: false
-      n_startup_trials: 10
-      n_ei_candidates: 24
+      consider_endpoints: true
+      n_startup_trials: 32
+      n_ei_candidates: 32
       multivariate: true
     _target_: hydra_plugins.hydra_optuna_sweeper.optuna_sweeper.OptunaSweeper
     direction: ${direction}
     storage: sqlite:///optuna.db
-    study_name: ${dataset}_${model_name}_condense
-    n_trials: 2
-    n_jobs: 2
+    study_name: ???
+    n_trials: ${oc.env:DECKARD_TRIALS, 128}
+    n_jobs: ${oc.env:DECKARD_JOBS, 8}
     max_failure_rate: 1.0
     params:
       model.init.k : 1,3,5,7,11
       +model.init.weights : uniform,distance
       +model.init.algorithm : brute
-      model.init.symmetric : True
-      model.init.metric : gzip,lzma,bz2,pkl,zstd,levenshtein,ratio,hamming,jaro,jaro_winkler,seqratio
-      model_name : ${model_name}
-      data.sample.random_state: 0,1,2,3,4,5,6,7,8,9
-      model.init.m: tag(log, interval(.1, 1))
+      model.init.symmetric : True,False
       +model.init.sampling_method: medoid,sum,svc,random,hardness,nearmiss,knn
   launcher:
     _target_: hydra_plugins.hydra_joblib_launcher.joblib_launcher.JoblibLauncher
-    n_jobs: 8
+    n_jobs: -1
     prefer : processes
     verbose: 1
     timeout: null

diff --git a/examples/gzip/conf/condense_logistic.yaml b/examples/gzip/conf/condense_logistic.yaml
@@ -37,33 +37,29 @@ hydra:
       consider_prior: true
       prior_weight: 1.0
       consider_magic_clip: true
-      consider_endpoints: false
-      n_startup_trials: 10
-      n_ei_candidates: 24
+      consider_endpoints: true
+      n_startup_trials: 32
+      n_ei_candidates: 32
       multivariate: true
     _target_: hydra_plugins.hydra_optuna_sweeper.optuna_sweeper.OptunaSweeper
-    study_name: ${dataset}_${model_name}_condense
+    study_name: ???
     storage: sqlite:///optuna.db
-    n_jobs: 1
-    n_trials : 1
+    n_jobs: ${oc.env:DECKARD_JOBS, 8}
+    n_trials : ${oc.env:DECKARD_TRIALS, 128}
     params:
       +model.init.solver: saga
-      +model.init.penalty : l2,l1,l2,none
-      +model.init.tol : 1e-4,1e-3,1e-2
-      +model.init.C : 1e-2,1e-1,1e0,1e1,1e2
+      +model.init.penalty : l2,l1
+      +model.init.tol : tag(log, interval(1e-5, 1e-1))
+      +model.init.C : tag(log, interval(1e-3, 1e3))
       +model.init.fit_intercept : True,False
       +model.init.class_weight : balanced,None
-      model.init.symmetric : True
-      model.init.metric : gzip,lzma,bz2,pkl,zstd,levenshtein,ratio,hamming,jaro,jaro_winkler,seqratio
-      model_name : ${model_name}
-      data.sample.random_state: 0,1,2,3,4,5,6,7,8,9
-      model.init.m: tag(log, interval(.1, 1))
+      model.init.symmetric : True,False
       +model.init.sampling_method: medoid,sum,svc,random,hardness,nearmiss,knn
     direction: ${direction}
     max_failure_rate: 1.0
   launcher:
     _target_: hydra_plugins.hydra_joblib_launcher.joblib_launcher.JoblibLauncher
-    n_jobs: 8
+    n_jobs: -1
     prefer : processes
     verbose: 1
     timeout: null

diff --git a/examples/gzip/conf/condense_svc.yaml b/examples/gzip/conf/condense_svc.yaml
@@ -27,10 +27,8 @@ hydra:
       _target_ : database.OptunaStudyDumpCallback
       storage : ${hydra.sweeper.storage}
       study_name : ${hydra.sweeper.study_name}
-      directions : 
-        - maximize
-      metric_names : 
-        - accuracy
+      directions : ${direction}
+      metric_names : ${optimizers}
       output_file : ${dataset}/logs/${model_name}/${data.sample.train_size}/study.csv
   sweeper:
     sampler:
@@ -41,23 +39,19 @@ hydra:
       consider_magic_clip: true
       consider_endpoints: false
       n_startup_trials: 10
-      n_ei_candidates: 24
+      n_ei_candidates: 256
       multivariate: true
     _target_: hydra_plugins.hydra_optuna_sweeper.optuna_sweeper.OptunaSweeper
-    study_name: ${dataset}_${model_name}_condense
+    study_name: ???
     storage: sqlite:///optuna.db
-    n_jobs: 2
-    n_trials : 2
+    n_jobs: ${oc.env:DECKARD_JOBS, 8}
+    n_trials : ${oc.env:DECKARD_TRIALS, 128}
     params:
       +model.init.kernel : rbf,precomputed
-      +model.init.C : 1e-2,1e-1,1e0,1e1,1e2
+      +model.init.C : tag(log, interval(1e-3, 1e3))
       +model.init.gamma : scale,auto
       +model.init.class_weight : balanced,null
-      model.init.metric : gzip,lzma,bz2,pkl,zstd,levenshtein,ratio,hamming,jaro,jaro_winkler,seqratio
-      model.init.symmetric : True
-      model_name : ${model_name}
-      data.sample.random_state: 0,1,2,3,4,5,6,7,8,9
-      model.init.m: tag(log, interval(.1, 1))
+      model.init.symmetric : True,False
       +model.init.sampling_method: medoid,sum,svc,random,hardness,nearmiss,knn
     direction: ${direction}
     max_failure_rate: 1.0

diff --git a/examples/gzip/database.py b/examples/gzip/database.py
@@ -6,24 +6,22 @@
 from pathlib import Path
 from hydra.experimental.callback import Callback
 import argparse
-
+from typing import Union
 storage = "sqlite:///optuna.db"
 study_name = "gzip_knn_20-0"
 metric_names = ["accuracy"]
 directions = ["maximize"]
 output_file = "optuna.csv"
 
-
 @dataclass
 class OptunaStudyDumpCallback(Callback):
     def __init__(
         self,
         storage: str,
         study_name: str,
-        metric_names: list,
-        directions: list,
+        metric_names: Union[str, ListConfig, list],
+        directions: Union[str, ListConfig, list],
         output_file: str,
-        seed=42,
     ):
         self.storage = storage
         self.study_name = study_name
@@ -70,6 +68,7 @@ def on_multirun_end(self, *args, **kwargs) -> None:
             metric_names = [f"values_{metric}" for metric in self.metric_names]
             df = df.sort_values(metric_names, ascending=False)
         suffix = Path(self.output_file).suffix
+        Path(self.output_file).parent.mkdir(parents=True, exist_ok=True)
         if suffix in [".csv"]:
             df.to_csv(self.output_file, index=False)
         elif suffix in [".json"]: