Skip to content

Commit

Permalink
gzip config improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
simplymathematics committed Aug 4, 2024
1 parent 60a37df commit ee6472c
Show file tree
Hide file tree
Showing 8 changed files with 16,289 additions and 460 deletions.
34 changes: 0 additions & 34 deletions examples/gzip/batchMixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,37 +174,3 @@ def wrapper(method, **kwargs):

return wrapper


if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
big_X = []
big_y = []
for i in range(100):
X, y = make_classification(
n_samples=100,
n_features=20,
n_informative=19,
n_redundant=1,
n_classes=2,
random_state=42 + i,
)
big_X.extend(X.tolist())
big_y.extend(y.tolist())
big_X = np.array(big_X)
big_y = np.array(big_y)
logger.info(f"Shape of big_X: {big_X.shape}")
i = 42
X, y = make_classification(
n_samples=10000,
n_features=20,
n_informative=19,
n_redundant=1,
n_classes=2,
random_state=42 + i,
)
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.2,
random_state=42,
)
20 changes: 8 additions & 12 deletions examples/gzip/conf/condense_knn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,30 +37,26 @@ hydra:
consider_prior: true
prior_weight: 1.0
consider_magic_clip: true
consider_endpoints: false
n_startup_trials: 10
n_ei_candidates: 24
consider_endpoints: true
n_startup_trials: 32
n_ei_candidates: 32
multivariate: true
_target_: hydra_plugins.hydra_optuna_sweeper.optuna_sweeper.OptunaSweeper
direction: ${direction}
storage: sqlite:///optuna.db
study_name: ${dataset}_${model_name}_condense
n_trials: 2
n_jobs: 2
study_name: ???
n_trials: ${oc.env:DECKARD_TRIALS, 128}
n_jobs: ${oc.env:DECKARD_JOBS, 8}
max_failure_rate: 1.0
params:
model.init.k : 1,3,5,7,11
+model.init.weights : uniform,distance
+model.init.algorithm : brute
model.init.symmetric : True
model.init.metric : gzip,lzma,bz2,pkl,zstd,levenshtein,ratio,hamming,jaro,jaro_winkler,seqratio
model_name : ${model_name}
data.sample.random_state: 0,1,2,3,4,5,6,7,8,9
model.init.m: tag(log, interval(.1, 1))
model.init.symmetric : True,False
+model.init.sampling_method: medoid,sum,svc,random,hardness,nearmiss,knn
launcher:
_target_: hydra_plugins.hydra_joblib_launcher.joblib_launcher.JoblibLauncher
n_jobs: 8
n_jobs: -1
prefer : processes
verbose: 1
timeout: null
Expand Down
26 changes: 11 additions & 15 deletions examples/gzip/conf/condense_logistic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,33 +37,29 @@ hydra:
consider_prior: true
prior_weight: 1.0
consider_magic_clip: true
consider_endpoints: false
n_startup_trials: 10
n_ei_candidates: 24
consider_endpoints: true
n_startup_trials: 32
n_ei_candidates: 32
multivariate: true
_target_: hydra_plugins.hydra_optuna_sweeper.optuna_sweeper.OptunaSweeper
study_name: ${dataset}_${model_name}_condense
study_name: ???
storage: sqlite:///optuna.db
n_jobs: 1
n_trials : 1
n_jobs: ${oc.env:DECKARD_JOBS, 8}
n_trials : ${oc.env:DECKARD_TRIALS, 128}
params:
+model.init.solver: saga
+model.init.penalty : l2,l1,l2,none
+model.init.tol : 1e-4,1e-3,1e-2
+model.init.C : 1e-2,1e-1,1e0,1e1,1e2
+model.init.penalty : l2,l1
+model.init.tol : tag(log, interval(1e-5, 1e-1))
+model.init.C : tag(log, interval(1e-3, 1e3))
+model.init.fit_intercept : True,False
+model.init.class_weight : balanced,None
model.init.symmetric : True
model.init.metric : gzip,lzma,bz2,pkl,zstd,levenshtein,ratio,hamming,jaro,jaro_winkler,seqratio
model_name : ${model_name}
data.sample.random_state: 0,1,2,3,4,5,6,7,8,9
model.init.m: tag(log, interval(.1, 1))
model.init.symmetric : True,False
+model.init.sampling_method: medoid,sum,svc,random,hardness,nearmiss,knn
direction: ${direction}
max_failure_rate: 1.0
launcher:
_target_: hydra_plugins.hydra_joblib_launcher.joblib_launcher.JoblibLauncher
n_jobs: 8
n_jobs: -1
prefer : processes
verbose: 1
timeout: null
Expand Down
22 changes: 8 additions & 14 deletions examples/gzip/conf/condense_svc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,8 @@ hydra:
_target_ : database.OptunaStudyDumpCallback
storage : ${hydra.sweeper.storage}
study_name : ${hydra.sweeper.study_name}
directions :
- maximize
metric_names :
- accuracy
directions : ${direction}
metric_names : ${optimizers}
output_file : ${dataset}/logs/${model_name}/${data.sample.train_size}/study.csv
sweeper:
sampler:
Expand All @@ -41,23 +39,19 @@ hydra:
consider_magic_clip: true
consider_endpoints: false
n_startup_trials: 10
n_ei_candidates: 24
n_ei_candidates: 256
multivariate: true
_target_: hydra_plugins.hydra_optuna_sweeper.optuna_sweeper.OptunaSweeper
study_name: ${dataset}_${model_name}_condense
study_name: ???
storage: sqlite:///optuna.db
n_jobs: 2
n_trials : 2
n_jobs: ${oc.env:DECKARD_JOBS, 8}
n_trials : ${oc.env:DECKARD_TRIALS, 128}
params:
+model.init.kernel : rbf,precomputed
+model.init.C : 1e-2,1e-1,1e0,1e1,1e2
+model.init.C : tag(log, interval(1e-3, 1e3))
+model.init.gamma : scale,auto
+model.init.class_weight : balanced,null
model.init.metric : gzip,lzma,bz2,pkl,zstd,levenshtein,ratio,hamming,jaro,jaro_winkler,seqratio
model.init.symmetric : True
model_name : ${model_name}
data.sample.random_state: 0,1,2,3,4,5,6,7,8,9
model.init.m: tag(log, interval(.1, 1))
model.init.symmetric : True,False
+model.init.sampling_method: medoid,sum,svc,random,hardness,nearmiss,knn
direction: ${direction}
max_failure_rate: 1.0
Expand Down
9 changes: 4 additions & 5 deletions examples/gzip/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,22 @@
from pathlib import Path
from hydra.experimental.callback import Callback
import argparse

from typing import Union
storage = "sqlite:///optuna.db"
study_name = "gzip_knn_20-0"
metric_names = ["accuracy"]
directions = ["maximize"]
output_file = "optuna.csv"


@dataclass
class OptunaStudyDumpCallback(Callback):
def __init__(
self,
storage: str,
study_name: str,
metric_names: list,
directions: list,
metric_names: Union[str, ListConfig, list],
directions: Union[str, ListConfig, list],
output_file: str,
seed=42,
):
self.storage = storage
self.study_name = study_name
Expand Down Expand Up @@ -70,6 +68,7 @@ def on_multirun_end(self, *args, **kwargs) -> None:
metric_names = [f"values_{metric}" for metric in self.metric_names]
df = df.sort_values(metric_names, ascending=False)
suffix = Path(self.output_file).suffix
Path(self.output_file).parent.mkdir(parents=True, exist_ok=True)
if suffix in [".csv"]:
df.to_csv(self.output_file, index=False)
elif suffix in [".json"]:
Expand Down
Loading

0 comments on commit ee6472c

Please sign in to comment.