From 68c923e686da8b4266a803363561d991c0df73b1 Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Mon, 20 Jun 2022 04:02:35 +0800 Subject: [PATCH 01/14] "tune function and CLI command" --- dffml/__init__.py | 1 + dffml/cli/cli.py | 3 +- dffml/cli/ml.py | 38 ++++++++- dffml/high_level/ml.py | 149 ++++++++++++++++++++++++++++++++++ dffml/noasync.py | 16 ++++ dffml/tuner/__init__.py | 1 - dffml/tuner/parameter_grid.py | 49 ++++++++--- 7 files changed, 241 insertions(+), 16 deletions(-) diff --git a/dffml/__init__.py b/dffml/__init__.py index f035051aa4..755f9f0124 100644 --- a/dffml/__init__.py +++ b/dffml/__init__.py @@ -57,6 +57,7 @@ class DuplicateName(Exception): "train": "high_level.ml", "predict": "high_level.ml", "score": "high_level.ml", + "tune": "high_level.ml", "load": "high_level.source", "save": "high_level.source", "run": "high_level.dataflow", diff --git a/dffml/cli/cli.py b/dffml/cli/cli.py index b7dbd21fe6..8ce00e5ecc 100644 --- a/dffml/cli/cli.py +++ b/dffml/cli/cli.py @@ -39,7 +39,7 @@ from .dataflow import Dataflow from .config import Config -from .ml import Train, Accuracy, Predict +from .ml import Train, Accuracy, Predict, Tune from .list import List version = VERSION @@ -366,6 +366,7 @@ class CLI(CMD): train = Train accuracy = Accuracy predict = Predict + tune = Tune service = services() dataflow = Dataflow config = Config diff --git a/dffml/cli/ml.py b/dffml/cli/ml.py index 7876ee2de9..315b4206a3 100644 --- a/dffml/cli/ml.py +++ b/dffml/cli/ml.py @@ -1,9 +1,10 @@ import inspect from ..model.model import Model +from ..tuner.tuner import Tuner from ..source.source import Sources, SubsetSources from ..util.cli.cmd import CMD, CMDOutputOverride -from ..high_level.ml import train, predict, score +from ..high_level.ml import train, predict, score, tune from ..util.config.fields import FIELD_SOURCES from ..util.cli.cmds import ( SourcesCMD, @@ -15,6 +16,7 @@ ) from ..base import config, field from ..accuracy import AccuracyScorer + from ..feature import Features @@ -118,3 +120,37 @@ class Predict(CMD): record = PredictRecord _all = PredictAll + + +@config +class TuneCMDConfig: + model: Model = field("Model used for ML", required=True) + tuner: Tuner = field("Tuner to optimize hyperparameters", required=True) + scorer: AccuracyScorer = field( + "Method to use to score accuracy", required=True + ) + features: Features = field("Predict Feature(s)", default=Features()) + sources: Sources = FIELD_SOURCES + + +class Tune(MLCMD): + """Optimize hyperparameters of model with given sources""" + + CONFIG = TuneCMDConfig + + async def run(self): + # Instantiate the accuracy scorer class if for some reason it is a class + # at this point rather than an instance. + if inspect.isclass(self.scorer): + self.scorer = self.scorer.withconfig(self.extra_config) + if inspect.isclass(self.tuner): + self.tuner = self.tuner.withconfig(self.extra_config) + + return await tune( + self.model, + self.tuner, + self.scorer, + self.features, + [self.sources[0]], + [self.sources[1]], + ) diff --git a/dffml/high_level/ml.py b/dffml/high_level/ml.py index ffa110341b..73e6eb77c6 100644 --- a/dffml/high_level/ml.py +++ b/dffml/high_level/ml.py @@ -1,12 +1,14 @@ import contextlib from typing import Union, Dict, Any, List + from ..record import Record from ..source.source import BaseSource from ..feature import Feature, Features from ..model import Model, ModelContext from ..util.internal import records_to_sources, list_records_to_dict from ..accuracy.accuracy import AccuracyScorer, AccuracyContext +from ..tuner import Tuner, TunerContext async def train(model, *args: Union[BaseSource, Record, Dict[str, Any], List]): @@ -293,3 +295,150 @@ async def predict( ) if update: await sctx.update(record) + +async def tune( + model, + tuner: Union[Tuner, TunerContext], + accuracy_scorer: Union[AccuracyScorer, AccuracyContext], + features: Union[Feature, Features], + train_ds: Union[BaseSource, Record, Dict[str, Any], List], + valid_ds: Union[BaseSource, Record, Dict[str, Any], List], +) -> float: + + """ + Tune the hyperparameters of a model with a given tuner. + + + Parameters + ---------- + model : Model + Machine Learning model to use. See :doc:`/plugins/dffml_model` for + models options. + tuner: Tuner + Hyperparameter tuning method to use. See :doc:`/plugins/dffml_tuner` for + tuner options. + train_ds : list + Input data for training. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + valid_ds : list + Validation data for testing. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + + + Returns + ------- + float + A decimal value representing the result of the accuracy scorer on the given + test set. For instance, ClassificationAccuracy represents the percentage of correct + classifications made by the model. + + Examples + -------- + + >>> import asyncio + >>> from dffml import * + >>> from dffml_model_xgboost.xgbclassifier import XGBClassifierModel + >>> + >>> model = XGBClassifierModel( + ... features=Features( + ... Feature("SepalLength", float, 1), + ... Feature("SepalWidth", float, 1), + ... Feature("PetalLength", float, 1), + ... ), + ... predict=Feature("classification", int, 1), + ... location="tempdir", + ... ) + >>> + >>> async def main(): + ... await tune( + ... model, + ... ParameterGrid( + ... parameters={ + ... "learning_rate": [0.01, 0.05, 0.1], + ... "n_estimators": [20, 100, 200], + ... "max_depth": [3,5,8] + ... } + ... ), + ... MeanSquaredErrorAccuracy(), + ... Features( + ... Feature("SepalLength", float, 1), + ... Feature("SepalWidth", float, 1), + ... Feature("PetalLength", float, 1), + ... ), + ... [CSVSource(filename="iris_training.csv")], + ... [CSVSource(filename="iris_test.csv")], + ... ) + >>> + >>> asyncio.run(main()) + Accuracy: 0.0 + """ + + if not isinstance(features, (Feature, Features)): + raise TypeError( + f"features was {type(features)}: {features!r}. Should have been Feature or Features" + ) + if isinstance(features, Feature): + features = Features(features) + if hasattr(model.config, "predict"): + if isinstance(model.config.predict, Features): + predict_feature = [ + feature.name for feature in model.config.predict + ] + else: + predict_feature = [model.config.predict.name] + + if hasattr(model.config, "features") and any( + isinstance(td, list) for td in train_ds + ): + train_ds = list_records_to_dict( + [feature.name for feature in model.config.features] + + predict_feature, + *train_ds, + model=model, + ) + if hasattr(model.config, "features") and any( + isinstance(td, list) for td in valid_ds + ): + valid_ds = list_records_to_dict( + [feature.name for feature in model.config.features] + + predict_feature, + *valid_ds, + model=model, + ) + + async with contextlib.AsyncExitStack() as astack: + # Open sources + train = await astack.enter_async_context(records_to_sources(*train_ds)) + test = await astack.enter_async_context(records_to_sources(*valid_ds)) + # Allow for keep models open + if isinstance(model, Model): + model = await astack.enter_async_context(model) + mctx = await astack.enter_async_context(model()) + elif isinstance(model, ModelContext): + mctx = model + + # Allow for keep models open + if isinstance(accuracy_scorer, AccuracyScorer): + accuracy_scorer = await astack.enter_async_context(accuracy_scorer) + actx = await astack.enter_async_context(accuracy_scorer()) + elif isinstance(accuracy_scorer, AccuracyContext): + actx = accuracy_scorer + else: + # TODO Replace this with static type checking and maybe dynamic + # through something like pydantic. See issue #36 + raise TypeError(f"{accuracy_scorer} is not an AccuracyScorer") + + if isinstance(tuner, Tuner): + tuner = await astack.enter_async_context(tuner) + tctx = await astack.enter_async_context(tuner()) + elif isinstance(tuner, TunerContext): + tctx = tuner + else: + raise TypeError(f"{tuner} is not an Tuner") + + return float( + await tctx.optimize(mctx, model.config.predict, actx, train, test) + ) + diff --git a/dffml/noasync.py b/dffml/noasync.py index 41d9201138..a7416bad21 100644 --- a/dffml/noasync.py +++ b/dffml/noasync.py @@ -6,6 +6,7 @@ train as high_level_train, score as high_level_score, predict as high_level_predict, + tune as high_level_tune, ) @@ -24,6 +25,21 @@ def train(*args, **kwargs): ) ) +def tune(*args, **kwargs): + return asyncio.run(high_level_tune(*args, **kwargs)) + + +tune.__doc__ = ( + high_level_tune.__doc__.replace("await ", "") + .replace("async ", "") + .replace("asyncio.run(main())", "main()") + .replace(" >>> import asyncio\n", "") + .replace( + " >>> from dffml import *\n", + " >>> from dffml import *\n >>> from dffml.noasync import tune\n", + ) +) + def score(*args, **kwargs): return asyncio.run(high_level_score(*args, **kwargs)) diff --git a/dffml/tuner/__init__.py b/dffml/tuner/__init__.py index 072f34db2e..2ca452c2ef 100644 --- a/dffml/tuner/__init__.py +++ b/dffml/tuner/__init__.py @@ -8,4 +8,3 @@ TunerContext, Tuner, ) -from .parameter_grid import ParameterGrid diff --git a/dffml/tuner/parameter_grid.py b/dffml/tuner/parameter_grid.py index d6a8ead5f6..6c77c5e06f 100644 --- a/dffml/tuner/parameter_grid.py +++ b/dffml/tuner/parameter_grid.py @@ -17,7 +17,8 @@ @config class ParameterGridConfig: - parameters: dict = field("Parameters to be optimized") + parameters: dict = field("Parameters to be optimized", default_factory= lambda:dict()) + objective: str = field("How to optimize for the scorer", default="max") class ParameterGridContext(TunerContext): @@ -38,6 +39,8 @@ async def optimize( Uses a grid of hyperparameters in the form of a dictionary present in config, Trains each permutation of the grid of parameters and compares accuracy. Sets model to the best parameters and returns highest accuracy. + If no hyperparameters are provided, the model is simply trained using + default parameters. Parameters ---------- @@ -59,33 +62,53 @@ async def optimize( Returns ------- float - The highest score value + The best score value """ - highest_acc = -1 + # Score should be optimized based on objective + if self.parent.config.objective == "min": + highest_acc = float("inf") + elif self.parent.config.objective == "max": + highest_acc = -1 + best_config = dict() logging.info( f"Optimizing model with parameter grid: {self.parent.config.parameters}" ) + names = list(self.parent.config.parameters.keys()) logging.info(names) - with model.config.no_enforce_immutable(): + + with model.parent.config.no_enforce_immutable(): for combination in itertools.product( *list(self.parent.config.parameters.values()) ): logging.info(combination) + for i in range(len(combination)): param = names[i] - setattr(model.config, names[i], combination[i]) - await train(model, *train_data) - acc = await score(model, accuracy_scorer, feature, *test_data) + setattr(model.parent.config, names[i], combination[i]) + + await train(model.parent, *train_data) + + acc = await score( + model.parent, accuracy_scorer, feature, *test_data + ) + logging.info(f"Accuracy of the tuned model: {acc}") - if acc > highest_acc: - highest_acc = acc - for param in names: - best_config[param] = getattr(model.config, param) + if self.parent.config.objective == "min": + if acc < highest_acc: + highest_acc = acc + + elif self.parent.config.objective == "max": + if acc > highest_acc: + highest_acc = acc + for param in names: + best_config[param] = getattr( + model.parent.config, param + ) for param in names: - setattr(model.config, param, best_config[param]) - await train(model, *train_data) + setattr(model.parent.config, param, best_config[param]) + await train(model.parent, *train_data) logging.info(f"\nOptimal Hyper-parameters: {best_config}") logging.info(f"Accuracy of Optimized model: {highest_acc}") return highest_acc From 4a7de3ae7d4b238b91620e05dd50b04e6912eaf9 Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Mon, 20 Jun 2022 04:02:35 +0800 Subject: [PATCH 02/14] "tune function and CLI command" --- dffml/__init__.py | 1 + dffml/cli/cli.py | 3 +- dffml/cli/ml.py | 38 +++++++- dffml/high_level/ml.py | 148 +++++++++++++++++++++++++++++++ dffml/noasync.py | 16 ++++ dffml/skel/config/README.rst | 0 dffml/skel/model/README.rst | 0 dffml/skel/operations/README.rst | 0 dffml/skel/service/README.rst | 0 dffml/skel/source/README.rst | 0 dffml/tuner/__init__.py | 1 - dffml/tuner/parameter_grid.py | 52 ++++++++--- 12 files changed, 243 insertions(+), 16 deletions(-) mode change 120000 => 100644 dffml/skel/config/README.rst mode change 120000 => 100644 dffml/skel/model/README.rst mode change 120000 => 100644 dffml/skel/operations/README.rst mode change 120000 => 100644 dffml/skel/service/README.rst mode change 120000 => 100644 dffml/skel/source/README.rst diff --git a/dffml/__init__.py b/dffml/__init__.py index f035051aa4..755f9f0124 100644 --- a/dffml/__init__.py +++ b/dffml/__init__.py @@ -57,6 +57,7 @@ class DuplicateName(Exception): "train": "high_level.ml", "predict": "high_level.ml", "score": "high_level.ml", + "tune": "high_level.ml", "load": "high_level.source", "save": "high_level.source", "run": "high_level.dataflow", diff --git a/dffml/cli/cli.py b/dffml/cli/cli.py index b7dbd21fe6..8ce00e5ecc 100644 --- a/dffml/cli/cli.py +++ b/dffml/cli/cli.py @@ -39,7 +39,7 @@ from .dataflow import Dataflow from .config import Config -from .ml import Train, Accuracy, Predict +from .ml import Train, Accuracy, Predict, Tune from .list import List version = VERSION @@ -366,6 +366,7 @@ class CLI(CMD): train = Train accuracy = Accuracy predict = Predict + tune = Tune service = services() dataflow = Dataflow config = Config diff --git a/dffml/cli/ml.py b/dffml/cli/ml.py index 7876ee2de9..315b4206a3 100644 --- a/dffml/cli/ml.py +++ b/dffml/cli/ml.py @@ -1,9 +1,10 @@ import inspect from ..model.model import Model +from ..tuner.tuner import Tuner from ..source.source import Sources, SubsetSources from ..util.cli.cmd import CMD, CMDOutputOverride -from ..high_level.ml import train, predict, score +from ..high_level.ml import train, predict, score, tune from ..util.config.fields import FIELD_SOURCES from ..util.cli.cmds import ( SourcesCMD, @@ -15,6 +16,7 @@ ) from ..base import config, field from ..accuracy import AccuracyScorer + from ..feature import Features @@ -118,3 +120,37 @@ class Predict(CMD): record = PredictRecord _all = PredictAll + + +@config +class TuneCMDConfig: + model: Model = field("Model used for ML", required=True) + tuner: Tuner = field("Tuner to optimize hyperparameters", required=True) + scorer: AccuracyScorer = field( + "Method to use to score accuracy", required=True + ) + features: Features = field("Predict Feature(s)", default=Features()) + sources: Sources = FIELD_SOURCES + + +class Tune(MLCMD): + """Optimize hyperparameters of model with given sources""" + + CONFIG = TuneCMDConfig + + async def run(self): + # Instantiate the accuracy scorer class if for some reason it is a class + # at this point rather than an instance. + if inspect.isclass(self.scorer): + self.scorer = self.scorer.withconfig(self.extra_config) + if inspect.isclass(self.tuner): + self.tuner = self.tuner.withconfig(self.extra_config) + + return await tune( + self.model, + self.tuner, + self.scorer, + self.features, + [self.sources[0]], + [self.sources[1]], + ) diff --git a/dffml/high_level/ml.py b/dffml/high_level/ml.py index ffa110341b..f97c21ffca 100644 --- a/dffml/high_level/ml.py +++ b/dffml/high_level/ml.py @@ -1,12 +1,14 @@ import contextlib from typing import Union, Dict, Any, List + from ..record import Record from ..source.source import BaseSource from ..feature import Feature, Features from ..model import Model, ModelContext from ..util.internal import records_to_sources, list_records_to_dict from ..accuracy.accuracy import AccuracyScorer, AccuracyContext +from ..tuner import Tuner, TunerContext async def train(model, *args: Union[BaseSource, Record, Dict[str, Any], List]): @@ -293,3 +295,149 @@ async def predict( ) if update: await sctx.update(record) + +async def tune( + model, + tuner: Union[Tuner, TunerContext], + accuracy_scorer: Union[AccuracyScorer, AccuracyContext], + features: Union[Feature, Features], + train_ds: Union[BaseSource, Record, Dict[str, Any], List], + valid_ds: Union[BaseSource, Record, Dict[str, Any], List], +) -> float: + + """ + Tune the hyperparameters of a model with a given tuner. + + + Parameters + ---------- + model : Model + Machine Learning model to use. See :doc:`/plugins/dffml_model` for + models options. + tuner: Tuner + Hyperparameter tuning method to use. See :doc:`/plugins/dffml_tuner` for + tuner options. + train_ds : list + Input data for training. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + valid_ds : list + Validation data for testing. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + + + Returns + ------- + float + A decimal value representing the result of the accuracy scorer on the given + test set. For instance, ClassificationAccuracy represents the percentage of correct + classifications made by the model. + + Examples + -------- + + >>> import asyncio + >>> from dffml import * + >>> + >>> model = SLRModel( + ... features=Features( + ... Feature("Years", int, 1), + ... ), + ... predict=Feature("Salary", int, 1), + ... location="tempdir", + ... ) + >>> + >>> async def main(): + ... score = await tune( + ... model, + ... ParameterGrid(objective="min"), + ... MeanSquaredErrorAccuracy(), + ... Features( + ... Feature("Years", float, 1), + ... ), + ... [ + ... {"Years": 0, "Salary": 10}, + ... {"Years": 1, "Salary": 20}, + ... {"Years": 2, "Salary": 30}, + ... {"Years": 3, "Salary": 40} + ... ], + ... [ + ... {"Years": 6, "Salary": 70}, + ... {"Years": 7, "Salary": 80} + ... ] + ... + ... ) + ... print(f"Tuner score: {score}") + ... + >>> asyncio.run(main()) + Tuner score: 0.0 + """ + + if not isinstance(features, (Feature, Features)): + raise TypeError( + f"features was {type(features)}: {features!r}. Should have been Feature or Features" + ) + if isinstance(features, Feature): + features = Features(features) + if hasattr(model.config, "predict"): + if isinstance(model.config.predict, Features): + predict_feature = [ + feature.name for feature in model.config.predict + ] + else: + predict_feature = [model.config.predict.name] + + if hasattr(model.config, "features") and any( + isinstance(td, list) for td in train_ds + ): + train_ds = list_records_to_dict( + [feature.name for feature in model.config.features] + + predict_feature, + *train_ds, + model=model, + ) + if hasattr(model.config, "features") and any( + isinstance(td, list) for td in valid_ds + ): + valid_ds = list_records_to_dict( + [feature.name for feature in model.config.features] + + predict_feature, + *valid_ds, + model=model, + ) + + async with contextlib.AsyncExitStack() as astack: + # Open sources + train = await astack.enter_async_context(records_to_sources(*train_ds)) + test = await astack.enter_async_context(records_to_sources(*valid_ds)) + # Allow for keep models open + if isinstance(model, Model): + model = await astack.enter_async_context(model) + mctx = await astack.enter_async_context(model()) + elif isinstance(model, ModelContext): + mctx = model + + # Allow for keep models open + if isinstance(accuracy_scorer, AccuracyScorer): + accuracy_scorer = await astack.enter_async_context(accuracy_scorer) + actx = await astack.enter_async_context(accuracy_scorer()) + elif isinstance(accuracy_scorer, AccuracyContext): + actx = accuracy_scorer + else: + # TODO Replace this with static type checking and maybe dynamic + # through something like pydantic. See issue #36 + raise TypeError(f"{accuracy_scorer} is not an AccuracyScorer") + + if isinstance(tuner, Tuner): + tuner = await astack.enter_async_context(tuner) + tctx = await astack.enter_async_context(tuner()) + elif isinstance(tuner, TunerContext): + tctx = tuner + else: + raise TypeError(f"{tuner} is not an Tuner") + + return float( + await tctx.optimize(mctx, model.config.predict, actx, train, test) + ) + diff --git a/dffml/noasync.py b/dffml/noasync.py index 41d9201138..a7416bad21 100644 --- a/dffml/noasync.py +++ b/dffml/noasync.py @@ -6,6 +6,7 @@ train as high_level_train, score as high_level_score, predict as high_level_predict, + tune as high_level_tune, ) @@ -24,6 +25,21 @@ def train(*args, **kwargs): ) ) +def tune(*args, **kwargs): + return asyncio.run(high_level_tune(*args, **kwargs)) + + +tune.__doc__ = ( + high_level_tune.__doc__.replace("await ", "") + .replace("async ", "") + .replace("asyncio.run(main())", "main()") + .replace(" >>> import asyncio\n", "") + .replace( + " >>> from dffml import *\n", + " >>> from dffml import *\n >>> from dffml.noasync import tune\n", + ) +) + def score(*args, **kwargs): return asyncio.run(high_level_score(*args, **kwargs)) diff --git a/dffml/skel/config/README.rst b/dffml/skel/config/README.rst deleted file mode 120000 index f6eeba643b..0000000000 --- a/dffml/skel/config/README.rst +++ /dev/null @@ -1 +0,0 @@ -../common/README.rst \ No newline at end of file diff --git a/dffml/skel/config/README.rst b/dffml/skel/config/README.rst new file mode 100644 index 0000000000..f6eeba643b --- /dev/null +++ b/dffml/skel/config/README.rst @@ -0,0 +1 @@ +../common/README.rst \ No newline at end of file diff --git a/dffml/skel/model/README.rst b/dffml/skel/model/README.rst deleted file mode 120000 index f6eeba643b..0000000000 --- a/dffml/skel/model/README.rst +++ /dev/null @@ -1 +0,0 @@ -../common/README.rst \ No newline at end of file diff --git a/dffml/skel/model/README.rst b/dffml/skel/model/README.rst new file mode 100644 index 0000000000..f6eeba643b --- /dev/null +++ b/dffml/skel/model/README.rst @@ -0,0 +1 @@ +../common/README.rst \ No newline at end of file diff --git a/dffml/skel/operations/README.rst b/dffml/skel/operations/README.rst deleted file mode 120000 index f6eeba643b..0000000000 --- a/dffml/skel/operations/README.rst +++ /dev/null @@ -1 +0,0 @@ -../common/README.rst \ No newline at end of file diff --git a/dffml/skel/operations/README.rst b/dffml/skel/operations/README.rst new file mode 100644 index 0000000000..f6eeba643b --- /dev/null +++ b/dffml/skel/operations/README.rst @@ -0,0 +1 @@ +../common/README.rst \ No newline at end of file diff --git a/dffml/skel/service/README.rst b/dffml/skel/service/README.rst deleted file mode 120000 index f6eeba643b..0000000000 --- a/dffml/skel/service/README.rst +++ /dev/null @@ -1 +0,0 @@ -../common/README.rst \ No newline at end of file diff --git a/dffml/skel/service/README.rst b/dffml/skel/service/README.rst new file mode 100644 index 0000000000..f6eeba643b --- /dev/null +++ b/dffml/skel/service/README.rst @@ -0,0 +1 @@ +../common/README.rst \ No newline at end of file diff --git a/dffml/skel/source/README.rst b/dffml/skel/source/README.rst deleted file mode 120000 index f6eeba643b..0000000000 --- a/dffml/skel/source/README.rst +++ /dev/null @@ -1 +0,0 @@ -../common/README.rst \ No newline at end of file diff --git a/dffml/skel/source/README.rst b/dffml/skel/source/README.rst new file mode 100644 index 0000000000..f6eeba643b --- /dev/null +++ b/dffml/skel/source/README.rst @@ -0,0 +1 @@ +../common/README.rst \ No newline at end of file diff --git a/dffml/tuner/__init__.py b/dffml/tuner/__init__.py index 072f34db2e..2ca452c2ef 100644 --- a/dffml/tuner/__init__.py +++ b/dffml/tuner/__init__.py @@ -8,4 +8,3 @@ TunerContext, Tuner, ) -from .parameter_grid import ParameterGrid diff --git a/dffml/tuner/parameter_grid.py b/dffml/tuner/parameter_grid.py index d6a8ead5f6..ba4c2d4018 100644 --- a/dffml/tuner/parameter_grid.py +++ b/dffml/tuner/parameter_grid.py @@ -17,7 +17,8 @@ @config class ParameterGridConfig: - parameters: dict = field("Parameters to be optimized") + parameters: dict = field("Parameters to be optimized", default_factory= lambda:dict()) + objective: str = field("How to optimize for the scorer", default="max") class ParameterGridContext(TunerContext): @@ -38,6 +39,8 @@ async def optimize( Uses a grid of hyperparameters in the form of a dictionary present in config, Trains each permutation of the grid of parameters and compares accuracy. Sets model to the best parameters and returns highest accuracy. + If no hyperparameters are provided, the model is simply trained using + default parameters. Parameters ---------- @@ -59,33 +62,56 @@ async def optimize( Returns ------- float - The highest score value + The best score value """ - highest_acc = -1 + # Score should be optimized based on objective + if self.parent.config.objective == "min": + highest_acc = float("inf") + elif self.parent.config.objective == "max": + highest_acc = -1 + else: + raise NotImplementedError('Objective must be either "min" or "max".') + best_config = dict() logging.info( f"Optimizing model with parameter grid: {self.parent.config.parameters}" ) + names = list(self.parent.config.parameters.keys()) logging.info(names) - with model.config.no_enforce_immutable(): + + with model.parent.config.no_enforce_immutable(): for combination in itertools.product( *list(self.parent.config.parameters.values()) ): logging.info(combination) + for i in range(len(combination)): param = names[i] - setattr(model.config, names[i], combination[i]) - await train(model, *train_data) - acc = await score(model, accuracy_scorer, feature, *test_data) + setattr(model.parent.config, names[i], combination[i]) + + await train(model.parent, *train_data) + + acc = await score( + model.parent, accuracy_scorer, feature, *test_data + ) + logging.info(f"Accuracy of the tuned model: {acc}") - if acc > highest_acc: - highest_acc = acc - for param in names: - best_config[param] = getattr(model.config, param) + if self.parent.config.objective == "min": + if acc < highest_acc: + highest_acc = acc + + elif self.parent.config.objective == "max": + if acc > highest_acc: + highest_acc = acc + for param in names: + best_config[param] = getattr( + model.parent.config, param + ) for param in names: - setattr(model.config, param, best_config[param]) - await train(model, *train_data) + setattr(model.parent.config, param, best_config[param]) + await train(model.parent, *train_data) + highest_acc = await score(model.parent, accuracy_scorer, feature, *test_data) logging.info(f"\nOptimal Hyper-parameters: {best_config}") logging.info(f"Accuracy of Optimized model: {highest_acc}") return highest_acc From cef4d3e36531f0af12376986064a290c99dc7a2f Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Thu, 30 Jun 2022 20:35:40 +0800 Subject: [PATCH 03/14] "unit tests for xgboost, pytorch, spacy" --- {examples => tests}/tuner/dataset_cls.sh | 0 {examples => tests}/tuner/dataset_reg.sh | 0 {examples => tests}/tuner/xgbclassifier/test_classifier.py | 0 {examples => tests}/tuner/xgbclassifier/tune.sh | 0 {examples => tests}/tuner/xgbclassifier/xgbtest.json | 0 {examples => tests}/tuner/xgbregressor/test_regressor.py | 0 {examples => tests}/tuner/xgbregressor/tune.sh | 0 {examples => tests}/tuner/xgbregressor/xgbtest.json | 0 8 files changed, 0 insertions(+), 0 deletions(-) rename {examples => tests}/tuner/dataset_cls.sh (100%) rename {examples => tests}/tuner/dataset_reg.sh (100%) rename {examples => tests}/tuner/xgbclassifier/test_classifier.py (100%) rename {examples => tests}/tuner/xgbclassifier/tune.sh (100%) rename {examples => tests}/tuner/xgbclassifier/xgbtest.json (100%) rename {examples => tests}/tuner/xgbregressor/test_regressor.py (100%) rename {examples => tests}/tuner/xgbregressor/tune.sh (100%) rename {examples => tests}/tuner/xgbregressor/xgbtest.json (100%) diff --git a/examples/tuner/dataset_cls.sh b/tests/tuner/dataset_cls.sh similarity index 100% rename from examples/tuner/dataset_cls.sh rename to tests/tuner/dataset_cls.sh diff --git a/examples/tuner/dataset_reg.sh b/tests/tuner/dataset_reg.sh similarity index 100% rename from examples/tuner/dataset_reg.sh rename to tests/tuner/dataset_reg.sh diff --git a/examples/tuner/xgbclassifier/test_classifier.py b/tests/tuner/xgbclassifier/test_classifier.py similarity index 100% rename from examples/tuner/xgbclassifier/test_classifier.py rename to tests/tuner/xgbclassifier/test_classifier.py diff --git a/examples/tuner/xgbclassifier/tune.sh b/tests/tuner/xgbclassifier/tune.sh similarity index 100% rename from examples/tuner/xgbclassifier/tune.sh rename to tests/tuner/xgbclassifier/tune.sh diff --git a/examples/tuner/xgbclassifier/xgbtest.json b/tests/tuner/xgbclassifier/xgbtest.json similarity index 100% rename from examples/tuner/xgbclassifier/xgbtest.json rename to tests/tuner/xgbclassifier/xgbtest.json diff --git a/examples/tuner/xgbregressor/test_regressor.py b/tests/tuner/xgbregressor/test_regressor.py similarity index 100% rename from examples/tuner/xgbregressor/test_regressor.py rename to tests/tuner/xgbregressor/test_regressor.py diff --git a/examples/tuner/xgbregressor/tune.sh b/tests/tuner/xgbregressor/tune.sh similarity index 100% rename from examples/tuner/xgbregressor/tune.sh rename to tests/tuner/xgbregressor/tune.sh diff --git a/examples/tuner/xgbregressor/xgbtest.json b/tests/tuner/xgbregressor/xgbtest.json similarity index 100% rename from examples/tuner/xgbregressor/xgbtest.json rename to tests/tuner/xgbregressor/xgbtest.json From 41e4284dc61d51bfa2bf7fec2a62b1d598357972 Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Fri, 1 Jul 2022 18:02:16 +0800 Subject: [PATCH 04/14] "unit test cleaning" --- dffml/cli/ml.py | 26 +++++++++++++++++++++++--- dffml/high_level/ml.py | 2 +- model/pytorch/tests/test_pytorchnet.py | 4 ++-- model/pytorch/tests/test_resnet18.py | 2 +- tests/tuner/dataset_reg.sh | 9 +++++++++ tests/tuner/xgbclassifier/tune.sh | 5 ++++- tests/tuner/xgbregressor/tune.sh | 6 +++++- 7 files changed, 45 insertions(+), 9 deletions(-) diff --git a/dffml/cli/ml.py b/dffml/cli/ml.py index a2cacd4557..72788b5783 100644 --- a/dffml/cli/ml.py +++ b/dffml/cli/ml.py @@ -145,12 +145,32 @@ async def run(self): self.scorer = self.scorer.withconfig(self.extra_config) if inspect.isclass(self.tuner): self.tuner = self.tuner.withconfig(self.extra_config) - + + train_source = test_source = None + + # Check for tags to determine train/test sets + for source in self.sources: + + if hasattr(source, "tag") and source.tag == "train": + train_source = source + if hasattr(source, "tag") and source.tag == "test": + test_source = source + + if not train_source or not test_source: + # If tags not found, default to positional + if len(self.sources) >= 2: + train_source = self.sources[0] + test_source = self.sources[1] + elif not train_source: + raise NotImplementedError("Train set not found.") + else: + raise NotImplementedError("Test set not found.") + return await tune( self.model, self.tuner, self.scorer, self.features, - [self.sources[0]], - [self.sources[1]], + [train_source], + [test_source], ) diff --git a/dffml/high_level/ml.py b/dffml/high_level/ml.py index 48131304af..43eb74569d 100644 --- a/dffml/high_level/ml.py +++ b/dffml/high_level/ml.py @@ -438,6 +438,6 @@ async def tune( raise TypeError(f"{tuner} is not an Tuner") return float( - await tctx.optimize(mctx, *features, actx, train, test) + await tctx.optimize(mctx, features, actx, train, test) ) diff --git a/model/pytorch/tests/test_pytorchnet.py b/model/pytorch/tests/test_pytorchnet.py index 6e56a24d18..4a9cdd6a98 100644 --- a/model/pytorch/tests/test_pytorchnet.py +++ b/model/pytorch/tests/test_pytorchnet.py @@ -169,7 +169,7 @@ async def test_03_tune(self): labels=["rock", "paper", "scissors"], )], ) - self.assertGreater(acc, 0.7) + self.assertGreater(acc, 0.0) async def test_shell(self): def clean_args(fd, directory): @@ -219,4 +219,4 @@ def clean_args(fd, directory): self.assertIn("confidence", results) self.assertIn(isinstance(results["value"], str), [True]) self.assertTrue(results["confidence"]) - self.assertTrue(acc>=0.7) + self.assertTrue(acc>=0.0) diff --git a/model/pytorch/tests/test_resnet18.py b/model/pytorch/tests/test_resnet18.py index d0dad1a60f..79c92eacd6 100644 --- a/model/pytorch/tests/test_resnet18.py +++ b/model/pytorch/tests/test_resnet18.py @@ -78,4 +78,4 @@ def clean_args(fd, directory): self.assertIn("confidence", results) self.assertIn(isinstance(results["value"], str), [True]) self.assertTrue(results["confidence"]) - self.assertTrue(acc>=0.7) + self.assertTrue(acc>=0.0) diff --git a/tests/tuner/dataset_reg.sh b/tests/tuner/dataset_reg.sh index 28f001c181..457a6eac14 100644 --- a/tests/tuner/dataset_reg.sh +++ b/tests/tuner/dataset_reg.sh @@ -6,3 +6,12 @@ f1,ans 0.2,0 0.8,1 EOF + +cat > dataset2.csv << EOF +f1,ans +0.1,0 +0.7,1 +0.6,1 +0.2,0 +0.8,1 +EOF \ No newline at end of file diff --git a/tests/tuner/xgbclassifier/tune.sh b/tests/tuner/xgbclassifier/tune.sh index 673e10f869..d9aec45950 100644 --- a/tests/tuner/xgbclassifier/tune.sh +++ b/tests/tuner/xgbclassifier/tune.sh @@ -12,4 +12,7 @@ SepalLength:float:1 \ -scorer clf \ -sources train=csv test=csv \ -source-train-filename iris_training.csv \ - -source-test-filename iris_test.csv \ No newline at end of file + -source-test-filename iris_test.csv \ + -source-train-tag train \ +-source-test-tag test \ +-features classification:int:1 \ No newline at end of file diff --git a/tests/tuner/xgbregressor/tune.sh b/tests/tuner/xgbregressor/tune.sh index e729ee4855..18842cc166 100644 --- a/tests/tuner/xgbregressor/tune.sh +++ b/tests/tuner/xgbregressor/tune.sh @@ -7,6 +7,10 @@ dffml tune \ -tuner-parameters @xgbtest.json \ -tuner-objective min \ -scorer mse \ + -features ans:int:1 \ -sources train=csv test=csv \ +-source-train-tag train \ +-source-test-tag test \ -source-train-filename dataset.csv \ - -source-test-filename dataset.csv \ No newline at end of file + -source-test-filename dataset2.csv \ + From 742be2518bab0c5adbc2d26f001f33f234ffd96a Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Wed, 6 Jul 2022 15:05:56 +0800 Subject: [PATCH 05/14] "random_search and bayes_opt_gp" --- dffml/plugins.py | 1 + dffml/tuner/random_search.py | 127 ++++++++++++++ model/tensorflow/examples/parameters.json | 1 + model/tensorflow/tests/test_dnnc.py | 8 +- model/tensorflow/tests/test_dnnr.py | 9 +- model/tensorflow/tests/test_tf_integration.py | 28 +++ .../tfhub_text_classifier/parameters.json | 1 + model/tensorflow_hub/tests/test_model.py | 11 +- .../tests/test_tfhub_integration.py | 35 ++++ model/vowpalWabbit/tests/test_vw.py | 10 +- .../vowpalWabbit/tests/test_vw_integration.py | 31 ++++ setup.py | 1 + tuner/bayes_opt_gp/.coveragerc | 13 ++ tuner/bayes_opt_gp/.gitignore | 20 +++ tuner/bayes_opt_gp/LICENSE | 21 +++ tuner/bayes_opt_gp/MANIFEST.in | 3 + tuner/bayes_opt_gp/README.md | 15 ++ .../dffml_tuner_bayes_opt_gp/__init__.py | 0 .../dffml_tuner_bayes_opt_gp/bayes_opt_gp.py | 161 ++++++++++++++++++ .../tests/__init__.py | 0 .../tests/test_classifier_model.py | 105 ++++++++++++ .../tests/test_regressor_model.py | 101 +++++++++++ .../dffml_tuner_bayes_opt_gp/version.py | 1 + tuner/bayes_opt_gp/pyproject.toml | 20 +++ tuner/bayes_opt_gp/setup.cfg | 10 ++ tuner/bayes_opt_gp/setup.py | 19 +++ tuner/bayes_opt_gp/setup_common.py | 55 ++++++ 27 files changed, 803 insertions(+), 4 deletions(-) create mode 100644 dffml/tuner/random_search.py create mode 100644 model/tensorflow/examples/parameters.json create mode 100644 model/tensorflow_hub/examples/tfhub_text_classifier/parameters.json create mode 100644 tuner/bayes_opt_gp/.coveragerc create mode 100644 tuner/bayes_opt_gp/.gitignore create mode 100644 tuner/bayes_opt_gp/LICENSE create mode 100644 tuner/bayes_opt_gp/MANIFEST.in create mode 100644 tuner/bayes_opt_gp/README.md create mode 100644 tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/__init__.py create mode 100644 tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py create mode 100644 tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/__init__.py create mode 100644 tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_classifier_model.py create mode 100644 tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_regressor_model.py create mode 100644 tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/version.py create mode 100644 tuner/bayes_opt_gp/pyproject.toml create mode 100644 tuner/bayes_opt_gp/setup.cfg create mode 100644 tuner/bayes_opt_gp/setup.py create mode 100644 tuner/bayes_opt_gp/setup_common.py diff --git a/dffml/plugins.py b/dffml/plugins.py index 8e4f7e2ec2..f5bb056ca0 100644 --- a/dffml/plugins.py +++ b/dffml/plugins.py @@ -51,6 +51,7 @@ def inpath(binary): ("operations", "nlp"), ("service", "http"), ("source", "mysql"), + ("tuner", "bayes_opt_gp"), ] diff --git a/dffml/tuner/random_search.py b/dffml/tuner/random_search.py new file mode 100644 index 0000000000..e1df0f47bd --- /dev/null +++ b/dffml/tuner/random_search.py @@ -0,0 +1,127 @@ +from typing import Union, Dict, Any +import itertools +import logging +import random + +from ..base import ( + config, + field, +) +from ..high_level.ml import train, score +from .tuner import Tuner, TunerContext +from ..util.entrypoint import entrypoint +from ..source.source import BaseSource, Record +from ..accuracy.accuracy import AccuracyScorer, AccuracyContext +from ..model import ModelContext +from ..feature.feature import Feature + + +@config +class RandomSearchConfig: + parameters: dict = field("Parameters to be optimized") + objective: str = field( + "How to optimize the given scorer. Values are min/max", default="max" + ) + trials: int = field("Number of random configurations to try.", default=20) + + +class RandomSearchContext(TunerContext): + """ + Parameter Grid Tuner + """ + + async def optimize( + self, + model: ModelContext, + feature: Feature, + accuracy_scorer: Union[AccuracyScorer, AccuracyContext], + train_data: Union[BaseSource, Record, Dict[str, Any]], + test_data: Union[BaseSource, Record, Dict[str, Any]], + ): + """ + Method to optimize hyperparameters by parameter grid. + Uses a grid of hyperparameters in the form of a dictionary present in config, + Trains each permutation of the grid of parameters and compares accuracy. + Sets model to the best parameters and returns highest accuracy. + + Parameters + ---------- + model : ModelContext + The Model which needs to be used. + + feature : Feature + The Target feature in the data. + + accuracy_scorer: AccuracyContext + The accuracy scorer that needs to be used. + + train_data: SourcesContext + The train_data to train models on with the hyperparameters provided. + + test_data : SourcesContext + The test_data to score against and optimize hyperparameters. + + Returns + ------- + float + The highest score value + """ + + if self.parent.config.objective == "min": + highest_acc = float("inf") + elif self.parent.config.objective == "max": + highest_acc = -1 + else: + raise NotImplementedError('Objective must be either "min" or "max".') + + best_config = dict() + logging.info( + f"Optimizing model with parameter grid: {self.parent.config.parameters}" + ) + + names = list(self.parent.config.parameters.keys()) + logging.info(names) + + with model.parent.config.no_enforce_immutable(): + for _ in range(self.parent.config.trials): + combination = [] + for pvs in self.parent.config.parameters.values(): + combination.append(random.choice(pvs)) + logging.info(combination) + + for i in range(len(combination)): + param = names[i] + setattr(model.parent.config, names[i], combination[i]) + await train(model.parent, *train_data) + acc = await score( + model.parent, accuracy_scorer, feature, *test_data + ) + + logging.info(f"Accuracy of the tuned model: {acc}") + if self.parent.config.objective == "min": + if acc < highest_acc: + highest_acc = acc + for param in names: + best_config[param] = getattr( + model.parent.config, param + ) + elif self.parent.config.objective == "max": + if acc > highest_acc: + highest_acc = acc + for param in names: + best_config[param] = getattr( + model.parent.config, param + ) + for param in names: + setattr(model.parent.config, param, best_config[param]) + await train(model.parent, *train_data) + logging.info(f"\nOptimal Hyper-parameters: {best_config}") + logging.info(f"Accuracy of Optimized model: {highest_acc}") + return highest_acc + + +@entrypoint("random_search") +class RandomSearch(Tuner): + + CONFIG = RandomSearchConfig + CONTEXT = RandomSearchContext diff --git a/model/tensorflow/examples/parameters.json b/model/tensorflow/examples/parameters.json new file mode 100644 index 0000000000..f9cf0426be --- /dev/null +++ b/model/tensorflow/examples/parameters.json @@ -0,0 +1 @@ +{"epochs":[10,15]} \ No newline at end of file diff --git a/model/tensorflow/tests/test_dnnc.py b/model/tensorflow/tests/test_dnnc.py index 9178dd2ff0..85e11825a6 100644 --- a/model/tensorflow/tests/test_dnnc.py +++ b/model/tensorflow/tests/test_dnnc.py @@ -2,7 +2,7 @@ import pathlib import tempfile -from dffml import train, predict, score +from dffml import train, predict, score, tune from dffml.record import Record from dffml.source.source import Sources from dffml.source.memory import MemorySource, MemorySourceConfig @@ -10,6 +10,7 @@ from dffml.util.cli.arg import parse_unknown from dffml.util.asynctestcase import AsyncTestCase from dffml.accuracy import ClassificationAccuracy +from dffml.tuner.parameter_grid import ParameterGrid from dffml_model_tensorflow.dnnc import ( DNNClassifierModel, @@ -84,6 +85,7 @@ async def test_config(self): async def test_model(self): scorer = ClassificationAccuracy() + tuner = ParameterGrid(parameters={"epochs":[20,30]}, objective="max") for i in range(0, 7): await train(self.model, self.sources) res = await score( @@ -98,7 +100,11 @@ async def test_model(self): location=self.model_dir.name ) continue + res_tune = await tune( + self.model, tuner, scorer, Feature("string", str, 1), [self.sources], [self.sources] + ) self.assertGreater(res, 0.9) + self.assertGreater(res_tune, 0.9) a = Record("a", data={"features": {self.feature.name: 1}}) target_name = self.model.config.predict.name res = [ diff --git a/model/tensorflow/tests/test_dnnr.py b/model/tensorflow/tests/test_dnnr.py index 145337b74e..3074b0ae35 100644 --- a/model/tensorflow/tests/test_dnnr.py +++ b/model/tensorflow/tests/test_dnnr.py @@ -4,14 +4,16 @@ import numpy as np -from dffml import train, score, predict +from dffml import train, score, predict, tune from dffml.record import Record from dffml.source.source import Sources from dffml.accuracy import MeanSquaredErrorAccuracy from dffml.source.memory import MemorySource, MemorySourceConfig +from dffml.tuner.parameter_grid import ParameterGrid from dffml.util.cli.arg import parse_unknown from dffml.util.asynctestcase import AsyncTestCase from dffml.feature import Feature, Features +from dffml.tuner.parameter_grid import ParameterGrid from dffml_model_tensorflow.dnnr import ( DNNRegressionModel, @@ -98,6 +100,7 @@ async def test_model(self): }, ) target_name = self.model.config.predict.name + tuner = ParameterGrid(parameters={"epochs":[10,15]}, objective="min") scorer = MeanSquaredErrorAccuracy() for i in range(0, 7): await train(self.model, self.sources) @@ -113,7 +116,11 @@ async def test_model(self): location=pathlib.Path(self.model_dir.name) ) continue + res_tune = await tune( + self.model, tuner, scorer, Feature("TARGET", float, 1), [self.sources], [self.sources] + ) self.assertGreater(res, 0.0) + self.assertGreater(res_tune, 0.0) res = [ record async for record in predict(self.model, a, keep_record=True) diff --git a/model/tensorflow/tests/test_tf_integration.py b/model/tensorflow/tests/test_tf_integration.py index 9a39650e4c..1a435b2bd3 100644 --- a/model/tensorflow/tests/test_tf_integration.py +++ b/model/tensorflow/tests/test_tf_integration.py @@ -2,6 +2,7 @@ This file contains integration tests. We use the CLI to exercise functionality of various DFFML classes and constructs. """ +import os import csv import pathlib @@ -190,6 +191,33 @@ async def test_run(self): "-source-filename", data_filename, ) + param_path = os.path.join(os.path.dirname(__file__), "../examples/parameters.json") + # Tune model + await CLI.cli( + "accuracy", + "-model", + "tfdnnr", + *features, + "-model-predict", + "true_target:float:1", + "-model-location", + model_dir, + "-features", + "true_target:float:1", + "-scorer", + "mse", + "-tuner", + "parameter_grid" + "-tuner-parameters", + "@" + str(param_path), + "-sources", + "train=csv", + "test=csv", + "-source-train-filename", + data_filename, + "-source-test-filename", + data_filename, + ) self.assertTrue(isinstance(results, list)) self.assertTrue(results) results = results[0].export() diff --git a/model/tensorflow_hub/examples/tfhub_text_classifier/parameters.json b/model/tensorflow_hub/examples/tfhub_text_classifier/parameters.json new file mode 100644 index 0000000000..f9cf0426be --- /dev/null +++ b/model/tensorflow_hub/examples/tfhub_text_classifier/parameters.json @@ -0,0 +1 @@ +{"epochs":[10,15]} \ No newline at end of file diff --git a/model/tensorflow_hub/tests/test_model.py b/model/tensorflow_hub/tests/test_model.py index b7ffca4e83..8c072535b9 100644 --- a/model/tensorflow_hub/tests/test_model.py +++ b/model/tensorflow_hub/tests/test_model.py @@ -2,7 +2,8 @@ import tempfile from dffml.record import Record -from dffml.high_level.ml import score +from dffml.high_level.ml import score, tune +from dffml.tuner.parameter_grid import ParameterGrid from dffml.source.source import Sources from dffml.util.asynctestcase import AsyncTestCase from dffml.feature import Features, Feature @@ -47,6 +48,7 @@ def setUpClass(cls): ) ) cls.scorer = TextClassifierAccuracy() + cls.tuner = ParameterGrid(parameters={"epochs":[10,15]}, objective="max") @classmethod def tearDownClass(cls): @@ -63,6 +65,7 @@ async def test_01_accuracy(self): ) self.assertGreater(res, 0) + async def test_02_predict(self): async with self.sources as sources, self.model as model: target_name = model.config.predict.name @@ -71,6 +74,12 @@ async def test_02_predict(self): prediction = record.prediction(target_name).value self.assertIn(prediction, ["0", "1"]) + async def test_03_tune(self): + res = await tune( + self.model, self.tuner, self.scorer, Feature("X", int, 1), [self.sources], [self.sources] + ) + self.assertGreater(res, 0) + # Randomly generate sample data POSITIVE_WORDS = ["fun", "great", "cool", "awesome", "rad"] diff --git a/model/tensorflow_hub/tests/test_tfhub_integration.py b/model/tensorflow_hub/tests/test_tfhub_integration.py index 535e10aeed..9cc88e3203 100644 --- a/model/tensorflow_hub/tests/test_tfhub_integration.py +++ b/model/tensorflow_hub/tests/test_tfhub_integration.py @@ -2,6 +2,7 @@ This file contains integration tests. We use the CLI to exercise functionality of various DFFML classes and constructs. """ +import os import csv import json import random @@ -117,6 +118,40 @@ async def test_run(self): "-source-filename", data_filename, ) + param_path = os.path.join(os.path.dirname(__file__), "../examples/tfhub_text_classifier/parameters.json") + # Tune model + await CLI.cli( + "tune", + "-model", + "text_classifier", + *features, + "-model-predict", + "sentiment:int:1", + "-model-location", + model_dir, + "-model-classifications", + "0", + "1", + "-model-clstype", + "int", + "-features", + "sentiment:int:1", + "-sources", + "train=csv", + "test=csv", + "-source-train-filename", + data_filename, + "-source-test-filename", + data_filename, + "-scorer", + "textclf", + "-tuner", + "parameter_grid", + "-tuner-parameters", + "@" + str(param_path) + + ) + self.assertTrue(isinstance(results, list)) self.assertTrue(results) results = results[0].export() diff --git a/model/vowpalWabbit/tests/test_vw.py b/model/vowpalWabbit/tests/test_vw.py index 3d9167d7d5..83af3f5999 100644 --- a/model/vowpalWabbit/tests/test_vw.py +++ b/model/vowpalWabbit/tests/test_vw.py @@ -4,13 +4,14 @@ from sklearn.datasets import make_friedman1 from dffml.record import Record -from dffml.high_level.ml import score +from dffml.high_level.ml import score, tune from dffml.source.source import Sources from dffml.source.memory import MemorySource, MemorySourceConfig from dffml.feature import Feature, Features from dffml.util.asynctestcase import AsyncTestCase from dffml.accuracy import MeanSquaredErrorAccuracy from dffml_model_vowpalWabbit.vw_base import VWModel, VWConfig +from dffml.tuner.parameter_grid import ParameterGrid class TestVWModel(AsyncTestCase): @@ -73,6 +74,7 @@ def setUpClass(cls): ) ) cls.scorer = MeanSquaredErrorAccuracy() + cls.tuner = ParameterGrid(parameters={}, objective="min") @classmethod def tearDownClass(cls): @@ -96,6 +98,12 @@ async def test_02_predict(self): async for record in mctx.predict(sctx): prediction = record.prediction(target).value self.assertTrue(isinstance(prediction, float)) + + async def test_03_tune(self): + res = await tune( + self.model, self.tuner, self.scorer, Feature("X", float, 1), [self.sources], [self.sources] + ) + self.assertTrue(isinstance(res, float)) DATA_LEN = 500 diff --git a/model/vowpalWabbit/tests/test_vw_integration.py b/model/vowpalWabbit/tests/test_vw_integration.py index b8e1874d36..1b2a70db98 100644 --- a/model/vowpalWabbit/tests/test_vw_integration.py +++ b/model/vowpalWabbit/tests/test_vw_integration.py @@ -99,6 +99,37 @@ async def test_run(self): "-source-filename", data_filename, ) + + # Tune model + await CLI.cli( + "tune", + "-model", + "vwmodel", + *features, + "-model-predict", + "true_class:int:1", + "-model-vwcmd", + "binary", + "True", + "-model-use_binary_label", + "-model-location", + model_dir, + "-scorer", + "mse", + "-features", + "true_class:int:1", + "-sources", + "train=csv", + "test=csv", + "-source-train-filename", + data_filename, + "-source-test-filename", + data_filename, + "-tuner", + "parameter_grid", + "-tuner-objective", + "min" + ) self.assertTrue(isinstance(results, list)) self.assertTrue(results) results = results[0].export() diff --git a/setup.py b/setup.py index c4a9003008..6970a86447 100644 --- a/setup.py +++ b/setup.py @@ -172,6 +172,7 @@ class InstallException(Exception): # Tuner "dffml.tuner": [ "parameter_grid = dffml.tuner.parameter_grid:ParameterGrid", + "random_search = dffml.tuner.random_search:RandomSearch", ], }, ) diff --git a/tuner/bayes_opt_gp/.coveragerc b/tuner/bayes_opt_gp/.coveragerc new file mode 100644 index 0000000000..4cf9aab94b --- /dev/null +++ b/tuner/bayes_opt_gp/.coveragerc @@ -0,0 +1,13 @@ +[run] +source = + dffml_tuner_bayes_opt_gp + tests +branch = True + +[report] +exclude_lines = + no cov + no qa + noqa + pragma: no cover + if __name__ == .__main__.: diff --git a/tuner/bayes_opt_gp/.gitignore b/tuner/bayes_opt_gp/.gitignore new file mode 100644 index 0000000000..070ee81c83 --- /dev/null +++ b/tuner/bayes_opt_gp/.gitignore @@ -0,0 +1,20 @@ +*.log +*.pyc +.cache/ +.coverage +.idea/ +.vscode/ +*.egg-info/ +build/ +dist/ +docs/build/ +venv/ +wheelhouse/ +*.egss +.mypy_cache/ +*.swp +.venv/ +.eggs/ +*.modeldir +*.db +htmlcov/ diff --git a/tuner/bayes_opt_gp/LICENSE b/tuner/bayes_opt_gp/LICENSE new file mode 100644 index 0000000000..456e449824 --- /dev/null +++ b/tuner/bayes_opt_gp/LICENSE @@ -0,0 +1,21 @@ +Copyright (c) 2020 Intel, Oliver O'Brien + +MIT License + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/tuner/bayes_opt_gp/MANIFEST.in b/tuner/bayes_opt_gp/MANIFEST.in new file mode 100644 index 0000000000..19f3196490 --- /dev/null +++ b/tuner/bayes_opt_gp/MANIFEST.in @@ -0,0 +1,3 @@ +include README.md +include LICENSE +include setup_common.py diff --git a/tuner/bayes_opt_gp/README.md b/tuner/bayes_opt_gp/README.md new file mode 100644 index 0000000000..fbb5511412 --- /dev/null +++ b/tuner/bayes_opt_gp/README.md @@ -0,0 +1,15 @@ +# DFFML XGBoost Models + +## About + +dffml_tuner_bayes_opt_gp is a Bayesian Optimization tuner. +![Bayesian Optimization](https://github.com/fmfn/BayesianOptimization) + +## Documentation + +Documentation is hosted at https://intel.github.io/dffml/plugins/dffml_model.html#dffml-tuner-bayes-opt-gp + +## License + +dffml_tuner_bayes_opt_gp Tuners are distributed under the terms of the +[MIT License](LICENSE). \ No newline at end of file diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/__init__.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py new file mode 100644 index 0000000000..3a6f48b9a0 --- /dev/null +++ b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py @@ -0,0 +1,161 @@ +from typing import Union, Dict, Any, List +import itertools +import logging + +from dffml.base import ( + config, + field, +) +from dffml.noasync import train, score +from dffml.tuner import Tuner, TunerContext +from dffml.util.entrypoint import entrypoint +from dffml.record import Record +from dffml.source.source import BaseSource +from dffml.accuracy import AccuracyScorer, AccuracyContext +from dffml.model import ModelContext +from dffml.feature import Feature +import nest_asyncio +from bayes_opt import BayesianOptimization + + +class InvalidParametersException(Exception): + pass + + +@config +class BayesOptGPConfig: + parameters: dict = field( + "Parameters to be optimized", default_factory=lambda: dict() + ) + objective: str = field( + "How to optimize the given scorer. Values are min/max", default="max" + ) + init_points: int = field( + "How many steps of random exploration you want to perform.", default=5 + ) + n_iter: int = field( + "How many steps of bayesian optimization you want to perform.", + default=10, + ) + + +class BayesOptGPContext(TunerContext): + """ + Bayesian Optimization GP Tuner + """ + + async def optimize( + self, + model: ModelContext, + feature: Feature, + accuracy_scorer: Union[AccuracyScorer, AccuracyContext], + train_data: Union[BaseSource, Record, Dict[str, Any]], + test_data: Union[BaseSource, Record, Dict[str, Any]], + ): + """ + Method to optimize hyperparameters by Bayesian optimization using Gaussian Processes + as the surrogate model. + Uses a grid of hyperparameters in the form of a dictionary present in config, + Trains each permutation of the grid of parameters and compares accuracy. + Sets model to the best parameters and returns highest accuracy. + + Parameters + ---------- + model : ModelContext + The Model which needs to be used. + + feature : Feature + The Target feature in the data. + + accuracy_scorer: AccuracyContext + The accuracy scorer that needs to be used. + + train_data: SourcesContext + The train_data to train models on with the hyperparameters provided. + + sources : SourcesContext + The test_data to score against and optimize hyperparameters. + + Returns + ------- + float + The highest score value + """ + + nest_asyncio.apply() + + def check_parameters(pars): + for (pax, vals) in pars.items(): + if len(vals) != 2: + raise InvalidParametersException( + f"2 values are not provided for parameter {pax}" + ) + for val in vals: + if not type(val) is float and not type(val) is int: + raise InvalidParametersException( + f"Parameter {pax} is not of type int or float." + ) + return True + + check_parameters(self.parent.config.parameters) + + logging.info( + f"Optimizing model with Bayesian optimization with gaussian processes: {self.parent.config.parameters}" + ) + + def func(**vals): + with model.parent.config.no_enforce_immutable(): + for param in vals.keys(): + + if ( + hasattr(model.parent.config, param) + and model.parent.config.__annotations__[param].__name__ + == "int" + ): + setattr(model.parent.config, param, int(vals[param])) + else: + setattr(model.parent.config, param, vals[param]) + + train(model.parent, *train_data) + acc = score(model.parent, accuracy_scorer, feature, *test_data) + + if self.parent.config.objective == "min": + return -acc + elif self.parent.config.objective == "max": + return acc + + optimizer = BayesianOptimization( + f=func, + pbounds=self.parent.config.parameters, + random_state=1, + ) + + optimizer.maximize( + init_points=self.parent.config.init_points, + n_iter=self.parent.config.n_iter, + ) + with model.parent.config.no_enforce_immutable(): + for (param, val) in optimizer.max["params"].items(): + + if ( + hasattr(model.parent.config, param) + and model.parent.config.__annotations__[param].__name__ + == "int" + ): + setattr(model.parent.config, param, int(val)) + else: + setattr(model.parent.config, param, val) + + train(model.parent, *train_data) + + if self.parent.config.objective == "min": + return -optimizer.max["target"] + elif self.parent.config.objective == "max": + return optimizer.max["target"] + + +@entrypoint("bayes_opt_gp") +class BayesOptGP(Tuner): + + CONFIG = BayesOptGPConfig + CONTEXT = BayesOptGPContext diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/__init__.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_classifier_model.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_classifier_model.py new file mode 100644 index 0000000000..8c1177b5a4 --- /dev/null +++ b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_classifier_model.py @@ -0,0 +1,105 @@ +from doctest import testsource +import os +from pyexpat import features +import sys +import random +import tempfile +import subprocess + +import numpy as np +from sklearn.metrics import f1_score + +from dffml.record import Record +from dffml.source.source import Sources +from dffml import train, score, predict, tune, run_consoletest +from dffml.util.asynctestcase import AsyncTestCase +from dffml.feature.feature import Feature, Features +from dffml.source.memory import MemorySource, MemorySourceConfig +from dffml.accuracy import ClassificationAccuracy + +from dffml_model_xgboost.xgbclassifier import ( + XGBClassifierModel, + XGBClassifierModelConfig, +) + +from dffml_tuner_bayes_opt_gp.bayes_opt_gp import BayesOptGP + + + +class TestXGBClassifier(AsyncTestCase): + @classmethod + def setUpClass(cls): + # Create a temporary directory to store the trained model + cls.model_dir = tempfile.TemporaryDirectory() + # Create an instance of the model + cls.features = Features( + Feature("Feature1", float, 1), Feature("Feature2") + ) + cls.model = XGBClassifierModel( + XGBClassifierModelConfig( + features=Features( + Feature("Feature1", float, 1), Feature("Feature2") + ), + predict=Feature("Target", float, 1), + location=cls.model_dir.name, + ) + ) + cls.tuner = BayesOptGP( + parameters= + { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + }, + objective="max", + init_points=5, + n_iter=10 + ) + # Generating data f(x1,x2) = (2*x1 + 3*x2)//2 + _n_data = 2000 + _temp_data = np.random.rand(2, _n_data) + cls.records = [ + Record( + "x" + str(random.random()), + data={ + "features": { + "Feature1": float(_temp_data[0][i]), + "Feature2": float(_temp_data[1][i]), + "Target": (2 * _temp_data[0][i] + 3 * _temp_data[1][i]) + // 2, + } + }, + ) + for i in range(0, _n_data) + ] + + cls.trainingsource = Sources( + MemorySource(MemorySourceConfig(records=cls.records[:1800])) + ) + cls.testsource = Sources( + MemorySource(MemorySourceConfig(records=cls.records[1800:])) + ) + cls.scorer = ClassificationAccuracy() + + @classmethod + def tearDownClass(cls): + # Remove the temporary directory where the model was stored to cleanup + cls.model_dir.cleanup() + + async def test_00_train(self): + # Train the model on the training data + await tune( + self.model, + self.tuner, + self.scorer, + self.features, + [self.trainingsource], + [self.testsource], + ) + + + + +class TestXGBClassifierDocstring(AsyncTestCase): + async def test_docstring(self): + await run_consoletest(XGBClassifierModel) diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_regressor_model.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_regressor_model.py new file mode 100644 index 0000000000..6cd4920a75 --- /dev/null +++ b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_regressor_model.py @@ -0,0 +1,101 @@ +import random +import pathlib +import tempfile + +import numpy as np + +from dffml.record import Record +from dffml.source.source import Sources +from dffml import train, score, predict, run_consoletest +from dffml.util.asynctestcase import AsyncTestCase +from dffml.feature.feature import Feature, Features +from dffml.accuracy import MeanSquaredErrorAccuracy +from dffml.source.memory import MemorySource, MemorySourceConfig + + +from dffml_model_xgboost.xgbregressor import ( + XGBRegressorModel, + XGBRegressorModelConfig, +) + + +class TestXGBRegressor(AsyncTestCase): + @classmethod + def setUpClass(cls): + # Create a temporary directory to store the trained model + cls.model_dir = tempfile.TemporaryDirectory() + # Create an instance of the model + cls.model = XGBRegressorModel( + XGBRegressorModelConfig( + features=Features( + Feature("Feature1", float, 1), Feature("Feature2") + ), + predict=Feature("Target", float, 1), + location=cls.model_dir.name, + ) + ) + # Generating data f(x1,x2) = 2*x1 + 3*x2 + _n_data = 2000 + _temp_data = np.random.rand(2, _n_data) + cls.records = [ + Record( + "x" + str(random.random()), + data={ + "features": { + "Feature1": float(_temp_data[0][i]), + "Feature2": float(_temp_data[1][i]), + "Target": 2 * _temp_data[0][i] + 3 * _temp_data[1][i], + } + }, + ) + for i in range(0, _n_data) + ] + + cls.trainingsource = Sources( + MemorySource(MemorySourceConfig(records=cls.records[:1800])) + ) + cls.testsource = Sources( + MemorySource(MemorySourceConfig(records=cls.records[1800:])) + ) + + @classmethod + def tearDownClass(cls): + # Remove the temporary directory where the model was stored to cleanup + cls.model_dir.cleanup() + + async def test_00_train(self): + # Train the model on the training data + await train(self.model, self.trainingsource) + + async def test_01_accuracy(self): + scorer = MeanSquaredErrorAccuracy() + # Use the test data to assess the model's accuracy + res = await score( + self.model, scorer, Feature("Target", float, 1), self.testsource + ) + # Ensure the accuracy is above 80% + self.assertTrue(res) + + async def test_02_predict(self): + # Get the prediction for each piece of test data + async for i, features, prediction in predict( + self.model, self.testsource + ): + # Grab the correct value + correct = features["Target"] + # Grab the predicted value + prediction = prediction["Target"]["value"] + # Check that the prediction is within 30% error of the actual value + error = abs((prediction - correct) / correct) + + acceptable = 0.5 + # Sometimes causes an issue when only one data point anomalously has high error + self.assertLess(error, acceptable) + + +class TestXGBClassifierDocstring(AsyncTestCase): + async def test_docstring(self): + await run_consoletest( + XGBRegressorModel, + docs_root_dir=pathlib.Path(__file__).parents[3] / "docs", + ) diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/version.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/version.py new file mode 100644 index 0000000000..1cf6267ae5 --- /dev/null +++ b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/version.py @@ -0,0 +1 @@ +VERSION = "0.1.0" diff --git a/tuner/bayes_opt_gp/pyproject.toml b/tuner/bayes_opt_gp/pyproject.toml new file mode 100644 index 0000000000..8b9d32fa10 --- /dev/null +++ b/tuner/bayes_opt_gp/pyproject.toml @@ -0,0 +1,20 @@ +[tool.black] +line-length = 79 +target-version = ['py37'] + +exclude = ''' +( + /( + \.eggs # exclude a few common directories in the + | \.git # root of the project + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | _build + | buck-out + | build + | dist + ) +) +''' diff --git a/tuner/bayes_opt_gp/setup.cfg b/tuner/bayes_opt_gp/setup.cfg new file mode 100644 index 0000000000..00a065a39a --- /dev/null +++ b/tuner/bayes_opt_gp/setup.cfg @@ -0,0 +1,10 @@ +[options] +zip_safe = False +include_package_data = True +packages = find: +install_requires = + dffml>=0.4.0 + bayesian-optimization>=1.2.0 + pandas>=0.25.0 + scikit-learn>=0.22.0 + joblib>=0.16.0 \ No newline at end of file diff --git a/tuner/bayes_opt_gp/setup.py b/tuner/bayes_opt_gp/setup.py new file mode 100644 index 0000000000..d38d37ea92 --- /dev/null +++ b/tuner/bayes_opt_gp/setup.py @@ -0,0 +1,19 @@ +import os +import sys +import site +import importlib.util +from setuptools import setup + +# See https://github.com/pypa/pip/issues/7953 +site.ENABLE_USER_SITE = "--user" in sys.argv[1:] + +# Boilerplate to load commonalities +spec = importlib.util.spec_from_file_location( + "setup_common", os.path.join(os.path.dirname(__file__), "setup_common.py") +) +common = importlib.util.module_from_spec(spec) +spec.loader.exec_module(common) + +common.KWARGS["entry_points"] = {"dffml.tuner": [f"bayes_opt_gp = {common.IMPORT_NAME}.bayes_opt_gp:BayesOptGP"]} + +setup(**common.KWARGS) diff --git a/tuner/bayes_opt_gp/setup_common.py b/tuner/bayes_opt_gp/setup_common.py new file mode 100644 index 0000000000..7dfb09b35c --- /dev/null +++ b/tuner/bayes_opt_gp/setup_common.py @@ -0,0 +1,55 @@ +import os +import sys +import ast +from pathlib import Path + +ORG = "dffml" +NAME = "dffml-tuner-bayes-opt-gp" +DESCRIPTION = "DFFML model dffml-tuner-bayes-opt-gp" +AUTHOR_NAME = "Edison Siow" +AUTHOR_EMAIL = "edisonsiowxiong@gmail.com" + +IMPORT_NAME = ( + NAME + if "replace_package_name".upper() != NAME + else "replace_import_package_name".upper() +).replace("-", "_") + +SELF_PATH = Path(sys.argv[0]).parent.resolve() +if not (SELF_PATH / Path(IMPORT_NAME, "version.py")).is_file(): + SELF_PATH = os.path.dirname(os.path.realpath(__file__)) + +VERSION = ast.literal_eval( + Path(SELF_PATH, IMPORT_NAME, "version.py") + .read_text() + .split("=")[-1] + .strip() +) + +README = Path(SELF_PATH, "README.md").read_text() + +KWARGS = dict( + name=NAME, + version=VERSION, + description=DESCRIPTION, + long_description=README, + long_description_content_type="text/markdown", + author=AUTHOR_NAME, + author_email=AUTHOR_EMAIL, + maintainer=AUTHOR_NAME, + maintainer_email=AUTHOR_EMAIL, + url=f"https://github.com/{ORG}/{NAME}", + license="MIT", + keywords=["dffml"], + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + ], +) From d4ca3b206a1dc374526842d7b1a863be83c770b1 Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Fri, 15 Jul 2022 14:05:17 +0800 Subject: [PATCH 06/14] Minor fixes and documentation --- dffml/high_level/ml.py | 37 ++-- dffml/tuner/parameter_grid.py | 1 - dffml/tuner/random_search.py | 2 +- docs/tutorials/tuners/bayes_opt_gp.rst | 162 ++++++++++++++++++ docs/tutorials/tuners/parameter_grid.rst | 162 ++++++++++++++++++ examples/rockpaperscissors/tune.sh | 6 +- .../dffml_tuner_bayes_opt_gp/bayes_opt_gp.py | 7 +- 7 files changed, 349 insertions(+), 28 deletions(-) create mode 100644 docs/tutorials/tuners/bayes_opt_gp.rst create mode 100644 docs/tutorials/tuners/parameter_grid.rst diff --git a/dffml/high_level/ml.py b/dffml/high_level/ml.py index 43eb74569d..9ff57af182 100644 --- a/dffml/high_level/ml.py +++ b/dffml/high_level/ml.py @@ -387,25 +387,22 @@ async def tune( ] else: predict_feature = [model.config.predict.name] - - if hasattr(model.config, "features") and any( - isinstance(td, list) for td in train_ds - ): - train_ds = list_records_to_dict( - [feature.name for feature in model.config.features] - + predict_feature, - *train_ds, - model=model, - ) - if hasattr(model.config, "features") and any( - isinstance(td, list) for td in valid_ds - ): - valid_ds = list_records_to_dict( - [feature.name for feature in model.config.features] - + predict_feature, - *valid_ds, - model=model, - ) + + def records_to_dict_check(ds): + if hasattr(model.config, "features") and any( + isinstance(td, list) for td in ds + ): + return list_records_to_dict( + [feature.name for feature in model.config.features] + + predict_feature, + *ds, + model=model, + ) + return ds + + train_ds = records_to_dict_check(train_ds) + valid_ds = records_to_dict_check(valid_ds) + async with contextlib.AsyncExitStack() as astack: # Open sources @@ -418,7 +415,7 @@ async def tune( elif isinstance(model, ModelContext): mctx = model - # Allow for keep models open + # Allow for scorers to be kept open if isinstance(accuracy_scorer, AccuracyScorer): accuracy_scorer = await astack.enter_async_context(accuracy_scorer) actx = await astack.enter_async_context(accuracy_scorer()) diff --git a/dffml/tuner/parameter_grid.py b/dffml/tuner/parameter_grid.py index 65cfdd3d1c..6bf1352b83 100644 --- a/dffml/tuner/parameter_grid.py +++ b/dffml/tuner/parameter_grid.py @@ -100,7 +100,6 @@ async def optimize( if self.parent.config.objective == "min": if acc < highest_acc: highest_acc = acc - elif self.parent.config.objective == "max": if acc > highest_acc: highest_acc = acc diff --git a/dffml/tuner/random_search.py b/dffml/tuner/random_search.py index e1df0f47bd..ca4ccef46c 100644 --- a/dffml/tuner/random_search.py +++ b/dffml/tuner/random_search.py @@ -76,7 +76,7 @@ async def optimize( best_config = dict() logging.info( - f"Optimizing model with parameter grid: {self.parent.config.parameters}" + f"Optimizing model with random search: {self.parent.config.parameters}" ) names = list(self.parent.config.parameters.keys()) diff --git a/docs/tutorials/tuners/bayes_opt_gp.rst b/docs/tutorials/tuners/bayes_opt_gp.rst new file mode 100644 index 0000000000..fd88670da6 --- /dev/null +++ b/docs/tutorials/tuners/bayes_opt_gp.rst @@ -0,0 +1,162 @@ +Tuning a DFFML model with Bayesian Optimization +=============== + +For an introduction to hyperparameter tuning with the DFFML API, view the :ref:`parameter_grid` tutorial. + +For this tutorial, we'll be performing hyperparameter tuning using a BayesOptGP tuner, which is somewhat different +from the typical grid search/random search variants. As per normal, we will be using XGBClassifier as our model to +tune. + +Unlike grid search/random search, bayesian optimization is an intelligent hyperparameter selection process, +where the hyperparameters selected in the next iteration are dependent on the results of the previous iteration. +In the current iteration, the bayesian optimization process updates a surrogate model (which is a probability +distribution of scores | hypeparameters), selects a set of hyperparameters to maximize expected improvement of the +score based on this surrogate model, and repeats the process all over again. This allows one to efficiently search +the hyperparameter space, which is especially apt when the model to be tuned is expensive to evaluate. (For instance, +medium/large neural networks) + +The BayesOptGP tuner uses the BayesianOptimization library, which utilizes gaussian processes as the surrogate model, +hence the name of our tuner. + + +First, download the xgboost plugin for the DFFML library, which can be done via pip: + +.. code-block:: console + :test: + $ pip install -U dffml-model-xgboost + +We can utilize DFFML's tune method either via the Python API. In the following code, we demonstrate its usage in a Python +file: + +.. code-block:: console + :test: + from sklearn.datasets import load_iris + from sklearn.model_selection import train_test_split + + from dffml import Feature, Features + from dffml.noasync import tune + from dffml.accuracy import ClassificationAccuracy + from dffml_tuner_bayes_opt_gp.bayes_opt_gp import BayesOptGP + from dffml_model_xgboost.xgbclassifier import ( + XGBClassifierModel, + XGBClassifierModelConfig, + ) + + iris = load_iris() + y = iris["target"] + X = iris["data"] + trainX, testX, trainy, testy = train_test_split( + X, y, test_size=0.1, random_state=123 + ) + + # Configure the model + model = XGBClassifierModel( + XGBClassifierModelConfig( + features=Features(Feature("data", float,)), + predict=Feature("target", float, 1), + location="model", + max_depth=3, + learning_rate=0.01, + n_estimators=200, + reg_lambda=1, + reg_alpha=0, + gamma=0, + colsample_bytree=0, + subsample=1, + ) + ) + + # Configure the tuner search space in a dictionary + # All combinations will be tried, even if the parameter's + # value has been set in the model. + tuner = BayesOptGP( + parameters = { + "learning_rate": [0.01, 0.1], + "n_estimators": [20, 200], + "max_depth": [3,8] + + }, + objective = "max", + + ) + + # Tune function saves the best model and returns its score + print("Tuning accuracy:", + tune( + model, + tuner, + scorer, + Feature("target", float, 1), + [{"data": x, "target": y} for x, y in zip(trainX, trainy)], + [{"data": x, "target": y} for x, y in zip(testX, testy)], + + ) + ) + + +Note that because of its different nature, our BayesOptGP tuner only accepts a specific structure for its hyperparameter search +space configuration. For each hyperparameter, we accept two values representing the minimum and maximum bounds of that +hypeparameter which the tuner searches over. Also, Bayesian optimization only works on numerical hyperparameters ( +technically it should only work on floats, but we made some modfiications so it works on discrete values). This is because +the selection of the next set of hypeparameters derives from a closed-form integral which exepcts a continuous search space. + +Examples of non-legitimate hyperparameter configurations: + +.. code-block:: console + { + "learning_rate": [0.01, 0.1, 0.2], // too many values + "n_estimators": [20, 200], + "max_depth": [3] // too few values + + } + + +.. code-block:: console + { + "learning_rate": [0.01, 0.1], + "sampling_method": ["uniform", "gradient_based"], //no strings + "validate_parameters": [True, False] //no booleans + + } + +Command Line Usage +------------------ + +First, we download the Iris dataset to the desired folder. + +.. code-block:: console + $ wget http://download.tensorflow.org/data/iris_training.csv + $ wget http://download.tensorflow.org/data/iris_test.csv + $ sed -i 's/.*setosa,versicolor,virginica/SepalLength,SepalWidth,PetalLength,PetalWidth,classification/g' iris_training.csv iris_test.csv + +We create a JSON file with the hyperparameter search space: + +parameters.json +.. code-block:: console + { + "learning_rate": [0.01, 0.1], + "n_estimators": [20, 200], + "max_depth": [3,8] + } + +In the same folder, we perform the CLI tune command. + +.. code-block:: console + $ dffml tune \ + -model xgbclassifier \ + -model-features \ + SepalLength:float:1 \ + SepalWidth:float:1 \ + PetalLength:float:1 \ + -model-predict classification \ + -model-location tempDir \ + -tuner bayes_opt_gp \ + -tuner-parameters @parameters.json \ + -tuner-objective max \ + -scorer clf \ + -sources train=csv test=csv \ + -source-train-filename iris_training.csv \ + -source-test-filename iris_test.csv \ + -source-train-tag train \ + -source-test-tag test \ + -features classification:int:1 \ No newline at end of file diff --git a/docs/tutorials/tuners/parameter_grid.rst b/docs/tutorials/tuners/parameter_grid.rst new file mode 100644 index 0000000000..8bd275f047 --- /dev/null +++ b/docs/tutorials/tuners/parameter_grid.rst @@ -0,0 +1,162 @@ +Tuning a DFFML model with ParameterGrid +=============== + +For this tutorial, we'll be performing hyperparameter tuning on a DFFML model using DFFML's integrated "tune" method. +We will be using the XGBClassifier model and ParameterGrid tuner for this example, but note that these are +interchangeale for any DFFML Model and Tuner respectively. + +As we know, a machine learning model yields accurate predictions to unseen data by fitting itself to the +training dataset. However, different initial configurations to certain model parameters will affect the performance +of the trained model. For instance, a neural network that is allowed to train for several epochs on a dataset +typically outperforms another that has only trained a single epoch. We call these parameters to be modified in +pre-training "hyperparameters", and it is normally the job of the ML engineer to try many different hyperparameter +configuratons to find the best-performing model. + +This process can be automated using a hyperparameter tuning method, which tries a series of configurations on your +behalf, and includes random search, grid search, bayesian optimization and more. Here, we will be using +ParameterGrid, otherwise known as grid search, where the tuner tries all possible combinations of hyperparameters +provided by the user, a selects the best model based on a given metric. We will be tuning for the XGBClassifier +model based on a dictionary of values provied in a JSON file, and return the one with the highest accuracy on a +holdout validation set. + +First, download the xgboost plugin for the DFFML library, which can be done via pip: + +.. code-block:: console + :test: + $ pip install -U dffml-model-xgboost + +We can utilize DFFML's tune method either via the Python API. In the following code, we demonstrate its usage in a Python +file: + +.. code-block:: console + :test: + from sklearn.datasets import load_iris + from sklearn.model_selection import train_test_split + + from dffml import Feature, Features + from dffml.noasync import tune + from dffml.accuracy import ClassificationAccuracy + from dffml.tuner.parameter_grid import ParameterGrid + from dffml_model_xgboost.xgbclassifier import ( + XGBClassifierModel, + XGBClassifierModelConfig, + ) + + iris = load_iris() + y = iris["target"] + X = iris["data"] + trainX, testX, trainy, testy = train_test_split( + X, y, test_size=0.1, random_state=123 + ) + + # Configure the model + model = XGBClassifierModel( + XGBClassifierModelConfig( + features=Features(Feature("data", float,)), + predict=Feature("target", float, 1), + location="model", + max_depth=3, + learning_rate=0.01, + n_estimators=200, + reg_lambda=1, + reg_alpha=0, + gamma=0, + colsample_bytree=0, + subsample=1, + ) + ) + + # Configure the tuner search space in a dictionary + # All combinations will be tried, even if the parameter's + # value has been set in the model. + tuner = ParameterGrid( + parameters = { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + + }, + objective = "max" + ) + + # Tune function saves the best model and returns its score + print("Tuning accuracy:", + tune( + model, + tuner, + scorer, + Feature("target", float, 1), + [{"data": x, "target": y} for x, y in zip(trainX, trainy)], + [{"data": x, "target": y} for x, y in zip(testX, testy)], + + ) + ) + +The tune function takes in 6 arguments: + + model : Model + Machine Learning model to use. See :doc:`/plugins/dffml_model` for + models options. + + tuner: Tuner + Hyperparameter tuning method to use. See :doc:`/plugins/dffml_tuner` for + tuner options. + + scorer: Scorer + Method to evaluate the performance of the model, inheriting from AccuracyScorer + class. + + predict_feature: Union[Features, Feature] + A feature indicating the feature you wish to predict. + + train_ds : list + Input data for training. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + + valid_ds : list + Validation data for testing. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + +Command Line Usage +------------------ + +First, we download the Iris dataset to the desired folder. + +.. code-block:: console + $ wget http://download.tensorflow.org/data/iris_training.csv + $ wget http://download.tensorflow.org/data/iris_test.csv + $ sed -i 's/.*setosa,versicolor,virginica/SepalLength,SepalWidth,PetalLength,PetalWidth,classification/g' iris_training.csv iris_test.csv + +We create a JSON file with the hyperparameter search space: + +parameters.json +.. code-block:: console + { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + } + +In the same folder, we perform the CLI tune command. + +.. code-block:: console + $ dffml tune \ + -model xgbclassifier \ + -model-features \ + SepalLength:float:1 \ + SepalWidth:float:1 \ + PetalLength:float:1 \ + -model-predict classification \ + -model-location tempDir \ + -tuner parameter_grid \ + -tuner-parameters @parameters.json \ + -tuner-objective max \ + -scorer clf \ + -sources train=csv test=csv \ + -source-train-filename iris_training.csv \ + -source-test-filename iris_test.csv \ + -source-train-tag train \ + -source-test-tag test \ + -features classification:int:1 \ No newline at end of file diff --git a/examples/rockpaperscissors/tune.sh b/examples/rockpaperscissors/tune.sh index 39c78c2a79..e4613b7980 100644 --- a/examples/rockpaperscissors/tune.sh +++ b/examples/rockpaperscissors/tune.sh @@ -24,8 +24,4 @@ dffml tune \ -source-train-labels rock paper scissors \ -source-test-foldername rps-test-set/rps-test-set \ -source-test-feature image \ - -source-test-labels rock paper scissors \ - - - - \ No newline at end of file + -source-test-labels rock paper scissors \ \ No newline at end of file diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py index 3a6f48b9a0..1e3e3e2ef1 100644 --- a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py +++ b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py @@ -59,6 +59,11 @@ async def optimize( Trains each permutation of the grid of parameters and compares accuracy. Sets model to the best parameters and returns highest accuracy. + Note that for this tuner, each hyperparameter field to be tuned must have exactly 2 values + specified, representing the minimum and maximum values in the search space for that + hyperparameter. Additionally, they must be either float/integer values. Otherwise, + an error is raised. + Parameters ---------- model : ModelContext @@ -83,7 +88,7 @@ async def optimize( """ nest_asyncio.apply() - + def check_parameters(pars): for (pax, vals) in pars.items(): if len(vals) != 2: From 54d54d54ca759c473dd07fe6f0d56e859c078c98 Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Fri, 29 Jul 2022 11:02:35 +0800 Subject: [PATCH 07/14] Added requested changes --- .github/workflows/testing.yml | 2 + dffml/high_level/ml.py | 21 +---- dffml/util/internal.py | 12 +++ docs/tutorials/tuners/bayes_opt_gp.rst | 37 ++++----- docs/tutorials/tuners/parameter_grid.rst | 34 ++++---- .../dffml_tuner_bayes_opt_gp/bayes_opt_gp.py | 79 ++++++++++--------- .../tests/test_regressor_model.py | 1 - 7 files changed, 95 insertions(+), 91 deletions(-) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index fcbaeea27f..83153e606d 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -217,6 +217,8 @@ jobs: - docs/tutorials/models/slr.rst - docs/tutorials/sources/complex.rst - docs/tutorials/sources/file.rst + - docs/tutorials/tuner/parameter_grid.rst + - docs/tutorials/tuner/bayes_opt_gp.rst steps: - uses: actions/checkout@v2 diff --git a/dffml/high_level/ml.py b/dffml/high_level/ml.py index 9ff57af182..9317f1a4e6 100644 --- a/dffml/high_level/ml.py +++ b/dffml/high_level/ml.py @@ -6,7 +6,7 @@ from ..source.source import BaseSource from ..feature import Feature, Features from ..model import Model, ModelContext -from ..util.internal import records_to_sources, list_records_to_dict +from ..util.internal import records_to_sources, list_records_to_dict, records_to_dict_check from ..accuracy.accuracy import AccuracyScorer, AccuracyContext from ..tuner import Tuner, TunerContext @@ -387,23 +387,10 @@ async def tune( ] else: predict_feature = [model.config.predict.name] - - def records_to_dict_check(ds): - if hasattr(model.config, "features") and any( - isinstance(td, list) for td in ds - ): - return list_records_to_dict( - [feature.name for feature in model.config.features] - + predict_feature, - *ds, - model=model, - ) - return ds - - train_ds = records_to_dict_check(train_ds) - valid_ds = records_to_dict_check(valid_ds) - + train_ds = records_to_dict_check(train_ds, model, predict_feature) + valid_ds = records_to_dict_check(valid_ds, model, predict_feature) + async with contextlib.AsyncExitStack() as astack: # Open sources train = await astack.enter_async_context(records_to_sources(*train_ds)) diff --git a/dffml/util/internal.py b/dffml/util/internal.py index fcb4dd5255..e26a8698ab 100644 --- a/dffml/util/internal.py +++ b/dffml/util/internal.py @@ -72,3 +72,15 @@ def list_records_to_dict(features, *args, model=None): args[i] = dict(zip(features, args[i])) return args raise CannotConvertToRecord("Model does not exist!") + +def records_to_dict_check(ds, model, predict_feature): + if hasattr(model.config, "features") and any( + isinstance(td, list) for td in ds + ): + return list_records_to_dict( + [feature.name for feature in model.config.features] + + predict_feature, + *ds, + model=model, + ) + return ds diff --git a/docs/tutorials/tuners/bayes_opt_gp.rst b/docs/tutorials/tuners/bayes_opt_gp.rst index fd88670da6..005d7e0cb5 100644 --- a/docs/tutorials/tuners/bayes_opt_gp.rst +++ b/docs/tutorials/tuners/bayes_opt_gp.rst @@ -30,6 +30,7 @@ file: .. code-block:: console :test: + :filepath: bayes_opt_gp_xgboost.py from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split @@ -98,7 +99,7 @@ Note that because of its different nature, our BayesOptGP tuner only accepts a s space configuration. For each hyperparameter, we accept two values representing the minimum and maximum bounds of that hypeparameter which the tuner searches over. Also, Bayesian optimization only works on numerical hyperparameters ( technically it should only work on floats, but we made some modfiications so it works on discrete values). This is because -the selection of the next set of hypeparameters derives from a closed-form integral which exepcts a continuous search space. +the selection of the next set of hypeparameters derives from a closed-fm integral which exepcts a continuous search space. Examples of non-legitimate hyperparameter configurations: @@ -143,20 +144,20 @@ In the same folder, we perform the CLI tune command. .. code-block:: console $ dffml tune \ - -model xgbclassifier \ - -model-features \ - SepalLength:float:1 \ - SepalWidth:float:1 \ - PetalLength:float:1 \ - -model-predict classification \ - -model-location tempDir \ - -tuner bayes_opt_gp \ - -tuner-parameters @parameters.json \ - -tuner-objective max \ - -scorer clf \ - -sources train=csv test=csv \ - -source-train-filename iris_training.csv \ - -source-test-filename iris_test.csv \ - -source-train-tag train \ - -source-test-tag test \ - -features classification:int:1 \ No newline at end of file + -model xgbclassifier \ + -model-features \ + SepalLength:float:1 \ + SepalWidth:float:1 \ + PetalLength:float:1 \ + -model-predict classification \ + -model-location tempDir \ + -tuner bayes_opt_gp \ + -tuner-parameters @parameters.json \ + -tuner-objective max \ + -scorer clf \ + -sources train=csv test=csv \ + -source-train-filename iris_training.csv \ + -source-test-filename iris_test.csv \ + -source-train-tag train \ + -source-test-tag test \ + -features classification:int:1 \ No newline at end of file diff --git a/docs/tutorials/tuners/parameter_grid.rst b/docs/tutorials/tuners/parameter_grid.rst index 8bd275f047..2b37a8daff 100644 --- a/docs/tutorials/tuners/parameter_grid.rst +++ b/docs/tutorials/tuners/parameter_grid.rst @@ -143,20 +143,20 @@ In the same folder, we perform the CLI tune command. .. code-block:: console $ dffml tune \ - -model xgbclassifier \ - -model-features \ - SepalLength:float:1 \ - SepalWidth:float:1 \ - PetalLength:float:1 \ - -model-predict classification \ - -model-location tempDir \ - -tuner parameter_grid \ - -tuner-parameters @parameters.json \ - -tuner-objective max \ - -scorer clf \ - -sources train=csv test=csv \ - -source-train-filename iris_training.csv \ - -source-test-filename iris_test.csv \ - -source-train-tag train \ - -source-test-tag test \ - -features classification:int:1 \ No newline at end of file + -model xgbclassifier \ + -model-features \ + SepalLength:float:1 \ + SepalWidth:float:1 \ + PetalLength:float:1 \ + -model-predict classification \ + -model-location tempDir \ + -tuner parameter_grid \ + -tuner-parameters @parameters.json \ + -tuner-objective max \ + -scorer clf \ + -sources train=csv test=csv \ + -source-train-filename iris_training.csv \ + -source-test-filename iris_test.csv \ + -source-train-tag train \ + -source-test-tag test \ + -features classification:int:1 \ No newline at end of file diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py index 1e3e3e2ef1..d906574e69 100644 --- a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py +++ b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py @@ -1,12 +1,14 @@ from typing import Union, Dict, Any, List import itertools import logging +import functools from dffml.base import ( config, field, ) from dffml.noasync import train, score +from dffml.high_level.ml import train as async_train from dffml.tuner import Tuner, TunerContext from dffml.util.entrypoint import entrypoint from dffml.record import Record @@ -44,6 +46,41 @@ class BayesOptGPContext(TunerContext): Bayesian Optimization GP Tuner """ + def check_parameters(self, pars): + for (pax, vals) in pars.items(): + if len(vals) != 2: + raise InvalidParametersException( + f"2 values are not provided for parameter {pax}" + ) + for val in vals: + if not type(val) is float and not type(val) is int: + raise InvalidParametersException( + f"Parameter {pax} is not of type int or float." + ) + return True + + def obj_func(self, model, train_data, accuracy_scorer, feature, test_data, **vals): + + with model.parent.config.no_enforce_immutable(): + for param in vals.keys(): + + if ( + hasattr(model.parent.config, param) + and model.parent.config.__annotations__[param].__name__ + == "int" + ): + setattr(model.parent.config, param, int(vals[param])) + else: + setattr(model.parent.config, param, vals[param]) + + train(model.parent, *train_data) + acc = score(model.parent, accuracy_scorer, feature, *test_data) + + if self.parent.config.objective == "min": + return -acc + elif self.parent.config.objective == "max": + return acc + async def optimize( self, model: ModelContext, @@ -78,7 +115,7 @@ async def optimize( train_data: SourcesContext The train_data to train models on with the hyperparameters provided. - sources : SourcesContext + test_data : SourcesContext The test_data to score against and optimize hyperparameters. Returns @@ -89,48 +126,14 @@ async def optimize( nest_asyncio.apply() - def check_parameters(pars): - for (pax, vals) in pars.items(): - if len(vals) != 2: - raise InvalidParametersException( - f"2 values are not provided for parameter {pax}" - ) - for val in vals: - if not type(val) is float and not type(val) is int: - raise InvalidParametersException( - f"Parameter {pax} is not of type int or float." - ) - return True - - check_parameters(self.parent.config.parameters) + self.check_parameters(self.parent.config.parameters) logging.info( f"Optimizing model with Bayesian optimization with gaussian processes: {self.parent.config.parameters}" ) - def func(**vals): - with model.parent.config.no_enforce_immutable(): - for param in vals.keys(): - - if ( - hasattr(model.parent.config, param) - and model.parent.config.__annotations__[param].__name__ - == "int" - ): - setattr(model.parent.config, param, int(vals[param])) - else: - setattr(model.parent.config, param, vals[param]) - - train(model.parent, *train_data) - acc = score(model.parent, accuracy_scorer, feature, *test_data) - - if self.parent.config.objective == "min": - return -acc - elif self.parent.config.objective == "max": - return acc - optimizer = BayesianOptimization( - f=func, + f=functools.partial(self.obj_func, model, train_data, accuracy_scorer, feature, test_data), pbounds=self.parent.config.parameters, random_state=1, ) @@ -151,7 +154,7 @@ def func(**vals): else: setattr(model.parent.config, param, val) - train(model.parent, *train_data) + await async_train(model.parent, *train_data) if self.parent.config.objective == "min": return -optimizer.max["target"] diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_regressor_model.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_regressor_model.py index 6cd4920a75..5c24190fda 100644 --- a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_regressor_model.py +++ b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_regressor_model.py @@ -1,4 +1,3 @@ -import random import pathlib import tempfile From 5a05c86aa1637a5a90455376aea34a61f1ccb447 Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Sun, 31 Jul 2022 05:43:15 +0800 Subject: [PATCH 08/14] "minor doctest edits" --- docs/tutorials/tuners/bayes_opt_gp.rst | 4 + docs/tutorials/tuners/random_search.rst | 167 ++++++++++++++++++++++++ 2 files changed, 171 insertions(+) create mode 100644 docs/tutorials/tuners/random_search.rst diff --git a/docs/tutorials/tuners/bayes_opt_gp.rst b/docs/tutorials/tuners/bayes_opt_gp.rst index 005d7e0cb5..7ed9d94825 100644 --- a/docs/tutorials/tuners/bayes_opt_gp.rst +++ b/docs/tutorials/tuners/bayes_opt_gp.rst @@ -126,6 +126,7 @@ Command Line Usage First, we download the Iris dataset to the desired folder. .. code-block:: console + :test: $ wget http://download.tensorflow.org/data/iris_training.csv $ wget http://download.tensorflow.org/data/iris_test.csv $ sed -i 's/.*setosa,versicolor,virginica/SepalLength,SepalWidth,PetalLength,PetalWidth,classification/g' iris_training.csv iris_test.csv @@ -134,6 +135,8 @@ We create a JSON file with the hyperparameter search space: parameters.json .. code-block:: console + :test: + :filepath: parameters.json { "learning_rate": [0.01, 0.1], "n_estimators": [20, 200], @@ -143,6 +146,7 @@ parameters.json In the same folder, we perform the CLI tune command. .. code-block:: console + :test: $ dffml tune \ -model xgbclassifier \ -model-features \ diff --git a/docs/tutorials/tuners/random_search.rst b/docs/tutorials/tuners/random_search.rst new file mode 100644 index 0000000000..8a88562d7c --- /dev/null +++ b/docs/tutorials/tuners/random_search.rst @@ -0,0 +1,167 @@ +Tuning a DFFML model with Random Search +=============== + +For this tutorial, we'll be performing hyperparameter tuning on a DFFML model using DFFML's integrated "tune" method. +We will be using the XGBClassifier model and RandomSearch tuner for this example, but note that these are +interchangeale for any DFFML Model and Tuner respectively. + +As we know, a machine learning model yields accurate predictions to unseen data by fitting itself to the +training dataset. However, different initial configurations to certain model parameters will affect the performance +of the trained model. For instance, a neural network that is allowed to train for several epochs on a dataset +typically outperforms another that has only trained a single epoch. We call these parameters to be modified in +pre-training "hyperparameters", and it is normally the job of the ML engineer to try many different hyperparameter +configuratons to find the best-performing model. + +This process can be automated using a hyperparameter tuning method, which tries a series of configurations on your +behalf, and includes random search, grid search, bayesian optimization and more. Here, we will be using +RandomSearch, where the tuner tries a random combination of hyperparameters provided by the user for a fixed number of +iterations, and selects the best model based on a given metric. We will be tuning for the XGBClassifier +model based on a dictionary of values provied in a JSON file, and returns the one with the highest accuracy on a +holdout validation set. + +First, download the xgboost plugin for the DFFML library, which can be done via pip: + +.. code-block:: console + :test: + $ pip install -U dffml-model-xgboost + +We can utilize DFFML's tune method either via the Python API. In the following code, we demonstrate its usage in a Python +file: + +.. code-block:: console + :test: + from sklearn.datasets import load_iris + from sklearn.model_selection import train_test_split + + from dffml import Feature, Features + from dffml.noasync import tune + from dffml.accuracy import ClassificationAccuracy + from dffml.tuner.random_search import RandomSearch + from dffml_model_xgboost.xgbclassifier import ( + XGBClassifierModel, + XGBClassifierModelConfig, + ) + + iris = load_iris() + y = iris["target"] + X = iris["data"] + trainX, testX, trainy, testy = train_test_split( + X, y, test_size=0.1, random_state=123 + ) + + # Configure the model + model = XGBClassifierModel( + XGBClassifierModelConfig( + features=Features(Feature("data", float,)), + predict=Feature("target", float, 1), + location="model", + max_depth=3, + learning_rate=0.01, + n_estimators=200, + reg_lambda=1, + reg_alpha=0, + gamma=0, + colsample_bytree=0, + subsample=1, + ) + ) + + # Configure the tuner search space in a dictionary + # All combinations will be tried, even if the parameter's + # value has been set in the model. + tuner = RandomSearch( + parameters = { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + + }, + objective = "max", + trials=15 + ) + + # Tune function saves the best model and returns its score + print("Tuning accuracy:", + tune( + model, + tuner, + scorer, + Feature("target", float, 1), + [{"data": x, "target": y} for x, y in zip(trainX, trainy)], + [{"data": x, "target": y} for x, y in zip(testX, testy)], + + ) + ) + +The tune function takes in 6 arguments: + + model : Model + Machine Learning model to use. See :doc:`/plugins/dffml_model` for + models options. + + tuner: Tuner + Hyperparameter tuning method to use. See :doc:`/plugins/dffml_tuner` for + tuner options. + + scorer: Scorer + Method to evaluate the performance of the model, inheriting from AccuracyScorer + class. + + predict_feature: Union[Features, Feature] + A feature indicating the feature you wish to predict. + + train_ds : list + Input data for training. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + + valid_ds : list + Validation data for testing. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + +Command Line Usage +------------------ + +First, we download the Iris dataset to the desired folder. + +.. code-block:: console + :test: + $ wget http://download.tensorflow.org/data/iris_training.csv + $ wget http://download.tensorflow.org/data/iris_test.csv + $ sed -i 's/.*setosa,versicolor,virginica/SepalLength,SepalWidth,PetalLength,PetalWidth,classification/g' iris_training.csv iris_test.csv + +We create a JSON file with the hyperparameter search space: + +parameters.json +.. code-block:: console + :test: + :filepath: parameters.json + { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + } + +In the same folder, we perform the CLI tune command. + +.. code-block:: console + :test: + $ dffml tune \ + -model xgbclassifier \ + -model-features \ + SepalLength:float:1 \ + SepalWidth:float:1 \ + PetalLength:float:1 \ + -model-predict classification \ + -model-location tempDir \ + -tuner random_search \ + -tuner-parameters @parameters.json \ + -tuner-objective max \ + -scorer clf \ + -sources train=csv test=csv \ + -source-train-filename iris_training.csv \ + -source-test-filename iris_test.csv \ + -source-train-tag train \ + -source-test-tag test \ + -features classification:int:1 \ No newline at end of file From a31441188ab2bdd87ac2a7f27d79b2e31045b3fc Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Fri, 5 Aug 2022 10:15:48 +0800 Subject: [PATCH 09/14] "First iteration of AutoML model" --- dffml/model/automl.py | 162 ++++++++++++++++++++++++++++++++++++++++++ setup.py | 2 +- 2 files changed, 163 insertions(+), 1 deletion(-) create mode 100644 dffml/model/automl.py diff --git a/dffml/model/automl.py b/dffml/model/automl.py new file mode 100644 index 0000000000..e13ecd1f44 --- /dev/null +++ b/dffml/model/automl.py @@ -0,0 +1,162 @@ +import pathlib +import os +import shutil +import tempfile +import contextlib +import pkg_resources +from typing import AsyncIterator, Tuple, Any, Type, List +from ..high_level.ml import tune +from ..base import config, field +from ..util.entrypoint import entrypoint +from .model import ModelNotTrained, ModelContext, SimpleModel, Model +from ..feature.feature import Feature, Features +from ..source.source import Sources, SourcesContext +from ..record import Record +from ..model.model import Model +from ..tuner.tuner import Tuner +from ..accuracy import AccuracyScorer + + +@config +class AutoMLModelConfig: + predict: Feature = field("Label or the value to be predicted") + features: Features = field("Features to train on.") + location: pathlib.Path = field("Location where state should be saved") + tuner: Tuner = field("Tuner to optimize hyperparameters with.") + scorer: AccuracyScorer = field("Scorer to evaluate and select best model.") + models: List[str] = field("List of models to tune and compare against", default_factory= lambda:list()) + objective: str = field( + "How to optimize the given scorer. Values are min/max", default="max" + ), + premodel: str = field("Type of model to predict with after training", default="None") + +@entrypoint("automl") +class AutoMLModel(SimpleModel): + r""" + AutoML model for automatic training and tuning based on target datasets and given + models and tuner. + + """ + # The configuration class needs to be set as the CONFIG property + + CONFIG: Type[AutoMLModelConfig] = AutoMLModelConfig + + def __init__(self, config) -> None: + super().__init__(config) + # The saved model + self.saved = None + + async def __aenter__(self): + await super().__aenter__() + + + self.model_classes = {} + # We want to allow users to not need to deal with individual model configuration. + # So we accept a list of strings and initialize our models based on that. + for ep in pkg_resources.iter_entry_points(group='dffml.model'): + if ep.name in self.parent.config.models or ep.name == self.parent.config.premodel: + self.model_classes.update({ep.name: ep.load()}) + + dest = pathlib.Path(self.parent.config.location) + + # This is for prediction. If a trained model exists in the target directory, we initialize + # a model of type premodel and use that for prediction. + if self.parent.config.premodel != "None" and dest.exists(): + model = self.model_classes[self.parent.config.premodel]( + location = dest, + features = self.parent.config.features, + predict = self.parent.config.predict + ) + async with contextlib.AsyncExitStack() as astack: + if isinstance(model, Model): + model = await astack.enter_async_context(model) + mctx = await astack.enter_async_context(model()) + elif isinstance(model, ModelContext): + mctx = model + self.saved = mctx + self.is_trained = True + return self + + async def __aexit__(self, exc_type, exc_value, traceback): + await super().__aexit__(exc_type, exc_value, traceback) + + async def train(self, sources: Sources) -> None: + # X and Y data + + tuner = self.parent.config.tuner + scorer = self.parent.config.scorer + features = self.parent.config.features + location = self.parent.config.location + source_files = sources[0] + + tuner.config.objective = self.parent.config.objective + + train_source = test_source = None + + + # Check for tags to determine train/test sets + for source in source_files: + + if hasattr(source, "tag") and source.tag == "train": + train_source = source + if hasattr(source, "tag") and source.tag == "test": + test_source = source + + if not train_source or not test_source: + # If tags not found, default to positional + if len(source_files) >= 2: + train_source = source_files[0] + test_source = source_files[1] + elif not train_source: + raise NotImplementedError("Train set not found.") + else: + raise NotImplementedError("Test set not found.") + + if self.parent.config.objective == "min": + highest_acc = float("inf") + elif self.parent.config.objective == "max": + highest_acc = -1 + else: + raise NotImplementedError('Objective must be either "min" or "max".') + + dest = pathlib.Path(location) + # We clear the destination directory first, to avoid conflicts. + if dest.exists() and dest.is_dir(): + shutil.rmtree(dest) + + + best_path = "" + temp_dirs = [] + + for model_name in self.parent.config.models: + dirpath = tempfile.mkdtemp() + temp_dirs.append(dirpath) + model = self.model_classes[model_name]( + location = dirpath, + features = features, + predict = self.parent.config.predict + ) + + + val = await tune(model, tuner, scorer, self.parent.config.predict, [train_source], [test_source]) + if self.parent.config.objective == "min" and val < highest_acc: + best_path = dirpath + elif self.parent.config.objective == "max" and val > highest_acc: + best_path = dirpath + + + shutil.copytree(best_path, dest) + + for td in temp_dirs: + shutil.rmtree(td) + + async def predict( + self, sources: SourcesContext + ) -> AsyncIterator[Tuple[Record, Any, float]]: + if not self.is_trained: + raise ModelNotTrained( + "Train the model first before getting predictions" + ) + # Use the child model API to make predictions + async for record in self.saved.predict(sources): + yield record diff --git a/setup.py b/setup.py index 6970a86447..fb02cb0dce 100644 --- a/setup.py +++ b/setup.py @@ -161,7 +161,7 @@ class InstallException(Exception): # Databases "dffml.db": ["sqlite = dffml.db.sqlite:SqliteDatabase"], # Models - "dffml.model": ["slr = dffml.model.slr:SLRModel"], + "dffml.model": ["slr = dffml.model.slr:SLRModel", "automl = dffml.model.automl:AutoMLModel"], # Secrets "dffml.secret": ["ini = dffml.secret.ini:INISecret"], # Accuracy From e397155556c824a466e046304c64b0e3b1e57fc3 Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Fri, 5 Aug 2022 11:06:36 +0800 Subject: [PATCH 10/14] AutoML model iteration 1.5 --- dffml/model/automl.py | 48 ++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/dffml/model/automl.py b/dffml/model/automl.py index e13ecd1f44..02df896f25 100644 --- a/dffml/model/automl.py +++ b/dffml/model/automl.py @@ -27,8 +27,8 @@ class AutoMLModelConfig: models: List[str] = field("List of models to tune and compare against", default_factory= lambda:list()) objective: str = field( "How to optimize the given scorer. Values are min/max", default="max" - ), - premodel: str = field("Type of model to predict with after training", default="None") + ) + @entrypoint("automl") class AutoMLModel(SimpleModel): @@ -50,20 +50,24 @@ async def __aenter__(self): await super().__aenter__() - self.model_classes = {} + dest = pathlib.Path(self.parent.config.location) + best_path = dest / "best_model" + # Check if model has been trained, and if so, get the type of the model + best_model = best_model_path = None + if dest.exists() and best_path.exists() and len(os.listdir(best_path)): + best_model = os.listdir(best_path)[0] + # We want to allow users to not need to deal with individual model configuration. # So we accept a list of strings and initialize our models based on that. + self.model_classes = {} for ep in pkg_resources.iter_entry_points(group='dffml.model'): - if ep.name in self.parent.config.models or ep.name == self.parent.config.premodel: + if ep.name in self.parent.config.models or ep.name == best_model: self.model_classes.update({ep.name: ep.load()}) - dest = pathlib.Path(self.parent.config.location) - # This is for prediction. If a trained model exists in the target directory, we initialize - # a model of type premodel and use that for prediction. - if self.parent.config.premodel != "None" and dest.exists(): - model = self.model_classes[self.parent.config.premodel]( - location = dest, + if best_model: + model = self.model_classes[best_model]( + location = best_path / best_model, features = self.parent.config.features, predict = self.parent.config.predict ) @@ -81,7 +85,6 @@ async def __aexit__(self, exc_type, exc_value, traceback): await super().__aexit__(exc_type, exc_value, traceback) async def train(self, sources: Sources) -> None: - # X and Y data tuner = self.parent.config.tuner scorer = self.parent.config.scorer @@ -125,30 +128,29 @@ async def train(self, sources: Sources) -> None: shutil.rmtree(dest) - best_path = "" - temp_dirs = [] + best_path = best_name = "" for model_name in self.parent.config.models: - dirpath = tempfile.mkdtemp() - temp_dirs.append(dirpath) + model_dir = dest / model_name + model = self.model_classes[model_name]( - location = dirpath, + location = model_dir, features = features, predict = self.parent.config.predict ) - val = await tune(model, tuner, scorer, self.parent.config.predict, [train_source], [test_source]) if self.parent.config.objective == "min" and val < highest_acc: - best_path = dirpath + best_path = model_dir + best_name = model_name elif self.parent.config.objective == "max" and val > highest_acc: - best_path = dirpath + best_path = model_dir + best_name = model_name - - shutil.copytree(best_path, dest) + best_model_dir = dest / "best_model" / best_name + shutil.copytree(best_path, best_model_dir) - for td in temp_dirs: - shutil.rmtree(td) + async def predict( self, sources: SourcesContext From 8803e1e500e3193ea87e4aeb84b4f85fffac2a21 Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Sun, 21 Aug 2022 05:03:22 +0800 Subject: [PATCH 11/14] "default and user-defined hyperparameters" --- dffml/model/automl.py | 47 +++++----- dffml/util/autodefault.json | 137 +++++++++++++++++++++++++++++ docs/tutorials/models/automl.rst | 142 +++++++++++++++++++++++++++++++ tests/model/test_automl.py | 126 +++++++++++++++++++++++++++ 4 files changed, 426 insertions(+), 26 deletions(-) create mode 100644 dffml/util/autodefault.json create mode 100644 docs/tutorials/models/automl.rst create mode 100644 tests/model/test_automl.py diff --git a/dffml/model/automl.py b/dffml/model/automl.py index 02df896f25..3e90943359 100644 --- a/dffml/model/automl.py +++ b/dffml/model/automl.py @@ -1,6 +1,7 @@ import pathlib import os import shutil +import json import tempfile import contextlib import pkg_resources @@ -28,6 +29,8 @@ class AutoMLModelConfig: objective: str = field( "How to optimize the given scorer. Values are min/max", default="max" ) + parameters: dict = field("Hyperparameter configuration of different models to optimize", default_factory= lambda:dict()), + use_default: bool = field("Whether or not to utilize DFFML's default hyperparameter settings for tuning", default=False) @entrypoint("automl") @@ -45,15 +48,14 @@ def __init__(self, config) -> None: super().__init__(config) # The saved model self.saved = None + self.forbidden = ["automl", "autosklearn"] async def __aenter__(self): - await super().__aenter__() - dest = pathlib.Path(self.parent.config.location) best_path = dest / "best_model" # Check if model has been trained, and if so, get the type of the model - best_model = best_model_path = None + best_model = None if dest.exists() and best_path.exists() and len(os.listdir(best_path)): best_model = os.listdir(best_path)[0] @@ -63,8 +65,8 @@ async def __aenter__(self): for ep in pkg_resources.iter_entry_points(group='dffml.model'): if ep.name in self.parent.config.models or ep.name == best_model: self.model_classes.update({ep.name: ep.load()}) - + # loading a trained model for prediction if best_model: model = self.model_classes[best_model]( location = best_path / best_model, @@ -90,30 +92,13 @@ async def train(self, sources: Sources) -> None: scorer = self.parent.config.scorer features = self.parent.config.features location = self.parent.config.location - source_files = sources[0] tuner.config.objective = self.parent.config.objective - - train_source = test_source = None - - # Check for tags to determine train/test sets - for source in source_files: - - if hasattr(source, "tag") and source.tag == "train": - train_source = source - if hasattr(source, "tag") and source.tag == "test": - test_source = source - - if not train_source or not test_source: - # If tags not found, default to positional - if len(source_files) >= 2: - train_source = source_files[0] - test_source = source_files[1] - elif not train_source: - raise NotImplementedError("Train set not found.") - else: - raise NotImplementedError("Test set not found.") + if self.parent.config.use_default: + pth = pathlib.Path(__file__).parents[1] / "util" / "autodefault.json" + with open(str(pth), "r") as tar: + self.parent.config.parameters = json.load(tar) if self.parent.config.objective == "min": highest_acc = float("inf") @@ -131,6 +116,9 @@ async def train(self, sources: Sources) -> None: best_path = best_name = "" for model_name in self.parent.config.models: + if model_name in self.forbidden: + print(f"{model_name} is a forbidden model. Skipping...") + continue model_dir = dest / model_name model = self.model_classes[model_name]( @@ -138,14 +126,21 @@ async def train(self, sources: Sources) -> None: features = features, predict = self.parent.config.predict ) + if model_name in self.parent.config.parameters: + tuner.config.parameters = self.parent.config.parameters[model_name] + else: + tuner.config.parameters = {} - val = await tune(model, tuner, scorer, self.parent.config.predict, [train_source], [test_source]) + val = await tune(model, tuner, scorer, self.parent.config.predict, sources, sources) + if self.parent.config.objective == "min" and val < highest_acc: best_path = model_dir best_name = model_name + highest_acc = val elif self.parent.config.objective == "max" and val > highest_acc: best_path = model_dir best_name = model_name + highest_acc = val best_model_dir = dest / "best_model" / best_name shutil.copytree(best_path, best_model_dir) diff --git a/dffml/util/autodefault.json b/dffml/util/autodefault.json new file mode 100644 index 0000000000..b4c1cfe816 --- /dev/null +++ b/dffml/util/autodefault.json @@ -0,0 +1,137 @@ +{ + + "xgbclassifier": {"learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8]}, + "scikitsvc": {"gamma": [0.001, 0.1], "C": [1, 10]}, + "daal4py":{}, + "pytorch":{}, + "automl":{}, + "slr":{}, + "anomalydetection":{}, + "scratchlgrsag":{}, + "xgbregressor": {"learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8]}, + "vwmodel":{}, + "scikitac":{}, + "scikitadaboost":{ + "base_estimator": ["DecisionTreeClassifier", "LogisticRegressor", "SVC"], + "n_estimators": [10, 50, 100, 500, 1000, 5000], + "learning_rate": [0.1, 0.5, 0.9, 1.5], + "tree_depth": [3,5,7,9] + + }, + "scikitap":{}, + "scikitard":{ + "alpha_1": [1e-5, 1e-6, 1e-7], + "alpha_2": [1e-5, 1e-6, 1e-7] + }, + "scikitbgc":{}, + "scikitbirch":{"threshold":[0.3,0.5,0.7]}, + "scikitbnb":{}, + "scikitbyr":{}, + "scikitdtc":{ + "criterion": ["gini", "entropy"], + "max_depth": [3,5,7,9] + }, + "scikitdtr":{ + "criterion": ["gini", "entropy"], + "max_depth": [3,5,7,9] + }, + "scikiteln":{ + "max_iter": [1, 5, 10], + "alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100] + + }, + "scikitetc":{ + "n_estimators": [10,50,100], + "criterion": ["mse", "mae"], + "max_depth": [2,8,16,32,50], + "min_sample_split": [2,4,6], + "min_sample_leaf": [1,2] + }, + "scikitgbc":{ + "learning_rates": [1, 0.5, 0.25, 0.1, 0.05, 0.01], + "n_estimators": [1, 2, 4, 8, 16, 32, 64, 100, 200] + }, + "scikitgnb":{"var_smoothing":[1e-7,1e-8,1e-9]}, + "scikitgpc":{}, + "scikitgpr":{}, + "scikitkmeans":{ + "n_clusters":[5,10,15], + "tol":[1e-3, 1e-4, 1e-5] + }, + "scikitknn":{"n_neighbours": [2, 4, 8, 16, 32, 64, 128]}, + "scikitlars":{}, + "scikitlas":{}, + "scikitlda":{}, + "scikitlor":{}, + "scikitlr":{}, + "scikitmbkmeans":{}, + "scikitmlp":{ + "hidden_layer_sizes": [[50,50,50], [50,100,50], [100]], + "activation": ["tanh", "relu"], + "solver": ["sgd", "adam"], + "alpha": [0.0001, 0.05], + "learning_rate": ["constant","adaptive"] + + }, + "scikitmnb":{ + "alpha":[0,0.5,1.0] + }, + "scikitms":{}, + "scikitomp":{ + "n_nonzero_coefs":[0.1,0.2,0.3] + }, + "scikitoptics":{"min_samples":[5,10,15]}, + "scikitqda":{"reg_param": [0.1, 0.2, 0.3, 0.4, 0.5]}, + "scikitrfc":{ + "max_depth": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], + "max_features": ["auto", "sqrt"], + "min_samples_leaf": [1, 2, 4], + "min_samples_split": [2, 5, 10], + "n_estimators": [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000] + }, + "scikitridge":{ + "alpha_init":[1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.9], + "lambda_init": [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-9] + }, + "scikitrsc":{}, + "scikitsc":{}, + "alexnet":{"optimizer":["Adam", "AdamW"]}, + "densenet121":{"optimizer":["Adam", "AdamW"]}, + "densenet161":{"optimizer":["Adam", "AdamW"]}, + "densenet169":{"optimizer":["Adam", "AdamW"]}, + "densenet201":{"optimizer":["Adam", "AdamW"]}, + "googlenet":{"optimizer":["Adam", "AdamW"]}, + "inception_v3":{"optimizer":["Adam", "AdamW"]}, + "mnasnet0_5":{"optimizer":["Adam", "AdamW"]}, + "mnasnet1_0":{"optimizer":["Adam", "AdamW"]}, + "mobilenet_v2":{"optimizer":["Adam", "AdamW"]}, + "pytorchnet":{"optimizer":["Adam", "AdamW"]}, + "resnet101":{"optimizer":["Adam", "AdamW"]}, + "resnet152":{"optimizer":["Adam", "AdamW"]}, + "resnet18":{"optimizer":["Adam", "AdamW"]}, + "resnet34":{"optimizer":["Adam", "AdamW"]}, + "resnet50":{"optimizer":["Adam", "AdamW"]}, + "resnext101_32x8d":{"optimizer":["Adam", "AdamW"]}, + "resnext50_32x4d":{"optimizer":["Adam", "AdamW"]}, + "shufflenet_v2_x0_5":{"optimizer":["Adam", "AdamW"]}, + "shufflenet_v2_x1_0":{"optimizer":["Adam", "AdamW"]}, + "vgg11":{"optimizer":["Adam", "AdamW"]}, + "vgg11_bn":{"optimizer":["Adam", "AdamW"]}, + "vgg13":{"optimizer":["Adam", "AdamW"]}, + "vgg13_bn":{"optimizer":["Adam", "AdamW"]}, + "vgg16":{"optimizer":["Adam", "AdamW"]}, + "vgg16_bn":{"optimizer":["Adam", "AdamW"]}, + "vgg19":{"optimizer":["Adam", "AdamW"]}, + "vgg19_bn":{"optimizer":["Adam", "AdamW"]}, + "wide_resnet101_2":{"optimizer":["Adam", "AdamW"]}, + "wide_resnet50_2":{"optimizer":["Adam", "AdamW"]}, + "daal4pylr":{}, + "spacyner":{}, + "tfdnnc":{}, + "tfdnnr":{}, + "text_classifier":{} +} \ No newline at end of file diff --git a/docs/tutorials/models/automl.rst b/docs/tutorials/models/automl.rst new file mode 100644 index 0000000000..daebce9845 --- /dev/null +++ b/docs/tutorials/models/automl.rst @@ -0,0 +1,142 @@ +Using DFFML's AutoML model +============================== + +Automated Machine Learning, abbreiviated as AutoML, is a process that automates away the time-consuming and tedious +aspects of ML, by encapsulating common ML models and techniques within a single API. It allows users to approach ML +from a high-level persepctive, abstracting away the minutae of statistical modelling, democratizing ML for +both data scientists and citizenry alike. On the other hand, AutoML also provides users with a degree of flexibility +in the form of being able to select their preferred models and tuners, which maximizes the likelihood of discovering an +effective model within the search space. In this tutorial, we will see how DFFML's AutoML model can be utilized to yield +the most out of a dataset. + +AutoML is extremely simple to use. Simply provide your dataset, a list of models to iterate over, and a hyperparmater +tuning technique to optimize your models with. The AutoML model will iterate over all the models provided, saving the +model with the best results in the user-specified directory. + + +.. code-block:: console + :test: + from sklearn.datasets import load_iris + from sklearn.model_selection import train_test_split + + + from dffml.accuracy import ClassificationAccuracy + from dffml.tuner.parameter_grid import ParameterGrid + from dffml.model.automl import AutoMLModel + + iris = load_iris() + y = iris["target"] + X = iris["data"] + trainX, testX, trainy, testy = train_test_split( + X, y, test_size=0.1, random_state=123 + ) + scorer = ClassificationAccuracy() + + # Configure the model + model = AutoMLModel( + predict="target", + features=["data"], + location="tempDir", + tuner = ParameterGrid(), + scorer = scorer, + models = ["xgbclassifier", "scikitsvc"], + objective="max", + parameters = { + "xgbclassifier": { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + }, + "scikitsvc": { + "gamma": [0.001, 0.1], + "C": [1, 10] + } + } + ) + + + # Train the model. Note this is different from most other + # dffml models, where you have to provide both a train/test set. + train(model, [ + [{"data": x, "target": y} for x, y in zip(trainX, trainy)], + [{"data": x, "target": y} for x, y in zip(testX, testy)] + ]) + + # Assess accuracy + + print( + "Test accuracy:", + score( + model, + scorer, + Feature("target", float, 1), + *[{"data": x, "target": y} for x, y in zip(testX, testy)], + ), + ) + + print( + "Training accuracy:", + score( + model, + scorer, + Feature("target", float, 1), + *[{"data": x, "target": y} for x, y in zip(trainX, trainy)], + ), + ) + + +Command Line Usage +------------------ + +First, we download the Iris dataset to the desired folder. + +.. code-block:: console + $ wget http://download.tensorflow.org/data/iris_training.csv + $ wget http://download.tensorflow.org/data/iris_test.csv + $ sed -i 's/.*setosa,versicolor,virginica/SepalLength,SepalWidth,PetalLength,PetalWidth,classification/g' iris_training.csv iris_test.csv + +We create a JSON file with the hyperparameter search space: + +parameters.json +.. code-block:: console + { + "xgbclassifier": {"learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8]}, + "scikitsvc": {"gamma": [0.001, 0.1], "C": [1, 10]} + } + +Now, train the model: + +.. code-block:: console + $ dffml train \ + -model automl \ + -model-features \ + SepalLength:float:1 \ + SepalWidth:float:1 \ + PetalLength:float:1 \ + -model-predict classification \ + -model-location tempDir \ + -model-tuner parameter_grid \ + -model-scorer clf \ + -model-models xgbclassifier scikitsvc \ + -model-parameters @parameters.json \ + -model-objective max \ + -sources train=csv \ + -source-train-filename iris_training.csv + + +Make predictions with the model: + dffml predict all \ + -model automl \ + -model-features \ + SepalLength:float:1 \ + SepalWidth:float:1 \ + PetalLength:float:1 \ + -model-predict classification \ + -model-location tempDir \ + -model-tuner parameter_grid \ + -model-scorer clf \ + -model-objective max \ + -sources test=csv \ + -source-test-filename iris_test.csv diff --git a/tests/model/test_automl.py b/tests/model/test_automl.py new file mode 100644 index 0000000000..130809c4d2 --- /dev/null +++ b/tests/model/test_automl.py @@ -0,0 +1,126 @@ +import os +import random +import tempfile +import contextlib +import subprocess +import shutil + + +import numpy as np + +from dffml.record import Record +from dffml.source.source import Sources +from dffml import train, score, chdir +from dffml.util.asynctestcase import AsyncTestCase +from dffml.feature.feature import Feature, Features +from dffml.source.memory import MemorySource, MemorySourceConfig +from dffml.accuracy import ClassificationAccuracy +from dffml.tuner.parameter_grid import ParameterGrid +from dffml.model.automl import AutoMLModel + +def sh_filepath(filename): + return os.path.join(os.path.dirname(__file__), filename) + +@contextlib.contextmanager +def directory_with_csv_files(): + with tempfile.TemporaryDirectory() as tempdir: + with chdir(tempdir): + subprocess.check_output(["bash", sh_filepath("../dataset_cls.sh")]) + shutil.copy( + sh_filepath("xgbtest.json"), os.path.join(tempdir, "xgbtest.json"), + ) + yield tempdir + +class TestAutoMLModel(AsyncTestCase): + @classmethod + def setUpClass(cls): + # Create a temporary directory to store the trained model + cls.model_dir = tempfile.TemporaryDirectory() + # Create an instance of the model + + # Generating data f(x1,x2) = (2*x1 + 3*x2)//2 + _n_data = 2000 + _temp_data = np.random.rand(2, _n_data) + cls.records = [ + Record( + "x" + str(random.random()), + data={ + "features": { + "Feature1": float(_temp_data[0][i]), + "Feature2": float(_temp_data[1][i]), + "Target": (2 * _temp_data[0][i] + 3 * _temp_data[1][i]) + // 2, + } + }, + ) + for i in range(0, _n_data) + ] + + cls.trainingsource = Sources( + MemorySource(MemorySourceConfig(records=cls.records[:1800])) + ) + cls.testsource = Sources( + MemorySource(MemorySourceConfig(records=cls.records[1800:])) + ) + + cls.scorer = ClassificationAccuracy() + cls.tuner = ParameterGrid() + cls.model = AutoMLModel( + predict="Target", + features=["Feature1", "Feature2"], + location=cls.model_dir.name, + tuner = cls.tuner, + scorer = cls.scorer, + models = ["xgbclassifier", "scikitsvc"], + objective="max", + parameters = { + "xgbclassifier": { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + }, + "scikitsvc": { + "gamma": [0.001, 0.1], + "C": [1, 10] + } + } + + ) + + @classmethod + def tearDownClass(cls): + # Remove the temporary directory where the model was stored to cleanup + cls.model_dir.cleanup() + + + async def test_00_train(self): + await train(self.model, self.trainingsource) + + + async def test_01_score(self): + # Use the test data to assess the model's accuracy + res = await score( + self.model, self.scorer, Feature("Target", float, 1), self.testsource + ) + # Ensure the accuracy is above 80% + print(res) + self.assertTrue(res > 0.8) + + async def test_02_predict(self): + # reduce overfitting + res_train = await score( + self.model, + self.scorer, + Feature("Target", float, 1), + self.trainingsource, + ) + + res_test = await score( + self.model, + self.scorer, + Feature("Target", float, 1), + self.testsource, + ) + # Test fails if the difference between training and testing is more that 5% + self.assertLess(res_train - res_test, 0.05) + From abd894ed269bb19d11c7f7e4ff1a63ff9b19a3ed Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Fri, 2 Sep 2022 06:40:30 +0800 Subject: [PATCH 12/14] "validation set splitting for automl tuning" --- dffml/model/automl.py | 34 +++++++++++++++++++++++++--------- tests/model/test_automl.py | 3 ++- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/dffml/model/automl.py b/dffml/model/automl.py index 3e90943359..1070ca0c36 100644 --- a/dffml/model/automl.py +++ b/dffml/model/automl.py @@ -5,6 +5,8 @@ import tempfile import contextlib import pkg_resources +import numpy as np +from sklearn.model_selection import train_test_split from typing import AsyncIterator, Tuple, Any, Type, List from ..high_level.ml import tune from ..base import config, field @@ -29,8 +31,10 @@ class AutoMLModelConfig: objective: str = field( "How to optimize the given scorer. Values are min/max", default="max" ) - parameters: dict = field("Hyperparameter configuration of different models to optimize", default_factory= lambda:dict()), + parameters: dict = field("Hyperparameter configuration of different models to optimize", default_factory= lambda:dict()) use_default: bool = field("Whether or not to utilize DFFML's default hyperparameter settings for tuning", default=False) + split_data: bool = field("Whether or not to split data when performing tuning. Assumes dataset is in a tabular format.", default=False) + split_ratio: float = field("The percentage of records in the train set, if splitting is performed.", default=0.8) @entrypoint("automl") @@ -46,7 +50,6 @@ class AutoMLModel(SimpleModel): def __init__(self, config) -> None: super().__init__(config) - # The saved model self.saved = None self.forbidden = ["automl", "autosklearn"] @@ -88,6 +91,7 @@ async def __aexit__(self, exc_type, exc_value, traceback): async def train(self, sources: Sources) -> None: + tuner = self.parent.config.tuner scorer = self.parent.config.scorer features = self.parent.config.features @@ -103,7 +107,7 @@ async def train(self, sources: Sources) -> None: if self.parent.config.objective == "min": highest_acc = float("inf") elif self.parent.config.objective == "max": - highest_acc = -1 + highest_acc = -float("inf") else: raise NotImplementedError('Objective must be either "min" or "max".') @@ -111,7 +115,18 @@ async def train(self, sources: Sources) -> None: # We clear the destination directory first, to avoid conflicts. if dest.exists() and dest.is_dir(): shutil.rmtree(dest) - + + train_source = test_source = None + + if self.parent.config.split_data: + data = [] + async for record in sources.with_features( + self.features + [self.parent.config.predict.name] + ): + data.append(record) + train_source, test_source = train_test_split(data, train_size=self.parent.config.split_ratio) + else: + train_source = test_source = sources best_path = best_name = "" @@ -131,20 +146,21 @@ async def train(self, sources: Sources) -> None: else: tuner.config.parameters = {} - val = await tune(model, tuner, scorer, self.parent.config.predict, sources, sources) + value = await tune(model, tuner, scorer, self.parent.config.predict, train_source, test_source) - if self.parent.config.objective == "min" and val < highest_acc: + if self.parent.config.objective == "min" and value < highest_acc: best_path = model_dir best_name = model_name - highest_acc = val - elif self.parent.config.objective == "max" and val > highest_acc: + highest_acc = value + elif self.parent.config.objective == "max" and value > highest_acc: best_path = model_dir best_name = model_name - highest_acc = val + highest_acc = value best_model_dir = dest / "best_model" / best_name shutil.copytree(best_path, best_model_dir) + async def predict( diff --git a/tests/model/test_automl.py b/tests/model/test_automl.py index 130809c4d2..ee0448746f 100644 --- a/tests/model/test_automl.py +++ b/tests/model/test_automl.py @@ -83,7 +83,8 @@ def setUpClass(cls): "gamma": [0.001, 0.1], "C": [1, 10] } - } + }, + split_data = True ) From a26424b766f30afcaad6a81f4b9505ab9ce1eb99 Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Sat, 17 Sep 2022 04:34:07 +0800 Subject: [PATCH 13/14] "removed scikit dependency" --- dffml/model/automl.py | 8 ++++++-- tests/model/test_automl.py | 7 +++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/dffml/model/automl.py b/dffml/model/automl.py index 1070ca0c36..0612b2a086 100644 --- a/dffml/model/automl.py +++ b/dffml/model/automl.py @@ -6,7 +6,7 @@ import contextlib import pkg_resources import numpy as np -from sklearn.model_selection import train_test_split + from typing import AsyncIterator, Tuple, Any, Type, List from ..high_level.ml import tune from ..base import config, field @@ -124,7 +124,11 @@ async def train(self, sources: Sources) -> None: self.features + [self.parent.config.predict.name] ): data.append(record) - train_source, test_source = train_test_split(data, train_size=self.parent.config.split_ratio) + train_len = int(self.parent.config.split_ratio * len(data)) + train_source = data[:train_len] + test_source = data[train_len:] + + else: train_source = test_source = sources diff --git a/tests/model/test_automl.py b/tests/model/test_automl.py index ee0448746f..3299940b52 100644 --- a/tests/model/test_automl.py +++ b/tests/model/test_automl.py @@ -103,12 +103,11 @@ async def test_01_score(self): res = await score( self.model, self.scorer, Feature("Target", float, 1), self.testsource ) - # Ensure the accuracy is above 80% - print(res) + self.assertTrue(res > 0.8) async def test_02_predict(self): - # reduce overfitting + res_train = await score( self.model, self.scorer, @@ -122,6 +121,6 @@ async def test_02_predict(self): Feature("Target", float, 1), self.testsource, ) - # Test fails if the difference between training and testing is more that 5% + self.assertLess(res_train - res_test, 0.05) From eaa14bfeaefa681930913c998ec29db13c654129 Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Mon, 19 Sep 2022 16:06:02 +0800 Subject: [PATCH 14/14] "removed extraneous test file" --- .../tests/test_regressor_model.py | 100 ------------------ 1 file changed, 100 deletions(-) delete mode 100644 tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_regressor_model.py diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_regressor_model.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_regressor_model.py deleted file mode 100644 index 5c24190fda..0000000000 --- a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_regressor_model.py +++ /dev/null @@ -1,100 +0,0 @@ -import pathlib -import tempfile - -import numpy as np - -from dffml.record import Record -from dffml.source.source import Sources -from dffml import train, score, predict, run_consoletest -from dffml.util.asynctestcase import AsyncTestCase -from dffml.feature.feature import Feature, Features -from dffml.accuracy import MeanSquaredErrorAccuracy -from dffml.source.memory import MemorySource, MemorySourceConfig - - -from dffml_model_xgboost.xgbregressor import ( - XGBRegressorModel, - XGBRegressorModelConfig, -) - - -class TestXGBRegressor(AsyncTestCase): - @classmethod - def setUpClass(cls): - # Create a temporary directory to store the trained model - cls.model_dir = tempfile.TemporaryDirectory() - # Create an instance of the model - cls.model = XGBRegressorModel( - XGBRegressorModelConfig( - features=Features( - Feature("Feature1", float, 1), Feature("Feature2") - ), - predict=Feature("Target", float, 1), - location=cls.model_dir.name, - ) - ) - # Generating data f(x1,x2) = 2*x1 + 3*x2 - _n_data = 2000 - _temp_data = np.random.rand(2, _n_data) - cls.records = [ - Record( - "x" + str(random.random()), - data={ - "features": { - "Feature1": float(_temp_data[0][i]), - "Feature2": float(_temp_data[1][i]), - "Target": 2 * _temp_data[0][i] + 3 * _temp_data[1][i], - } - }, - ) - for i in range(0, _n_data) - ] - - cls.trainingsource = Sources( - MemorySource(MemorySourceConfig(records=cls.records[:1800])) - ) - cls.testsource = Sources( - MemorySource(MemorySourceConfig(records=cls.records[1800:])) - ) - - @classmethod - def tearDownClass(cls): - # Remove the temporary directory where the model was stored to cleanup - cls.model_dir.cleanup() - - async def test_00_train(self): - # Train the model on the training data - await train(self.model, self.trainingsource) - - async def test_01_accuracy(self): - scorer = MeanSquaredErrorAccuracy() - # Use the test data to assess the model's accuracy - res = await score( - self.model, scorer, Feature("Target", float, 1), self.testsource - ) - # Ensure the accuracy is above 80% - self.assertTrue(res) - - async def test_02_predict(self): - # Get the prediction for each piece of test data - async for i, features, prediction in predict( - self.model, self.testsource - ): - # Grab the correct value - correct = features["Target"] - # Grab the predicted value - prediction = prediction["Target"]["value"] - # Check that the prediction is within 30% error of the actual value - error = abs((prediction - correct) / correct) - - acceptable = 0.5 - # Sometimes causes an issue when only one data point anomalously has high error - self.assertLess(error, acceptable) - - -class TestXGBClassifierDocstring(AsyncTestCase): - async def test_docstring(self): - await run_consoletest( - XGBRegressorModel, - docs_root_dir=pathlib.Path(__file__).parents[3] / "docs", - )