From 68c923e686da8b4266a803363561d991c0df73b1 Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Mon, 20 Jun 2022 04:02:35 +0800 Subject: [PATCH 1/8] "tune function and CLI command" --- dffml/__init__.py | 1 + dffml/cli/cli.py | 3 +- dffml/cli/ml.py | 38 ++++++++- dffml/high_level/ml.py | 149 ++++++++++++++++++++++++++++++++++ dffml/noasync.py | 16 ++++ dffml/tuner/__init__.py | 1 - dffml/tuner/parameter_grid.py | 49 ++++++++--- 7 files changed, 241 insertions(+), 16 deletions(-) diff --git a/dffml/__init__.py b/dffml/__init__.py index f035051aa4..755f9f0124 100644 --- a/dffml/__init__.py +++ b/dffml/__init__.py @@ -57,6 +57,7 @@ class DuplicateName(Exception): "train": "high_level.ml", "predict": "high_level.ml", "score": "high_level.ml", + "tune": "high_level.ml", "load": "high_level.source", "save": "high_level.source", "run": "high_level.dataflow", diff --git a/dffml/cli/cli.py b/dffml/cli/cli.py index b7dbd21fe6..8ce00e5ecc 100644 --- a/dffml/cli/cli.py +++ b/dffml/cli/cli.py @@ -39,7 +39,7 @@ from .dataflow import Dataflow from .config import Config -from .ml import Train, Accuracy, Predict +from .ml import Train, Accuracy, Predict, Tune from .list import List version = VERSION @@ -366,6 +366,7 @@ class CLI(CMD): train = Train accuracy = Accuracy predict = Predict + tune = Tune service = services() dataflow = Dataflow config = Config diff --git a/dffml/cli/ml.py b/dffml/cli/ml.py index 7876ee2de9..315b4206a3 100644 --- a/dffml/cli/ml.py +++ b/dffml/cli/ml.py @@ -1,9 +1,10 @@ import inspect from ..model.model import Model +from ..tuner.tuner import Tuner from ..source.source import Sources, SubsetSources from ..util.cli.cmd import CMD, CMDOutputOverride -from ..high_level.ml import train, predict, score +from ..high_level.ml import train, predict, score, tune from ..util.config.fields import FIELD_SOURCES from ..util.cli.cmds import ( SourcesCMD, @@ -15,6 +16,7 @@ ) from ..base import config, field from ..accuracy import AccuracyScorer + from ..feature import Features @@ -118,3 +120,37 @@ class Predict(CMD): record = PredictRecord _all = PredictAll + + +@config +class TuneCMDConfig: + model: Model = field("Model used for ML", required=True) + tuner: Tuner = field("Tuner to optimize hyperparameters", required=True) + scorer: AccuracyScorer = field( + "Method to use to score accuracy", required=True + ) + features: Features = field("Predict Feature(s)", default=Features()) + sources: Sources = FIELD_SOURCES + + +class Tune(MLCMD): + """Optimize hyperparameters of model with given sources""" + + CONFIG = TuneCMDConfig + + async def run(self): + # Instantiate the accuracy scorer class if for some reason it is a class + # at this point rather than an instance. + if inspect.isclass(self.scorer): + self.scorer = self.scorer.withconfig(self.extra_config) + if inspect.isclass(self.tuner): + self.tuner = self.tuner.withconfig(self.extra_config) + + return await tune( + self.model, + self.tuner, + self.scorer, + self.features, + [self.sources[0]], + [self.sources[1]], + ) diff --git a/dffml/high_level/ml.py b/dffml/high_level/ml.py index ffa110341b..73e6eb77c6 100644 --- a/dffml/high_level/ml.py +++ b/dffml/high_level/ml.py @@ -1,12 +1,14 @@ import contextlib from typing import Union, Dict, Any, List + from ..record import Record from ..source.source import BaseSource from ..feature import Feature, Features from ..model import Model, ModelContext from ..util.internal import records_to_sources, list_records_to_dict from ..accuracy.accuracy import AccuracyScorer, AccuracyContext +from ..tuner import Tuner, TunerContext async def train(model, *args: Union[BaseSource, Record, Dict[str, Any], List]): @@ -293,3 +295,150 @@ async def predict( ) if update: await sctx.update(record) + +async def tune( + model, + tuner: Union[Tuner, TunerContext], + accuracy_scorer: Union[AccuracyScorer, AccuracyContext], + features: Union[Feature, Features], + train_ds: Union[BaseSource, Record, Dict[str, Any], List], + valid_ds: Union[BaseSource, Record, Dict[str, Any], List], +) -> float: + + """ + Tune the hyperparameters of a model with a given tuner. + + + Parameters + ---------- + model : Model + Machine Learning model to use. See :doc:`/plugins/dffml_model` for + models options. + tuner: Tuner + Hyperparameter tuning method to use. See :doc:`/plugins/dffml_tuner` for + tuner options. + train_ds : list + Input data for training. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + valid_ds : list + Validation data for testing. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + + + Returns + ------- + float + A decimal value representing the result of the accuracy scorer on the given + test set. For instance, ClassificationAccuracy represents the percentage of correct + classifications made by the model. + + Examples + -------- + + >>> import asyncio + >>> from dffml import * + >>> from dffml_model_xgboost.xgbclassifier import XGBClassifierModel + >>> + >>> model = XGBClassifierModel( + ... features=Features( + ... Feature("SepalLength", float, 1), + ... Feature("SepalWidth", float, 1), + ... Feature("PetalLength", float, 1), + ... ), + ... predict=Feature("classification", int, 1), + ... location="tempdir", + ... ) + >>> + >>> async def main(): + ... await tune( + ... model, + ... ParameterGrid( + ... parameters={ + ... "learning_rate": [0.01, 0.05, 0.1], + ... "n_estimators": [20, 100, 200], + ... "max_depth": [3,5,8] + ... } + ... ), + ... MeanSquaredErrorAccuracy(), + ... Features( + ... Feature("SepalLength", float, 1), + ... Feature("SepalWidth", float, 1), + ... Feature("PetalLength", float, 1), + ... ), + ... [CSVSource(filename="iris_training.csv")], + ... [CSVSource(filename="iris_test.csv")], + ... ) + >>> + >>> asyncio.run(main()) + Accuracy: 0.0 + """ + + if not isinstance(features, (Feature, Features)): + raise TypeError( + f"features was {type(features)}: {features!r}. Should have been Feature or Features" + ) + if isinstance(features, Feature): + features = Features(features) + if hasattr(model.config, "predict"): + if isinstance(model.config.predict, Features): + predict_feature = [ + feature.name for feature in model.config.predict + ] + else: + predict_feature = [model.config.predict.name] + + if hasattr(model.config, "features") and any( + isinstance(td, list) for td in train_ds + ): + train_ds = list_records_to_dict( + [feature.name for feature in model.config.features] + + predict_feature, + *train_ds, + model=model, + ) + if hasattr(model.config, "features") and any( + isinstance(td, list) for td in valid_ds + ): + valid_ds = list_records_to_dict( + [feature.name for feature in model.config.features] + + predict_feature, + *valid_ds, + model=model, + ) + + async with contextlib.AsyncExitStack() as astack: + # Open sources + train = await astack.enter_async_context(records_to_sources(*train_ds)) + test = await astack.enter_async_context(records_to_sources(*valid_ds)) + # Allow for keep models open + if isinstance(model, Model): + model = await astack.enter_async_context(model) + mctx = await astack.enter_async_context(model()) + elif isinstance(model, ModelContext): + mctx = model + + # Allow for keep models open + if isinstance(accuracy_scorer, AccuracyScorer): + accuracy_scorer = await astack.enter_async_context(accuracy_scorer) + actx = await astack.enter_async_context(accuracy_scorer()) + elif isinstance(accuracy_scorer, AccuracyContext): + actx = accuracy_scorer + else: + # TODO Replace this with static type checking and maybe dynamic + # through something like pydantic. See issue #36 + raise TypeError(f"{accuracy_scorer} is not an AccuracyScorer") + + if isinstance(tuner, Tuner): + tuner = await astack.enter_async_context(tuner) + tctx = await astack.enter_async_context(tuner()) + elif isinstance(tuner, TunerContext): + tctx = tuner + else: + raise TypeError(f"{tuner} is not an Tuner") + + return float( + await tctx.optimize(mctx, model.config.predict, actx, train, test) + ) + diff --git a/dffml/noasync.py b/dffml/noasync.py index 41d9201138..a7416bad21 100644 --- a/dffml/noasync.py +++ b/dffml/noasync.py @@ -6,6 +6,7 @@ train as high_level_train, score as high_level_score, predict as high_level_predict, + tune as high_level_tune, ) @@ -24,6 +25,21 @@ def train(*args, **kwargs): ) ) +def tune(*args, **kwargs): + return asyncio.run(high_level_tune(*args, **kwargs)) + + +tune.__doc__ = ( + high_level_tune.__doc__.replace("await ", "") + .replace("async ", "") + .replace("asyncio.run(main())", "main()") + .replace(" >>> import asyncio\n", "") + .replace( + " >>> from dffml import *\n", + " >>> from dffml import *\n >>> from dffml.noasync import tune\n", + ) +) + def score(*args, **kwargs): return asyncio.run(high_level_score(*args, **kwargs)) diff --git a/dffml/tuner/__init__.py b/dffml/tuner/__init__.py index 072f34db2e..2ca452c2ef 100644 --- a/dffml/tuner/__init__.py +++ b/dffml/tuner/__init__.py @@ -8,4 +8,3 @@ TunerContext, Tuner, ) -from .parameter_grid import ParameterGrid diff --git a/dffml/tuner/parameter_grid.py b/dffml/tuner/parameter_grid.py index d6a8ead5f6..6c77c5e06f 100644 --- a/dffml/tuner/parameter_grid.py +++ b/dffml/tuner/parameter_grid.py @@ -17,7 +17,8 @@ @config class ParameterGridConfig: - parameters: dict = field("Parameters to be optimized") + parameters: dict = field("Parameters to be optimized", default_factory= lambda:dict()) + objective: str = field("How to optimize for the scorer", default="max") class ParameterGridContext(TunerContext): @@ -38,6 +39,8 @@ async def optimize( Uses a grid of hyperparameters in the form of a dictionary present in config, Trains each permutation of the grid of parameters and compares accuracy. Sets model to the best parameters and returns highest accuracy. + If no hyperparameters are provided, the model is simply trained using + default parameters. Parameters ---------- @@ -59,33 +62,53 @@ async def optimize( Returns ------- float - The highest score value + The best score value """ - highest_acc = -1 + # Score should be optimized based on objective + if self.parent.config.objective == "min": + highest_acc = float("inf") + elif self.parent.config.objective == "max": + highest_acc = -1 + best_config = dict() logging.info( f"Optimizing model with parameter grid: {self.parent.config.parameters}" ) + names = list(self.parent.config.parameters.keys()) logging.info(names) - with model.config.no_enforce_immutable(): + + with model.parent.config.no_enforce_immutable(): for combination in itertools.product( *list(self.parent.config.parameters.values()) ): logging.info(combination) + for i in range(len(combination)): param = names[i] - setattr(model.config, names[i], combination[i]) - await train(model, *train_data) - acc = await score(model, accuracy_scorer, feature, *test_data) + setattr(model.parent.config, names[i], combination[i]) + + await train(model.parent, *train_data) + + acc = await score( + model.parent, accuracy_scorer, feature, *test_data + ) + logging.info(f"Accuracy of the tuned model: {acc}") - if acc > highest_acc: - highest_acc = acc - for param in names: - best_config[param] = getattr(model.config, param) + if self.parent.config.objective == "min": + if acc < highest_acc: + highest_acc = acc + + elif self.parent.config.objective == "max": + if acc > highest_acc: + highest_acc = acc + for param in names: + best_config[param] = getattr( + model.parent.config, param + ) for param in names: - setattr(model.config, param, best_config[param]) - await train(model, *train_data) + setattr(model.parent.config, param, best_config[param]) + await train(model.parent, *train_data) logging.info(f"\nOptimal Hyper-parameters: {best_config}") logging.info(f"Accuracy of Optimized model: {highest_acc}") return highest_acc From 4a7de3ae7d4b238b91620e05dd50b04e6912eaf9 Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Mon, 20 Jun 2022 04:02:35 +0800 Subject: [PATCH 2/8] "tune function and CLI command" --- dffml/__init__.py | 1 + dffml/cli/cli.py | 3 +- dffml/cli/ml.py | 38 +++++++- dffml/high_level/ml.py | 148 +++++++++++++++++++++++++++++++ dffml/noasync.py | 16 ++++ dffml/skel/config/README.rst | 0 dffml/skel/model/README.rst | 0 dffml/skel/operations/README.rst | 0 dffml/skel/service/README.rst | 0 dffml/skel/source/README.rst | 0 dffml/tuner/__init__.py | 1 - dffml/tuner/parameter_grid.py | 52 ++++++++--- 12 files changed, 243 insertions(+), 16 deletions(-) mode change 120000 => 100644 dffml/skel/config/README.rst mode change 120000 => 100644 dffml/skel/model/README.rst mode change 120000 => 100644 dffml/skel/operations/README.rst mode change 120000 => 100644 dffml/skel/service/README.rst mode change 120000 => 100644 dffml/skel/source/README.rst diff --git a/dffml/__init__.py b/dffml/__init__.py index f035051aa4..755f9f0124 100644 --- a/dffml/__init__.py +++ b/dffml/__init__.py @@ -57,6 +57,7 @@ class DuplicateName(Exception): "train": "high_level.ml", "predict": "high_level.ml", "score": "high_level.ml", + "tune": "high_level.ml", "load": "high_level.source", "save": "high_level.source", "run": "high_level.dataflow", diff --git a/dffml/cli/cli.py b/dffml/cli/cli.py index b7dbd21fe6..8ce00e5ecc 100644 --- a/dffml/cli/cli.py +++ b/dffml/cli/cli.py @@ -39,7 +39,7 @@ from .dataflow import Dataflow from .config import Config -from .ml import Train, Accuracy, Predict +from .ml import Train, Accuracy, Predict, Tune from .list import List version = VERSION @@ -366,6 +366,7 @@ class CLI(CMD): train = Train accuracy = Accuracy predict = Predict + tune = Tune service = services() dataflow = Dataflow config = Config diff --git a/dffml/cli/ml.py b/dffml/cli/ml.py index 7876ee2de9..315b4206a3 100644 --- a/dffml/cli/ml.py +++ b/dffml/cli/ml.py @@ -1,9 +1,10 @@ import inspect from ..model.model import Model +from ..tuner.tuner import Tuner from ..source.source import Sources, SubsetSources from ..util.cli.cmd import CMD, CMDOutputOverride -from ..high_level.ml import train, predict, score +from ..high_level.ml import train, predict, score, tune from ..util.config.fields import FIELD_SOURCES from ..util.cli.cmds import ( SourcesCMD, @@ -15,6 +16,7 @@ ) from ..base import config, field from ..accuracy import AccuracyScorer + from ..feature import Features @@ -118,3 +120,37 @@ class Predict(CMD): record = PredictRecord _all = PredictAll + + +@config +class TuneCMDConfig: + model: Model = field("Model used for ML", required=True) + tuner: Tuner = field("Tuner to optimize hyperparameters", required=True) + scorer: AccuracyScorer = field( + "Method to use to score accuracy", required=True + ) + features: Features = field("Predict Feature(s)", default=Features()) + sources: Sources = FIELD_SOURCES + + +class Tune(MLCMD): + """Optimize hyperparameters of model with given sources""" + + CONFIG = TuneCMDConfig + + async def run(self): + # Instantiate the accuracy scorer class if for some reason it is a class + # at this point rather than an instance. + if inspect.isclass(self.scorer): + self.scorer = self.scorer.withconfig(self.extra_config) + if inspect.isclass(self.tuner): + self.tuner = self.tuner.withconfig(self.extra_config) + + return await tune( + self.model, + self.tuner, + self.scorer, + self.features, + [self.sources[0]], + [self.sources[1]], + ) diff --git a/dffml/high_level/ml.py b/dffml/high_level/ml.py index ffa110341b..f97c21ffca 100644 --- a/dffml/high_level/ml.py +++ b/dffml/high_level/ml.py @@ -1,12 +1,14 @@ import contextlib from typing import Union, Dict, Any, List + from ..record import Record from ..source.source import BaseSource from ..feature import Feature, Features from ..model import Model, ModelContext from ..util.internal import records_to_sources, list_records_to_dict from ..accuracy.accuracy import AccuracyScorer, AccuracyContext +from ..tuner import Tuner, TunerContext async def train(model, *args: Union[BaseSource, Record, Dict[str, Any], List]): @@ -293,3 +295,149 @@ async def predict( ) if update: await sctx.update(record) + +async def tune( + model, + tuner: Union[Tuner, TunerContext], + accuracy_scorer: Union[AccuracyScorer, AccuracyContext], + features: Union[Feature, Features], + train_ds: Union[BaseSource, Record, Dict[str, Any], List], + valid_ds: Union[BaseSource, Record, Dict[str, Any], List], +) -> float: + + """ + Tune the hyperparameters of a model with a given tuner. + + + Parameters + ---------- + model : Model + Machine Learning model to use. See :doc:`/plugins/dffml_model` for + models options. + tuner: Tuner + Hyperparameter tuning method to use. See :doc:`/plugins/dffml_tuner` for + tuner options. + train_ds : list + Input data for training. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + valid_ds : list + Validation data for testing. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + + + Returns + ------- + float + A decimal value representing the result of the accuracy scorer on the given + test set. For instance, ClassificationAccuracy represents the percentage of correct + classifications made by the model. + + Examples + -------- + + >>> import asyncio + >>> from dffml import * + >>> + >>> model = SLRModel( + ... features=Features( + ... Feature("Years", int, 1), + ... ), + ... predict=Feature("Salary", int, 1), + ... location="tempdir", + ... ) + >>> + >>> async def main(): + ... score = await tune( + ... model, + ... ParameterGrid(objective="min"), + ... MeanSquaredErrorAccuracy(), + ... Features( + ... Feature("Years", float, 1), + ... ), + ... [ + ... {"Years": 0, "Salary": 10}, + ... {"Years": 1, "Salary": 20}, + ... {"Years": 2, "Salary": 30}, + ... {"Years": 3, "Salary": 40} + ... ], + ... [ + ... {"Years": 6, "Salary": 70}, + ... {"Years": 7, "Salary": 80} + ... ] + ... + ... ) + ... print(f"Tuner score: {score}") + ... + >>> asyncio.run(main()) + Tuner score: 0.0 + """ + + if not isinstance(features, (Feature, Features)): + raise TypeError( + f"features was {type(features)}: {features!r}. Should have been Feature or Features" + ) + if isinstance(features, Feature): + features = Features(features) + if hasattr(model.config, "predict"): + if isinstance(model.config.predict, Features): + predict_feature = [ + feature.name for feature in model.config.predict + ] + else: + predict_feature = [model.config.predict.name] + + if hasattr(model.config, "features") and any( + isinstance(td, list) for td in train_ds + ): + train_ds = list_records_to_dict( + [feature.name for feature in model.config.features] + + predict_feature, + *train_ds, + model=model, + ) + if hasattr(model.config, "features") and any( + isinstance(td, list) for td in valid_ds + ): + valid_ds = list_records_to_dict( + [feature.name for feature in model.config.features] + + predict_feature, + *valid_ds, + model=model, + ) + + async with contextlib.AsyncExitStack() as astack: + # Open sources + train = await astack.enter_async_context(records_to_sources(*train_ds)) + test = await astack.enter_async_context(records_to_sources(*valid_ds)) + # Allow for keep models open + if isinstance(model, Model): + model = await astack.enter_async_context(model) + mctx = await astack.enter_async_context(model()) + elif isinstance(model, ModelContext): + mctx = model + + # Allow for keep models open + if isinstance(accuracy_scorer, AccuracyScorer): + accuracy_scorer = await astack.enter_async_context(accuracy_scorer) + actx = await astack.enter_async_context(accuracy_scorer()) + elif isinstance(accuracy_scorer, AccuracyContext): + actx = accuracy_scorer + else: + # TODO Replace this with static type checking and maybe dynamic + # through something like pydantic. See issue #36 + raise TypeError(f"{accuracy_scorer} is not an AccuracyScorer") + + if isinstance(tuner, Tuner): + tuner = await astack.enter_async_context(tuner) + tctx = await astack.enter_async_context(tuner()) + elif isinstance(tuner, TunerContext): + tctx = tuner + else: + raise TypeError(f"{tuner} is not an Tuner") + + return float( + await tctx.optimize(mctx, model.config.predict, actx, train, test) + ) + diff --git a/dffml/noasync.py b/dffml/noasync.py index 41d9201138..a7416bad21 100644 --- a/dffml/noasync.py +++ b/dffml/noasync.py @@ -6,6 +6,7 @@ train as high_level_train, score as high_level_score, predict as high_level_predict, + tune as high_level_tune, ) @@ -24,6 +25,21 @@ def train(*args, **kwargs): ) ) +def tune(*args, **kwargs): + return asyncio.run(high_level_tune(*args, **kwargs)) + + +tune.__doc__ = ( + high_level_tune.__doc__.replace("await ", "") + .replace("async ", "") + .replace("asyncio.run(main())", "main()") + .replace(" >>> import asyncio\n", "") + .replace( + " >>> from dffml import *\n", + " >>> from dffml import *\n >>> from dffml.noasync import tune\n", + ) +) + def score(*args, **kwargs): return asyncio.run(high_level_score(*args, **kwargs)) diff --git a/dffml/skel/config/README.rst b/dffml/skel/config/README.rst deleted file mode 120000 index f6eeba643b..0000000000 --- a/dffml/skel/config/README.rst +++ /dev/null @@ -1 +0,0 @@ -../common/README.rst \ No newline at end of file diff --git a/dffml/skel/config/README.rst b/dffml/skel/config/README.rst new file mode 100644 index 0000000000..f6eeba643b --- /dev/null +++ b/dffml/skel/config/README.rst @@ -0,0 +1 @@ +../common/README.rst \ No newline at end of file diff --git a/dffml/skel/model/README.rst b/dffml/skel/model/README.rst deleted file mode 120000 index f6eeba643b..0000000000 --- a/dffml/skel/model/README.rst +++ /dev/null @@ -1 +0,0 @@ -../common/README.rst \ No newline at end of file diff --git a/dffml/skel/model/README.rst b/dffml/skel/model/README.rst new file mode 100644 index 0000000000..f6eeba643b --- /dev/null +++ b/dffml/skel/model/README.rst @@ -0,0 +1 @@ +../common/README.rst \ No newline at end of file diff --git a/dffml/skel/operations/README.rst b/dffml/skel/operations/README.rst deleted file mode 120000 index f6eeba643b..0000000000 --- a/dffml/skel/operations/README.rst +++ /dev/null @@ -1 +0,0 @@ -../common/README.rst \ No newline at end of file diff --git a/dffml/skel/operations/README.rst b/dffml/skel/operations/README.rst new file mode 100644 index 0000000000..f6eeba643b --- /dev/null +++ b/dffml/skel/operations/README.rst @@ -0,0 +1 @@ +../common/README.rst \ No newline at end of file diff --git a/dffml/skel/service/README.rst b/dffml/skel/service/README.rst deleted file mode 120000 index f6eeba643b..0000000000 --- a/dffml/skel/service/README.rst +++ /dev/null @@ -1 +0,0 @@ -../common/README.rst \ No newline at end of file diff --git a/dffml/skel/service/README.rst b/dffml/skel/service/README.rst new file mode 100644 index 0000000000..f6eeba643b --- /dev/null +++ b/dffml/skel/service/README.rst @@ -0,0 +1 @@ +../common/README.rst \ No newline at end of file diff --git a/dffml/skel/source/README.rst b/dffml/skel/source/README.rst deleted file mode 120000 index f6eeba643b..0000000000 --- a/dffml/skel/source/README.rst +++ /dev/null @@ -1 +0,0 @@ -../common/README.rst \ No newline at end of file diff --git a/dffml/skel/source/README.rst b/dffml/skel/source/README.rst new file mode 100644 index 0000000000..f6eeba643b --- /dev/null +++ b/dffml/skel/source/README.rst @@ -0,0 +1 @@ +../common/README.rst \ No newline at end of file diff --git a/dffml/tuner/__init__.py b/dffml/tuner/__init__.py index 072f34db2e..2ca452c2ef 100644 --- a/dffml/tuner/__init__.py +++ b/dffml/tuner/__init__.py @@ -8,4 +8,3 @@ TunerContext, Tuner, ) -from .parameter_grid import ParameterGrid diff --git a/dffml/tuner/parameter_grid.py b/dffml/tuner/parameter_grid.py index d6a8ead5f6..ba4c2d4018 100644 --- a/dffml/tuner/parameter_grid.py +++ b/dffml/tuner/parameter_grid.py @@ -17,7 +17,8 @@ @config class ParameterGridConfig: - parameters: dict = field("Parameters to be optimized") + parameters: dict = field("Parameters to be optimized", default_factory= lambda:dict()) + objective: str = field("How to optimize for the scorer", default="max") class ParameterGridContext(TunerContext): @@ -38,6 +39,8 @@ async def optimize( Uses a grid of hyperparameters in the form of a dictionary present in config, Trains each permutation of the grid of parameters and compares accuracy. Sets model to the best parameters and returns highest accuracy. + If no hyperparameters are provided, the model is simply trained using + default parameters. Parameters ---------- @@ -59,33 +62,56 @@ async def optimize( Returns ------- float - The highest score value + The best score value """ - highest_acc = -1 + # Score should be optimized based on objective + if self.parent.config.objective == "min": + highest_acc = float("inf") + elif self.parent.config.objective == "max": + highest_acc = -1 + else: + raise NotImplementedError('Objective must be either "min" or "max".') + best_config = dict() logging.info( f"Optimizing model with parameter grid: {self.parent.config.parameters}" ) + names = list(self.parent.config.parameters.keys()) logging.info(names) - with model.config.no_enforce_immutable(): + + with model.parent.config.no_enforce_immutable(): for combination in itertools.product( *list(self.parent.config.parameters.values()) ): logging.info(combination) + for i in range(len(combination)): param = names[i] - setattr(model.config, names[i], combination[i]) - await train(model, *train_data) - acc = await score(model, accuracy_scorer, feature, *test_data) + setattr(model.parent.config, names[i], combination[i]) + + await train(model.parent, *train_data) + + acc = await score( + model.parent, accuracy_scorer, feature, *test_data + ) + logging.info(f"Accuracy of the tuned model: {acc}") - if acc > highest_acc: - highest_acc = acc - for param in names: - best_config[param] = getattr(model.config, param) + if self.parent.config.objective == "min": + if acc < highest_acc: + highest_acc = acc + + elif self.parent.config.objective == "max": + if acc > highest_acc: + highest_acc = acc + for param in names: + best_config[param] = getattr( + model.parent.config, param + ) for param in names: - setattr(model.config, param, best_config[param]) - await train(model, *train_data) + setattr(model.parent.config, param, best_config[param]) + await train(model.parent, *train_data) + highest_acc = await score(model.parent, accuracy_scorer, feature, *test_data) logging.info(f"\nOptimal Hyper-parameters: {best_config}") logging.info(f"Accuracy of Optimized model: {highest_acc}") return highest_acc From cef4d3e36531f0af12376986064a290c99dc7a2f Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Thu, 30 Jun 2022 20:35:40 +0800 Subject: [PATCH 3/8] "unit tests for xgboost, pytorch, spacy" --- {examples => tests}/tuner/dataset_cls.sh | 0 {examples => tests}/tuner/dataset_reg.sh | 0 {examples => tests}/tuner/xgbclassifier/test_classifier.py | 0 {examples => tests}/tuner/xgbclassifier/tune.sh | 0 {examples => tests}/tuner/xgbclassifier/xgbtest.json | 0 {examples => tests}/tuner/xgbregressor/test_regressor.py | 0 {examples => tests}/tuner/xgbregressor/tune.sh | 0 {examples => tests}/tuner/xgbregressor/xgbtest.json | 0 8 files changed, 0 insertions(+), 0 deletions(-) rename {examples => tests}/tuner/dataset_cls.sh (100%) rename {examples => tests}/tuner/dataset_reg.sh (100%) rename {examples => tests}/tuner/xgbclassifier/test_classifier.py (100%) rename {examples => tests}/tuner/xgbclassifier/tune.sh (100%) rename {examples => tests}/tuner/xgbclassifier/xgbtest.json (100%) rename {examples => tests}/tuner/xgbregressor/test_regressor.py (100%) rename {examples => tests}/tuner/xgbregressor/tune.sh (100%) rename {examples => tests}/tuner/xgbregressor/xgbtest.json (100%) diff --git a/examples/tuner/dataset_cls.sh b/tests/tuner/dataset_cls.sh similarity index 100% rename from examples/tuner/dataset_cls.sh rename to tests/tuner/dataset_cls.sh diff --git a/examples/tuner/dataset_reg.sh b/tests/tuner/dataset_reg.sh similarity index 100% rename from examples/tuner/dataset_reg.sh rename to tests/tuner/dataset_reg.sh diff --git a/examples/tuner/xgbclassifier/test_classifier.py b/tests/tuner/xgbclassifier/test_classifier.py similarity index 100% rename from examples/tuner/xgbclassifier/test_classifier.py rename to tests/tuner/xgbclassifier/test_classifier.py diff --git a/examples/tuner/xgbclassifier/tune.sh b/tests/tuner/xgbclassifier/tune.sh similarity index 100% rename from examples/tuner/xgbclassifier/tune.sh rename to tests/tuner/xgbclassifier/tune.sh diff --git a/examples/tuner/xgbclassifier/xgbtest.json b/tests/tuner/xgbclassifier/xgbtest.json similarity index 100% rename from examples/tuner/xgbclassifier/xgbtest.json rename to tests/tuner/xgbclassifier/xgbtest.json diff --git a/examples/tuner/xgbregressor/test_regressor.py b/tests/tuner/xgbregressor/test_regressor.py similarity index 100% rename from examples/tuner/xgbregressor/test_regressor.py rename to tests/tuner/xgbregressor/test_regressor.py diff --git a/examples/tuner/xgbregressor/tune.sh b/tests/tuner/xgbregressor/tune.sh similarity index 100% rename from examples/tuner/xgbregressor/tune.sh rename to tests/tuner/xgbregressor/tune.sh diff --git a/examples/tuner/xgbregressor/xgbtest.json b/tests/tuner/xgbregressor/xgbtest.json similarity index 100% rename from examples/tuner/xgbregressor/xgbtest.json rename to tests/tuner/xgbregressor/xgbtest.json From 41e4284dc61d51bfa2bf7fec2a62b1d598357972 Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Fri, 1 Jul 2022 18:02:16 +0800 Subject: [PATCH 4/8] "unit test cleaning" --- dffml/cli/ml.py | 26 +++++++++++++++++++++++--- dffml/high_level/ml.py | 2 +- model/pytorch/tests/test_pytorchnet.py | 4 ++-- model/pytorch/tests/test_resnet18.py | 2 +- tests/tuner/dataset_reg.sh | 9 +++++++++ tests/tuner/xgbclassifier/tune.sh | 5 ++++- tests/tuner/xgbregressor/tune.sh | 6 +++++- 7 files changed, 45 insertions(+), 9 deletions(-) diff --git a/dffml/cli/ml.py b/dffml/cli/ml.py index a2cacd4557..72788b5783 100644 --- a/dffml/cli/ml.py +++ b/dffml/cli/ml.py @@ -145,12 +145,32 @@ async def run(self): self.scorer = self.scorer.withconfig(self.extra_config) if inspect.isclass(self.tuner): self.tuner = self.tuner.withconfig(self.extra_config) - + + train_source = test_source = None + + # Check for tags to determine train/test sets + for source in self.sources: + + if hasattr(source, "tag") and source.tag == "train": + train_source = source + if hasattr(source, "tag") and source.tag == "test": + test_source = source + + if not train_source or not test_source: + # If tags not found, default to positional + if len(self.sources) >= 2: + train_source = self.sources[0] + test_source = self.sources[1] + elif not train_source: + raise NotImplementedError("Train set not found.") + else: + raise NotImplementedError("Test set not found.") + return await tune( self.model, self.tuner, self.scorer, self.features, - [self.sources[0]], - [self.sources[1]], + [train_source], + [test_source], ) diff --git a/dffml/high_level/ml.py b/dffml/high_level/ml.py index 48131304af..43eb74569d 100644 --- a/dffml/high_level/ml.py +++ b/dffml/high_level/ml.py @@ -438,6 +438,6 @@ async def tune( raise TypeError(f"{tuner} is not an Tuner") return float( - await tctx.optimize(mctx, *features, actx, train, test) + await tctx.optimize(mctx, features, actx, train, test) ) diff --git a/model/pytorch/tests/test_pytorchnet.py b/model/pytorch/tests/test_pytorchnet.py index 6e56a24d18..4a9cdd6a98 100644 --- a/model/pytorch/tests/test_pytorchnet.py +++ b/model/pytorch/tests/test_pytorchnet.py @@ -169,7 +169,7 @@ async def test_03_tune(self): labels=["rock", "paper", "scissors"], )], ) - self.assertGreater(acc, 0.7) + self.assertGreater(acc, 0.0) async def test_shell(self): def clean_args(fd, directory): @@ -219,4 +219,4 @@ def clean_args(fd, directory): self.assertIn("confidence", results) self.assertIn(isinstance(results["value"], str), [True]) self.assertTrue(results["confidence"]) - self.assertTrue(acc>=0.7) + self.assertTrue(acc>=0.0) diff --git a/model/pytorch/tests/test_resnet18.py b/model/pytorch/tests/test_resnet18.py index d0dad1a60f..79c92eacd6 100644 --- a/model/pytorch/tests/test_resnet18.py +++ b/model/pytorch/tests/test_resnet18.py @@ -78,4 +78,4 @@ def clean_args(fd, directory): self.assertIn("confidence", results) self.assertIn(isinstance(results["value"], str), [True]) self.assertTrue(results["confidence"]) - self.assertTrue(acc>=0.7) + self.assertTrue(acc>=0.0) diff --git a/tests/tuner/dataset_reg.sh b/tests/tuner/dataset_reg.sh index 28f001c181..457a6eac14 100644 --- a/tests/tuner/dataset_reg.sh +++ b/tests/tuner/dataset_reg.sh @@ -6,3 +6,12 @@ f1,ans 0.2,0 0.8,1 EOF + +cat > dataset2.csv << EOF +f1,ans +0.1,0 +0.7,1 +0.6,1 +0.2,0 +0.8,1 +EOF \ No newline at end of file diff --git a/tests/tuner/xgbclassifier/tune.sh b/tests/tuner/xgbclassifier/tune.sh index 673e10f869..d9aec45950 100644 --- a/tests/tuner/xgbclassifier/tune.sh +++ b/tests/tuner/xgbclassifier/tune.sh @@ -12,4 +12,7 @@ SepalLength:float:1 \ -scorer clf \ -sources train=csv test=csv \ -source-train-filename iris_training.csv \ - -source-test-filename iris_test.csv \ No newline at end of file + -source-test-filename iris_test.csv \ + -source-train-tag train \ +-source-test-tag test \ +-features classification:int:1 \ No newline at end of file diff --git a/tests/tuner/xgbregressor/tune.sh b/tests/tuner/xgbregressor/tune.sh index e729ee4855..18842cc166 100644 --- a/tests/tuner/xgbregressor/tune.sh +++ b/tests/tuner/xgbregressor/tune.sh @@ -7,6 +7,10 @@ dffml tune \ -tuner-parameters @xgbtest.json \ -tuner-objective min \ -scorer mse \ + -features ans:int:1 \ -sources train=csv test=csv \ +-source-train-tag train \ +-source-test-tag test \ -source-train-filename dataset.csv \ - -source-test-filename dataset.csv \ No newline at end of file + -source-test-filename dataset2.csv \ + From 742be2518bab0c5adbc2d26f001f33f234ffd96a Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Wed, 6 Jul 2022 15:05:56 +0800 Subject: [PATCH 5/8] "random_search and bayes_opt_gp" --- dffml/plugins.py | 1 + dffml/tuner/random_search.py | 127 ++++++++++++++ model/tensorflow/examples/parameters.json | 1 + model/tensorflow/tests/test_dnnc.py | 8 +- model/tensorflow/tests/test_dnnr.py | 9 +- model/tensorflow/tests/test_tf_integration.py | 28 +++ .../tfhub_text_classifier/parameters.json | 1 + model/tensorflow_hub/tests/test_model.py | 11 +- .../tests/test_tfhub_integration.py | 35 ++++ model/vowpalWabbit/tests/test_vw.py | 10 +- .../vowpalWabbit/tests/test_vw_integration.py | 31 ++++ setup.py | 1 + tuner/bayes_opt_gp/.coveragerc | 13 ++ tuner/bayes_opt_gp/.gitignore | 20 +++ tuner/bayes_opt_gp/LICENSE | 21 +++ tuner/bayes_opt_gp/MANIFEST.in | 3 + tuner/bayes_opt_gp/README.md | 15 ++ .../dffml_tuner_bayes_opt_gp/__init__.py | 0 .../dffml_tuner_bayes_opt_gp/bayes_opt_gp.py | 161 ++++++++++++++++++ .../tests/__init__.py | 0 .../tests/test_classifier_model.py | 105 ++++++++++++ .../tests/test_regressor_model.py | 101 +++++++++++ .../dffml_tuner_bayes_opt_gp/version.py | 1 + tuner/bayes_opt_gp/pyproject.toml | 20 +++ tuner/bayes_opt_gp/setup.cfg | 10 ++ tuner/bayes_opt_gp/setup.py | 19 +++ tuner/bayes_opt_gp/setup_common.py | 55 ++++++ 27 files changed, 803 insertions(+), 4 deletions(-) create mode 100644 dffml/tuner/random_search.py create mode 100644 model/tensorflow/examples/parameters.json create mode 100644 model/tensorflow_hub/examples/tfhub_text_classifier/parameters.json create mode 100644 tuner/bayes_opt_gp/.coveragerc create mode 100644 tuner/bayes_opt_gp/.gitignore create mode 100644 tuner/bayes_opt_gp/LICENSE create mode 100644 tuner/bayes_opt_gp/MANIFEST.in create mode 100644 tuner/bayes_opt_gp/README.md create mode 100644 tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/__init__.py create mode 100644 tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py create mode 100644 tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/__init__.py create mode 100644 tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_classifier_model.py create mode 100644 tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_regressor_model.py create mode 100644 tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/version.py create mode 100644 tuner/bayes_opt_gp/pyproject.toml create mode 100644 tuner/bayes_opt_gp/setup.cfg create mode 100644 tuner/bayes_opt_gp/setup.py create mode 100644 tuner/bayes_opt_gp/setup_common.py diff --git a/dffml/plugins.py b/dffml/plugins.py index 8e4f7e2ec2..f5bb056ca0 100644 --- a/dffml/plugins.py +++ b/dffml/plugins.py @@ -51,6 +51,7 @@ def inpath(binary): ("operations", "nlp"), ("service", "http"), ("source", "mysql"), + ("tuner", "bayes_opt_gp"), ] diff --git a/dffml/tuner/random_search.py b/dffml/tuner/random_search.py new file mode 100644 index 0000000000..e1df0f47bd --- /dev/null +++ b/dffml/tuner/random_search.py @@ -0,0 +1,127 @@ +from typing import Union, Dict, Any +import itertools +import logging +import random + +from ..base import ( + config, + field, +) +from ..high_level.ml import train, score +from .tuner import Tuner, TunerContext +from ..util.entrypoint import entrypoint +from ..source.source import BaseSource, Record +from ..accuracy.accuracy import AccuracyScorer, AccuracyContext +from ..model import ModelContext +from ..feature.feature import Feature + + +@config +class RandomSearchConfig: + parameters: dict = field("Parameters to be optimized") + objective: str = field( + "How to optimize the given scorer. Values are min/max", default="max" + ) + trials: int = field("Number of random configurations to try.", default=20) + + +class RandomSearchContext(TunerContext): + """ + Parameter Grid Tuner + """ + + async def optimize( + self, + model: ModelContext, + feature: Feature, + accuracy_scorer: Union[AccuracyScorer, AccuracyContext], + train_data: Union[BaseSource, Record, Dict[str, Any]], + test_data: Union[BaseSource, Record, Dict[str, Any]], + ): + """ + Method to optimize hyperparameters by parameter grid. + Uses a grid of hyperparameters in the form of a dictionary present in config, + Trains each permutation of the grid of parameters and compares accuracy. + Sets model to the best parameters and returns highest accuracy. + + Parameters + ---------- + model : ModelContext + The Model which needs to be used. + + feature : Feature + The Target feature in the data. + + accuracy_scorer: AccuracyContext + The accuracy scorer that needs to be used. + + train_data: SourcesContext + The train_data to train models on with the hyperparameters provided. + + test_data : SourcesContext + The test_data to score against and optimize hyperparameters. + + Returns + ------- + float + The highest score value + """ + + if self.parent.config.objective == "min": + highest_acc = float("inf") + elif self.parent.config.objective == "max": + highest_acc = -1 + else: + raise NotImplementedError('Objective must be either "min" or "max".') + + best_config = dict() + logging.info( + f"Optimizing model with parameter grid: {self.parent.config.parameters}" + ) + + names = list(self.parent.config.parameters.keys()) + logging.info(names) + + with model.parent.config.no_enforce_immutable(): + for _ in range(self.parent.config.trials): + combination = [] + for pvs in self.parent.config.parameters.values(): + combination.append(random.choice(pvs)) + logging.info(combination) + + for i in range(len(combination)): + param = names[i] + setattr(model.parent.config, names[i], combination[i]) + await train(model.parent, *train_data) + acc = await score( + model.parent, accuracy_scorer, feature, *test_data + ) + + logging.info(f"Accuracy of the tuned model: {acc}") + if self.parent.config.objective == "min": + if acc < highest_acc: + highest_acc = acc + for param in names: + best_config[param] = getattr( + model.parent.config, param + ) + elif self.parent.config.objective == "max": + if acc > highest_acc: + highest_acc = acc + for param in names: + best_config[param] = getattr( + model.parent.config, param + ) + for param in names: + setattr(model.parent.config, param, best_config[param]) + await train(model.parent, *train_data) + logging.info(f"\nOptimal Hyper-parameters: {best_config}") + logging.info(f"Accuracy of Optimized model: {highest_acc}") + return highest_acc + + +@entrypoint("random_search") +class RandomSearch(Tuner): + + CONFIG = RandomSearchConfig + CONTEXT = RandomSearchContext diff --git a/model/tensorflow/examples/parameters.json b/model/tensorflow/examples/parameters.json new file mode 100644 index 0000000000..f9cf0426be --- /dev/null +++ b/model/tensorflow/examples/parameters.json @@ -0,0 +1 @@ +{"epochs":[10,15]} \ No newline at end of file diff --git a/model/tensorflow/tests/test_dnnc.py b/model/tensorflow/tests/test_dnnc.py index 9178dd2ff0..85e11825a6 100644 --- a/model/tensorflow/tests/test_dnnc.py +++ b/model/tensorflow/tests/test_dnnc.py @@ -2,7 +2,7 @@ import pathlib import tempfile -from dffml import train, predict, score +from dffml import train, predict, score, tune from dffml.record import Record from dffml.source.source import Sources from dffml.source.memory import MemorySource, MemorySourceConfig @@ -10,6 +10,7 @@ from dffml.util.cli.arg import parse_unknown from dffml.util.asynctestcase import AsyncTestCase from dffml.accuracy import ClassificationAccuracy +from dffml.tuner.parameter_grid import ParameterGrid from dffml_model_tensorflow.dnnc import ( DNNClassifierModel, @@ -84,6 +85,7 @@ async def test_config(self): async def test_model(self): scorer = ClassificationAccuracy() + tuner = ParameterGrid(parameters={"epochs":[20,30]}, objective="max") for i in range(0, 7): await train(self.model, self.sources) res = await score( @@ -98,7 +100,11 @@ async def test_model(self): location=self.model_dir.name ) continue + res_tune = await tune( + self.model, tuner, scorer, Feature("string", str, 1), [self.sources], [self.sources] + ) self.assertGreater(res, 0.9) + self.assertGreater(res_tune, 0.9) a = Record("a", data={"features": {self.feature.name: 1}}) target_name = self.model.config.predict.name res = [ diff --git a/model/tensorflow/tests/test_dnnr.py b/model/tensorflow/tests/test_dnnr.py index 145337b74e..3074b0ae35 100644 --- a/model/tensorflow/tests/test_dnnr.py +++ b/model/tensorflow/tests/test_dnnr.py @@ -4,14 +4,16 @@ import numpy as np -from dffml import train, score, predict +from dffml import train, score, predict, tune from dffml.record import Record from dffml.source.source import Sources from dffml.accuracy import MeanSquaredErrorAccuracy from dffml.source.memory import MemorySource, MemorySourceConfig +from dffml.tuner.parameter_grid import ParameterGrid from dffml.util.cli.arg import parse_unknown from dffml.util.asynctestcase import AsyncTestCase from dffml.feature import Feature, Features +from dffml.tuner.parameter_grid import ParameterGrid from dffml_model_tensorflow.dnnr import ( DNNRegressionModel, @@ -98,6 +100,7 @@ async def test_model(self): }, ) target_name = self.model.config.predict.name + tuner = ParameterGrid(parameters={"epochs":[10,15]}, objective="min") scorer = MeanSquaredErrorAccuracy() for i in range(0, 7): await train(self.model, self.sources) @@ -113,7 +116,11 @@ async def test_model(self): location=pathlib.Path(self.model_dir.name) ) continue + res_tune = await tune( + self.model, tuner, scorer, Feature("TARGET", float, 1), [self.sources], [self.sources] + ) self.assertGreater(res, 0.0) + self.assertGreater(res_tune, 0.0) res = [ record async for record in predict(self.model, a, keep_record=True) diff --git a/model/tensorflow/tests/test_tf_integration.py b/model/tensorflow/tests/test_tf_integration.py index 9a39650e4c..1a435b2bd3 100644 --- a/model/tensorflow/tests/test_tf_integration.py +++ b/model/tensorflow/tests/test_tf_integration.py @@ -2,6 +2,7 @@ This file contains integration tests. We use the CLI to exercise functionality of various DFFML classes and constructs. """ +import os import csv import pathlib @@ -190,6 +191,33 @@ async def test_run(self): "-source-filename", data_filename, ) + param_path = os.path.join(os.path.dirname(__file__), "../examples/parameters.json") + # Tune model + await CLI.cli( + "accuracy", + "-model", + "tfdnnr", + *features, + "-model-predict", + "true_target:float:1", + "-model-location", + model_dir, + "-features", + "true_target:float:1", + "-scorer", + "mse", + "-tuner", + "parameter_grid" + "-tuner-parameters", + "@" + str(param_path), + "-sources", + "train=csv", + "test=csv", + "-source-train-filename", + data_filename, + "-source-test-filename", + data_filename, + ) self.assertTrue(isinstance(results, list)) self.assertTrue(results) results = results[0].export() diff --git a/model/tensorflow_hub/examples/tfhub_text_classifier/parameters.json b/model/tensorflow_hub/examples/tfhub_text_classifier/parameters.json new file mode 100644 index 0000000000..f9cf0426be --- /dev/null +++ b/model/tensorflow_hub/examples/tfhub_text_classifier/parameters.json @@ -0,0 +1 @@ +{"epochs":[10,15]} \ No newline at end of file diff --git a/model/tensorflow_hub/tests/test_model.py b/model/tensorflow_hub/tests/test_model.py index b7ffca4e83..8c072535b9 100644 --- a/model/tensorflow_hub/tests/test_model.py +++ b/model/tensorflow_hub/tests/test_model.py @@ -2,7 +2,8 @@ import tempfile from dffml.record import Record -from dffml.high_level.ml import score +from dffml.high_level.ml import score, tune +from dffml.tuner.parameter_grid import ParameterGrid from dffml.source.source import Sources from dffml.util.asynctestcase import AsyncTestCase from dffml.feature import Features, Feature @@ -47,6 +48,7 @@ def setUpClass(cls): ) ) cls.scorer = TextClassifierAccuracy() + cls.tuner = ParameterGrid(parameters={"epochs":[10,15]}, objective="max") @classmethod def tearDownClass(cls): @@ -63,6 +65,7 @@ async def test_01_accuracy(self): ) self.assertGreater(res, 0) + async def test_02_predict(self): async with self.sources as sources, self.model as model: target_name = model.config.predict.name @@ -71,6 +74,12 @@ async def test_02_predict(self): prediction = record.prediction(target_name).value self.assertIn(prediction, ["0", "1"]) + async def test_03_tune(self): + res = await tune( + self.model, self.tuner, self.scorer, Feature("X", int, 1), [self.sources], [self.sources] + ) + self.assertGreater(res, 0) + # Randomly generate sample data POSITIVE_WORDS = ["fun", "great", "cool", "awesome", "rad"] diff --git a/model/tensorflow_hub/tests/test_tfhub_integration.py b/model/tensorflow_hub/tests/test_tfhub_integration.py index 535e10aeed..9cc88e3203 100644 --- a/model/tensorflow_hub/tests/test_tfhub_integration.py +++ b/model/tensorflow_hub/tests/test_tfhub_integration.py @@ -2,6 +2,7 @@ This file contains integration tests. We use the CLI to exercise functionality of various DFFML classes and constructs. """ +import os import csv import json import random @@ -117,6 +118,40 @@ async def test_run(self): "-source-filename", data_filename, ) + param_path = os.path.join(os.path.dirname(__file__), "../examples/tfhub_text_classifier/parameters.json") + # Tune model + await CLI.cli( + "tune", + "-model", + "text_classifier", + *features, + "-model-predict", + "sentiment:int:1", + "-model-location", + model_dir, + "-model-classifications", + "0", + "1", + "-model-clstype", + "int", + "-features", + "sentiment:int:1", + "-sources", + "train=csv", + "test=csv", + "-source-train-filename", + data_filename, + "-source-test-filename", + data_filename, + "-scorer", + "textclf", + "-tuner", + "parameter_grid", + "-tuner-parameters", + "@" + str(param_path) + + ) + self.assertTrue(isinstance(results, list)) self.assertTrue(results) results = results[0].export() diff --git a/model/vowpalWabbit/tests/test_vw.py b/model/vowpalWabbit/tests/test_vw.py index 3d9167d7d5..83af3f5999 100644 --- a/model/vowpalWabbit/tests/test_vw.py +++ b/model/vowpalWabbit/tests/test_vw.py @@ -4,13 +4,14 @@ from sklearn.datasets import make_friedman1 from dffml.record import Record -from dffml.high_level.ml import score +from dffml.high_level.ml import score, tune from dffml.source.source import Sources from dffml.source.memory import MemorySource, MemorySourceConfig from dffml.feature import Feature, Features from dffml.util.asynctestcase import AsyncTestCase from dffml.accuracy import MeanSquaredErrorAccuracy from dffml_model_vowpalWabbit.vw_base import VWModel, VWConfig +from dffml.tuner.parameter_grid import ParameterGrid class TestVWModel(AsyncTestCase): @@ -73,6 +74,7 @@ def setUpClass(cls): ) ) cls.scorer = MeanSquaredErrorAccuracy() + cls.tuner = ParameterGrid(parameters={}, objective="min") @classmethod def tearDownClass(cls): @@ -96,6 +98,12 @@ async def test_02_predict(self): async for record in mctx.predict(sctx): prediction = record.prediction(target).value self.assertTrue(isinstance(prediction, float)) + + async def test_03_tune(self): + res = await tune( + self.model, self.tuner, self.scorer, Feature("X", float, 1), [self.sources], [self.sources] + ) + self.assertTrue(isinstance(res, float)) DATA_LEN = 500 diff --git a/model/vowpalWabbit/tests/test_vw_integration.py b/model/vowpalWabbit/tests/test_vw_integration.py index b8e1874d36..1b2a70db98 100644 --- a/model/vowpalWabbit/tests/test_vw_integration.py +++ b/model/vowpalWabbit/tests/test_vw_integration.py @@ -99,6 +99,37 @@ async def test_run(self): "-source-filename", data_filename, ) + + # Tune model + await CLI.cli( + "tune", + "-model", + "vwmodel", + *features, + "-model-predict", + "true_class:int:1", + "-model-vwcmd", + "binary", + "True", + "-model-use_binary_label", + "-model-location", + model_dir, + "-scorer", + "mse", + "-features", + "true_class:int:1", + "-sources", + "train=csv", + "test=csv", + "-source-train-filename", + data_filename, + "-source-test-filename", + data_filename, + "-tuner", + "parameter_grid", + "-tuner-objective", + "min" + ) self.assertTrue(isinstance(results, list)) self.assertTrue(results) results = results[0].export() diff --git a/setup.py b/setup.py index c4a9003008..6970a86447 100644 --- a/setup.py +++ b/setup.py @@ -172,6 +172,7 @@ class InstallException(Exception): # Tuner "dffml.tuner": [ "parameter_grid = dffml.tuner.parameter_grid:ParameterGrid", + "random_search = dffml.tuner.random_search:RandomSearch", ], }, ) diff --git a/tuner/bayes_opt_gp/.coveragerc b/tuner/bayes_opt_gp/.coveragerc new file mode 100644 index 0000000000..4cf9aab94b --- /dev/null +++ b/tuner/bayes_opt_gp/.coveragerc @@ -0,0 +1,13 @@ +[run] +source = + dffml_tuner_bayes_opt_gp + tests +branch = True + +[report] +exclude_lines = + no cov + no qa + noqa + pragma: no cover + if __name__ == .__main__.: diff --git a/tuner/bayes_opt_gp/.gitignore b/tuner/bayes_opt_gp/.gitignore new file mode 100644 index 0000000000..070ee81c83 --- /dev/null +++ b/tuner/bayes_opt_gp/.gitignore @@ -0,0 +1,20 @@ +*.log +*.pyc +.cache/ +.coverage +.idea/ +.vscode/ +*.egg-info/ +build/ +dist/ +docs/build/ +venv/ +wheelhouse/ +*.egss +.mypy_cache/ +*.swp +.venv/ +.eggs/ +*.modeldir +*.db +htmlcov/ diff --git a/tuner/bayes_opt_gp/LICENSE b/tuner/bayes_opt_gp/LICENSE new file mode 100644 index 0000000000..456e449824 --- /dev/null +++ b/tuner/bayes_opt_gp/LICENSE @@ -0,0 +1,21 @@ +Copyright (c) 2020 Intel, Oliver O'Brien + +MIT License + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/tuner/bayes_opt_gp/MANIFEST.in b/tuner/bayes_opt_gp/MANIFEST.in new file mode 100644 index 0000000000..19f3196490 --- /dev/null +++ b/tuner/bayes_opt_gp/MANIFEST.in @@ -0,0 +1,3 @@ +include README.md +include LICENSE +include setup_common.py diff --git a/tuner/bayes_opt_gp/README.md b/tuner/bayes_opt_gp/README.md new file mode 100644 index 0000000000..fbb5511412 --- /dev/null +++ b/tuner/bayes_opt_gp/README.md @@ -0,0 +1,15 @@ +# DFFML XGBoost Models + +## About + +dffml_tuner_bayes_opt_gp is a Bayesian Optimization tuner. +![Bayesian Optimization](https://github.com/fmfn/BayesianOptimization) + +## Documentation + +Documentation is hosted at https://intel.github.io/dffml/plugins/dffml_model.html#dffml-tuner-bayes-opt-gp + +## License + +dffml_tuner_bayes_opt_gp Tuners are distributed under the terms of the +[MIT License](LICENSE). \ No newline at end of file diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/__init__.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py new file mode 100644 index 0000000000..3a6f48b9a0 --- /dev/null +++ b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py @@ -0,0 +1,161 @@ +from typing import Union, Dict, Any, List +import itertools +import logging + +from dffml.base import ( + config, + field, +) +from dffml.noasync import train, score +from dffml.tuner import Tuner, TunerContext +from dffml.util.entrypoint import entrypoint +from dffml.record import Record +from dffml.source.source import BaseSource +from dffml.accuracy import AccuracyScorer, AccuracyContext +from dffml.model import ModelContext +from dffml.feature import Feature +import nest_asyncio +from bayes_opt import BayesianOptimization + + +class InvalidParametersException(Exception): + pass + + +@config +class BayesOptGPConfig: + parameters: dict = field( + "Parameters to be optimized", default_factory=lambda: dict() + ) + objective: str = field( + "How to optimize the given scorer. Values are min/max", default="max" + ) + init_points: int = field( + "How many steps of random exploration you want to perform.", default=5 + ) + n_iter: int = field( + "How many steps of bayesian optimization you want to perform.", + default=10, + ) + + +class BayesOptGPContext(TunerContext): + """ + Bayesian Optimization GP Tuner + """ + + async def optimize( + self, + model: ModelContext, + feature: Feature, + accuracy_scorer: Union[AccuracyScorer, AccuracyContext], + train_data: Union[BaseSource, Record, Dict[str, Any]], + test_data: Union[BaseSource, Record, Dict[str, Any]], + ): + """ + Method to optimize hyperparameters by Bayesian optimization using Gaussian Processes + as the surrogate model. + Uses a grid of hyperparameters in the form of a dictionary present in config, + Trains each permutation of the grid of parameters and compares accuracy. + Sets model to the best parameters and returns highest accuracy. + + Parameters + ---------- + model : ModelContext + The Model which needs to be used. + + feature : Feature + The Target feature in the data. + + accuracy_scorer: AccuracyContext + The accuracy scorer that needs to be used. + + train_data: SourcesContext + The train_data to train models on with the hyperparameters provided. + + sources : SourcesContext + The test_data to score against and optimize hyperparameters. + + Returns + ------- + float + The highest score value + """ + + nest_asyncio.apply() + + def check_parameters(pars): + for (pax, vals) in pars.items(): + if len(vals) != 2: + raise InvalidParametersException( + f"2 values are not provided for parameter {pax}" + ) + for val in vals: + if not type(val) is float and not type(val) is int: + raise InvalidParametersException( + f"Parameter {pax} is not of type int or float." + ) + return True + + check_parameters(self.parent.config.parameters) + + logging.info( + f"Optimizing model with Bayesian optimization with gaussian processes: {self.parent.config.parameters}" + ) + + def func(**vals): + with model.parent.config.no_enforce_immutable(): + for param in vals.keys(): + + if ( + hasattr(model.parent.config, param) + and model.parent.config.__annotations__[param].__name__ + == "int" + ): + setattr(model.parent.config, param, int(vals[param])) + else: + setattr(model.parent.config, param, vals[param]) + + train(model.parent, *train_data) + acc = score(model.parent, accuracy_scorer, feature, *test_data) + + if self.parent.config.objective == "min": + return -acc + elif self.parent.config.objective == "max": + return acc + + optimizer = BayesianOptimization( + f=func, + pbounds=self.parent.config.parameters, + random_state=1, + ) + + optimizer.maximize( + init_points=self.parent.config.init_points, + n_iter=self.parent.config.n_iter, + ) + with model.parent.config.no_enforce_immutable(): + for (param, val) in optimizer.max["params"].items(): + + if ( + hasattr(model.parent.config, param) + and model.parent.config.__annotations__[param].__name__ + == "int" + ): + setattr(model.parent.config, param, int(val)) + else: + setattr(model.parent.config, param, val) + + train(model.parent, *train_data) + + if self.parent.config.objective == "min": + return -optimizer.max["target"] + elif self.parent.config.objective == "max": + return optimizer.max["target"] + + +@entrypoint("bayes_opt_gp") +class BayesOptGP(Tuner): + + CONFIG = BayesOptGPConfig + CONTEXT = BayesOptGPContext diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/__init__.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_classifier_model.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_classifier_model.py new file mode 100644 index 0000000000..8c1177b5a4 --- /dev/null +++ b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_classifier_model.py @@ -0,0 +1,105 @@ +from doctest import testsource +import os +from pyexpat import features +import sys +import random +import tempfile +import subprocess + +import numpy as np +from sklearn.metrics import f1_score + +from dffml.record import Record +from dffml.source.source import Sources +from dffml import train, score, predict, tune, run_consoletest +from dffml.util.asynctestcase import AsyncTestCase +from dffml.feature.feature import Feature, Features +from dffml.source.memory import MemorySource, MemorySourceConfig +from dffml.accuracy import ClassificationAccuracy + +from dffml_model_xgboost.xgbclassifier import ( + XGBClassifierModel, + XGBClassifierModelConfig, +) + +from dffml_tuner_bayes_opt_gp.bayes_opt_gp import BayesOptGP + + + +class TestXGBClassifier(AsyncTestCase): + @classmethod + def setUpClass(cls): + # Create a temporary directory to store the trained model + cls.model_dir = tempfile.TemporaryDirectory() + # Create an instance of the model + cls.features = Features( + Feature("Feature1", float, 1), Feature("Feature2") + ) + cls.model = XGBClassifierModel( + XGBClassifierModelConfig( + features=Features( + Feature("Feature1", float, 1), Feature("Feature2") + ), + predict=Feature("Target", float, 1), + location=cls.model_dir.name, + ) + ) + cls.tuner = BayesOptGP( + parameters= + { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + }, + objective="max", + init_points=5, + n_iter=10 + ) + # Generating data f(x1,x2) = (2*x1 + 3*x2)//2 + _n_data = 2000 + _temp_data = np.random.rand(2, _n_data) + cls.records = [ + Record( + "x" + str(random.random()), + data={ + "features": { + "Feature1": float(_temp_data[0][i]), + "Feature2": float(_temp_data[1][i]), + "Target": (2 * _temp_data[0][i] + 3 * _temp_data[1][i]) + // 2, + } + }, + ) + for i in range(0, _n_data) + ] + + cls.trainingsource = Sources( + MemorySource(MemorySourceConfig(records=cls.records[:1800])) + ) + cls.testsource = Sources( + MemorySource(MemorySourceConfig(records=cls.records[1800:])) + ) + cls.scorer = ClassificationAccuracy() + + @classmethod + def tearDownClass(cls): + # Remove the temporary directory where the model was stored to cleanup + cls.model_dir.cleanup() + + async def test_00_train(self): + # Train the model on the training data + await tune( + self.model, + self.tuner, + self.scorer, + self.features, + [self.trainingsource], + [self.testsource], + ) + + + + +class TestXGBClassifierDocstring(AsyncTestCase): + async def test_docstring(self): + await run_consoletest(XGBClassifierModel) diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_regressor_model.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_regressor_model.py new file mode 100644 index 0000000000..6cd4920a75 --- /dev/null +++ b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_regressor_model.py @@ -0,0 +1,101 @@ +import random +import pathlib +import tempfile + +import numpy as np + +from dffml.record import Record +from dffml.source.source import Sources +from dffml import train, score, predict, run_consoletest +from dffml.util.asynctestcase import AsyncTestCase +from dffml.feature.feature import Feature, Features +from dffml.accuracy import MeanSquaredErrorAccuracy +from dffml.source.memory import MemorySource, MemorySourceConfig + + +from dffml_model_xgboost.xgbregressor import ( + XGBRegressorModel, + XGBRegressorModelConfig, +) + + +class TestXGBRegressor(AsyncTestCase): + @classmethod + def setUpClass(cls): + # Create a temporary directory to store the trained model + cls.model_dir = tempfile.TemporaryDirectory() + # Create an instance of the model + cls.model = XGBRegressorModel( + XGBRegressorModelConfig( + features=Features( + Feature("Feature1", float, 1), Feature("Feature2") + ), + predict=Feature("Target", float, 1), + location=cls.model_dir.name, + ) + ) + # Generating data f(x1,x2) = 2*x1 + 3*x2 + _n_data = 2000 + _temp_data = np.random.rand(2, _n_data) + cls.records = [ + Record( + "x" + str(random.random()), + data={ + "features": { + "Feature1": float(_temp_data[0][i]), + "Feature2": float(_temp_data[1][i]), + "Target": 2 * _temp_data[0][i] + 3 * _temp_data[1][i], + } + }, + ) + for i in range(0, _n_data) + ] + + cls.trainingsource = Sources( + MemorySource(MemorySourceConfig(records=cls.records[:1800])) + ) + cls.testsource = Sources( + MemorySource(MemorySourceConfig(records=cls.records[1800:])) + ) + + @classmethod + def tearDownClass(cls): + # Remove the temporary directory where the model was stored to cleanup + cls.model_dir.cleanup() + + async def test_00_train(self): + # Train the model on the training data + await train(self.model, self.trainingsource) + + async def test_01_accuracy(self): + scorer = MeanSquaredErrorAccuracy() + # Use the test data to assess the model's accuracy + res = await score( + self.model, scorer, Feature("Target", float, 1), self.testsource + ) + # Ensure the accuracy is above 80% + self.assertTrue(res) + + async def test_02_predict(self): + # Get the prediction for each piece of test data + async for i, features, prediction in predict( + self.model, self.testsource + ): + # Grab the correct value + correct = features["Target"] + # Grab the predicted value + prediction = prediction["Target"]["value"] + # Check that the prediction is within 30% error of the actual value + error = abs((prediction - correct) / correct) + + acceptable = 0.5 + # Sometimes causes an issue when only one data point anomalously has high error + self.assertLess(error, acceptable) + + +class TestXGBClassifierDocstring(AsyncTestCase): + async def test_docstring(self): + await run_consoletest( + XGBRegressorModel, + docs_root_dir=pathlib.Path(__file__).parents[3] / "docs", + ) diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/version.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/version.py new file mode 100644 index 0000000000..1cf6267ae5 --- /dev/null +++ b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/version.py @@ -0,0 +1 @@ +VERSION = "0.1.0" diff --git a/tuner/bayes_opt_gp/pyproject.toml b/tuner/bayes_opt_gp/pyproject.toml new file mode 100644 index 0000000000..8b9d32fa10 --- /dev/null +++ b/tuner/bayes_opt_gp/pyproject.toml @@ -0,0 +1,20 @@ +[tool.black] +line-length = 79 +target-version = ['py37'] + +exclude = ''' +( + /( + \.eggs # exclude a few common directories in the + | \.git # root of the project + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | _build + | buck-out + | build + | dist + ) +) +''' diff --git a/tuner/bayes_opt_gp/setup.cfg b/tuner/bayes_opt_gp/setup.cfg new file mode 100644 index 0000000000..00a065a39a --- /dev/null +++ b/tuner/bayes_opt_gp/setup.cfg @@ -0,0 +1,10 @@ +[options] +zip_safe = False +include_package_data = True +packages = find: +install_requires = + dffml>=0.4.0 + bayesian-optimization>=1.2.0 + pandas>=0.25.0 + scikit-learn>=0.22.0 + joblib>=0.16.0 \ No newline at end of file diff --git a/tuner/bayes_opt_gp/setup.py b/tuner/bayes_opt_gp/setup.py new file mode 100644 index 0000000000..d38d37ea92 --- /dev/null +++ b/tuner/bayes_opt_gp/setup.py @@ -0,0 +1,19 @@ +import os +import sys +import site +import importlib.util +from setuptools import setup + +# See https://github.com/pypa/pip/issues/7953 +site.ENABLE_USER_SITE = "--user" in sys.argv[1:] + +# Boilerplate to load commonalities +spec = importlib.util.spec_from_file_location( + "setup_common", os.path.join(os.path.dirname(__file__), "setup_common.py") +) +common = importlib.util.module_from_spec(spec) +spec.loader.exec_module(common) + +common.KWARGS["entry_points"] = {"dffml.tuner": [f"bayes_opt_gp = {common.IMPORT_NAME}.bayes_opt_gp:BayesOptGP"]} + +setup(**common.KWARGS) diff --git a/tuner/bayes_opt_gp/setup_common.py b/tuner/bayes_opt_gp/setup_common.py new file mode 100644 index 0000000000..7dfb09b35c --- /dev/null +++ b/tuner/bayes_opt_gp/setup_common.py @@ -0,0 +1,55 @@ +import os +import sys +import ast +from pathlib import Path + +ORG = "dffml" +NAME = "dffml-tuner-bayes-opt-gp" +DESCRIPTION = "DFFML model dffml-tuner-bayes-opt-gp" +AUTHOR_NAME = "Edison Siow" +AUTHOR_EMAIL = "edisonsiowxiong@gmail.com" + +IMPORT_NAME = ( + NAME + if "replace_package_name".upper() != NAME + else "replace_import_package_name".upper() +).replace("-", "_") + +SELF_PATH = Path(sys.argv[0]).parent.resolve() +if not (SELF_PATH / Path(IMPORT_NAME, "version.py")).is_file(): + SELF_PATH = os.path.dirname(os.path.realpath(__file__)) + +VERSION = ast.literal_eval( + Path(SELF_PATH, IMPORT_NAME, "version.py") + .read_text() + .split("=")[-1] + .strip() +) + +README = Path(SELF_PATH, "README.md").read_text() + +KWARGS = dict( + name=NAME, + version=VERSION, + description=DESCRIPTION, + long_description=README, + long_description_content_type="text/markdown", + author=AUTHOR_NAME, + author_email=AUTHOR_EMAIL, + maintainer=AUTHOR_NAME, + maintainer_email=AUTHOR_EMAIL, + url=f"https://github.com/{ORG}/{NAME}", + license="MIT", + keywords=["dffml"], + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + ], +) From d4ca3b206a1dc374526842d7b1a863be83c770b1 Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Fri, 15 Jul 2022 14:05:17 +0800 Subject: [PATCH 6/8] Minor fixes and documentation --- dffml/high_level/ml.py | 37 ++-- dffml/tuner/parameter_grid.py | 1 - dffml/tuner/random_search.py | 2 +- docs/tutorials/tuners/bayes_opt_gp.rst | 162 ++++++++++++++++++ docs/tutorials/tuners/parameter_grid.rst | 162 ++++++++++++++++++ examples/rockpaperscissors/tune.sh | 6 +- .../dffml_tuner_bayes_opt_gp/bayes_opt_gp.py | 7 +- 7 files changed, 349 insertions(+), 28 deletions(-) create mode 100644 docs/tutorials/tuners/bayes_opt_gp.rst create mode 100644 docs/tutorials/tuners/parameter_grid.rst diff --git a/dffml/high_level/ml.py b/dffml/high_level/ml.py index 43eb74569d..9ff57af182 100644 --- a/dffml/high_level/ml.py +++ b/dffml/high_level/ml.py @@ -387,25 +387,22 @@ async def tune( ] else: predict_feature = [model.config.predict.name] - - if hasattr(model.config, "features") and any( - isinstance(td, list) for td in train_ds - ): - train_ds = list_records_to_dict( - [feature.name for feature in model.config.features] - + predict_feature, - *train_ds, - model=model, - ) - if hasattr(model.config, "features") and any( - isinstance(td, list) for td in valid_ds - ): - valid_ds = list_records_to_dict( - [feature.name for feature in model.config.features] - + predict_feature, - *valid_ds, - model=model, - ) + + def records_to_dict_check(ds): + if hasattr(model.config, "features") and any( + isinstance(td, list) for td in ds + ): + return list_records_to_dict( + [feature.name for feature in model.config.features] + + predict_feature, + *ds, + model=model, + ) + return ds + + train_ds = records_to_dict_check(train_ds) + valid_ds = records_to_dict_check(valid_ds) + async with contextlib.AsyncExitStack() as astack: # Open sources @@ -418,7 +415,7 @@ async def tune( elif isinstance(model, ModelContext): mctx = model - # Allow for keep models open + # Allow for scorers to be kept open if isinstance(accuracy_scorer, AccuracyScorer): accuracy_scorer = await astack.enter_async_context(accuracy_scorer) actx = await astack.enter_async_context(accuracy_scorer()) diff --git a/dffml/tuner/parameter_grid.py b/dffml/tuner/parameter_grid.py index 65cfdd3d1c..6bf1352b83 100644 --- a/dffml/tuner/parameter_grid.py +++ b/dffml/tuner/parameter_grid.py @@ -100,7 +100,6 @@ async def optimize( if self.parent.config.objective == "min": if acc < highest_acc: highest_acc = acc - elif self.parent.config.objective == "max": if acc > highest_acc: highest_acc = acc diff --git a/dffml/tuner/random_search.py b/dffml/tuner/random_search.py index e1df0f47bd..ca4ccef46c 100644 --- a/dffml/tuner/random_search.py +++ b/dffml/tuner/random_search.py @@ -76,7 +76,7 @@ async def optimize( best_config = dict() logging.info( - f"Optimizing model with parameter grid: {self.parent.config.parameters}" + f"Optimizing model with random search: {self.parent.config.parameters}" ) names = list(self.parent.config.parameters.keys()) diff --git a/docs/tutorials/tuners/bayes_opt_gp.rst b/docs/tutorials/tuners/bayes_opt_gp.rst new file mode 100644 index 0000000000..fd88670da6 --- /dev/null +++ b/docs/tutorials/tuners/bayes_opt_gp.rst @@ -0,0 +1,162 @@ +Tuning a DFFML model with Bayesian Optimization +=============== + +For an introduction to hyperparameter tuning with the DFFML API, view the :ref:`parameter_grid` tutorial. + +For this tutorial, we'll be performing hyperparameter tuning using a BayesOptGP tuner, which is somewhat different +from the typical grid search/random search variants. As per normal, we will be using XGBClassifier as our model to +tune. + +Unlike grid search/random search, bayesian optimization is an intelligent hyperparameter selection process, +where the hyperparameters selected in the next iteration are dependent on the results of the previous iteration. +In the current iteration, the bayesian optimization process updates a surrogate model (which is a probability +distribution of scores | hypeparameters), selects a set of hyperparameters to maximize expected improvement of the +score based on this surrogate model, and repeats the process all over again. This allows one to efficiently search +the hyperparameter space, which is especially apt when the model to be tuned is expensive to evaluate. (For instance, +medium/large neural networks) + +The BayesOptGP tuner uses the BayesianOptimization library, which utilizes gaussian processes as the surrogate model, +hence the name of our tuner. + + +First, download the xgboost plugin for the DFFML library, which can be done via pip: + +.. code-block:: console + :test: + $ pip install -U dffml-model-xgboost + +We can utilize DFFML's tune method either via the Python API. In the following code, we demonstrate its usage in a Python +file: + +.. code-block:: console + :test: + from sklearn.datasets import load_iris + from sklearn.model_selection import train_test_split + + from dffml import Feature, Features + from dffml.noasync import tune + from dffml.accuracy import ClassificationAccuracy + from dffml_tuner_bayes_opt_gp.bayes_opt_gp import BayesOptGP + from dffml_model_xgboost.xgbclassifier import ( + XGBClassifierModel, + XGBClassifierModelConfig, + ) + + iris = load_iris() + y = iris["target"] + X = iris["data"] + trainX, testX, trainy, testy = train_test_split( + X, y, test_size=0.1, random_state=123 + ) + + # Configure the model + model = XGBClassifierModel( + XGBClassifierModelConfig( + features=Features(Feature("data", float,)), + predict=Feature("target", float, 1), + location="model", + max_depth=3, + learning_rate=0.01, + n_estimators=200, + reg_lambda=1, + reg_alpha=0, + gamma=0, + colsample_bytree=0, + subsample=1, + ) + ) + + # Configure the tuner search space in a dictionary + # All combinations will be tried, even if the parameter's + # value has been set in the model. + tuner = BayesOptGP( + parameters = { + "learning_rate": [0.01, 0.1], + "n_estimators": [20, 200], + "max_depth": [3,8] + + }, + objective = "max", + + ) + + # Tune function saves the best model and returns its score + print("Tuning accuracy:", + tune( + model, + tuner, + scorer, + Feature("target", float, 1), + [{"data": x, "target": y} for x, y in zip(trainX, trainy)], + [{"data": x, "target": y} for x, y in zip(testX, testy)], + + ) + ) + + +Note that because of its different nature, our BayesOptGP tuner only accepts a specific structure for its hyperparameter search +space configuration. For each hyperparameter, we accept two values representing the minimum and maximum bounds of that +hypeparameter which the tuner searches over. Also, Bayesian optimization only works on numerical hyperparameters ( +technically it should only work on floats, but we made some modfiications so it works on discrete values). This is because +the selection of the next set of hypeparameters derives from a closed-form integral which exepcts a continuous search space. + +Examples of non-legitimate hyperparameter configurations: + +.. code-block:: console + { + "learning_rate": [0.01, 0.1, 0.2], // too many values + "n_estimators": [20, 200], + "max_depth": [3] // too few values + + } + + +.. code-block:: console + { + "learning_rate": [0.01, 0.1], + "sampling_method": ["uniform", "gradient_based"], //no strings + "validate_parameters": [True, False] //no booleans + + } + +Command Line Usage +------------------ + +First, we download the Iris dataset to the desired folder. + +.. code-block:: console + $ wget http://download.tensorflow.org/data/iris_training.csv + $ wget http://download.tensorflow.org/data/iris_test.csv + $ sed -i 's/.*setosa,versicolor,virginica/SepalLength,SepalWidth,PetalLength,PetalWidth,classification/g' iris_training.csv iris_test.csv + +We create a JSON file with the hyperparameter search space: + +parameters.json +.. code-block:: console + { + "learning_rate": [0.01, 0.1], + "n_estimators": [20, 200], + "max_depth": [3,8] + } + +In the same folder, we perform the CLI tune command. + +.. code-block:: console + $ dffml tune \ + -model xgbclassifier \ + -model-features \ + SepalLength:float:1 \ + SepalWidth:float:1 \ + PetalLength:float:1 \ + -model-predict classification \ + -model-location tempDir \ + -tuner bayes_opt_gp \ + -tuner-parameters @parameters.json \ + -tuner-objective max \ + -scorer clf \ + -sources train=csv test=csv \ + -source-train-filename iris_training.csv \ + -source-test-filename iris_test.csv \ + -source-train-tag train \ + -source-test-tag test \ + -features classification:int:1 \ No newline at end of file diff --git a/docs/tutorials/tuners/parameter_grid.rst b/docs/tutorials/tuners/parameter_grid.rst new file mode 100644 index 0000000000..8bd275f047 --- /dev/null +++ b/docs/tutorials/tuners/parameter_grid.rst @@ -0,0 +1,162 @@ +Tuning a DFFML model with ParameterGrid +=============== + +For this tutorial, we'll be performing hyperparameter tuning on a DFFML model using DFFML's integrated "tune" method. +We will be using the XGBClassifier model and ParameterGrid tuner for this example, but note that these are +interchangeale for any DFFML Model and Tuner respectively. + +As we know, a machine learning model yields accurate predictions to unseen data by fitting itself to the +training dataset. However, different initial configurations to certain model parameters will affect the performance +of the trained model. For instance, a neural network that is allowed to train for several epochs on a dataset +typically outperforms another that has only trained a single epoch. We call these parameters to be modified in +pre-training "hyperparameters", and it is normally the job of the ML engineer to try many different hyperparameter +configuratons to find the best-performing model. + +This process can be automated using a hyperparameter tuning method, which tries a series of configurations on your +behalf, and includes random search, grid search, bayesian optimization and more. Here, we will be using +ParameterGrid, otherwise known as grid search, where the tuner tries all possible combinations of hyperparameters +provided by the user, a selects the best model based on a given metric. We will be tuning for the XGBClassifier +model based on a dictionary of values provied in a JSON file, and return the one with the highest accuracy on a +holdout validation set. + +First, download the xgboost plugin for the DFFML library, which can be done via pip: + +.. code-block:: console + :test: + $ pip install -U dffml-model-xgboost + +We can utilize DFFML's tune method either via the Python API. In the following code, we demonstrate its usage in a Python +file: + +.. code-block:: console + :test: + from sklearn.datasets import load_iris + from sklearn.model_selection import train_test_split + + from dffml import Feature, Features + from dffml.noasync import tune + from dffml.accuracy import ClassificationAccuracy + from dffml.tuner.parameter_grid import ParameterGrid + from dffml_model_xgboost.xgbclassifier import ( + XGBClassifierModel, + XGBClassifierModelConfig, + ) + + iris = load_iris() + y = iris["target"] + X = iris["data"] + trainX, testX, trainy, testy = train_test_split( + X, y, test_size=0.1, random_state=123 + ) + + # Configure the model + model = XGBClassifierModel( + XGBClassifierModelConfig( + features=Features(Feature("data", float,)), + predict=Feature("target", float, 1), + location="model", + max_depth=3, + learning_rate=0.01, + n_estimators=200, + reg_lambda=1, + reg_alpha=0, + gamma=0, + colsample_bytree=0, + subsample=1, + ) + ) + + # Configure the tuner search space in a dictionary + # All combinations will be tried, even if the parameter's + # value has been set in the model. + tuner = ParameterGrid( + parameters = { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + + }, + objective = "max" + ) + + # Tune function saves the best model and returns its score + print("Tuning accuracy:", + tune( + model, + tuner, + scorer, + Feature("target", float, 1), + [{"data": x, "target": y} for x, y in zip(trainX, trainy)], + [{"data": x, "target": y} for x, y in zip(testX, testy)], + + ) + ) + +The tune function takes in 6 arguments: + + model : Model + Machine Learning model to use. See :doc:`/plugins/dffml_model` for + models options. + + tuner: Tuner + Hyperparameter tuning method to use. See :doc:`/plugins/dffml_tuner` for + tuner options. + + scorer: Scorer + Method to evaluate the performance of the model, inheriting from AccuracyScorer + class. + + predict_feature: Union[Features, Feature] + A feature indicating the feature you wish to predict. + + train_ds : list + Input data for training. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + + valid_ds : list + Validation data for testing. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + +Command Line Usage +------------------ + +First, we download the Iris dataset to the desired folder. + +.. code-block:: console + $ wget http://download.tensorflow.org/data/iris_training.csv + $ wget http://download.tensorflow.org/data/iris_test.csv + $ sed -i 's/.*setosa,versicolor,virginica/SepalLength,SepalWidth,PetalLength,PetalWidth,classification/g' iris_training.csv iris_test.csv + +We create a JSON file with the hyperparameter search space: + +parameters.json +.. code-block:: console + { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + } + +In the same folder, we perform the CLI tune command. + +.. code-block:: console + $ dffml tune \ + -model xgbclassifier \ + -model-features \ + SepalLength:float:1 \ + SepalWidth:float:1 \ + PetalLength:float:1 \ + -model-predict classification \ + -model-location tempDir \ + -tuner parameter_grid \ + -tuner-parameters @parameters.json \ + -tuner-objective max \ + -scorer clf \ + -sources train=csv test=csv \ + -source-train-filename iris_training.csv \ + -source-test-filename iris_test.csv \ + -source-train-tag train \ + -source-test-tag test \ + -features classification:int:1 \ No newline at end of file diff --git a/examples/rockpaperscissors/tune.sh b/examples/rockpaperscissors/tune.sh index 39c78c2a79..e4613b7980 100644 --- a/examples/rockpaperscissors/tune.sh +++ b/examples/rockpaperscissors/tune.sh @@ -24,8 +24,4 @@ dffml tune \ -source-train-labels rock paper scissors \ -source-test-foldername rps-test-set/rps-test-set \ -source-test-feature image \ - -source-test-labels rock paper scissors \ - - - - \ No newline at end of file + -source-test-labels rock paper scissors \ \ No newline at end of file diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py index 3a6f48b9a0..1e3e3e2ef1 100644 --- a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py +++ b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py @@ -59,6 +59,11 @@ async def optimize( Trains each permutation of the grid of parameters and compares accuracy. Sets model to the best parameters and returns highest accuracy. + Note that for this tuner, each hyperparameter field to be tuned must have exactly 2 values + specified, representing the minimum and maximum values in the search space for that + hyperparameter. Additionally, they must be either float/integer values. Otherwise, + an error is raised. + Parameters ---------- model : ModelContext @@ -83,7 +88,7 @@ async def optimize( """ nest_asyncio.apply() - + def check_parameters(pars): for (pax, vals) in pars.items(): if len(vals) != 2: From 54d54d54ca759c473dd07fe6f0d56e859c078c98 Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Fri, 29 Jul 2022 11:02:35 +0800 Subject: [PATCH 7/8] Added requested changes --- .github/workflows/testing.yml | 2 + dffml/high_level/ml.py | 21 +---- dffml/util/internal.py | 12 +++ docs/tutorials/tuners/bayes_opt_gp.rst | 37 ++++----- docs/tutorials/tuners/parameter_grid.rst | 34 ++++---- .../dffml_tuner_bayes_opt_gp/bayes_opt_gp.py | 79 ++++++++++--------- .../tests/test_regressor_model.py | 1 - 7 files changed, 95 insertions(+), 91 deletions(-) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index fcbaeea27f..83153e606d 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -217,6 +217,8 @@ jobs: - docs/tutorials/models/slr.rst - docs/tutorials/sources/complex.rst - docs/tutorials/sources/file.rst + - docs/tutorials/tuner/parameter_grid.rst + - docs/tutorials/tuner/bayes_opt_gp.rst steps: - uses: actions/checkout@v2 diff --git a/dffml/high_level/ml.py b/dffml/high_level/ml.py index 9ff57af182..9317f1a4e6 100644 --- a/dffml/high_level/ml.py +++ b/dffml/high_level/ml.py @@ -6,7 +6,7 @@ from ..source.source import BaseSource from ..feature import Feature, Features from ..model import Model, ModelContext -from ..util.internal import records_to_sources, list_records_to_dict +from ..util.internal import records_to_sources, list_records_to_dict, records_to_dict_check from ..accuracy.accuracy import AccuracyScorer, AccuracyContext from ..tuner import Tuner, TunerContext @@ -387,23 +387,10 @@ async def tune( ] else: predict_feature = [model.config.predict.name] - - def records_to_dict_check(ds): - if hasattr(model.config, "features") and any( - isinstance(td, list) for td in ds - ): - return list_records_to_dict( - [feature.name for feature in model.config.features] - + predict_feature, - *ds, - model=model, - ) - return ds - - train_ds = records_to_dict_check(train_ds) - valid_ds = records_to_dict_check(valid_ds) - + train_ds = records_to_dict_check(train_ds, model, predict_feature) + valid_ds = records_to_dict_check(valid_ds, model, predict_feature) + async with contextlib.AsyncExitStack() as astack: # Open sources train = await astack.enter_async_context(records_to_sources(*train_ds)) diff --git a/dffml/util/internal.py b/dffml/util/internal.py index fcb4dd5255..e26a8698ab 100644 --- a/dffml/util/internal.py +++ b/dffml/util/internal.py @@ -72,3 +72,15 @@ def list_records_to_dict(features, *args, model=None): args[i] = dict(zip(features, args[i])) return args raise CannotConvertToRecord("Model does not exist!") + +def records_to_dict_check(ds, model, predict_feature): + if hasattr(model.config, "features") and any( + isinstance(td, list) for td in ds + ): + return list_records_to_dict( + [feature.name for feature in model.config.features] + + predict_feature, + *ds, + model=model, + ) + return ds diff --git a/docs/tutorials/tuners/bayes_opt_gp.rst b/docs/tutorials/tuners/bayes_opt_gp.rst index fd88670da6..005d7e0cb5 100644 --- a/docs/tutorials/tuners/bayes_opt_gp.rst +++ b/docs/tutorials/tuners/bayes_opt_gp.rst @@ -30,6 +30,7 @@ file: .. code-block:: console :test: + :filepath: bayes_opt_gp_xgboost.py from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split @@ -98,7 +99,7 @@ Note that because of its different nature, our BayesOptGP tuner only accepts a s space configuration. For each hyperparameter, we accept two values representing the minimum and maximum bounds of that hypeparameter which the tuner searches over. Also, Bayesian optimization only works on numerical hyperparameters ( technically it should only work on floats, but we made some modfiications so it works on discrete values). This is because -the selection of the next set of hypeparameters derives from a closed-form integral which exepcts a continuous search space. +the selection of the next set of hypeparameters derives from a closed-fm integral which exepcts a continuous search space. Examples of non-legitimate hyperparameter configurations: @@ -143,20 +144,20 @@ In the same folder, we perform the CLI tune command. .. code-block:: console $ dffml tune \ - -model xgbclassifier \ - -model-features \ - SepalLength:float:1 \ - SepalWidth:float:1 \ - PetalLength:float:1 \ - -model-predict classification \ - -model-location tempDir \ - -tuner bayes_opt_gp \ - -tuner-parameters @parameters.json \ - -tuner-objective max \ - -scorer clf \ - -sources train=csv test=csv \ - -source-train-filename iris_training.csv \ - -source-test-filename iris_test.csv \ - -source-train-tag train \ - -source-test-tag test \ - -features classification:int:1 \ No newline at end of file + -model xgbclassifier \ + -model-features \ + SepalLength:float:1 \ + SepalWidth:float:1 \ + PetalLength:float:1 \ + -model-predict classification \ + -model-location tempDir \ + -tuner bayes_opt_gp \ + -tuner-parameters @parameters.json \ + -tuner-objective max \ + -scorer clf \ + -sources train=csv test=csv \ + -source-train-filename iris_training.csv \ + -source-test-filename iris_test.csv \ + -source-train-tag train \ + -source-test-tag test \ + -features classification:int:1 \ No newline at end of file diff --git a/docs/tutorials/tuners/parameter_grid.rst b/docs/tutorials/tuners/parameter_grid.rst index 8bd275f047..2b37a8daff 100644 --- a/docs/tutorials/tuners/parameter_grid.rst +++ b/docs/tutorials/tuners/parameter_grid.rst @@ -143,20 +143,20 @@ In the same folder, we perform the CLI tune command. .. code-block:: console $ dffml tune \ - -model xgbclassifier \ - -model-features \ - SepalLength:float:1 \ - SepalWidth:float:1 \ - PetalLength:float:1 \ - -model-predict classification \ - -model-location tempDir \ - -tuner parameter_grid \ - -tuner-parameters @parameters.json \ - -tuner-objective max \ - -scorer clf \ - -sources train=csv test=csv \ - -source-train-filename iris_training.csv \ - -source-test-filename iris_test.csv \ - -source-train-tag train \ - -source-test-tag test \ - -features classification:int:1 \ No newline at end of file + -model xgbclassifier \ + -model-features \ + SepalLength:float:1 \ + SepalWidth:float:1 \ + PetalLength:float:1 \ + -model-predict classification \ + -model-location tempDir \ + -tuner parameter_grid \ + -tuner-parameters @parameters.json \ + -tuner-objective max \ + -scorer clf \ + -sources train=csv test=csv \ + -source-train-filename iris_training.csv \ + -source-test-filename iris_test.csv \ + -source-train-tag train \ + -source-test-tag test \ + -features classification:int:1 \ No newline at end of file diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py index 1e3e3e2ef1..d906574e69 100644 --- a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py +++ b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/bayes_opt_gp.py @@ -1,12 +1,14 @@ from typing import Union, Dict, Any, List import itertools import logging +import functools from dffml.base import ( config, field, ) from dffml.noasync import train, score +from dffml.high_level.ml import train as async_train from dffml.tuner import Tuner, TunerContext from dffml.util.entrypoint import entrypoint from dffml.record import Record @@ -44,6 +46,41 @@ class BayesOptGPContext(TunerContext): Bayesian Optimization GP Tuner """ + def check_parameters(self, pars): + for (pax, vals) in pars.items(): + if len(vals) != 2: + raise InvalidParametersException( + f"2 values are not provided for parameter {pax}" + ) + for val in vals: + if not type(val) is float and not type(val) is int: + raise InvalidParametersException( + f"Parameter {pax} is not of type int or float." + ) + return True + + def obj_func(self, model, train_data, accuracy_scorer, feature, test_data, **vals): + + with model.parent.config.no_enforce_immutable(): + for param in vals.keys(): + + if ( + hasattr(model.parent.config, param) + and model.parent.config.__annotations__[param].__name__ + == "int" + ): + setattr(model.parent.config, param, int(vals[param])) + else: + setattr(model.parent.config, param, vals[param]) + + train(model.parent, *train_data) + acc = score(model.parent, accuracy_scorer, feature, *test_data) + + if self.parent.config.objective == "min": + return -acc + elif self.parent.config.objective == "max": + return acc + async def optimize( self, model: ModelContext, @@ -78,7 +115,7 @@ async def optimize( train_data: SourcesContext The train_data to train models on with the hyperparameters provided. - sources : SourcesContext + test_data : SourcesContext The test_data to score against and optimize hyperparameters. Returns @@ -89,48 +126,14 @@ async def optimize( nest_asyncio.apply() - def check_parameters(pars): - for (pax, vals) in pars.items(): - if len(vals) != 2: - raise InvalidParametersException( - f"2 values are not provided for parameter {pax}" - ) - for val in vals: - if not type(val) is float and not type(val) is int: - raise InvalidParametersException( - f"Parameter {pax} is not of type int or float." - ) - return True - - check_parameters(self.parent.config.parameters) + self.check_parameters(self.parent.config.parameters) logging.info( f"Optimizing model with Bayesian optimization with gaussian processes: {self.parent.config.parameters}" ) - def func(**vals): - with model.parent.config.no_enforce_immutable(): - for param in vals.keys(): - - if ( - hasattr(model.parent.config, param) - and model.parent.config.__annotations__[param].__name__ - == "int" - ): - setattr(model.parent.config, param, int(vals[param])) - else: - setattr(model.parent.config, param, vals[param]) - - train(model.parent, *train_data) - acc = score(model.parent, accuracy_scorer, feature, *test_data) - - if self.parent.config.objective == "min": - return -acc - elif self.parent.config.objective == "max": - return acc - optimizer = BayesianOptimization( - f=func, + f=functools.partial(self.obj_func, model, train_data, accuracy_scorer, feature, test_data), pbounds=self.parent.config.parameters, random_state=1, ) @@ -151,7 +154,7 @@ def func(**vals): else: setattr(model.parent.config, param, val) - train(model.parent, *train_data) + await async_train(model.parent, *train_data) if self.parent.config.objective == "min": return -optimizer.max["target"] diff --git a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_regressor_model.py b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_regressor_model.py index 6cd4920a75..5c24190fda 100644 --- a/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_regressor_model.py +++ b/tuner/bayes_opt_gp/dffml_tuner_bayes_opt_gp/tests/test_regressor_model.py @@ -1,4 +1,3 @@ -import random import pathlib import tempfile From 5a05c86aa1637a5a90455376aea34a61f1ccb447 Mon Sep 17 00:00:00 2001 From: seraphimstreets Date: Sun, 31 Jul 2022 05:43:15 +0800 Subject: [PATCH 8/8] "minor doctest edits" --- docs/tutorials/tuners/bayes_opt_gp.rst | 4 + docs/tutorials/tuners/random_search.rst | 167 ++++++++++++++++++++++++ 2 files changed, 171 insertions(+) create mode 100644 docs/tutorials/tuners/random_search.rst diff --git a/docs/tutorials/tuners/bayes_opt_gp.rst b/docs/tutorials/tuners/bayes_opt_gp.rst index 005d7e0cb5..7ed9d94825 100644 --- a/docs/tutorials/tuners/bayes_opt_gp.rst +++ b/docs/tutorials/tuners/bayes_opt_gp.rst @@ -126,6 +126,7 @@ Command Line Usage First, we download the Iris dataset to the desired folder. .. code-block:: console + :test: $ wget http://download.tensorflow.org/data/iris_training.csv $ wget http://download.tensorflow.org/data/iris_test.csv $ sed -i 's/.*setosa,versicolor,virginica/SepalLength,SepalWidth,PetalLength,PetalWidth,classification/g' iris_training.csv iris_test.csv @@ -134,6 +135,8 @@ We create a JSON file with the hyperparameter search space: parameters.json .. code-block:: console + :test: + :filepath: parameters.json { "learning_rate": [0.01, 0.1], "n_estimators": [20, 200], @@ -143,6 +146,7 @@ parameters.json In the same folder, we perform the CLI tune command. .. code-block:: console + :test: $ dffml tune \ -model xgbclassifier \ -model-features \ diff --git a/docs/tutorials/tuners/random_search.rst b/docs/tutorials/tuners/random_search.rst new file mode 100644 index 0000000000..8a88562d7c --- /dev/null +++ b/docs/tutorials/tuners/random_search.rst @@ -0,0 +1,167 @@ +Tuning a DFFML model with Random Search +=============== + +For this tutorial, we'll be performing hyperparameter tuning on a DFFML model using DFFML's integrated "tune" method. +We will be using the XGBClassifier model and RandomSearch tuner for this example, but note that these are +interchangeale for any DFFML Model and Tuner respectively. + +As we know, a machine learning model yields accurate predictions to unseen data by fitting itself to the +training dataset. However, different initial configurations to certain model parameters will affect the performance +of the trained model. For instance, a neural network that is allowed to train for several epochs on a dataset +typically outperforms another that has only trained a single epoch. We call these parameters to be modified in +pre-training "hyperparameters", and it is normally the job of the ML engineer to try many different hyperparameter +configuratons to find the best-performing model. + +This process can be automated using a hyperparameter tuning method, which tries a series of configurations on your +behalf, and includes random search, grid search, bayesian optimization and more. Here, we will be using +RandomSearch, where the tuner tries a random combination of hyperparameters provided by the user for a fixed number of +iterations, and selects the best model based on a given metric. We will be tuning for the XGBClassifier +model based on a dictionary of values provied in a JSON file, and returns the one with the highest accuracy on a +holdout validation set. + +First, download the xgboost plugin for the DFFML library, which can be done via pip: + +.. code-block:: console + :test: + $ pip install -U dffml-model-xgboost + +We can utilize DFFML's tune method either via the Python API. In the following code, we demonstrate its usage in a Python +file: + +.. code-block:: console + :test: + from sklearn.datasets import load_iris + from sklearn.model_selection import train_test_split + + from dffml import Feature, Features + from dffml.noasync import tune + from dffml.accuracy import ClassificationAccuracy + from dffml.tuner.random_search import RandomSearch + from dffml_model_xgboost.xgbclassifier import ( + XGBClassifierModel, + XGBClassifierModelConfig, + ) + + iris = load_iris() + y = iris["target"] + X = iris["data"] + trainX, testX, trainy, testy = train_test_split( + X, y, test_size=0.1, random_state=123 + ) + + # Configure the model + model = XGBClassifierModel( + XGBClassifierModelConfig( + features=Features(Feature("data", float,)), + predict=Feature("target", float, 1), + location="model", + max_depth=3, + learning_rate=0.01, + n_estimators=200, + reg_lambda=1, + reg_alpha=0, + gamma=0, + colsample_bytree=0, + subsample=1, + ) + ) + + # Configure the tuner search space in a dictionary + # All combinations will be tried, even if the parameter's + # value has been set in the model. + tuner = RandomSearch( + parameters = { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + + }, + objective = "max", + trials=15 + ) + + # Tune function saves the best model and returns its score + print("Tuning accuracy:", + tune( + model, + tuner, + scorer, + Feature("target", float, 1), + [{"data": x, "target": y} for x, y in zip(trainX, trainy)], + [{"data": x, "target": y} for x, y in zip(testX, testy)], + + ) + ) + +The tune function takes in 6 arguments: + + model : Model + Machine Learning model to use. See :doc:`/plugins/dffml_model` for + models options. + + tuner: Tuner + Hyperparameter tuning method to use. See :doc:`/plugins/dffml_tuner` for + tuner options. + + scorer: Scorer + Method to evaluate the performance of the model, inheriting from AccuracyScorer + class. + + predict_feature: Union[Features, Feature] + A feature indicating the feature you wish to predict. + + train_ds : list + Input data for training. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + + valid_ds : list + Validation data for testing. Could be a ``dict``, :py:class:`Record`, + filename, one of the data :doc:`/plugins/dffml_source`, or a filename + with the extension being one of the data sources. + +Command Line Usage +------------------ + +First, we download the Iris dataset to the desired folder. + +.. code-block:: console + :test: + $ wget http://download.tensorflow.org/data/iris_training.csv + $ wget http://download.tensorflow.org/data/iris_test.csv + $ sed -i 's/.*setosa,versicolor,virginica/SepalLength,SepalWidth,PetalLength,PetalWidth,classification/g' iris_training.csv iris_test.csv + +We create a JSON file with the hyperparameter search space: + +parameters.json +.. code-block:: console + :test: + :filepath: parameters.json + { + "learning_rate": [0.01, 0.05, 0.1], + "n_estimators": [20, 100, 200], + "max_depth": [3,5,8] + } + +In the same folder, we perform the CLI tune command. + +.. code-block:: console + :test: + $ dffml tune \ + -model xgbclassifier \ + -model-features \ + SepalLength:float:1 \ + SepalWidth:float:1 \ + PetalLength:float:1 \ + -model-predict classification \ + -model-location tempDir \ + -tuner random_search \ + -tuner-parameters @parameters.json \ + -tuner-objective max \ + -scorer clf \ + -sources train=csv test=csv \ + -source-train-filename iris_training.csv \ + -source-test-filename iris_test.csv \ + -source-train-tag train \ + -source-test-tag test \ + -features classification:int:1 \ No newline at end of file