diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsInitTemplate.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsInitTemplate.scala index 1bdb65d56c..859cece2e8 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsInitTemplate.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsInitTemplate.scala @@ -22,7 +22,13 @@ import ai.h2o.sparkling.api.generation.common.{EntitySubstitutionContext, ModelM object MetricsInitTemplate extends ((Seq[ModelMetricsSubstitutionContext]) => String) with PythonEntityTemplate { def apply(metricSubstitutionContexts: Seq[ModelMetricsSubstitutionContext]): String = { - val metricClasses = metricSubstitutionContexts.map(_.entityName) + val metricClasses = metricSubstitutionContexts.map { metricSubstitutionContext => + if (metricSubstitutionContext.entityName.endsWith("Base")) { + metricSubstitutionContext.entityName.substring(0, metricSubstitutionContext.entityName.length - 4) + } else { + metricSubstitutionContext.entityName + } + } val imports = metricClasses.map(metricClass => s"ai.h2o.sparkling.ml.metrics.$metricClass.$metricClass") val entitySubstitutionContext = EntitySubstitutionContext(null, null, null, imports) diff --git a/py-scoring/src/ai/h2o/sparkling/ml/__init__.py b/py-scoring/src/ai/h2o/sparkling/ml/__init__.py index f0a1ffabe2..78169a4807 100644 --- a/py-scoring/src/ai/h2o/sparkling/ml/__init__.py +++ b/py-scoring/src/ai/h2o/sparkling/ml/__init__.py @@ -20,3 +20,4 @@ from ai.h2o.sparkling.ml.models import H2ODeepLearningMOJOModel, H2ODRFMOJOModel, H2OIsolationForestMOJOModel, H2OPCAMOJOModel, H2OGLRMMOJOModel from ai.h2o.sparkling.ml.models import H2OMOJOModel, H2OAlgorithmMOJOModel, H2OFeatureMOJOModel, H2OMOJOPipelineModel, H2OMOJOSettings from ai.h2o.sparkling.ml.models import H2OCoxPHMOJOModel, H2ORuleFitMOJOModel, H2OWord2VecMOJOModel +from ai.h2o.sparkling.ml.metrics import H2ORegressionMetrics, H2OBinomialMetrics, H2OMultinomialMetrics diff --git a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py index d7ade2d008..be6a87ef17 100644 --- a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py +++ b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py @@ -30,12 +30,26 @@ def calculate(dataFrame, labelCol = "label", weightCol = None, offsetCol = None): + ''' + The method calculates binomial metrics on a provided data frame with predictions and actual values. + :param dataFrame: A data frame with predictions and actual values + :param domain: A list of classes representing negative and positive response. Negative class must at position 0 + and positive at 1 + :param predictionCol: The name of prediction column. The prediction column must have the same type as + a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or a array type or + vector of doubles. First item is must be 0.0 or 1.0 representing negative or positive response. The other items + must be probabilities to predict given probability classes. + :param labelCol: The name of label column that contains actual values. + :param weightCol: The name of a weight column. + :param offsetCol: The name of a offset column. + :return: Calculated binomial metrics + ''' # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths Initializer.load_sparkling_jar() - javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OBinomialMetrics.calculate(dataFrame, - domain, - predictionCol, - labelCol, - weightCol, - offsetCol) + javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OBinomialMetrics.calculateInternal(dataFrame._jdf, + domain, + predictionCol, + labelCol, + weightCol, + offsetCol) return H2OBinomialMetrics(javaMetrics) diff --git a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py index e7432ed5b3..7a7854b455 100644 --- a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py +++ b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py @@ -30,12 +30,31 @@ def calculate(dataFrame, labelCol = "label", weightCol = None, aucType = "AUTO"): + ''' + The method calculates multinomial metrics on a provided data frame with predictions and actual values. + :param dataFrame: A data frame with predictions and actual values. + :param domain: List of response classes. + :param predictionCol: The name of prediction column. The prediction column must have the same type as + a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or a array type or + vector of doubles. First item is must be 0.0, 1.0, 2.0 representing indexes of response classes. The other + items must be probabilities to predict given probability classes. + :param labelCol: The name of label column that contains actual values. + :param weightCol: The name of a weight column. + :param aucType: Type of multinomial AUC/AUCPR calculation. Possible values: + - AUTO, + - NONE, + - MACRO_OVR, + - WEIGHTED_OVR, + - MACRO_OVO, + - WEIGHTED_OVO + :return: Calculated multinomial metrics + ''' # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths Initializer.load_sparkling_jar() - javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OMultinomialMetrics.calculate(dataFrame, - domain, - predictionCol, - labelCol, - weightCol, - aucType) + javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OMultinomialMetrics.calculateInternal(dataFrame._jdf, + domain, + predictionCol, + labelCol, + weightCol, + aucType) return H2OMultinomialMetrics(javaMetrics) diff --git a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.py b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.py index 2daa054f86..b82dd6ded4 100644 --- a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.py +++ b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.py @@ -25,17 +25,26 @@ class H2ORegressionMetrics(H2ORegressionMetricsBase): @staticmethod def calculate(dataFrame, - domain, predictionCol = "detailed_prediction", labelCol = "label", weightCol = None, offsetCol = None): + ''' + The method calculates regression metrics on a provided data frame with predictions and actual values. + :param dataFrame: A data frame with predictions and actual values + :param predictionCol: The name of prediction column. The prediction column must have the same type as + a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or + it must be of DoubleType or FloatType. + :param labelCol: The name of label column that contains actual values. + :param weightCol: The name of a weight column. + :param offsetCol: The name of a offset column. + :return: Calculated regression metrics + ''' # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths Initializer.load_sparkling_jar() - javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2ORegressionMetrics.calculate(dataFrame, - domain, - predictionCol, - labelCol, - weightCol, - offsetCol) + javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2ORegressionMetrics.calculateInternal(dataFrame._jdf, + predictionCol, + labelCol, + weightCol, + offsetCol) return H2ORegressionMetrics(javaMetrics) diff --git a/py-scoring/src/pysparkling/ml/__init__.py b/py-scoring/src/pysparkling/ml/__init__.py index 7aa23d5515..3f2cfc4c2d 100644 --- a/py-scoring/src/pysparkling/ml/__init__.py +++ b/py-scoring/src/pysparkling/ml/__init__.py @@ -16,12 +16,14 @@ # from pysparkling.ml.models import * +from pysparkling.ml.metrics import * __all__ = ["H2OMOJOModel", "H2OSupervisedMOJOModel", "H2OTreeBasedSupervisedMOJOModel", "H2OUnsupervisedMOJOModel", "H2OTreeBasedUnsupervisedMOJOModel", "H2OMOJOPipelineModel", "H2OMOJOSettings", "H2OBinaryModel", "H2OKMeansMOJOModel", "H2OGLMMOJOModel", "H2OGAMMOJOModel", "H2OGBMMOJOModel", "H2OXGBoostMOJOModel", "H2ODeepLearningMOJOModel", "H2ODRFMOJOModel", "H2OIsolationForestMOJOModel", "H2OPCAMOJOModel", - "H2OGLRMMOJOModel", "H2OCoxPHMOJOModel", "H2ORuleFitMOJOModel", "H2OWord2VecMOJOModel"] + "H2OGLRMMOJOModel", "H2OCoxPHMOJOModel", "H2ORuleFitMOJOModel", "H2OWord2VecMOJOModel", + "H2ORegressionMetrics", "H2OMultinomialMetrics", "H2OBinomialMetrics"] from pysparkling.initializer import Initializer diff --git a/py-scoring/src/pysparkling/ml/metrics/__init__.py b/py-scoring/src/pysparkling/ml/metrics/__init__.py new file mode 100644 index 0000000000..a24e87398c --- /dev/null +++ b/py-scoring/src/pysparkling/ml/metrics/__init__.py @@ -0,0 +1,20 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ai.h2o.sparkling.ml.metrics import H2ORegressionMetrics, H2OMultinomialMetrics, H2OBinomialMetrics + +__all__ = ["H2ORegressionMetrics", "H2OMultinomialMetrics", "H2OBinomialMetrics"] diff --git a/py/src/ai/h2o/sparkling/ml/__init__.py b/py/src/ai/h2o/sparkling/ml/__init__.py index 89f8998524..d4748a8ba2 100644 --- a/py/src/ai/h2o/sparkling/ml/__init__.py +++ b/py/src/ai/h2o/sparkling/ml/__init__.py @@ -27,3 +27,4 @@ from ai.h2o.sparkling.ml.models import H2ODeepLearningMOJOModel, H2OWord2VecMOJOModel, H2OAutoEncoderMOJOModel, H2ODRFMOJOModel, H2OPCAMOJOModel, H2OGLRMMOJOModel from ai.h2o.sparkling.ml.models import H2OIsolationForestMOJOModel, H2OCoxPHMOJOModel, H2ORuleFitMOJOModel, H2OStackedEnsembleMOJOModel from ai.h2o.sparkling.ml.models import H2OMOJOModel, H2OAlgorithmMOJOModel, H2OFeatureMOJOModel, H2OMOJOPipelineModel, H2OMOJOSettings +from ai.h2o.sparkling.ml.metrics import H2ORegressionMetrics, H2OBinomialMetrics, H2OMultinomialMetrics diff --git a/py/src/pysparkling/ml/__init__.py b/py/src/pysparkling/ml/__init__.py index 2a5c74fab7..3a1721c887 100644 --- a/py/src/pysparkling/ml/__init__.py +++ b/py/src/pysparkling/ml/__init__.py @@ -19,6 +19,7 @@ from pysparkling.ml.algos.regression import * from pysparkling.ml.features import * from pysparkling.ml.models import * +from pysparkling.ml.metrics import * __all__ = ["ColumnPruner", "H2OGBM", "H2ODeepLearning", "H2OAutoML", "H2OXGBoost", "H2OGLM", "H2OCoxPH", "H2OGAM", "H2OMOJOModel", "H2OAlgorithmMOJOModel", "H2OFeatureMOJOModel", "H2OSupervisedMOJOModel", @@ -32,7 +33,7 @@ "H2ODRFMOJOModel", "H2OIsolationForestMOJOModel", "H2OWord2Vec", "H2OWord2VecMOJOModel", "H2OAutoEncoder", "H2OAutoEncoderMOJOModel", "H2OPCA", "H2OPCAMOJOModel", "H2OGLRM", "H2OGLRMMOJOModel", "H2ORuleFit", "H2ORuleFitClassifier", "H2ORuleFitRegressor", "H2ORuleFitMOJOModel", "H2OStackedEnsemble", - "H2OStackedEnsembleMOJOModel"] + "H2OStackedEnsembleMOJOModel", "H2ORegressionMetrics", "H2OBinomialMetrics", "H2OMultinomialMetrics"] from pysparkling.initializer import Initializer diff --git a/py/src/pysparkling/ml/metrics/__init__.py b/py/src/pysparkling/ml/metrics/__init__.py new file mode 100644 index 0000000000..9bec18e1f3 --- /dev/null +++ b/py/src/pysparkling/ml/metrics/__init__.py @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ai.h2o.sparkling.ml.metrics import H2ORegressionMetrics, H2OBinomialMetrics, H2OMultinomialMetrics + + +__all__ = ["H2ORegressionMetrics", "H2OBinomialMetrics", "H2OMultinomialMetrics"] diff --git a/py/tests/unit/with_runtime_sparkling/conftest.py b/py/tests/unit/with_runtime_sparkling/conftest.py index 2b8c0799d2..e7350c7d13 100644 --- a/py/tests/unit/with_runtime_sparkling/conftest.py +++ b/py/tests/unit/with_runtime_sparkling/conftest.py @@ -60,6 +60,11 @@ def irisDatasetPath(): return "file://" + os.path.abspath("../examples/smalldata/iris/iris_wheader.csv") +@pytest.fixture(scope="module") +def irisDataset(spark, irisDatasetPath): + return spark.read.csv(irisDatasetPath, header=True, inferSchema=True) + + @pytest.fixture(scope="module") def airlinesDatasetPath(): return "file://" + os.path.abspath("../examples/smalldata/airlines/allyears2k_headers.csv") diff --git a/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py b/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py index f0f71390ab..def81191ef 100644 --- a/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py +++ b/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py @@ -17,16 +17,11 @@ import os from pysparkling.ml import * -from ai.h2o.sparkling.ml.models.H2OBinomialMetrics import H2OBinomialMetrics -from ai.h2o.sparkling.ml.models.H2OMultinomialMetrics import H2OMultinomialMetrics -from ai.h2o.sparkling.ml.models.H2ORegressionMetrics import H2ORegressionMetrics -from ai.h2o.sparkling.ml.models.H2OMOJOModel import H2OMOJOModel - def testRegressionMetricsCalculation(prostateDataset): mojo = H2OMOJOModel.createFromMojo( "file://" + os.path.abspath("../ml/src/test/resources/regre_model_prostate.mojo")) - metrics = H2ORegressionMetrics.calculate(mojo.transform(prostateDataset), labelCol = "capsule") + metrics = H2ORegressionMetrics.calculate(mojo.transform(prostateDataset), labelCol = "CAPSULE") assert metrics is not None @@ -34,7 +29,7 @@ def testBinomialMetricsCalculation(prostateDataset): mojo = H2OMOJOModel.createFromMojo( "file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo")) domain = mojo.getDomainValues()["capsule"] - metrics = H2OBinomialMetrics.calculate(mojo.transform(prostateDataset), domain, labelCol = "capsule") + metrics = H2OBinomialMetrics.calculate(mojo.transform(prostateDataset), domain, labelCol = "CAPSULE") assert metrics is not None diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala index 235c825517..57ae6ddec7 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala @@ -68,14 +68,15 @@ object H2OBinomialMetrics extends MetricCalculation { result } - def calculate( + // The method serves for call from Python/R API + def calculateInternal( dataFrame: DataFrame, - domain: Array[String], + domain: java.util.ArrayList[String], predictionCol: String, labelCol: String, weightCol: String, offsetCol: String): Unit = { - calculate(dataFrame, domain, predictionCol, labelCol, Option(weightCol), Option(offsetCol)) + calculate(dataFrame, domain.toArray[String](new Array[String](0)), predictionCol, labelCol, Option(weightCol), Option(offsetCol)) } override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = { diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala index b9a31b50e7..a65fe556fb 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala @@ -22,7 +22,7 @@ import hex.MultinomialAucType import org.apache.spark.{ExposeUtils, ml, mllib} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions.col -import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.types.{ArrayType, DataType, DoubleType, FloatType, StringType, StructType} @MetricsDescription( @@ -38,7 +38,7 @@ object H2OMultinomialMetrics extends MetricCalculation { /** * The method calculates multinomial metrics on a provided data frame with predictions and actual values. * - * @param dataFrame A data frame with predictions and actual values + * @param dataFrame A data frame with predictions and actual values. * @param domain Array of response classes. * @param predictionCol The name of prediction column. The prediction column must have the same type as * a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or @@ -77,14 +77,15 @@ object H2OMultinomialMetrics extends MetricCalculation { result } - def calculate( + // The method serves for call from Python/R API + def calculateInternal( dataFrame: DataFrame, - domain: Array[String], + domain: java.util.ArrayList[String], predictionCol: String, labelCol: String, weightCol: String, aucType: String): H2OMultinomialMetrics = { - calculate(dataFrame, domain, predictionCol, labelCol, Option(weightCol), aucType) + calculate(dataFrame, domain.toArray[String](new Array[String](0)), predictionCol, labelCol, Option(weightCol), aucType) } override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = { diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala index 8b55e92ede..54e7ac3209 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala @@ -63,7 +63,8 @@ object H2ORegressionMetrics extends MetricCalculation { result } - def calculate( + // The method serves for call from Python/R API + def calculateInternal( dataFrame: DataFrame, predictionCol: String, labelCol: String,