From 753562d3fd133d131b053456af7f273c6e8d3101 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Mon, 15 Nov 2021 12:46:38 +0100 Subject: [PATCH 01/37] [SW-2646] Calculate Metrics on Arbitrary Dataset --- .../common/AlgorithmConfigurations.scala | 37 ++- .../common/AlgorithmSubstitutionContext.scala | 1 + .../common/AutoMLConfiguration.scala | 2 +- .../FeatureEstimatorConfigurations.scala | 1 + .../common/GridSearchConfiguration.scala | 2 +- .../generation/scala/MOJOModelTemplate.scala | 10 +- .../scala/ai/h2o/sparkling/TestUtils.scala | 46 ++- .../h2o/sparkling/doc/generation/Runner.scala | 1 + doc/src/site/sphinx/deployment/load_mojo.rst | 13 +- extensions/build.gradle | 1 + .../META-INF/services/water.TypeMapExtension | 1 + .../hex/MetricsCalculationTypeExtensions.java | 82 +++++ gradle.properties | 4 +- .../algos/BinomialPredictionTestSuite.scala | 53 ---- .../ml/algos/H2OKMeansTestSuite.scala | 7 + .../ml/algos/OrdinalPredictionTestSuite.scala | 22 -- .../algos/RegressionPredictionTestSuite.scala | 34 -- .../AnomalyDetectionMetricsTestSuite.scala | 55 ++++ .../metrics/AutoEncoderMetricsTestSuite.scala | 56 ++++ .../ml/metrics/BinomialMetricsTestSuite.scala | 278 +++++++++++++++++ .../metrics/ClusteringMetricsTestSuite.scala | 76 +++++ .../DimReductionMetricsTestSuite.scala | 84 +++++ .../ml/metrics/MetricsAssertions.scala | 62 +++- .../metrics/MultinomialMetricsTestSuite.scala | 294 ++++++++++++++++++ .../metrics/NoRuntimeMetricsTestSuite.scala | 67 ++++ .../ml/metrics/OrdinalMetricsTestSuite.scala | 182 +++++++++++ .../metrics/RegressionMetricsTestSuite.scala | 211 +++++++++++++ .../sparkling/ml/params/H2OMOJOModelParams.py | 15 + .../unit/with_runtime_sparkling/test_mojo.py | 26 +- .../ai/h2o/sparkling/ml/models/H2OMOJOModel.R | 9 + r/src/tests/testthat/testMojo.R | 28 ++ scoring/build.gradle | 5 + .../ml/metrics/GLRMMetricCalculation.scala | 33 ++ .../h2o/sparkling/ml/metrics/H2OMetrics.scala | 8 +- .../ml/metrics/KmeansMetricCalculation.scala | 42 +++ .../ml/metrics/MetricCalculation.scala | 207 ++++++++++++ .../sparkling/ml/models/H2OMOJOModel.scala | 9 +- .../ml/models/H2OSupervisedMOJOModel.scala | 10 + 38 files changed, 1930 insertions(+), 144 deletions(-) create mode 100644 extensions/src/main/resources/META-INF/services/water.TypeMapExtension create mode 100644 extensions/src/main/scala/hex/MetricsCalculationTypeExtensions.java create mode 100644 ml/src/test/scala/ai/h2o/sparkling/ml/metrics/AnomalyDetectionMetricsTestSuite.scala create mode 100644 ml/src/test/scala/ai/h2o/sparkling/ml/metrics/AutoEncoderMetricsTestSuite.scala create mode 100644 ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala create mode 100644 ml/src/test/scala/ai/h2o/sparkling/ml/metrics/ClusteringMetricsTestSuite.scala create mode 100644 ml/src/test/scala/ai/h2o/sparkling/ml/metrics/DimReductionMetricsTestSuite.scala create mode 100644 ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala create mode 100644 ml/src/test/scala/ai/h2o/sparkling/ml/metrics/NoRuntimeMetricsTestSuite.scala create mode 100644 ml/src/test/scala/ai/h2o/sparkling/ml/metrics/OrdinalMetricsTestSuite.scala create mode 100644 ml/src/test/scala/ai/h2o/sparkling/ml/metrics/RegressionMetricsTestSuite.scala create mode 100644 scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/GLRMMetricCalculation.scala create mode 100644 scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/KmeansMetricCalculation.scala create mode 100644 scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AlgorithmConfigurations.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AlgorithmConfigurations.scala index 9a6a44817b..ca843c4694 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AlgorithmConfigurations.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AlgorithmConfigurations.scala @@ -112,7 +112,7 @@ class AlgorithmConfigurations extends MultipleAlgorithmsConfiguration { type KMeansParamsV3 = KMeansV3.KMeansParametersV3 val explicitDefaultValues = - Map[String, Any]("max_w2" -> 3.402823e38f, "response_column" -> "label", "model_id" -> null, "lambda" -> null) + Map[String, Any]("max_w2" -> 3.402823e38f, "response_column" -> "label", "model_id" -> null) val noDeprecation = Seq.empty @@ -173,25 +173,34 @@ class AlgorithmConfigurations extends MultipleAlgorithmsConfiguration { type IFParameters = IsolationForestParameters - val algorithms = Seq[(String, Class[_], String, Seq[String], Option[String])]( - ("H2OXGBoost", classOf[XGBoostParameters], treeSupervised, Seq(withDistribution), None), - ("H2OGBM", classOf[GBMParameters], treeSupervised, Seq(withDistribution), None), - ("H2ODRF", classOf[DRFParameters], treeSupervised, Seq(withDistribution), None), - ("H2OGLM", classOf[GLMParameters], cvSupervised, Seq(withFamily), Some("H2OGLMMetrics")), - ("H2OGAM", classOf[GAMParameters], cvSupervised, Seq(withFamily), None), - ("H2ODeepLearning", classOf[DeepLearningParameters], cvSupervised, Seq(withDistribution), None), - ("H2ORuleFit", classOf[RuleFitParameters], supervised, Seq(withDistribution), None), - ("H2OKMeans", classOf[KMeansParameters], unsupervised, Seq("H2OKMeansExtras"), Some("H2OClusteringMetrics")), - ("H2OCoxPH", classOf[CoxPHParameters], supervised, Seq.empty, Some("H2ORegressionCoxPHMetrics")), - ("H2OIsolationForest", classOf[IFParameters], treeUnsupervised, Seq.empty, Some("H2OAnomalyMetrics"))) - - for ((entityName, h2oParametersClass: Class[_], algorithmType, extraParents, metricsClass) <- algorithms) + val none = Seq.empty + + val algorithms = Seq[(String, Class[_], String, Seq[String], Seq[String], Option[String])]( + ("H2OXGBoost", classOf[XGBoostParameters], treeSupervised, Seq(withDistribution), none, None), + ("H2OGBM", classOf[GBMParameters], treeSupervised, Seq(withDistribution), none, None), + ("H2ODRF", classOf[DRFParameters], treeSupervised, Seq(withDistribution), none, None), + ("H2OGLM", classOf[GLMParameters], cvSupervised, Seq(withFamily), none, Some("H2OGLMMetrics")), + ("H2OGAM", classOf[GAMParameters], cvSupervised, Seq(withFamily), none, None), + ("H2ODeepLearning", classOf[DeepLearningParameters], cvSupervised, Seq(withDistribution), none, None), + ("H2ORuleFit", classOf[RuleFitParameters], supervised, Seq(withDistribution), none, None), + ( + "H2OKMeans", + classOf[KMeansParameters], + unsupervised, + Seq("H2OKMeansExtras"), + Seq("KmeansMetricCalculation"), + Some("H2OClusteringMetrics")), + ("H2OCoxPH", classOf[CoxPHParameters], supervised, none, none, Some("H2ORegressionCoxPHMetrics")), + ("H2OIsolationForest", classOf[IFParameters], treeUnsupervised, none, none, Some("H2OAnomalyMetrics"))) + + for ((entityName, h2oParametersClass: Class[_], algorithmType, extraParents, extraMOJOParents, metricsClass) <- algorithms) yield AlgorithmSubstitutionContext( namespace = "ai.h2o.sparkling.ml.algos", entityName, h2oParametersClass, algorithmType, extraParents, + extraMOJOParents, specificMetricsClass = metricsClass) } diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AlgorithmSubstitutionContext.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AlgorithmSubstitutionContext.scala index afb4975de2..a5ac7105e0 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AlgorithmSubstitutionContext.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AlgorithmSubstitutionContext.scala @@ -23,6 +23,7 @@ case class AlgorithmSubstitutionContext( h2oSchemaClass: Class[_], algorithmType: String, extraInheritedEntities: Seq[String] = Seq.empty, + extraInheritedEntitiesOnMOJO: Seq[String] = Seq.empty, constructorMethods: Boolean = true, specificMetricsClass: Option[String] = None) extends SubstitutionContextBase diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AutoMLConfiguration.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AutoMLConfiguration.scala index 724724e0f9..8a0822344e 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AutoMLConfiguration.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AutoMLConfiguration.scala @@ -52,7 +52,7 @@ class AutoMLConfiguration extends SingleAlgorithmConfiguration { null, "H2OSupervisedAlgorithmWithFoldColumn", Seq("H2OAutoMLExtras"), - false)) + constructorMethods = false)) } override def problemSpecificAlgorithmConfiguration: Seq[ProblemSpecificAlgorithmSubstitutionContext] = { diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/FeatureEstimatorConfigurations.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/FeatureEstimatorConfigurations.scala index b74ba73c43..0eeccb76ba 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/FeatureEstimatorConfigurations.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/FeatureEstimatorConfigurations.scala @@ -115,6 +115,7 @@ class FeatureEstimatorConfigurations extends MultipleAlgorithmsConfiguration { override def algorithmConfiguration: Seq[AlgorithmSubstitutionContext] = { + def none = Seq.empty[String] val algorithms = Seq[(String, Class[_], String, Option[String])]( ("H2OAutoEncoder", classOf[DeepLearningParameters], "H2OAutoEncoderBase", Some("H2OAutoEncoderMetrics")), ("H2OPCA", classOf[PCAParameters], "H2ODimReductionEstimator", Some("H2OPCAMetrics")), diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/GridSearchConfiguration.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/GridSearchConfiguration.scala index 2e5a0241b3..ba0e8ae10a 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/GridSearchConfiguration.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/GridSearchConfiguration.scala @@ -71,6 +71,6 @@ class GridSearchConfiguration extends SingleAlgorithmConfiguration { null, "H2OAlgorithm", Seq("H2OGridSearchExtras"), - false)) + constructorMethods = false)) } } diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/scala/MOJOModelTemplate.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/scala/MOJOModelTemplate.scala index 9b925f77b9..39d783055a 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/scala/MOJOModelTemplate.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/scala/MOJOModelTemplate.scala @@ -46,6 +46,7 @@ object MOJOModelTemplate val imports = Seq( "com.google.gson.JsonObject", "ai.h2o.sparkling.ml.params.ParameterConstructorMethods", + "ai.h2o.sparkling.ml.metrics._", "hex.genmodel.MojoModel", "org.apache.spark.expose.Logging", "ai.h2o.sparkling.utils.DataFrameSerializationWrappers._") ++ @@ -59,7 +60,9 @@ object MOJOModelTemplate .replace("Estimator", "MOJOModel") .replaceFirst("Base$", "MOJOBase"), "ParameterConstructorMethods", - "Logging") ++ explicitFieldImplementations + "Logging") ++ + explicitFieldImplementations ++ + algorithmSubstitutionContext.extraInheritedEntitiesOnMOJO val entityName = algorithmSubstitutionContext.entityName val entityParameters = "(override val uid: String)" @@ -212,6 +215,11 @@ object MOJOModelTemplate | override def getCrossValidationMetricsObject(): $metrics = { | val value = super.getCrossValidationMetricsObject() | if (value == null) null else value.asInstanceOf[$metrics] + | } + | + | override def getMetricsObject(dataFrame: org.apache.spark.sql.DataFrame): $metrics = { + | val value = super.getMetricsObject(dataFrame) + | if (value == null) null else value.asInstanceOf[$metrics] | }""".stripMargin } } diff --git a/core/src/test/scala/ai/h2o/sparkling/TestUtils.scala b/core/src/test/scala/ai/h2o/sparkling/TestUtils.scala index feca5a8d02..f90c7d139f 100644 --- a/core/src/test/scala/ai/h2o/sparkling/TestUtils.scala +++ b/core/src/test/scala/ai/h2o/sparkling/TestUtils.scala @@ -24,7 +24,7 @@ import org.apache.spark.mllib import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema -import org.apache.spark.sql.functions.{lit, rand} +import org.apache.spark.sql.functions.{lit, rand, col, abs} import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.scalatest.Matchers @@ -100,6 +100,50 @@ object TestUtils extends Matchers { """.stripMargin) } + def assertDataFramesAreEqual( + expected: DataFrame, + produced: DataFrame, + identityColumn: String, + tolerance: Double): Unit = { + val tolerances = expected.schema.fields + .filterNot(_.name == identityColumn) + .filter(_.dataType.isInstanceOf[NumericType]) + .map(_.name -> tolerance) + .toMap + assertDataFramesAreEqual(expected, produced, identityColumn, tolerances) + } + + def assertDataFramesAreEqual( + expected: DataFrame, + produced: DataFrame, + identityColumn: String, + tolerances: Map[String, Double] = Map.empty): Unit = { + expected.schema shouldEqual produced.schema + val intersection = expected.as("expected").join(produced.as("produced"), identityColumn) + intersection.count() shouldEqual expected.count() + intersection.count() shouldEqual produced.count() + val isEqualExpression = expected.columns.foldLeft(lit(true)) { + case (partialExpression, columnName) => + val columnComparision = if (tolerances.contains(columnName)) { + val difference = abs(col(s"expected.$columnName") - col(s"produced.$columnName")) + difference <= lit(tolerances(columnName)) + } else if (columnName == identityColumn) { + lit(true) + } else { + col(s"expected.$columnName") === col(s"produced.$columnName") + } + partialExpression && columnComparision + } + val withComparisonDF = intersection.withColumn("isEqual", isEqualExpression) + val differentRowsDF = withComparisonDF + .filter(col("isEqual") === lit(false)) + .select(col(s"expected.$identityColumn") as "id") + val differentIds = differentRowsDF.collect().map(_.get(0)) + assert( + differentIds.length == 0, + s"The rows of ids($identityColumn) [${differentIds.mkString(", ")}] are not equal.") + } + def assertDatasetBasicProperties[T <: Product]( ds: Dataset[T], df: H2OFrame, diff --git a/doc/src/main/scala/ai/h2o/sparkling/doc/generation/Runner.scala b/doc/src/main/scala/ai/h2o/sparkling/doc/generation/Runner.scala index 2b0d592ac8..29ded6c37c 100644 --- a/doc/src/main/scala/ai/h2o/sparkling/doc/generation/Runner.scala +++ b/doc/src/main/scala/ai/h2o/sparkling/doc/generation/Runner.scala @@ -98,6 +98,7 @@ object Runner { } } else { val metricClasses = getParamClasses("ai.h2o.sparkling.ml.metrics") + .filter(_.getSimpleName.endsWith("Metrics")) writeResultToFile(MetricsTocTreeTemplate(metricClasses), "metrics", destinationDir) for (metricClass <- metricClasses) { val content = MetricsTemplate(metricClass) diff --git a/doc/src/site/sphinx/deployment/load_mojo.rst b/doc/src/site/sphinx/deployment/load_mojo.rst index 1c549b61b3..e4b5377e07 100644 --- a/doc/src/site/sphinx/deployment/load_mojo.rst +++ b/doc/src/site/sphinx/deployment/load_mojo.rst @@ -362,8 +362,8 @@ Obtaining Scoring History The method ``getScoringHistory`` returns a data frame describing how the model evolved during the training process according to a certain training and validation metrics. -Obtaining Metrics -^^^^^^^^^^^^^^^^^ +Obtaining Pre-calculated Metrics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ There are two sets of methods to obtain metrics from the MOJO model. @@ -389,6 +389,15 @@ the metrics could be also of a complex type. (see :ref:`metrics` for details) There is also the method ``getCurrentMetricsObject()`` working a similar way as ``getCurrentMetrics()``. +Calculation of Metrics on Arbitrary Dataset +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The below two methods calculate metrics on a provided dataset. + +- ``getMetrics(dataFrame)`` - Returns a map with basic metrics of double type + +- ``getMetricsObject(dataFrame)`` - Returns an object with basic and more complex metrics available via getter methods. + (see :ref:`metrics` for details) + Obtaining Cross Validation Metrics Summary ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The ``getCrossValidationMetricsSummary`` method returns data frame with information about performance of individual folds diff --git a/extensions/build.gradle b/extensions/build.gradle index bfea309244..7acd163eca 100644 --- a/extensions/build.gradle +++ b/extensions/build.gradle @@ -5,6 +5,7 @@ dependencies { compileOnly("org.scala-lang:scala-library:${scalaVersion}") compileOnly("ai.h2o:h2o-core:${h2oVersion}") + compileOnly("ai.h2o:h2o-algos:${h2oVersion}") compileOnly("javax.servlet:servlet-api:2.5") testImplementation("org.scala-lang:scala-library:${scalaVersion}") diff --git a/extensions/src/main/resources/META-INF/services/water.TypeMapExtension b/extensions/src/main/resources/META-INF/services/water.TypeMapExtension new file mode 100644 index 0000000000..9cd8d7c16b --- /dev/null +++ b/extensions/src/main/resources/META-INF/services/water.TypeMapExtension @@ -0,0 +1 @@ +hex.MetricsCalculationTypeExtensions diff --git a/extensions/src/main/scala/hex/MetricsCalculationTypeExtensions.java b/extensions/src/main/scala/hex/MetricsCalculationTypeExtensions.java new file mode 100644 index 0000000000..bd1b472a4b --- /dev/null +++ b/extensions/src/main/scala/hex/MetricsCalculationTypeExtensions.java @@ -0,0 +1,82 @@ +package hex; + +import hex.glm.IndependentGLMMetricBuilder; +import hex.glrm.ModelMetricsGLRM; +import hex.pca.ModelMetricsPCA; +import hex.tree.isofor.ModelMetricsAnomaly; +import java.util.Arrays; +import water.TypeMapExtension; +import water.api.ModelMetricsPCAV3; +import water.api.schemas3.*; + +public class MetricsCalculationTypeExtensions implements TypeMapExtension { + public static final String[] MODEL_BUILDER_CLASSES = { + ModelMetrics.IndependentMetricBuilder.class.getName(), + ModelMetricsSupervised.IndependentMetricBuilderSupervised.class.getName(), + ModelMetricsUnsupervised.IndependentMetricBuilderUnsupervised.class.getName(), + ModelMetricsBinomial.IndependentMetricBuilderBinomial.class.getName(), + AUC2.AUCBuilder.class.getName(), + ModelMetricsRegression.IndependentMetricBuilderRegression.class.getName(), + Distribution.class.getName(), + GaussianDistribution.class.getName(), + BernoulliDistribution.class.getName(), + QuasibinomialDistribution.class.getName(), + ModifiedHuberDistribution.class.getName(), + MultinomialDistribution.class.getName(), + PoissonDistribution.class.getName(), + GammaDistribution.class.getName(), + TweedieDistribution.class.getName(), + HuberDistribution.class.getName(), + LaplaceDistribution.class.getName(), + QuantileDistribution.class.getName(), + CustomDistribution.class.getName(), + CustomDistributionWrapper.class.getName(), + LinkFunction.class.getName(), + IdentityFunction.class.getName(), + InverseFunction.class.getName(), + LogFunction.class.getName(), + LogitFunction.class.getName(), + OlogitFunction.class.getName(), + OloglogFunction.class.getName(), + OprobitFunction.class.getName(), + ModelMetricsMultinomial.IndependentMetricBuilderMultinomial.class.getName(), + ModelMetricsOrdinal.IndependentMetricBuilderOrdinal.class.getName(), + ModelMetricsClustering.IndependentMetricBuilderClustering.class.getName(), + ModelMetricsHGLM.IndependentMetricBuilderHGLM.class.getName(), + ModelMetricsGLRM.IndependentGLRMModelMetricsBuilder.class.getName(), + ModelMetricsAnomaly.IndependentMetricBuilderAnomaly.class.getName(), + IndependentGLMMetricBuilder.class.getName(), + hex.glm.GLMModel.GLMWeightsFun.class.getName(), + ModelMetricsAutoEncoder.IndependentAutoEncoderMetricBuilder.class.getName(), + ModelMetricsPCA.IndependentPCAMetricBuilder.class.getName() + }; + + public static final String[] SCHEMA_CLASSES = { + ModelMetricsBaseV3.class.getName(), + ModelMetricsBinomialGLMV3.class.getName(), + ModelMetricsBinomialV3.class.getName(), + ModelMetricsMultinomialGLMV3.class.getName(), + ModelMetricsMultinomialV3.class.getName(), + ModelMetricsOrdinalGLMV3.class.getName(), + ModelMetricsOrdinalV3.class.getName(), + ModelMetricsRegressionGLMV3.class.getName(), + ModelMetricsRegressionCoxPHV3.class.getName(), + ModelMetricsRegressionV3.class.getName(), + ModelMetricsAutoEncoderV3.class.getName(), + ModelMetricsPCAV3.class.getName(), + ModelMetricsHGLMV3.class.getName(), + ModelMetricsClusteringV3.class.getName(), + ConfusionMatrixV3.class.getName(), + TwoDimTableV3.class.getName(), + TwoDimTableV3.ColumnSpecsBase.class.getName() + }; + + @Override + public String[] getBoostrapClasses() { + String[] result = + Arrays.copyOf(MODEL_BUILDER_CLASSES, MODEL_BUILDER_CLASSES.length + SCHEMA_CLASSES.length); + System.arraycopy( + SCHEMA_CLASSES, 0, result, MODEL_BUILDER_CLASSES.length, SCHEMA_CLASSES.length); + return result; + } +} diff --git a/gradle.properties b/gradle.properties index 6055f1b8f3..b58887a6a1 100644 --- a/gradle.properties +++ b/gradle.properties @@ -29,11 +29,11 @@ pythonEnvironments=2.7 3.6 3.7 3.8 # Select for which Spark version is Sparkling Water built by default spark=3.2 # Sparkling Water Version -version=3.38.0.1-1-SNAPSHOT +version=3.38.0.1-199-SNAPSHOT # Spark version from which is Kubernetes Supported kubernetesSupportSinceSpark=2.4 databricksTestSinceSpark=2.4 spotlessModern=true -testH2OBranch=master +testH2OBranch=mn/PUBDEV-8373 makeBooklet=false testingBaseImage="harbor.h2o.ai/opsh2oai/h2o-3-hadoop-cdh-6.3:84" diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/algos/BinomialPredictionTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/algos/BinomialPredictionTestSuite.scala index 6706574b58..7254f223d0 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/algos/BinomialPredictionTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/algos/BinomialPredictionTestSuite.scala @@ -183,57 +183,4 @@ class BinomialPredictionTestSuite extends FunSuite with Matchers with SharedH2OT assert(schema == expectedSchema) assert(schema == expectedSchemaByTransform) } - - private def assertMetrics[T](model: H2OMOJOModel): Unit = { - assertMetrics[T](model.getTrainingMetricsObject(), model.getTrainingMetrics()) - assertMetrics[T](model.getValidationMetricsObject(), model.getValidationMetrics()) - assert(model.getCrossValidationMetricsObject() == null) - assert(model.getCrossValidationMetrics() == Map()) - } - - private def assertMetrics[T](metricsObject: H2OMetrics, metrics: Map[String, Double]): Unit = { - metricsObject.isInstanceOf[T] should be(true) - MetricsAssertions.assertMetricsObjectAgainstMetricsMap(metricsObject, metrics) - val binomialObject = metricsObject.asInstanceOf[H2OBinomialMetrics] - binomialObject.getConfusionMatrix().count() > 0 - binomialObject.getConfusionMatrix().columns.length > 0 - binomialObject.getGainsLiftTable().count() > 0 - binomialObject.getGainsLiftTable().columns.length > 0 - binomialObject.getMaxCriteriaAndMetricScores().count() > 0 - binomialObject.getMaxCriteriaAndMetricScores().columns.length > 0 - binomialObject.getThresholdsAndMetricScores().count() > 0 - binomialObject.getThresholdsAndMetricScores().columns.length > 0 - } - - test("test binomial metric objects") { - val algo = new H2OGBM() - .setSplitRatio(0.8) - .setSeed(1) - .setFeaturesCols("sepal_len", "sepal_wid") - .setColumnsToCategorical("class") - .setLabelCol("class") - - val model = algo.fit(dataset) - assertMetrics[H2OBinomialMetrics](model) - - model.write.overwrite().save("ml/build/gbm_binomial_model_metrics") - val loadedModel = H2OGBMMOJOModel.load("ml/build/gbm_binomial_model_metrics") - assertMetrics[H2OBinomialMetrics](loadedModel) - } - - test("test binomial glm metric objects") { - val algo = new H2OGLM() - .setSplitRatio(0.8) - .setSeed(1) - .setFeaturesCols("sepal_len", "sepal_wid") - .setColumnsToCategorical("class") - .setLabelCol("class") - - val model = algo.fit(dataset) - assertMetrics[H2OBinomialGLMMetrics](model) - - model.write.overwrite().save("ml/build/glm_binomial_model_metrics") - val loadedModel = H2OGLMMOJOModel.load("ml/build/glm_binomial_model_metrics") - assertMetrics[H2OBinomialGLMMetrics](loadedModel) - } } diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/algos/H2OKMeansTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/algos/H2OKMeansTestSuite.scala index f5fd6e23f5..e44c79a6a1 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/algos/H2OKMeansTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/algos/H2OKMeansTestSuite.scala @@ -18,11 +18,18 @@ package ai.h2o.sparkling.ml.algos import ai.h2o.sparkling.{SharedH2OTestContext, TestUtils} +import com.google.gson.GsonBuilder +import hex.genmodel.MojoModel +import hex.genmodel.easy.EasyPredictModelWrapper +import hex.kmeans.KMeansModel +import hex.schemas.KMeansModelV3 +import org.apache.commons.io.IOUtils import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.sql.{Row, SparkSession} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{FunSuite, Matchers} +import water.AutoBuffer @RunWith(classOf[JUnitRunner]) class H2OKMeansTestSuite extends FunSuite with Matchers with SharedH2OTestContext { diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/algos/OrdinalPredictionTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/algos/OrdinalPredictionTestSuite.scala index 88ee00258b..70dba7889d 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/algos/OrdinalPredictionTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/algos/OrdinalPredictionTestSuite.scala @@ -86,26 +86,4 @@ class OrdinalPredictionTestSuite extends FunSuite with Matchers with SharedH2OTe assert(schema == expectedSchema) assert(schema == expectedSchemaByTransform) } - - private def assertMetrics[T](model: H2OMOJOModel): Unit = { - assertMetrics(model.getTrainingMetricsObject(), model.getTrainingMetrics()) - assertMetrics(model.getValidationMetricsObject(), model.getValidationMetrics()) - assert(model.getCrossValidationMetricsObject() == null) - assert(model.getCrossValidationMetrics() == Map()) - } - - private def assertMetrics(metricsObject: H2OMetrics, metrics: Map[String, Double]): Unit = { - metricsObject.isInstanceOf[H2OOrdinalGLMMetrics] should be(true) - MetricsAssertions.assertMetricsObjectAgainstMetricsMap(metricsObject, metrics) - } - - test("test ordinal glm metric objects") { - val algo = createAlgorithm() - val model = algo.fit(dataset) - assertMetrics[H2OOrdinalMetrics](model) - - model.write.overwrite().save("ml/build/glm_ordinal_model_metrics") - val loadedModel = H2OGLMMOJOModel.load("ml/build/glm_ordinal_model_metrics") - assertMetrics[H2OOrdinalGLMMetrics](loadedModel) - } } diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/algos/RegressionPredictionTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/algos/RegressionPredictionTestSuite.scala index bfcdb5b3bc..637bae371f 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/algos/RegressionPredictionTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/algos/RegressionPredictionTestSuite.scala @@ -153,38 +153,4 @@ class RegressionPredictionTestSuite extends FunSuite with Matchers with SharedH2 metricsObject.isInstanceOf[T] should be(true) MetricsAssertions.assertMetricsObjectAgainstMetricsMap(metricsObject, metrics) } - - test("test regression metric objects") { - val algo = new algos.H2OGBM() - .setSplitRatio(0.8) - .setSeed(1) - .setWithContributions(true) - .setWithLeafNodeAssignments(true) - .setWithStageResults(true) - .setFeaturesCols("CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON") - .setLabelCol("AGE") - val model = algo.fit(dataset) - assertMetrics[H2ORegressionMetrics](model) - - model.write.overwrite().save("ml/build/gbm_regression_model_metrics") - val loadedModel = H2OGBMMOJOModel.load("ml/build/gbm_regression_model_metrics") - assertMetrics[H2ORegressionMetrics](loadedModel) - } - - test("test regression glm metric objects") { - val algo = new algos.H2OGLM() - .setSplitRatio(0.8) - .setSeed(1) - .setWithContributions(true) - .setWithLeafNodeAssignments(true) - .setWithStageResults(true) - .setFeaturesCols("CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON") - .setLabelCol("AGE") - val model = algo.fit(dataset) - assertMetrics[H2ORegressionGLMMetrics](model) - - model.write.overwrite().save("ml/build/glm_regression_model_metrics") - val loadedModel = H2OGLMMOJOModel.load("ml/build/glm_regression_model_metrics") - assertMetrics[H2ORegressionGLMMetrics](loadedModel) - } } diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/AnomalyDetectionMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/AnomalyDetectionMetricsTestSuite.scala new file mode 100644 index 0000000000..54da3bf5ec --- /dev/null +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/AnomalyDetectionMetricsTestSuite.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.h2o.sparkling.ml.metrics + +import ai.h2o.sparkling.ml.algos.H2OIsolationForest +import ai.h2o.sparkling.ml.features.H2OAutoEncoder +import ai.h2o.sparkling.{SharedH2OTestContext, TestUtils} +import org.apache.spark.sql.SparkSession +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner +import org.scalatest.{FunSuite, Matchers} + +@RunWith(classOf[JUnitRunner]) +class AnomalyDetectionMetricsTestSuite extends FunSuite with Matchers with SharedH2OTestContext { + + override def createSparkSession(): SparkSession = sparkSession("local[*]") + + private lazy val trainingDataset = spark.read + .option("inferSchema", "true") + .csv(TestUtils.locate("smalldata/anomaly/ecg_discord_train.csv")) + + private lazy val validationDataset = spark.read + .option("inferSchema", "true") + .csv(TestUtils.locate("smalldata/anomaly/ecg_discord_test.csv")) + + ignore("test calculation of isolation forest metric objects on arbitrary dataset") { + val algorithm = new H2OIsolationForest() + .setSeed(42) +// .setValidationDataFrame(validationDataset) + + val model = algorithm.fit(trainingDataset) + + MetricsAssertions.assertEssentialMetrics( + model, + trainingDataset, + validationDataset, + trainingMetricsTolerance = 0.00001, + validationMetricsTolerance = 0.00001) + } +} diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/AutoEncoderMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/AutoEncoderMetricsTestSuite.scala new file mode 100644 index 0000000000..73b76b8798 --- /dev/null +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/AutoEncoderMetricsTestSuite.scala @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.h2o.sparkling.ml.metrics + +import ai.h2o.sparkling.ml.features.H2OAutoEncoder +import ai.h2o.sparkling.{SharedH2OTestContext, TestUtils} +import org.apache.spark.sql.SparkSession +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner +import org.scalatest.{FunSuite, Matchers} + +@RunWith(classOf[JUnitRunner]) +class AutoEncoderMetricsTestSuite extends FunSuite with Matchers with SharedH2OTestContext { + + override def createSparkSession(): SparkSession = sparkSession("local[*]") + + private lazy val dataset = spark.read + .option("header", "true") + .option("inferSchema", "true") + .csv(TestUtils.locate("smalldata/prostate/prostate.csv")) + private lazy val Array(trainingDataset, validationDataset) = dataset.randomSplit(Array(0.8, 0.2), seed = 42L) + + test("test calculation of autoencoder metric objects on arbitrary dataset") { + val algorithm = new H2OAutoEncoder() + .setSeed(1) + .setInputCols("DCAPS", "PSA", "VOL") + .setValidationDataFrame(validationDataset) + .setOutputCol("Output") + .setHidden(Array(3)) + .setReproducible(true) + + val model = algorithm.fit(trainingDataset) + + MetricsAssertions.assertEssentialMetrics( + model, + trainingDataset, + validationDataset, + trainingMetricsTolerance = 0.00001, + validationMetricsTolerance = 0.00001) + } +} diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala new file mode 100644 index 0000000000..f27cf81eb9 --- /dev/null +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala @@ -0,0 +1,278 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.h2o.sparkling.ml.metrics + +import ai.h2o.sparkling.ml.algos._ +import ai.h2o.sparkling.ml.models.{H2OGBMMOJOModel, H2OGLMMOJOModel, H2OMOJOModel} +import ai.h2o.sparkling.{SharedH2OTestContext, TestUtils} +import org.apache.spark.sql.functions.rand +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.types._ +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner +import org.scalatest.{FunSuite, Matchers} + +@RunWith(classOf[JUnitRunner]) +class BinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OTestContext { + + override def createSparkSession(): SparkSession = sparkSession("local[*]") + + import spark.implicits._ + + private lazy val dataset = spark.read + .option("header", "true") + .option("inferSchema", "true") + .csv(TestUtils.locate("smalldata/prostate/prostate.csv")) + .withColumn("CAPSULE", 'CAPSULE.cast(StringType)) + .withColumn("RACE", 'RACE.cast(StringType)) + .withColumn("DCAPS", 'DCAPS.cast(StringType)) + .withColumn("WEIGHT", rand(42)) + .repartition(20) + + private lazy val Array(trainingDataset, validationDataset) = dataset.randomSplit(Array(0.8, 0.2), 1234L) + + private def assertMetrics[T](model: H2OMOJOModel): Unit = { + assertMetrics[T](model.getTrainingMetricsObject(), model.getTrainingMetrics()) + assertMetrics[T](model.getValidationMetricsObject(), model.getValidationMetrics()) + assert(model.getCrossValidationMetricsObject() == null) + assert(model.getCrossValidationMetrics() == Map()) + } + + private def assertMetrics[T](metricsObject: H2OMetrics, metrics: Map[String, Double]): Unit = { + metricsObject.isInstanceOf[T] should be(true) + MetricsAssertions.assertMetricsObjectAgainstMetricsMap(metricsObject, metrics) + val binomialObject = metricsObject.asInstanceOf[H2OBinomialMetrics] + binomialObject.getConfusionMatrix().count() > 0 + binomialObject.getConfusionMatrix().columns.length > 0 + binomialObject.getGainsLiftTable().count() > 0 + binomialObject.getGainsLiftTable().columns.length > 0 + binomialObject.getMaxCriteriaAndMetricScores().count() > 0 + binomialObject.getMaxCriteriaAndMetricScores().columns.length > 0 + binomialObject.getThresholdsAndMetricScores().count() > 0 + binomialObject.getThresholdsAndMetricScores().columns.length > 0 + } + + private def assertMetrics( + model: H2OMOJOModel, + trainingDataset: DataFrame, + validationDataset: DataFrame, + trainingMetricsTolerance: Double = 0.0, + validationMetricsTolerance: Double = 0.0, + skipExtraMetrics: Boolean = false): Unit = { + MetricsAssertions.assertEssentialMetrics( + model, + trainingDataset, + validationDataset, + trainingMetricsTolerance, + validationMetricsTolerance, + skipExtraMetrics) + + if (trainingMetricsTolerance < Double.PositiveInfinity) { + val trainingMetricObject = model.getMetricsObject(trainingDataset).asInstanceOf[H2OBinomialMetrics] + val expectedTrainingMetricObject = model.getTrainingMetricsObject().asInstanceOf[H2OBinomialMetrics] + + // Confusion matrix is not correctly calculated in H2O-3 runtime. + val trainingConfusionMatrix = trainingMetricObject.getConfusionMatrix().count() + val expectedTrainingConfusionMatrix = expectedTrainingMetricObject.getConfusionMatrix().count() + trainingConfusionMatrix shouldBe >(0L) + trainingConfusionMatrix shouldEqual expectedTrainingConfusionMatrix + + val trainingMetricScores = trainingMetricObject.getThresholdsAndMetricScores().count() + val expectedTrainingMetricScores = expectedTrainingMetricObject.getThresholdsAndMetricScores().count() + trainingMetricScores shouldBe >(0L) + trainingMetricScores shouldEqual expectedTrainingMetricScores + TestUtils.assertDataFramesAreEqual( + trainingMetricObject.getMaxCriteriaAndMetricScores(), + expectedTrainingMetricObject.getMaxCriteriaAndMetricScores(), + "Metric", + trainingMetricsTolerance) + trainingMetricObject.getGainsLiftTable() shouldBe (null) // Gains-lift table is not supported yet. + } + + if (validationMetricsTolerance < Double.PositiveInfinity) { + val validationMetricObject = model.getMetricsObject(validationDataset).asInstanceOf[H2OBinomialMetrics] + val expectedValidationMetricObject = model.getValidationMetricsObject().asInstanceOf[H2OBinomialMetrics] + + // Confusion matrix is not correctly calculated in H2O-3 runtime. + val validationConfusionMatrix = validationMetricObject.getConfusionMatrix().count() + val expectedValidationConfusionMatrix = expectedValidationMetricObject.getConfusionMatrix().count() + validationConfusionMatrix shouldBe >(0L) + validationConfusionMatrix shouldEqual expectedValidationConfusionMatrix + + val validationMetricScores = validationMetricObject.getThresholdsAndMetricScores().count() + val expectedValidationMetricScores = expectedValidationMetricObject.getThresholdsAndMetricScores().count() + validationMetricScores shouldBe >(0L) + validationMetricScores shouldEqual expectedValidationMetricScores + TestUtils.assertDataFramesAreEqual( + validationMetricObject.getMaxCriteriaAndMetricScores(), + expectedValidationMetricObject.getMaxCriteriaAndMetricScores(), + "Metric", + validationMetricsTolerance) + validationMetricObject.getGainsLiftTable() shouldBe (null) // Gains-lift table is not supported yet. + } + } + + test("test binomial metric objects") { + val algo = new H2OGBM() + .setSplitRatio(0.8) + .setSeed(1) + .setFeaturesCols("AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON") + .setLabelCol("CAPSULE") + + val model = algo.fit(dataset) + assertMetrics[H2OBinomialMetrics](model) + + model.write.overwrite().save("ml/build/gbm_binomial_model_metrics") + val loadedModel = H2OGBMMOJOModel.load("ml/build/gbm_binomial_model_metrics") + assertMetrics[H2OBinomialMetrics](loadedModel) + } + + test("test binomial glm metric objects") { + val algo = new H2OGLM() + .setSplitRatio(0.8) + .setSeed(1) + .setFeaturesCols("AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON") + .setLabelCol("CAPSULE") + + val model = algo.fit(dataset) + assertMetrics[H2OBinomialGLMMetrics](model) + + model.write.overwrite().save("ml/build/glm_binomial_model_metrics") + val loadedModel = H2OGLMMOJOModel.load("ml/build/glm_binomial_model_metrics") + assertMetrics[H2OBinomialGLMMetrics](loadedModel) + } + + { + val algorithmsAndTolerances: Seq[(() => H2OSupervisedAlgorithm[_], Double, Double, Boolean)] = Seq( + (() => new H2ODeepLearning(), 0.00001, 0.000001, false), + (() => new H2OXGBoost(), 0.0001, 0.0001, false), + (() => new H2OGBM(), 0.0001, 0.0001, false), + (() => new H2OGLM(), 0.00001, 0.000001, false), + (() => new H2ODRF(), Double.PositiveInfinity, 0.0001, false)) + // TODO: investigate differences - (() => new H2ORuleFit(), Double.PositiveInfinity, 0.0005, true)) + + for ((algorithmGetter, trainingMetricsTolerance, validationMetricsTolerance, skipExtraMetrics) <- algorithmsAndTolerances) { + val algorithmName = algorithmGetter().getClass.getSimpleName + + test(s"test calculation of binomial $algorithmName metrics on arbitrary dataset") { + val algorithm = algorithmGetter() + algorithm + .setValidationDataFrame(validationDataset) + .set(algorithm.getParam("seed"), 1L) + .setFeaturesCols("AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON") + .setLabelCol("CAPSULE") + val model = algorithm.fit(trainingDataset) + + assertMetrics( + model, + trainingDataset, + validationDataset, + trainingMetricsTolerance, + validationMetricsTolerance, + skipExtraMetrics) + } + + test(s"test calculation of binomial $algorithmName metrics with weightCol set on arbitrary dataset") { + val algorithm = algorithmGetter() + algorithm + .setValidationDataFrame(validationDataset) + .set(algorithm.getParam("seed"), 1L) + .setFeaturesCols("AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON") + .setLabelCol("CAPSULE") + .setWeightCol("WEIGHT") + val model = algorithm.fit(trainingDataset) + + assertMetrics( + model, + trainingDataset, + validationDataset, + trainingMetricsTolerance, + validationMetricsTolerance, + skipExtraMetrics) + } + } + } + { + val algorithmsAndTolerances: Seq[(H2OSupervisedAlgorithm[_], Double, Double)] = + Seq((new H2OXGBoost(), 0.00001, 0.00001), (new H2OGBM(), 1, 0.00001), (new H2OGLM(), 0.00001, 0.000001)) + + for ((algorithm, trainingMetricsTolerance, validationMetricsTolerance) <- algorithmsAndTolerances) { + val algorithmName = algorithm.getClass.getSimpleName + + test(s"test calculation of binomial $algorithmName metrics with offsetCol set on arbitrary dataset") { + algorithm + .setValidationDataFrame(validationDataset) + .set(algorithm.getParam("seed"), 1L) + .setFeaturesCols("AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON") + .setLabelCol("CAPSULE") + .setOffsetCol("ID") + val model = algorithm.fit(trainingDataset) + + assertMetrics(model, trainingDataset, validationDataset, trainingMetricsTolerance, validationMetricsTolerance) + } + } + } + + { + // TODO: Investigate differences when data frames have more partitions + def gamTrainingDataset = trainingDataset.repartition(1) + def gamValidationDataset = validationDataset.repartition(1) + + test(s"test calculation of binomial H2OGAM metrics on arbitrary dataset") { + val algorithm = new H2OGAM() + algorithm + .setValidationDataFrame(gamValidationDataset) + .setSeed(1L) + .setFeaturesCols("AGE", "RACE", "DPROS", "DCAPS", "VOL", "GLEASON") + .setGamCols(Array("PSA")) + .setLabelCol("CAPSULE") + val model = algorithm.fit(gamTrainingDataset) + + assertMetrics(model, gamTrainingDataset, gamValidationDataset, 0.00001, 0.00000001) + } + + // H2OGAM renames Gam cols when offset columns is set (PSA -> PSA_0_center__8) + ignore(s"test calculation of binomial H2OGAM metrics with offsetCol set on arbitrary dataset") { + val algorithm = new H2OGAM() + algorithm + .setValidationDataFrame(gamValidationDataset) + .setSeed(1L) + .setFeaturesCols("AGE", "RACE", "DPROS", "DCAPS", "VOL", "GLEASON") + .setGamCols(Array("PSA")) + .setLabelCol("CAPSULE") + .setOffsetCol("ID") + val model = algorithm.fit(gamTrainingDataset) + + assertMetrics(model, gamTrainingDataset, gamValidationDataset, 0.00001, 0.00000001) + } + + test(s"test calculation of binomial H2OGAM metrics with weightCol set on arbitrary dataset") { + val algorithm = new H2OGAM() + algorithm + .setValidationDataFrame(gamValidationDataset) + .setSeed(1L) + .setFeaturesCols("AGE", "RACE", "DPROS", "DCAPS", "VOL", "GLEASON") + .setGamCols(Array("PSA")) + .setLabelCol("CAPSULE") + .setWeightCol("ID") + val model = algorithm.fit(gamTrainingDataset) + + assertMetrics(model, gamTrainingDataset, gamValidationDataset, 0.00001, 0.00000001) + } + } +} diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/ClusteringMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/ClusteringMetricsTestSuite.scala new file mode 100644 index 0000000000..0fdbea4625 --- /dev/null +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/ClusteringMetricsTestSuite.scala @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.h2o.sparkling.ml.metrics + +import ai.h2o.sparkling.ml.algos.H2OKMeans +import ai.h2o.sparkling.ml.models.{H2OKMeansMOJOModel, H2OMOJOModel} +import ai.h2o.sparkling.{SharedH2OTestContext, TestUtils} +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner +import org.scalatest.{FunSuite, Matchers} + +@RunWith(classOf[JUnitRunner]) +class ClusteringMetricsTestSuite extends FunSuite with Matchers with SharedH2OTestContext { + + override def createSparkSession(): SparkSession = sparkSession("local[*]") + + private lazy val dataset = spark.read + .option("header", "true") + .option("inferSchema", "true") + .csv(TestUtils.locate("smalldata/iris/iris_wheader.csv")) + + private lazy val Array(trainingDataset, validationDataset) = dataset.randomSplit(Array(0.8, 0.2), seed = 42L) + + test("test calculation of kmeans metric objects on arbitrary dataset") { + val algorithm = new H2OKMeans() + .setValidationDataFrame(validationDataset) + .setSeed(1) + .setK(3) + .setUserPoints(Array(Array(4.9, 3.0, 1.4, 0.2), Array(5.6, 2.5, 3.9, 1.1), Array(6.5, 3.0, 5.2, 2.0))) + .setFeaturesCols("sepal_len", "sepal_wid", "petal_len", "petal_wid") + + val model = algorithm.fit(trainingDataset) + + assertMetrics(model, trainingDataset, validationDataset, trainingMetricsTolerance = 0.00001) + } + + private def assertMetrics( + model: H2OKMeansMOJOModel, + trainingDataset: DataFrame, + validationDataset: DataFrame, + trainingMetricsTolerance: Double = 0.0, + validationMetricsTolerance: Double = 0.0): Unit = { + MetricsAssertions.assertEssentialMetrics( + model, + trainingDataset, + validationDataset, + trainingMetricsTolerance, + validationMetricsTolerance) + TestUtils.assertDataFramesAreEqual( + model.getTrainingMetricsObject().getCentroidStats(), + model.getMetricsObject(trainingDataset).getCentroidStats(), + "Centroid", + Map("Size" -> trainingMetricsTolerance, "Within Cluster Sum of Squares" -> trainingMetricsTolerance)) + TestUtils.assertDataFramesAreEqual( + model.getValidationMetricsObject().getCentroidStats(), + model.getMetricsObject(validationDataset).getCentroidStats(), + "Centroid", + Map("Size" -> validationMetricsTolerance, "Within Cluster Sum of Squares" -> validationMetricsTolerance)) + } +} diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/DimReductionMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/DimReductionMetricsTestSuite.scala new file mode 100644 index 0000000000..f61eac5cc4 --- /dev/null +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/DimReductionMetricsTestSuite.scala @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.h2o.sparkling.ml.metrics + +import ai.h2o.sparkling.ml.features.{H2OGLRM, H2OPCA} +import ai.h2o.sparkling.{SharedH2OTestContext, TestUtils} +import org.apache.spark.sql.SparkSession +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner +import org.scalatest.{FunSuite, Matchers} + +@RunWith(classOf[JUnitRunner]) +class DimReductionMetricsTestSuite extends FunSuite with Matchers with SharedH2OTestContext { + + override def createSparkSession(): SparkSession = sparkSession("local[*]") + + private lazy val dataset = spark.read + .option("header", "true") + .option("inferSchema", "true") + .csv(TestUtils.locate("smalldata/prostate/prostate.csv")) + private lazy val Array(trainingDataset, validationDataset) = dataset.randomSplit(Array(0.8, 0.2), seed = 42L) + + test("test calculation of H2OPCA metrics on arbitrary dataset") { + val algorithm = new H2OPCA() + .setSeed(1) + .setInputCols("RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON") + .setOutputCol("Output") + .setValidationDataFrame(validationDataset) + .setImputeMissing(true) + .setPcaMethod("Power") + .setK(3) + + val model = algorithm.fit(trainingDataset) + + MetricsAssertions.assertEssentialMetrics( + model, + trainingDataset, + validationDataset, + trainingMetricsTolerance = 0.00001, + validationMetricsTolerance = 0.00001) + } + + ignore("test calculation of H2OGLRM metrics on arbitrary dataset") { + val Array(rawTrainingDataset, rawValidationDataset) = dataset.randomSplit(Array(0.5, 0.5), seed = 42) + val glrmTrainingDataset = rawTrainingDataset.limit(150) + val glrmValidationDataset = rawValidationDataset.limit(150) + + val algorithm = new H2OGLRM() + .setSeed(1) + .setValidationDataFrame(glrmValidationDataset) + .setInputCols("RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON") + .setOutputCol("Output") + .setWithReconstructedCol(true) + .setReconstructedCol("Reconstructed") + .setK(3) + + val model = algorithm.fit(glrmTrainingDataset) + println(model.getTrainingMetrics()) + println(model.getMetrics(glrmTrainingDataset)) + println(model.getValidationMetrics()) + println(model.getMetrics(glrmValidationDataset)) + MetricsAssertions.assertEssentialMetrics( + model, + glrmTrainingDataset, + glrmValidationDataset, + trainingMetricsTolerance = Double.PositiveInfinity, + validationMetricsTolerance = 0.00001) + } +} diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MetricsAssertions.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MetricsAssertions.scala index 84f984a3e7..a9d2a901c0 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MetricsAssertions.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MetricsAssertions.scala @@ -17,13 +17,18 @@ package ai.h2o.sparkling.ml.metrics +import ai.h2o.sparkling.ml.models.H2OMOJOModel +import org.apache.spark.sql.DataFrame import org.scalatest.Matchers object MetricsAssertions extends Matchers { - def assertMetricsObjectAgainstMetricsMap(metricsObject: H2OMetrics, metrics: Map[String, Double]): Unit = { + def assertMetricsObjectAgainstMetricsMap( + metricsObject: H2OMetrics, + metrics: Map[String, Double], + ignoredGetters: Set[String] = Set("getCustomMetricValue")): Unit = { for (getter <- metricsObject.getClass.getMethods if getter.getName.startsWith("get") - if getter.getName != "getCustomMetricValue" + if !ignoredGetters.contains("getCustomMetricValue") if getter.getParameterCount == 0 if getter.getReturnType.isPrimitive) { val value = getter.invoke(metricsObject) @@ -37,4 +42,57 @@ object MetricsAssertions extends Matchers { } } } + + def assertEqual( + expected: Map[String, Double], + actual: Map[String, Double], + ignored: Set[String] = Set("ScoringTime"), + tolerance: Double = 0.0, + skipExtraMetrics: Boolean = false): Unit = { + val expectedKeys = expected.keySet + val actualKeys = actual.keySet + + if (!skipExtraMetrics) { + expectedKeys shouldEqual actualKeys + } + + for (key <- actualKeys.diff(ignored)) { + if (expected(key).isNaN && actual(key).isNaN) { + // Values are equal + } else if (tolerance > 0.0) { + expected(key) shouldBe (actual(key) +- tolerance) + } else { + expected(key) shouldBe actual(key) + } + } + } + + def assertEssentialMetrics( + model: H2OMOJOModel, + trainingDataset: DataFrame, + validationDataset: DataFrame, + trainingMetricsTolerance: Double = 0.0, + validationMetricsTolerance: Double = 0.0, + skipExtraMetrics: Boolean = false): Unit = { + val trainingMetrics = model.getMetrics(trainingDataset) + val trainingMetricsObject = model.getMetricsObject(trainingDataset) + val validationMetrics = model.getMetrics(validationDataset) + val validationMetricsObject = model.getMetricsObject(validationDataset) + val expectedTrainingMetrics = model.getTrainingMetrics() + val expectedValidationMetrics = model.getValidationMetrics() + + MetricsAssertions.assertEqual( + expectedTrainingMetrics, + trainingMetrics, + tolerance = trainingMetricsTolerance, + skipExtraMetrics = skipExtraMetrics) + MetricsAssertions.assertEqual( + expectedValidationMetrics, + validationMetrics, + tolerance = validationMetricsTolerance, + skipExtraMetrics = skipExtraMetrics) + val ignoredGetters = Set("getCustomMetricValue", "getScoringTime") + MetricsAssertions.assertMetricsObjectAgainstMetricsMap(trainingMetricsObject, trainingMetrics, ignoredGetters) + MetricsAssertions.assertMetricsObjectAgainstMetricsMap(validationMetricsObject, validationMetrics, ignoredGetters) + } } diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala new file mode 100644 index 0000000000..ef0c47b2fd --- /dev/null +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala @@ -0,0 +1,294 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.h2o.sparkling.ml.metrics + +import ai.h2o.sparkling.ml.algos._ +import ai.h2o.sparkling.ml.models.{H2OGBMMOJOModel, H2OGLMMOJOModel, H2OMOJOModel} +import ai.h2o.sparkling.{SharedH2OTestContext, TestUtils} +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.functions.{monotonically_increasing_id, rand} +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner +import org.scalatest.{FunSuite, Matchers} + +@RunWith(classOf[JUnitRunner]) +class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OTestContext { + + override def createSparkSession(): SparkSession = sparkSession("local[*]") + + private lazy val dataset = spark.read + .option("header", "true") + .option("inferSchema", "true") + .csv(TestUtils.locate("smalldata/iris/iris_wheader.csv")) + .withColumn("ID", monotonically_increasing_id) + .withColumn("WEIGHT", rand(42)) + .repartition(20) + + private lazy val Array(trainingDataset, validationDataset) = dataset.randomSplit(Array(0.8, 0.2), 42L) + + private def assertMetrics[T](model: H2OMOJOModel): Unit = { + assertMetrics[T](model.getTrainingMetricsObject(), model.getTrainingMetrics()) + assertMetrics[T](model.getValidationMetricsObject(), model.getValidationMetrics()) + assert(model.getCrossValidationMetricsObject() == null) + assert(model.getCrossValidationMetrics() == Map()) + } + + private def assertMetrics[T](metricsObject: H2OMetrics, metrics: Map[String, Double]): Unit = { + metricsObject.isInstanceOf[T] should be(true) + MetricsAssertions.assertMetricsObjectAgainstMetricsMap(metricsObject, metrics) + val multinomialObject = metricsObject.asInstanceOf[H2OMultinomialMetrics] + multinomialObject.getConfusionMatrix().count() > 0 + multinomialObject.getConfusionMatrix().columns.length > 0 + multinomialObject.getHitRatioTable().count() > 0 + multinomialObject.getHitRatioTable().columns.length > 0 + } + + private def assertMetrics( + model: H2OMOJOModel, + trainingDataset: DataFrame, + validationDataset: DataFrame, + trainingMetricsTolerance: Double = 0.0, + validationMetricsTolerance: Double = 0.0, + skipExtraMetrics: Boolean = false): Unit = { + MetricsAssertions.assertEssentialMetrics( + model, + trainingDataset, + validationDataset, + trainingMetricsTolerance, + validationMetricsTolerance, + skipExtraMetrics) + + if (trainingMetricsTolerance < Double.PositiveInfinity) { + val trainingMetricObject = model.getMetricsObject(trainingDataset).asInstanceOf[H2OMultinomialMetrics] + val expectedTrainingMetricObject = model.getTrainingMetricsObject().asInstanceOf[H2OMultinomialMetrics] + TestUtils.assertDataFramesAreEqual( + trainingMetricObject.getMultinomialAUCTable(), + expectedTrainingMetricObject.getMultinomialAUCTable(), + "Type", + trainingMetricsTolerance) + TestUtils.assertDataFramesAreEqual( + trainingMetricObject.getMultinomialPRAUCTable(), + expectedTrainingMetricObject.getMultinomialPRAUCTable(), + "Type", + trainingMetricsTolerance) + TestUtils.assertDataFramesAreIdentical( + trainingMetricObject.getConfusionMatrix(), + expectedTrainingMetricObject.getConfusionMatrix()) + TestUtils.assertDataFramesAreEqual( + trainingMetricObject.getHitRatioTable(), + expectedTrainingMetricObject.getHitRatioTable(), + "K", + trainingMetricsTolerance) + } + + if (validationMetricsTolerance < Double.PositiveInfinity) { + val validationMetricObject = model.getMetricsObject(validationDataset).asInstanceOf[H2OMultinomialMetrics] + val expectedValidationMetricObject = model.getValidationMetricsObject().asInstanceOf[H2OMultinomialMetrics] + TestUtils.assertDataFramesAreEqual( + validationMetricObject.getMultinomialAUCTable(), + expectedValidationMetricObject.getMultinomialAUCTable(), + "Type", + validationMetricsTolerance) + TestUtils.assertDataFramesAreEqual( + validationMetricObject.getMultinomialPRAUCTable(), + expectedValidationMetricObject.getMultinomialPRAUCTable(), + "Type", + validationMetricsTolerance) + TestUtils.assertDataFramesAreIdentical( + validationMetricObject.getConfusionMatrix(), + expectedValidationMetricObject.getConfusionMatrix()) + TestUtils.assertDataFramesAreEqual( + validationMetricObject.getHitRatioTable(), + expectedValidationMetricObject.getHitRatioTable(), + "K", + validationMetricsTolerance) + } + } + + test("test multinomial metric objects") { + val algo = new H2OGBM() + .setSplitRatio(0.8) + .setSeed(1) + .setFeaturesCols("sepal_len", "sepal_wid", "petal_len", "petal_wid") + .setColumnsToCategorical("class") + .setLabelCol("class") + val model = algo.fit(dataset) + assertMetrics[H2OMultinomialMetrics](model) + + model.write.overwrite().save("ml/build/gbm_multinomial_model_metrics") + val loadedModel = H2OGBMMOJOModel.load("ml/build/gbm_multinomial_model_metrics") + assertMetrics[H2OMultinomialMetrics](loadedModel) + } + + test("test multinomial glm metric objects") { + val algo = new H2OGLM() + .setSplitRatio(0.8) + .setSeed(1) + .setFeaturesCols("sepal_len", "sepal_wid", "petal_len", "petal_wid") + .setColumnsToCategorical("class") + .setLabelCol("class") + val model = algo.fit(dataset) + assertMetrics[H2OMultinomialGLMMetrics](model) + + model.write.overwrite().save("ml/build/glm_multinomial_model_metrics") + val loadedModel = H2OGLMMOJOModel.load("ml/build/glm_multinomial_model_metrics") + assertMetrics[H2OMultinomialGLMMetrics](loadedModel) + } + + { + val algorithmsAndTolerances: Seq[(() => H2OSupervisedAlgorithm[_], Double, Double, Boolean)] = Seq( + (() => new H2ODeepLearning(), 0.00001, 0.00000001, false), + (() => new H2OXGBoost(), 0.00001, 0.00000001, false), + (() => new H2OGBM(), 0.00001, 0.00000001, false), + (() => new H2OGLM(), 0.00001, 0.00000001, false), + (() => new H2ODRF(), Double.PositiveInfinity, 0.00000001, false), + (() => new H2ORuleFit(), 0.0001, 0.00001, true)) + + for ((algorithmGetter, trainingMetricsTolerance, validationMetricsTolerance, skipExtraMetrics) <- algorithmsAndTolerances) { + val algorithmName = algorithmGetter().getClass.getSimpleName + + test(s"test calculation of multinomial $algorithmName metrics on arbitrary dataset") { + val algorithm = algorithmGetter() + algorithm + .setValidationDataFrame(validationDataset) + .set(algorithm.getParam("seed"), 1L) + .setFeaturesCols("sepal_len", "sepal_wid", "petal_len", "petal_wid") + .setColumnsToCategorical("class") + .set(algorithm.getParam("aucType"), "MACRO_OVR") + .setLabelCol("class") + val model = algorithm.fit(trainingDataset) + + assertMetrics( + model, + trainingDataset, + validationDataset, + trainingMetricsTolerance, + validationMetricsTolerance, + skipExtraMetrics) + } + + test(s"test calculation of multinomial $algorithmName metrics with weightCol set on arbitrary dataset") { + val algorithm = algorithmGetter() + algorithm + .setValidationDataFrame(validationDataset) + .set(algorithm.getParam("seed"), 1L) + .setFeaturesCols("sepal_len", "sepal_wid", "petal_len", "petal_wid") + .setColumnsToCategorical("class") + .set(algorithm.getParam("aucType"), "MACRO_OVR") + .setLabelCol("class") + .setWeightCol("WEIGHT") + val model = algorithm.fit(trainingDataset) + + assertMetrics( + model, + trainingDataset, + validationDataset, + trainingMetricsTolerance, + validationMetricsTolerance, + skipExtraMetrics) + } + } + } + { + val algorithmsAndTolerances: Seq[(H2OSupervisedAlgorithm[_], Double, Double)] = + Seq((new H2OXGBoost(), 0.00001, 0.00000001), (new H2OGLM(), 0.00001, 0.00000001)) + + for ((algorithm, trainingMetricsTolerance, validationMetricsTolerance) <- algorithmsAndTolerances) { + val algorithmName = algorithm.getClass.getSimpleName + + test(s"test calculation of multinomial $algorithmName metrics with offsetCol set on arbitrary dataset") { + algorithm + .setValidationDataFrame(validationDataset) + .set(algorithm.getParam("seed"), 1L) + .setFeaturesCols("sepal_len", "sepal_wid", "petal_len", "petal_wid") + .setColumnsToCategorical("class") + .set(algorithm.getParam("aucType"), "MACRO_OVR") + .setLabelCol("class") + .setOffsetCol("ID") + val model = algorithm.fit(trainingDataset) + + assertMetrics(model, trainingDataset, validationDataset, trainingMetricsTolerance, validationMetricsTolerance) + } + } + } + { + // TODO: Investigate differences when data frames have more partitions + def gamTrainingDataset = trainingDataset.repartition(1) + def gamValidationDataset = validationDataset.repartition(1) + + test("test calculation of multinomial H2OGAM metrics on arbitrary dataset") { + val algo = new H2OGAM() + .setValidationDataFrame(gamValidationDataset) + .setSeed(1) + .setFeaturesCols("sepal_len", "sepal_wid", "petal_len") + .setGamCols(Array("petal_len")) + .setColumnsToCategorical("class") + .setAucType("MACRO_OVR") + .setLabelCol("class") + val model = algo.fit(gamTrainingDataset) + + assertMetrics( + model, + gamTrainingDataset, + gamValidationDataset, + trainingMetricsTolerance = 0.0001, + validationMetricsTolerance = 0.00000001) + } + + // H2OGAM renames Gam cols when offset columns is set (petal_len -> petal_len_0_center__8) + ignore("test calculation of multinomial H2OGAM metrics with offsetCol set on arbitrary dataset") { + val algo = new H2OGAM() + .setValidationDataFrame(gamValidationDataset) + .setSeed(1) + .setFeaturesCols("sepal_len", "sepal_wid", "petal_len") + .setGamCols(Array("petal_len")) + .setColumnsToCategorical("class") + .setAucType("MACRO_OVR") + .setLabelCol("class") + .setOffsetCol("ID") + val model = algo.fit(gamTrainingDataset) + + assertMetrics( + model, + gamTrainingDataset, + gamValidationDataset, + trainingMetricsTolerance = 0.0001, + validationMetricsTolerance = 0.00000001) + } + + test("test calculation of multinomial H2OGAM metrics with weightCol set on arbitrary dataset") { + val algo = new H2OGAM() + .setValidationDataFrame(gamValidationDataset) + .setSeed(1) + .setFeaturesCols("sepal_len", "sepal_wid", "petal_len") + .setGamCols(Array("petal_len")) + .setColumnsToCategorical("class") + .setAucType("MACRO_OVR") + .setLabelCol("class") + .setWeightCol("WEIGHT") + val model = algo.fit(gamTrainingDataset) + + assertMetrics( + model, + gamTrainingDataset, + gamValidationDataset, + trainingMetricsTolerance = 0.0001, + validationMetricsTolerance = 0.00000001) + } + } +} diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/NoRuntimeMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/NoRuntimeMetricsTestSuite.scala new file mode 100644 index 0000000000..32ea2e46c6 --- /dev/null +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/NoRuntimeMetricsTestSuite.scala @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.h2o.sparkling.ml.metrics + +import ai.h2o.sparkling.ml.models.H2OMOJOModel +import ai.h2o.sparkling.SparkTestContext +import org.apache.spark.sql.SparkSession +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner +import org.scalatest.{FunSuite, Matchers} + +@RunWith(classOf[JUnitRunner]) +class NoRuntimeMetricsTestSuite extends FunSuite with Matchers with SparkTestContext { + + override def createSparkSession(): SparkSession = sparkSession("local[*]") + + private lazy val irisDataFrame = { + spark.read.option("header", "true").option("inferSchema", "true").csv("examples/smalldata/iris/iris_wheader.csv") + } + + private lazy val prostateDataFrame = { + spark.read + .option("header", "true") + .option("inferSchema", "true") + .csv("examples/smalldata/prostate/prostate.csv") + .withColumnRenamed("CAPSULE", "capsule") + } + + test("Test calculation of metrics on saved binomial model") { + val mojo = H2OMOJOModel.createFromMojo( + this.getClass.getClassLoader.getResourceAsStream("binom_model_prostate.mojo"), + "binom_model_prostate.mojo") + mojo.getMetrics(prostateDataFrame) shouldNot be(null) + mojo.getMetricsObject(prostateDataFrame) shouldNot be(null) + } + + test("Test calculation of metrics on saved regression model") { + val mojo = H2OMOJOModel.createFromMojo( + this.getClass.getClassLoader.getResourceAsStream("regre_model_prostate.mojo"), + "regre_model_prostate.mojo") + mojo.getMetrics(prostateDataFrame) shouldNot be(null) + mojo.getMetricsObject(prostateDataFrame) shouldNot be(null) + } + + test("Test calculation of metrics on saved multinomial model") { + val mojo = H2OMOJOModel.createFromMojo( + this.getClass.getClassLoader.getResourceAsStream("multi_model_iris.mojo"), + "multi_model_iris.mojo") + mojo.getMetrics(irisDataFrame) shouldNot be(null) + mojo.getMetricsObject(irisDataFrame) shouldNot be(null) + } +} diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/OrdinalMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/OrdinalMetricsTestSuite.scala new file mode 100644 index 0000000000..2ee3f53ddd --- /dev/null +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/OrdinalMetricsTestSuite.scala @@ -0,0 +1,182 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.h2o.sparkling.ml.metrics + +import ai.h2o.sparkling.ml.algos.{H2OGAM, H2OGLM} +import ai.h2o.sparkling.ml.models.{H2OGLMMOJOModel, H2OMOJOModel} +import ai.h2o.sparkling.{SharedH2OTestContext, TestUtils} +import org.apache.spark.sql.functions.monotonically_increasing_id +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner +import org.scalatest.{FunSuite, Matchers} + +@RunWith(classOf[JUnitRunner]) +class OrdinalMetricsTestSuite extends FunSuite with Matchers with SharedH2OTestContext { + + override def createSparkSession(): SparkSession = sparkSession("local[*]") + + private lazy val dataset = spark.read + .option("header", "true") + .option("inferSchema", "true") + .csv(TestUtils.locate("smalldata/insurance.csv")) + .withColumn("ID", monotonically_increasing_id) + .repartition(20) + + private lazy val Array(trainingDataset, validationDataset) = dataset.randomSplit(Array(0.8, 0.2), 42L) + + private def assertMetrics[T](model: H2OMOJOModel): Unit = { + assertMetrics(model.getTrainingMetricsObject(), model.getTrainingMetrics()) + assertMetrics(model.getValidationMetricsObject(), model.getValidationMetrics()) + assert(model.getCrossValidationMetricsObject() == null) + assert(model.getCrossValidationMetrics() == Map()) + } + + private def assertMetrics(metricsObject: H2OMetrics, metrics: Map[String, Double]): Unit = { + metricsObject.isInstanceOf[H2OOrdinalGLMMetrics] should be(true) + MetricsAssertions.assertMetricsObjectAgainstMetricsMap(metricsObject, metrics) + } + + private def assertMetrics( + model: H2OMOJOModel, + trainingDataset: DataFrame, + validationDataset: DataFrame, + trainingMetricsTolerance: Double = 0.0, + validationMetricsTolerance: Double = 0.0): Unit = { + MetricsAssertions.assertEssentialMetrics( + model, + trainingDataset, + validationDataset, + trainingMetricsTolerance, + validationMetricsTolerance) + + val trainingMetricObject = model.getMetricsObject(trainingDataset).asInstanceOf[H2OOrdinalGLMMetrics] + val expectedTrainingMetricObject = model.getTrainingMetricsObject().asInstanceOf[H2OOrdinalGLMMetrics] + TestUtils.assertDataFramesAreIdentical( + trainingMetricObject.getConfusionMatrix(), + expectedTrainingMetricObject.getConfusionMatrix()) + TestUtils.assertDataFramesAreEqual( + trainingMetricObject.getHitRatioTable(), + expectedTrainingMetricObject.getHitRatioTable(), + "K", + trainingMetricsTolerance) + + val validationMetricObject = model.getMetricsObject(validationDataset).asInstanceOf[H2OOrdinalGLMMetrics] + val expectedValidationMetricObject = model.getValidationMetricsObject().asInstanceOf[H2OOrdinalGLMMetrics] + TestUtils.assertDataFramesAreIdentical( + validationMetricObject.getConfusionMatrix(), + expectedValidationMetricObject.getConfusionMatrix()) + TestUtils.assertDataFramesAreEqual( + validationMetricObject.getHitRatioTable(), + expectedValidationMetricObject.getHitRatioTable(), + "K", + validationMetricsTolerance) + } + + test("test ordinal glm metric objects") { + val algo = new H2OGLM() + .setSplitRatio(0.8) + .setFeaturesCols("District", "Group", "Claims") + .setLabelCol("Age") + .setSeed(1) + .setFamily("ordinal") + val model = algo.fit(dataset) + assertMetrics[H2OOrdinalMetrics](model) + + model.write.overwrite().save("ml/build/glm_ordinal_model_metrics") + val loadedModel = H2OGLMMOJOModel.load("ml/build/glm_ordinal_model_metrics") + assertMetrics[H2OOrdinalGLMMetrics](loadedModel) + } + + test("test calculation of ordinal H2OGLM metrics on arbitrary dataset") { + val algo = new H2OGLM() + .setValidationDataFrame(validationDataset) + .setFeaturesCols("District", "Group", "Claims") + .setLabelCol("Age") + .setSeed(1) + .setFamily("ordinal") + val model = algo.fit(trainingDataset) + + assertMetrics( + model, + trainingDataset, + validationDataset, + trainingMetricsTolerance = 0.00001, + validationMetricsTolerance = 0.00000001) + } + + test("test calculation of ordinal H2OGLM metrics with weightCol set on arbitrary dataset") { + val algo = new H2OGLM() + .setValidationDataFrame(validationDataset) + .setFeaturesCols("District", "Group", "Claims") + .setLabelCol("Age") + .setSeed(1) + .setFamily("ordinal") + .setWeightCol("ID") + val model = algo.fit(trainingDataset) + + assertMetrics( + model, + trainingDataset, + validationDataset, + trainingMetricsTolerance = 0.00001, + validationMetricsTolerance = 0.00000001) + } + + def gamTrainingDataset = trainingDataset.repartition(1) + def gamValidationDataset = validationDataset.repartition(1) + + // TODO: investigate why metrics are different + ignore("test calculation of ordinal H2OGAM metrics on arbitrary dataset") { + val algo = new H2OGAM() + .setValidationDataFrame(gamValidationDataset) + .setFeaturesCols("District", "Group") + .setGamCols(Array("Claims")) + .setLabelCol("Age") + .setSeed(1) + .setFamily("ordinal") + val model = algo.fit(gamTrainingDataset) + + assertMetrics( + model, + gamTrainingDataset, + gamValidationDataset, + trainingMetricsTolerance = 0.00001, + validationMetricsTolerance = 0.00000001) + } + + // TODO: investigate why metrics are different + ignore("test calculation of ordinal H2OGAM metrics with weightCol set on arbitrary dataset") { + val algo = new H2OGAM() + .setValidationDataFrame(gamValidationDataset) + .setFeaturesCols("District", "Group") + .setGamCols(Array("Claims")) + .setLabelCol("Age") + .setSeed(1) + .setFamily("ordinal") + .setWeightCol("ID") + val model = algo.fit(gamTrainingDataset) + + assertMetrics( + model, + gamTrainingDataset, + gamValidationDataset, + trainingMetricsTolerance = 0.00001, + validationMetricsTolerance = 0.00000001) + } +} diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/RegressionMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/RegressionMetricsTestSuite.scala new file mode 100644 index 0000000000..b84d260626 --- /dev/null +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/RegressionMetricsTestSuite.scala @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.h2o.sparkling.ml.metrics + +import ai.h2o.sparkling.ml.algos +import ai.h2o.sparkling.ml.algos._ +import ai.h2o.sparkling.ml.models.{H2OGBMMOJOModel, H2OGLMMOJOModel, H2OMOJOModel} +import ai.h2o.sparkling.{SharedH2OTestContext, TestUtils} +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.types.StringType +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner +import org.scalatest.{FunSuite, Matchers} + +@RunWith(classOf[JUnitRunner]) +class RegressionMetricsTestSuite extends FunSuite with Matchers with SharedH2OTestContext { + + override def createSparkSession(): SparkSession = sparkSession("local[*]") + + import spark.implicits._ + + private lazy val dataset = spark.read + .option("header", "true") + .option("inferSchema", "true") + .csv(TestUtils.locate("smalldata/prostate/prostate.csv")) + .withColumn("RACE", 'RACE.cast(StringType)) + .withColumn("DCAPS", 'DCAPS.cast(StringType)) + .repartition(20) + + private lazy val Array(trainingDataset, validationDataset) = dataset.randomSplit(Array(0.8, 0.2), 42L) + + private def assertMetrics[T](model: H2OMOJOModel): Unit = { + assertMetrics[T](model.getTrainingMetricsObject(), model.getTrainingMetrics()) + assertMetrics[T](model.getValidationMetricsObject(), model.getValidationMetrics()) + assert(model.getCrossValidationMetricsObject() == null) + assert(model.getCrossValidationMetrics() == Map()) + } + + private def assertMetrics[T](metricsObject: H2OMetrics, metrics: Map[String, Double]): Unit = { + metricsObject.isInstanceOf[T] should be(true) + MetricsAssertions.assertMetricsObjectAgainstMetricsMap(metricsObject, metrics) + } + + test("test regression metric objects") { + val algo = new algos.H2OGBM() + .setSplitRatio(0.8) + .setSeed(1) + .setFeaturesCols("CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON") + .setLabelCol("AGE") + val model = algo.fit(dataset) + assertMetrics[H2ORegressionMetrics](model) + + model.write.overwrite().save("ml/build/gbm_regression_model_metrics") + val loadedModel = H2OGBMMOJOModel.load("ml/build/gbm_regression_model_metrics") + assertMetrics[H2ORegressionMetrics](loadedModel) + } + + test("test regression glm metric objects") { + val algo = new algos.H2OGLM() + .setSplitRatio(0.8) + .setSeed(1) + .setFeaturesCols("CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON") + .setLabelCol("AGE") + val model = algo.fit(dataset) + assertMetrics[H2ORegressionGLMMetrics](model) + + model.write.overwrite().save("ml/build/glm_regression_model_metrics") + val loadedModel = H2OGLMMOJOModel.load("ml/build/glm_regression_model_metrics") + assertMetrics[H2ORegressionGLMMetrics](loadedModel) + } + + { + val algorithmsAndTolerances: Seq[(() => H2OSupervisedAlgorithm[_], Double, Double, Boolean)] = Seq( + (() => new H2ODeepLearning(), 0.00001, 0.00000001, false), + (() => new H2OXGBoost(), 0.00001, 0.00000001, false), + (() => new H2OGBM(), 0.0001, 0.00000001, false), + (() => new H2OGLM(), 0.00001, 0.00000001, false), + (() => new H2ODRF(), Double.PositiveInfinity, 0.00000001, false)) // ignore comparision on the training dataset + // H2O runtime produces additional GLM metrics + // TODO: investigate differences (() => new H2ORuleFit(), 0.001, 0.00000001, true)) + + for ((algorithmGetter, trainingMetricsTolerance, validationMetricsTolerance, skipExtraMetrics) <- algorithmsAndTolerances) { + val algorithmName = algorithmGetter().getClass.getSimpleName + + test(s"test calculation of regression $algorithmName metrics on arbitrary dataset") { + val algorithm = algorithmGetter() + algorithm + .setValidationDataFrame(validationDataset) + .set(algorithm.getParam("seed"), 1L) + .setFeaturesCols("CAPSULE", "RACE", "DPROS", "DCAPS", "VOL", "GLEASON") + .setLabelCol("AGE") + val model = algorithm.fit(trainingDataset) + + MetricsAssertions.assertEssentialMetrics( + model, + trainingDataset, + validationDataset, + trainingMetricsTolerance, + validationMetricsTolerance, + skipExtraMetrics) + } + + test(s"test calculation of regression $algorithmName metrics with weight column set on arbitrary dataset ") { + val algorithm = algorithmGetter() + algorithm + .setValidationDataFrame(validationDataset) + .set(algorithm.getParam("seed"), 1L) + .setFeaturesCols("CAPSULE", "RACE", "DPROS", "DCAPS", "VOL", "GLEASON") + .setLabelCol("AGE") + .setWeightCol("ID") + val model = algorithm.fit(trainingDataset) + + MetricsAssertions.assertEssentialMetrics( + model, + trainingDataset, + validationDataset, + trainingMetricsTolerance, + validationMetricsTolerance, + skipExtraMetrics) + } + } + } + { + val algorithmsAndTolerances: Seq[(H2OSupervisedAlgorithm[_], Double, Double)] = Seq( + (new H2OXGBoost(), 0.00001, 0.00000001), + (new H2OGBM(), 0.001, 0.00000001), + (new H2OGLM(), 0.00001, 0.00000001)) // H2ORuleFit and H2ODRF doesn't support offset column + + for ((algorithm, trainingMetricsTolerance, validationMetricsTolerance) <- algorithmsAndTolerances) { + val algorithmName = algorithm.getClass.getSimpleName + test(s"test calculation of regression $algorithmName metrics with offset column set on arbitrary dataset ") { + algorithm + .setValidationDataFrame(validationDataset) + .set(algorithm.getParam("seed"), 1L) + .setFeaturesCols("CAPSULE", "RACE", "DPROS", "DCAPS", "VOL", "GLEASON") + .setLabelCol("AGE") + .setOffsetCol("ID") + val model = algorithm.fit(trainingDataset) + + MetricsAssertions.assertEssentialMetrics( + model, + trainingDataset, + validationDataset, + trainingMetricsTolerance, + validationMetricsTolerance) + } + } + } + { + // TODO: investigate why GAM there are differences when more partitions are used + def gamTrainingDataset = trainingDataset.repartition(1) + def gamValidationDataset = validationDataset.repartition(1) + + test(s"test calculation of regression H2OGAM metrics on arbitrary dataset") { + val algorithm = new H2OGAM() + algorithm + .setValidationDataFrame(gamValidationDataset) + .setSeed(1L) + .setGamCols(Array(Array("PSA"))) + .setFeaturesCols("CAPSULE", "RACE", "DPROS", "DCAPS", "VOL", "GLEASON") + .setLabelCol("AGE") + val model = algorithm.fit(gamTrainingDataset) + + MetricsAssertions.assertEssentialMetrics(model, gamTrainingDataset, gamValidationDataset, 0.00001, 0.00000001) + } + + test(s"test calculation of regression H2OGAM metrics with weight column set on arbitrary dataset") { + val algorithm = new H2OGAM() + algorithm + .setValidationDataFrame(gamValidationDataset) + .setSeed(1L) + .setGamCols(Array(Array("PSA"))) + .setFeaturesCols("CAPSULE", "RACE", "DPROS", "DCAPS", "VOL", "GLEASON") + .setLabelCol("AGE") + .setWeightCol("ID") + val model = algorithm.fit(gamTrainingDataset) + + MetricsAssertions.assertEssentialMetrics(model, gamTrainingDataset, gamValidationDataset, 0.00001, 0.00000001) + } + + // H2OGAM renames Gam cols when offset columns is set (PSA -> PSA_0_center__8) + ignore(s"test calculation of regression H2OGAM metrics with offset column set on arbitrary dataset") { + val algorithm = new H2OGAM() + algorithm + .setValidationDataFrame(gamValidationDataset) + .setSeed(1L) + .setGamCols(Array(Array("PSA"))) + .setFeaturesCols("CAPSULE", "RACE", "DPROS", "DCAPS", "VOL", "GLEASON") + .setLabelCol("AGE") + .setOffsetCol("ID") + val model = algorithm.fit(gamTrainingDataset) + + MetricsAssertions.assertEssentialMetrics(model, gamTrainingDataset, gamValidationDataset, 0.00001, 0.00000001) + } + } +} diff --git a/py-scoring/src/ai/h2o/sparkling/ml/params/H2OMOJOModelParams.py b/py-scoring/src/ai/h2o/sparkling/ml/params/H2OMOJOModelParams.py index 8a323cef77..c5fe9c9c07 100644 --- a/py-scoring/src/ai/h2o/sparkling/ml/params/H2OMOJOModelParams.py +++ b/py-scoring/src/ai/h2o/sparkling/ml/params/H2OMOJOModelParams.py @@ -33,6 +33,21 @@ def getModelSummary(self): def getDomainValues(self): return H2OTypeConverters.scalaMapStringDictStringToStringDictString(self._java_obj.getDomainValues()) + def getMetrics(self, dataFrame): + """ + :return: A map of all metrics of the float type calculated on a data frame passed as a parameter. + """ + self._transfer_params_to_java() + return H2OTypeConverters.scalaMapStringStringToDictStringAny(self._java_obj.getMetrics(dataFrame._jdf)) + + def getMetricsObject(self, dataFrame): + """ + :return: An object holding all metrics of the float type and also more complex performance information + calculated on a data frame passed as a parameter. + """ + self._transfer_params_to_java() + return H2OMetricsFactory.fromJavaObject(self._java_obj.getMetricsObject(dataFrame._jdf)) + def getTrainingMetrics(self): """ :return: A map of all metrics of the float type calculated on the training dataset. diff --git a/py/tests/unit/with_runtime_sparkling/test_mojo.py b/py/tests/unit/with_runtime_sparkling/test_mojo.py index c916fb8f8f..a2fc18bef6 100644 --- a/py/tests/unit/with_runtime_sparkling/test_mojo.py +++ b/py/tests/unit/with_runtime_sparkling/test_mojo.py @@ -31,7 +31,7 @@ @pytest.fixture(scope="module") def gbmModel(prostateDataset): - gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", labelCol="capsule") + gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", labelCol="CAPSULE") return gbm.fit(prostateDataset) @@ -42,7 +42,7 @@ def testDomainColumns(gbmModel): assert domainValues["VOL"] is None assert domainValues["AGE"] is None assert domainValues["PSA"] is None - assert domainValues["capsule"] == ["0", "1"] + assert domainValues["CAPSULE"] == ["0", "1"] assert domainValues["RACE"] is None assert domainValues["ID"] is None @@ -74,7 +74,7 @@ def testFeatureTypes(gbmModel): assert types["VOL"] == "Numeric" assert types["AGE"] == "Numeric" assert types["PSA"] == "Numeric" - assert types["capsule"] == "Enum" + assert types["CAPSULE"] == "Enum" assert types["RACE"] == "Numeric" assert types["ID"] == "Numeric" assert len(types) == 9 @@ -208,7 +208,7 @@ def testGetCrossValidationSummary(): def testCrossValidationModelsAreAvailableAfterSavingAndLoading(prostateDataset): path = "file://" + os.path.abspath("build/testCrossValidationModelsAreAvialableAfterSavingAndLoading") nfolds = 3 - gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", labelCol="capsule", + gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", labelCol="CAPSULE", nfolds=nfolds, keepCrossValidationModels=True) model = gbm.fit(prostateDataset) model.write().overwrite().save(path) @@ -229,7 +229,7 @@ def testCrossValidationModelsAreAvailableAfterSavingAndLoading(prostateDataset): def testCrossValidationModelsAreNoneIfKeepCrossValidationModelsIsFalse(prostateDataset): - gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", labelCol="capsule", + gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", labelCol="CAPSULE", nfolds=3, keepCrossValidationModels=False) model = gbm.fit(prostateDataset) @@ -237,7 +237,7 @@ def testCrossValidationModelsAreNoneIfKeepCrossValidationModelsIsFalse(prostateD def testMetricObjects(prostateDataset): - gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", labelCol="capsule", + gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", labelCol="CAPSULE", nfolds=3, keepCrossValidationModels=False) model = gbm.fit(prostateDataset) @@ -255,9 +255,23 @@ def compareMetricValues(metricsObject, metricsMap): assert metricsObject.getThresholdsAndMetricScores().count() > 0 assert len(metricsObject.getThresholdsAndMetricScores().columns) > 0 + def compareCalculatedMetricValues(metricsObject, metricsMap): + for metric in metricsMap: + if metric != "ScoringTime": + metricValue = metricsMap[metric] + objectValue = getattr(metricsObject, "get" + metric)() + assert(metricValue == objectValue) + assert metricsObject.getConfusionMatrix().count() > 0 + assert len(metricsObject.getConfusionMatrix().columns) > 0 + assert metricsObject.getMaxCriteriaAndMetricScores().count() > 0 + assert len(metricsObject.getMaxCriteriaAndMetricScores().columns) > 0 + assert metricsObject.getThresholdsAndMetricScores().count() > 0 + assert len(metricsObject.getThresholdsAndMetricScores().columns) > 0 + compareMetricValues(model.getTrainingMetricsObject(), model.getTrainingMetrics()) compareMetricValues(model.getCrossValidationMetricsObject(), model.getCrossValidationMetrics()) compareMetricValues(model.getCurrentMetricsObject(), model.getCurrentMetrics()) + compareCalculatedMetricValues(model.getMetricsObject(prostateDataset), model.getMetrics(prostateDataset)) assert model.getValidationMetricsObject() is None assert model.getValidationMetrics() == {} diff --git a/r/src/R/ai/h2o/sparkling/ml/models/H2OMOJOModel.R b/r/src/R/ai/h2o/sparkling/ml/models/H2OMOJOModel.R index 8b1d7fab73..1bd3c4de58 100644 --- a/r/src/R/ai/h2o/sparkling/ml/models/H2OMOJOModel.R +++ b/r/src/R/ai/h2o/sparkling/ml/models/H2OMOJOModel.R @@ -49,6 +49,15 @@ H2OMOJOModel <- setRefClass("H2OMOJOModel", methods = list( getDomainValues = function() { invoke(.self$jmojo, "getDomainValues") }, + getMetrics = function(sparkFrame) { + sparkFrame <- spark_dataframe(sparkFrame) + invoke(.self$jmojo, "getMetrics", sparkFrame) + }, + getMetricsObject = function(sparkFrame) { + sparkFrame <- spark_dataframe(sparkFrame) + metrics <- invoke(.self$jmojo, "getMetricsObject", sparkFrame) + H2OMetricsFactory.fromJavaObject(metrics) + }, getTrainingMetrics = function() { invoke(.self$jmojo, "getTrainingMetrics") }, diff --git a/r/src/tests/testthat/testMojo.R b/r/src/tests/testthat/testMojo.R index 3c830c8e5e..85aa61855b 100644 --- a/r/src/tests/testthat/testMojo.R +++ b/r/src/tests/testthat/testMojo.R @@ -115,6 +115,34 @@ test_that("test model category", { expect_equal(category, "Binomial") }) +test_that("test metrics calculation", { + path <- paste0("file://", locate("smalldata/prostate/prostate.csv")) + dataset <- spark_read_csv(sc, path = path, infer_schema = TRUE, header = TRUE) + dataset <- dplyr::rename(dataset, capsule = CAPSULE) + model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo"))) + metrics <- model$getMetrics(dataset) + expect_equal(as.character(metrics[["AUC"]]), "0.896878869021911") + expect_equal(length(metrics), 10) +}) + +test_that("test metrics object calculation", { + path <- paste0("file://", locate("smalldata/prostate/prostate.csv")) + dataset <- spark_read_csv(sc, path = path, infer_schema = TRUE, header = TRUE) + dataset <- dplyr::rename(dataset, capsule = CAPSULE) + model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo"))) + metrics <- model$getMetricsObject(dataset) + aucValue <- metrics$getAUC() + scoringTime <- metrics$getScoringTime() + + thresholdsAndScores <- metrics$getThresholdsAndMetricScores() + thresholdsAndScoresFrame <- dplyr::tally(thresholdsAndScores) + thresholdsAndScoresCount <- as.double(dplyr::collect(thresholdsAndScoresFrame)[[1]]) + + expect_equal(as.character(aucValue), "0.896878869021911") + expect_true(scoringTime > 0) + expect_true(thresholdsAndScoresCount > 0) +}) + test_that("test training metrics", { model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo"))) metrics <- model$getTrainingMetrics() diff --git a/scoring/build.gradle b/scoring/build.gradle index 5fa9131295..9bf91fd0da 100644 --- a/scoring/build.gradle +++ b/scoring/build.gradle @@ -26,6 +26,11 @@ dependencies { api("ai.h2o:h2o-genmodel-ext-xgboost:${h2oVersion}") api(project(":sparkling-water-utils")) + // Required for model metrics calculation + implementation("ai.h2o:h2o-core:${h2oVersion}") + implementation("ai.h2o:h2o-algos:${h2oVersion}") + implementation(project(":sparkling-water-extensions")) + compileOnly(project(':sparkling-water-api-generation')) compileOnly("ai.h2o:h2o-ext-xgboost:${h2oVersion}") compileOnly("org.apache.spark:spark-core_${scalaBaseVersion}:${sparkVersion}") diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/GLRMMetricCalculation.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/GLRMMetricCalculation.scala new file mode 100644 index 0000000000..fa5e81996d --- /dev/null +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/GLRMMetricCalculation.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.h2o.sparkling.ml.metrics + +import ai.h2o.sparkling.ml.models.H2OGLRMMOJOModel +import hex.genmodel.easy.{EasyPredictModelWrapper, RowData} + +trait GLRMMetricCalculation { + self: H2OGLRMMOJOModel => + + private[sparkling] def getPredictionGetter(): (EasyPredictModelWrapper, RowData, Double) => Array[Double] = { + (wrapper: EasyPredictModelWrapper, rowData: RowData, offset: Double) => + { + val vpa = wrapper.predictDimReduction(rowData) + vpa.reconstructed + } + } +} diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMetrics.scala index 3c6a696881..7e145aaece 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMetrics.scala @@ -55,13 +55,13 @@ object H2OMetrics { dataFrameSerializerGetter: () => String): H2OMetrics = { val metricsObject = modelCategory match { - case H2OModelCategory.Binomial if algoName == "glm" => new H2OBinomialGLMMetrics() + case H2OModelCategory.Binomial if Set("glm", "gam").contains(algoName) => new H2OBinomialGLMMetrics() case H2OModelCategory.Binomial => new H2OBinomialMetrics() - case H2OModelCategory.Multinomial if algoName == "glm" => new H2OMultinomialGLMMetrics() + case H2OModelCategory.Multinomial if Set("glm", "gam").contains(algoName) => new H2OMultinomialGLMMetrics() case H2OModelCategory.Multinomial => new H2OMultinomialMetrics() - case H2OModelCategory.Ordinal if algoName == "glm" => new H2OOrdinalGLMMetrics() + case H2OModelCategory.Ordinal if Set("glm", "gam").contains(algoName) => new H2OOrdinalGLMMetrics() case H2OModelCategory.Ordinal => new H2OOrdinalMetrics() - case H2OModelCategory.Regression if algoName == "glm" => new H2ORegressionGLMMetrics() + case H2OModelCategory.Regression if Set("glm", "gam").contains(algoName) => new H2ORegressionGLMMetrics() case H2OModelCategory.Regression => new H2ORegressionMetrics() case H2OModelCategory.Clustering => new H2OClusteringMetrics() case H2OModelCategory.AnomalyDetection => new H2OAnomalyMetrics() diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/KmeansMetricCalculation.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/KmeansMetricCalculation.scala new file mode 100644 index 0000000000..427ea0213d --- /dev/null +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/KmeansMetricCalculation.scala @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.h2o.sparkling.ml.metrics + +import ai.h2o.sparkling.ml.models.H2OKMeansMOJOModel +import hex.ModelMetrics.IndependentMetricBuilder +import hex.ModelMetricsClustering.IndependentMetricBuilderClustering +import hex.genmodel.GenModel +import hex.genmodel.algos.kmeans.KMeansMojoModel +import hex.genmodel.easy.{EasyPredictModelWrapper, RowData} + +trait KmeansMetricCalculation { + self: H2OKMeansMOJOModel => + + override private[sparkling] def getActualValuesExtractor(): (RowData, EasyPredictModelWrapper) => Array[Double] = { + (rowData: RowData, wrapper: EasyPredictModelWrapper) => + { + val model = wrapper.m.asInstanceOf[KMeansMojoModel] + val rawData = new Array[Double](wrapper.m.nfeatures()) + wrapper.fillRawData(rowData, rawData) + if (model._standardize) { + GenModel.Kmeans_preprocessData(rawData, model._means, model._mults, model._modes) + } + rawData + } + } +} diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala new file mode 100644 index 0000000000..79157d8209 --- /dev/null +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala @@ -0,0 +1,207 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.h2o.sparkling.ml.metrics + +import java.io.File + +import ai.h2o.sparkling.ml.internals.H2OModelCategory +import ai.h2o.sparkling.ml.models.{H2OMOJOModel, RowConverter} +import ai.h2o.sparkling.ml.utils.{DatasetShape, SchemaUtils} +import com.google.gson.{GsonBuilder, JsonObject} +import hex._ +import hex.ModelMetrics.IndependentMetricBuilder +import hex.ModelMetricsBinomialGLM.{ModelMetricsMultinomialGLM, ModelMetricsOrdinalGLM} +import hex.genmodel.MojoModel +import hex.genmodel.easy.{EasyPredictModelWrapper, RowData} +import org.apache.spark.SparkFiles +import org.apache.spark.sql.DataFrame +import water.api.{Schema, SchemaServer} +import water.api.schemas3._ +import org.apache.spark.sql.functions.col +import org.apache.spark.sql.types.DoubleType + +trait MetricCalculation { + self: H2OMOJOModel => + + /** + * Returns an object holding all metrics of the Double type and also more complex performance information + * calculated on a data frame passed as a parameter. + */ + def getMetricsObject(dataFrame: DataFrame): H2OMetrics = { + val gson = getMetricGson(dataFrame) + + val h2oMojo = unwrapMojoModel() + val modelCategory = H2OModelCategory.fromString(getModelCategory()) + + H2OMetrics.loadMetrics(gson, "realtime_metrics", h2oMojo._algoName, modelCategory, getDataFrameSerializer) + } + + /** + * Returns a map of all metrics of the Double type calculated on a data frame passed as a parameter. + */ + def getMetrics(dataFrame: DataFrame): Map[String, Double] = { + val gson = getMetricGson(dataFrame) + val conversionInput = new JsonObject() + conversionInput.add("realtime_metrics", gson) + + extractMetrics(conversionInput, "realtime_metrics") + } + + private[sparkling] def getActualValuesExtractor(): (RowData, EasyPredictModelWrapper) => Array[Double] = { + (rowData: RowData, wrapper: EasyPredictModelWrapper) => + { + val rawData = new Array[Double](wrapper.m.nfeatures()) + wrapper.fillRawData(rowData, rawData) + } + } + + private[sparkling] def getPredictionGetter(): (EasyPredictModelWrapper, RowData, Double) => Array[Double] = { + (wrapper: EasyPredictModelWrapper, rowData: RowData, offset: Double) => + { + wrapper.preamble(wrapper.m.getModelCategory, rowData, offset) + } + } + + private[sparkling] def getMetricGson(dataFrame: DataFrame): JsonObject = { + val (preparedDF, offsetColOption, weightColOption) = validateAndPrepareDataFrameForMetricCalculation(dataFrame) + val configInitializers = getEasyPredictModelWrapperConfigurationInitializers() + MetricCalculationClosure.getMetricGson( + uid, + mojoFileName, + preparedDF, + offsetColOption, + weightColOption, + configInitializers, + getActualValuesExtractor(), + getPredictionGetter()) + } + + private[sparkling] def validateAndPrepareDataFrameForMetricCalculation( + dataFrame: DataFrame): (DataFrame, Option[String], Option[String]) = { + val flatDataFrame = DatasetShape.getDatasetShape(dataFrame.schema) match { + case DatasetShape.Flat => dataFrame + case DatasetShape.StructsOnly | DatasetShape.Nested => + SchemaUtils.appendFlattenedStructsToDataFrame(dataFrame, RowConverter.temporaryColumnPrefix) + } + + if (hasParam("labelCol")) { + val labelCol = getOrDefault(getParam("labelCol")).toString + if (labelCol != null && !flatDataFrame.columns.contains(labelCol)) { + throw new IllegalArgumentException( + s"DataFrame passed as a parameter does not contain label column '$labelCol'.") + } + } + val (offsetColCastedDF, offsetColOption) = + if (hasParam("offsetCol") && getOrDefault(getParam("offsetCol")) != null) { + val offsetCol = getOrDefault(getParam("offsetCol")).toString + if (!flatDataFrame.columns.contains(offsetCol)) { + throw new IllegalArgumentException( + s"DataFrame passed as a parameter does not contain offset column '$offsetCol'.") + } + (flatDataFrame.withColumn(offsetCol, col(offsetCol).cast(DoubleType)), Some(offsetCol)) + } else { + (flatDataFrame, None) + } + + val weightColTuple = if (hasParam("weightCol") && getOrDefault(getParam("weightCol")) != null) { + val weightCol = getOrDefault(getParam("weightCol")).toString + if (!flatDataFrame.columns.contains(weightCol)) { + throw new IllegalArgumentException( + s"DataFrame passed as a parameter does not contain weight column '$weightCol'.") + } + (offsetColCastedDF.withColumn(weightCol, col(weightCol).cast(DoubleType)), offsetColOption, Some(weightCol)) + } else { + (offsetColCastedDF, offsetColOption, None) + } + weightColTuple + } + +} + +object MetricCalculationClosure { + + private[sparkling] def makeMetricBuilder(mojoModel: MojoModel, mojoFileName: String): IndependentMetricBuilder[_] = { + val mojoFile = new File(SparkFiles.get(mojoFileName)) + MojoModel.loadMetricBuilder(mojoModel, mojoFile).asInstanceOf[IndependentMetricBuilder[_]] + } + + private[sparkling] def metricsToSchema(metrics: ModelMetrics): Schema[_, _] = { + val schemas = + MetricsCalculationTypeExtensions.SCHEMA_CLASSES.map(c => + Class.forName(c).getConstructor().newInstance().asInstanceOf[Schema[Nothing, Nothing]]) + schemas.foreach(SchemaServer.register) + val schema = SchemaServer.schema(3, metrics) + schema match { + case s: ModelMetricsBinomialGLMV3 => s.fillFromImpl(metrics.asInstanceOf[ModelMetricsBinomialGLM]) + case s: ModelMetricsBinomialV3[ModelMetricsBinomial, _] => + s.fillFromImpl(metrics.asInstanceOf[ModelMetricsBinomial]) + case s: ModelMetricsMultinomialGLMV3 => s.fillFromImpl(metrics.asInstanceOf[ModelMetricsMultinomialGLM]) + case s: ModelMetricsMultinomialV3[ModelMetricsMultinomial, _] => + s.fillFromImpl(metrics.asInstanceOf[ModelMetricsMultinomial]) + case s: ModelMetricsOrdinalGLMV3 => s.fillFromImpl(metrics.asInstanceOf[ModelMetricsOrdinalGLM]) + case s: ModelMetricsOrdinalV3[ModelMetricsOrdinal, _] => s.fillFromImpl(metrics.asInstanceOf[ModelMetricsOrdinal]) + case s: ModelMetricsRegressionCoxPHV3 => s.fillFromImpl(metrics.asInstanceOf[ModelMetricsRegressionCoxPH]) + case s: ModelMetricsRegressionGLMV3 => s.fillFromImpl(metrics.asInstanceOf[ModelMetricsRegressionGLM]) + case s: ModelMetricsRegressionV3[ModelMetricsRegression, _] => + s.fillFromImpl(metrics.asInstanceOf[ModelMetricsRegression]) + case s: ModelMetricsClusteringV3 => s.fillFromImpl(metrics.asInstanceOf[ModelMetricsClustering]) + case s: ModelMetricsHGLMV3[ModelMetricsHGLM, _] => s.fillFromImpl(metrics.asInstanceOf[ModelMetricsHGLM]) + case s: ModelMetricsAutoEncoderV3 => s.fillFromImpl(metrics) + case s: ModelMetricsBaseV3[_, _] => s.fillFromImpl(metrics) + } + schema + } + + private[sparkling] def getMetricGson( + uid: String, + mojoFileName: String, + preparedDF: DataFrame, + offsetColOption: Option[String], + weightColOption: Option[String], + configInitializers: Seq[H2OMOJOModel.EasyPredictModelWrapperConfigurationInitializer], + extractActualValues: (RowData, EasyPredictModelWrapper) => Array[Double], + getPrediction: (EasyPredictModelWrapper, RowData, Double) => Array[Double]): JsonObject = { + val filledMetricsBuilder = preparedDF.rdd + .mapPartitions[IndependentMetricBuilder[_]] { rows => + val wrapper = H2OMOJOModel.loadEasyPredictModelWrapper(uid, mojoFileName, configInitializers) + val metricBuilder = makeMetricBuilder(wrapper.getModel.asInstanceOf[MojoModel], mojoFileName) + while (rows.hasNext) { + val row = rows.next() + val rowData = RowConverter.toH2ORowData(row) + val offset = offsetColOption match { + case Some(offsetCol) => row.getDouble(row.fieldIndex(offsetCol)) + case None => 0.0d + } + val weight = weightColOption match { + case Some(weightCol) => row.getDouble(row.fieldIndex(weightCol)) + case None => 1.0d + } + val prediction = getPrediction(wrapper, rowData, offset) + val actualValues = extractActualValues(rowData, wrapper) + metricBuilder.perRow(prediction, actualValues, weight, offset) + } + Iterator.single(metricBuilder) + } + .reduce((f, s) => { f.reduce(s); f }) + + val metrics = filledMetricsBuilder.makeModelMetrics() + val schema = metricsToSchema(metrics) + val json = schema.toJsonString + new GsonBuilder().create().fromJson(json, classOf[JsonObject]) + } +} diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OMOJOModel.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OMOJOModel.scala index ec32a099a2..996bb49d1c 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OMOJOModel.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OMOJOModel.scala @@ -17,7 +17,7 @@ package ai.h2o.sparkling.ml.models -import java.io.{File, InputStream} +import java.io.{BufferedReader, File, InputStream} import _root_.hex.genmodel.attributes.ModelJsonReader import _root_.hex.genmodel.easy.EasyPredictModelWrapper import _root_.hex.genmodel.{MojoModel, MojoReaderBackendFactory} @@ -35,6 +35,7 @@ import _root_.hex.genmodel.attributes.Table.ColumnType import ai.h2o.sparkling.api.generation.common.MetricNameConverter import ai.h2o.sparkling.ml.metrics.H2OMetrics import org.apache.spark.SparkFiles +import ai.h2o.sparkling.ml.metrics.{H2OMetrics, MetricCalculation} import org.apache.spark.expose.Logging import org.apache.spark.ml.Model import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema @@ -55,6 +56,7 @@ abstract class H2OMOJOModel with SpecificMOJOParameters with H2OBaseMOJOParams with HasFeatureTypesOnMOJO + with MetricCalculation with Logging { H2OMOJOCache.startCleanupThread() @@ -234,6 +236,11 @@ abstract class H2OMOJOModel private[sparkling] def getCrossValidationModelsAsArray(): Array[H2OMOJOModel] = crossValidationModels + private[sparkling] def getDetailsReader(): BufferedReader = { + val reader = MojoReaderBackendFactory.createReaderBackend(getMojo().getAbsolutePath) + reader.getTextFile(ModelJsonReader.MODEL_DETAILS_FILE) + } + private[sparkling] def setCrossValidationModels(models: Array[H2OMOJOModel]): this.type = { crossValidationModels = models if (models != null) { diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OSupervisedMOJOModel.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OSupervisedMOJOModel.scala index 38af0a18dc..08f1be1689 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OSupervisedMOJOModel.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OSupervisedMOJOModel.scala @@ -20,6 +20,7 @@ package ai.h2o.sparkling.ml.models import ai.h2o.sparkling.ml.params.H2OSupervisedMOJOParams import hex.ModelCategory import hex.genmodel.MojoModel +import hex.genmodel.easy.{EasyPredictModelWrapper, RowData} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.DataFrame import org.apache.spark.sql.expressions.UserDefinedFunction @@ -67,6 +68,15 @@ class H2OSupervisedMOJOModel(override val uid: String) extends H2OAlgorithmMOJOM flatDataFrame.withColumn(outputColumnName, udf(struct(args: _*))) } } + + override private[sparkling] def getActualValuesExtractor(): (RowData, EasyPredictModelWrapper) => Array[Double] = { + (rowData: RowData, wrapper: EasyPredictModelWrapper) => + { + val responseColumn = wrapper.m._responseColumn + val encodedActualValue = wrapper.extractRawDataValue(rowData, responseColumn) + Array[Double](encodedActualValue) + } + } } object H2OSupervisedMOJOModel extends H2OSpecificMOJOLoader[H2OSupervisedMOJOModel] From d8af501c0353fde13f0b1d0284c22cc9f9fd313d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Fri, 18 Mar 2022 19:58:39 +0100 Subject: [PATCH 02/37] Update api --- .../ml/metrics/H2OBinomialMetrics.scala | 67 +++++++ .../ml/metrics/H2OMultinomialMetrics.scala | 77 ++++++++ .../ml/metrics/H2ORegressionMetrics.scala | 67 +++++++ .../ml/metrics/MetricCalculation.scala | 174 ++++++------------ .../sparkling/ml/models/H2OMOJOModel.scala | 1 - 5 files changed, 266 insertions(+), 120 deletions(-) create mode 100644 scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala create mode 100644 scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala create mode 100644 scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala new file mode 100644 index 0000000000..ff61cf90d2 --- /dev/null +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.h2o.sparkling.ml.metrics + +import hex.ModelMetricsBinomial.IndependentMetricBuilderBinomial +import hex.genmodel.utils.DistributionFamily +import org.apache.spark.sql.DataFrame + +object H2OBinomialMetrics extends MetricCalculation { + + def calculate( + dataFrame: DataFrame, + domain: Array[String], + predictionProbabilitiesCol: String = "detailed_prediction.probabilities", + labelCol: String = "label", + weightColOption: Option[String] = None, + offsetColOption: Option[String] = None, + distributionFamily: String = "AUTO"): H2OBinomialMetrics = { + val domainFamilyEnum = DistributionFamily.valueOf(distributionFamily) + val getMetricBuilder = () => new IndependentMetricBuilderBinomial[_](domain, domainFamilyEnum) + + val gson = getMetricGson( + getMetricBuilder, + dataFrame, + predictionProbabilitiesCol, + labelCol, + offsetColOption, + weightColOption, + domain) + val result = new H2OBinomialMetrics() + result.setMetrics(gson, "H2OBinomialMetrics.calculate") + result + } + + def calculate( + dataFrame: DataFrame, + domain: Array[String], + predictionProbabilitiesCol: String, + labelCol: String, + weightCol: String, + offsetCol: String, + distributionFamily: String): Unit = { + calculate( + dataFrame, + domain, + predictionProbabilitiesCol, + labelCol, + Option(weightCol), + Option(offsetCol), + distributionFamily) + } +} diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala new file mode 100644 index 0000000000..0d7da7d268 --- /dev/null +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.h2o.sparkling.ml.metrics + +import ai.h2o.sparkling.ml.metrics.H2OBinomialMetrics.getMetricGson +import hex.ModelMetricsMultinomial.IndependentMetricBuilderMultinomial +import hex.MultinomialAucType +import org.apache.spark.sql.DataFrame + +object H2OMultinomialMetrics { + def calculate( + dataFrame: DataFrame, + domain: Array[String], + predictionProbabilitiesCol: String = "detailed_prediction.probabilities", + labelCol: String = "label", + weightColOption: Option[String] = None, + offsetColOption: Option[String] = None, + priorDistributionOption: Option[Array[Double]] = None, + aucType: String = "AUTO"): H2OMultinomialMetrics = { + + val aucTypeEnum = MultinomialAucType.valueOf(aucType) + val nclasses = domain.length + val priorDistribution = priorDistributionOption match { + case Some(x) => x + case None => null + } + val getMetricBuilder = + () => new IndependentMetricBuilderMultinomial[_](nclasses, domain, aucTypeEnum, priorDistribution) + + val gson = getMetricGson( + getMetricBuilder, + dataFrame, + predictionProbabilitiesCol, + labelCol, + offsetColOption, + weightColOption, + domain) + val result = new H2OMultinomialMetrics() + result.setMetrics(gson, "H2OMultinomialMetrics.calculate") + result + } + + def calculate( + dataFrame: DataFrame, + domain: Array[String], + predictionProbabilitiesCol: String, + labelCol: String, + weightCol: String, + offsetCol: String, + priorDistribution: Array[Double], + aucType: String): H2OMultinomialMetrics = { + calculate( + dataFrame, + domain, + predictionProbabilitiesCol, + labelCol, + Option(weightCol), + Option(offsetCol), + Option(priorDistribution), + aucType) + } +} diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala new file mode 100644 index 0000000000..98552725a6 --- /dev/null +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.h2o.sparkling.ml.metrics + +import ai.h2o.sparkling.ml.metrics.H2OBinomialMetrics.getMetricGson +import hex.DistributionFactory +import hex.ModelMetricsRegression.IndependentMetricBuilderRegression +import hex.genmodel.utils.DistributionFamily +import org.apache.spark.sql.DataFrame + +object H2ORegressionMetrics { + + def calculate( + dataFrame: DataFrame, + predictionCol: String = "prediction", + labelCol: String = "label", + weightColOption: Option[String] = None, + offsetColOption: Option[String] = None, + distributionFamily: String = "AUTO"): H2ORegressionMetrics = { + val domainFamilyEnum = DistributionFamily.valueOf(distributionFamily) + val distribution= DistributionFactory.getDistribution(domainFamilyEnum) + val getMetricBuilder = () => new IndependentMetricBuilderRegression[_](distribution) + + val gson = getMetricGson( + getMetricBuilder, + dataFrame, + predictionCol, + labelCol, + offsetColOption, + weightColOption, + null) + val result = new H2ORegressionMetrics() + result.setMetrics(gson, "H2ORegressionMetrics.calculate") + result + } + + def calculate( + dataFrame: DataFrame, + predictionCol: String, + labelCol: String, + weightCol: String, + offsetCol: String, + distributionFamily: String): H2ORegressionMetrics = { + calculate( + dataFrame, + predictionCol, + labelCol, + Option(weightCol), + Option(offsetCol), + distributionFamily) + } +} diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala index 79157d8209..ec02bdb4d3 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala @@ -17,127 +17,54 @@ package ai.h2o.sparkling.ml.metrics -import java.io.File - -import ai.h2o.sparkling.ml.internals.H2OModelCategory -import ai.h2o.sparkling.ml.models.{H2OMOJOModel, RowConverter} +import ai.h2o.sparkling.ml.models.RowConverter import ai.h2o.sparkling.ml.utils.{DatasetShape, SchemaUtils} import com.google.gson.{GsonBuilder, JsonObject} import hex._ import hex.ModelMetrics.IndependentMetricBuilder -import hex.ModelMetricsBinomialGLM.{ModelMetricsMultinomialGLM, ModelMetricsOrdinalGLM} -import hex.genmodel.MojoModel -import hex.genmodel.easy.{EasyPredictModelWrapper, RowData} -import org.apache.spark.SparkFiles +import org.apache.spark.{ExposeUtils, ml, mllib} import org.apache.spark.sql.DataFrame import water.api.{Schema, SchemaServer} import water.api.schemas3._ -import org.apache.spark.sql.functions.col -import org.apache.spark.sql.types.DoubleType +import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType, StringType} trait MetricCalculation { - self: H2OMOJOModel => - - /** - * Returns an object holding all metrics of the Double type and also more complex performance information - * calculated on a data frame passed as a parameter. - */ - def getMetricsObject(dataFrame: DataFrame): H2OMetrics = { - val gson = getMetricGson(dataFrame) - - val h2oMojo = unwrapMojoModel() - val modelCategory = H2OModelCategory.fromString(getModelCategory()) - - H2OMetrics.loadMetrics(gson, "realtime_metrics", h2oMojo._algoName, modelCategory, getDataFrameSerializer) - } - - /** - * Returns a map of all metrics of the Double type calculated on a data frame passed as a parameter. - */ - def getMetrics(dataFrame: DataFrame): Map[String, Double] = { - val gson = getMetricGson(dataFrame) - val conversionInput = new JsonObject() - conversionInput.add("realtime_metrics", gson) - - extractMetrics(conversionInput, "realtime_metrics") - } - - private[sparkling] def getActualValuesExtractor(): (RowData, EasyPredictModelWrapper) => Array[Double] = { - (rowData: RowData, wrapper: EasyPredictModelWrapper) => - { - val rawData = new Array[Double](wrapper.m.nfeatures()) - wrapper.fillRawData(rowData, rawData) - } - } - - private[sparkling] def getPredictionGetter(): (EasyPredictModelWrapper, RowData, Double) => Array[Double] = { - (wrapper: EasyPredictModelWrapper, rowData: RowData, offset: Double) => - { - wrapper.preamble(wrapper.m.getModelCategory, rowData, offset) - } - } - - private[sparkling] def getMetricGson(dataFrame: DataFrame): JsonObject = { - val (preparedDF, offsetColOption, weightColOption) = validateAndPrepareDataFrameForMetricCalculation(dataFrame) - val configInitializers = getEasyPredictModelWrapperConfigurationInitializers() - MetricCalculationClosure.getMetricGson( - uid, - mojoFileName, - preparedDF, - offsetColOption, - weightColOption, - configInitializers, - getActualValuesExtractor(), - getPredictionGetter()) - } - private[sparkling] def validateAndPrepareDataFrameForMetricCalculation( - dataFrame: DataFrame): (DataFrame, Option[String], Option[String]) = { + private[sparkling] def getFlattenDataFrame(dataFrame: DataFrame): DataFrame = { val flatDataFrame = DatasetShape.getDatasetShape(dataFrame.schema) match { case DatasetShape.Flat => dataFrame case DatasetShape.StructsOnly | DatasetShape.Nested => SchemaUtils.appendFlattenedStructsToDataFrame(dataFrame, RowConverter.temporaryColumnPrefix) } + flatDataFrame + } + + private[sparkling] def validateDataFrameForMetricCalculation( + flatDataFrame: DataFrame, + labelCol: String, + offsetColOption: Option[String], + weightColOption: Option[String]): Unit = { + + if (labelCol != null && !flatDataFrame.columns.contains(labelCol)) { + throw new IllegalArgumentException( + s"DataFrame passed as a parameter does not contain label column '$labelCol'.") + } - if (hasParam("labelCol")) { - val labelCol = getOrDefault(getParam("labelCol")).toString - if (labelCol != null && !flatDataFrame.columns.contains(labelCol)) { + if (offsetColOption.isDefined) { + val offsetCol = offsetColOption.get + if (!flatDataFrame.columns.contains(offsetCol)) { throw new IllegalArgumentException( - s"DataFrame passed as a parameter does not contain label column '$labelCol'.") + s"DataFrame passed as a parameter does not contain offset column '$offsetCol'.") } } - val (offsetColCastedDF, offsetColOption) = - if (hasParam("offsetCol") && getOrDefault(getParam("offsetCol")) != null) { - val offsetCol = getOrDefault(getParam("offsetCol")).toString - if (!flatDataFrame.columns.contains(offsetCol)) { - throw new IllegalArgumentException( - s"DataFrame passed as a parameter does not contain offset column '$offsetCol'.") - } - (flatDataFrame.withColumn(offsetCol, col(offsetCol).cast(DoubleType)), Some(offsetCol)) - } else { - (flatDataFrame, None) - } - val weightColTuple = if (hasParam("weightCol") && getOrDefault(getParam("weightCol")) != null) { - val weightCol = getOrDefault(getParam("weightCol")).toString + if (weightColOption.isDefined) { + val weightCol = weightColOption.get if (!flatDataFrame.columns.contains(weightCol)) { throw new IllegalArgumentException( s"DataFrame passed as a parameter does not contain weight column '$weightCol'.") } - (offsetColCastedDF.withColumn(weightCol, col(weightCol).cast(DoubleType)), offsetColOption, Some(weightCol)) - } else { - (offsetColCastedDF, offsetColOption, None) } - weightColTuple - } - -} - -object MetricCalculationClosure { - - private[sparkling] def makeMetricBuilder(mojoModel: MojoModel, mojoFileName: String): IndependentMetricBuilder[_] = { - val mojoFile = new File(SparkFiles.get(mojoFileName)) - MojoModel.loadMetricBuilder(mojoModel, mojoFile).asInstanceOf[IndependentMetricBuilder[_]] } private[sparkling] def metricsToSchema(metrics: ModelMetrics): Schema[_, _] = { @@ -147,42 +74,34 @@ object MetricCalculationClosure { schemas.foreach(SchemaServer.register) val schema = SchemaServer.schema(3, metrics) schema match { - case s: ModelMetricsBinomialGLMV3 => s.fillFromImpl(metrics.asInstanceOf[ModelMetricsBinomialGLM]) case s: ModelMetricsBinomialV3[ModelMetricsBinomial, _] => s.fillFromImpl(metrics.asInstanceOf[ModelMetricsBinomial]) - case s: ModelMetricsMultinomialGLMV3 => s.fillFromImpl(metrics.asInstanceOf[ModelMetricsMultinomialGLM]) case s: ModelMetricsMultinomialV3[ModelMetricsMultinomial, _] => s.fillFromImpl(metrics.asInstanceOf[ModelMetricsMultinomial]) - case s: ModelMetricsOrdinalGLMV3 => s.fillFromImpl(metrics.asInstanceOf[ModelMetricsOrdinalGLM]) - case s: ModelMetricsOrdinalV3[ModelMetricsOrdinal, _] => s.fillFromImpl(metrics.asInstanceOf[ModelMetricsOrdinal]) - case s: ModelMetricsRegressionCoxPHV3 => s.fillFromImpl(metrics.asInstanceOf[ModelMetricsRegressionCoxPH]) - case s: ModelMetricsRegressionGLMV3 => s.fillFromImpl(metrics.asInstanceOf[ModelMetricsRegressionGLM]) case s: ModelMetricsRegressionV3[ModelMetricsRegression, _] => s.fillFromImpl(metrics.asInstanceOf[ModelMetricsRegression]) - case s: ModelMetricsClusteringV3 => s.fillFromImpl(metrics.asInstanceOf[ModelMetricsClustering]) - case s: ModelMetricsHGLMV3[ModelMetricsHGLM, _] => s.fillFromImpl(metrics.asInstanceOf[ModelMetricsHGLM]) - case s: ModelMetricsAutoEncoderV3 => s.fillFromImpl(metrics) - case s: ModelMetricsBaseV3[_, _] => s.fillFromImpl(metrics) } schema } private[sparkling] def getMetricGson( - uid: String, - mojoFileName: String, - preparedDF: DataFrame, + createMetricBuilder: () => IndependentMetricBuilder[_], + dataFrame: DataFrame, + predictionCol: String, + labelCol: String, offsetColOption: Option[String], weightColOption: Option[String], - configInitializers: Seq[H2OMOJOModel.EasyPredictModelWrapperConfigurationInitializer], - extractActualValues: (RowData, EasyPredictModelWrapper) => Array[Double], - getPrediction: (EasyPredictModelWrapper, RowData, Double) => Array[Double]): JsonObject = { - val filledMetricsBuilder = preparedDF.rdd + domain: Array[String]): JsonObject = { + val flatDF = getFlattenDataFrame(dataFrame) + val predictionType = flatDF.schema.fields.find(f => f.name == predictionCol).get.dataType + val predictionColIndex = flatDF.schema.indexOf(predictionCol) + val actualType = flatDF.schema.fields.find(f => f.name == labelCol).get.dataType + val actualColIndex = flatDF.schema.indexOf(labelCol) + val filledMetricsBuilder = flatDF.rdd .mapPartitions[IndependentMetricBuilder[_]] { rows => - val wrapper = H2OMOJOModel.loadEasyPredictModelWrapper(uid, mojoFileName, configInitializers) - val metricBuilder = makeMetricBuilder(wrapper.getModel.asInstanceOf[MojoModel], mojoFileName) + val metricBuilder = createMetricBuilder() while (rows.hasNext) { val row = rows.next() - val rowData = RowConverter.toH2ORowData(row) val offset = offsetColOption match { case Some(offsetCol) => row.getDouble(row.fieldIndex(offsetCol)) case None => 0.0d @@ -191,9 +110,26 @@ object MetricCalculationClosure { case Some(weightCol) => row.getDouble(row.fieldIndex(weightCol)) case None => 1.0d } - val prediction = getPrediction(wrapper, rowData, offset) - val actualValues = extractActualValues(rowData, wrapper) - metricBuilder.perRow(prediction, actualValues, weight, offset) + val prediction = predictionType match { + case ArrayType(DoubleType, _) => row.getSeq[Double](predictionColIndex).toArray + case ArrayType(FloatType, _) => row.getSeq[Float](predictionColIndex).map(_.toDouble).toArray + case DoubleType => Array(row.getDouble(predictionColIndex)) + case FloatType => Array(row.getFloat(predictionColIndex).toDouble) + case v if ExposeUtils.isMLVectorUDT(v) => + val vector = row.getAs[ml.linalg.Vector](predictionColIndex) + vector.toDense.values + case _: mllib.linalg.VectorUDT => + val vector = row.getAs[mllib.linalg.Vector](predictionColIndex) + vector.toDense.values + } + val actualValue = actualType match { + case StringType => + val label = row.getString(actualColIndex) + domain.indexOf(label).toDouble + case DoubleType => row.getDouble(actualColIndex) + case FloatType => row.getFloat(actualColIndex) + } + metricBuilder.perRow(prediction, Array(actualValue), weight, offset) } Iterator.single(metricBuilder) } diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OMOJOModel.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OMOJOModel.scala index 996bb49d1c..dedf8445f4 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OMOJOModel.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OMOJOModel.scala @@ -56,7 +56,6 @@ abstract class H2OMOJOModel with SpecificMOJOParameters with H2OBaseMOJOParams with HasFeatureTypesOnMOJO - with MetricCalculation with Logging { H2OMOJOCache.startCleanupThread() From 5ed9b346569932dc6c30f792199c56f2adaef193 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Mon, 21 Mar 2022 12:26:43 +0100 Subject: [PATCH 03/37] Remove unrelated tests --- .../AnomalyDetectionMetricsTestSuite.scala | 55 ------ .../metrics/AutoEncoderMetricsTestSuite.scala | 56 ------ .../metrics/ClusteringMetricsTestSuite.scala | 76 -------- .../DimReductionMetricsTestSuite.scala | 84 -------- .../ml/metrics/OrdinalMetricsTestSuite.scala | 182 ------------------ 5 files changed, 453 deletions(-) delete mode 100644 ml/src/test/scala/ai/h2o/sparkling/ml/metrics/AnomalyDetectionMetricsTestSuite.scala delete mode 100644 ml/src/test/scala/ai/h2o/sparkling/ml/metrics/AutoEncoderMetricsTestSuite.scala delete mode 100644 ml/src/test/scala/ai/h2o/sparkling/ml/metrics/ClusteringMetricsTestSuite.scala delete mode 100644 ml/src/test/scala/ai/h2o/sparkling/ml/metrics/DimReductionMetricsTestSuite.scala delete mode 100644 ml/src/test/scala/ai/h2o/sparkling/ml/metrics/OrdinalMetricsTestSuite.scala diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/AnomalyDetectionMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/AnomalyDetectionMetricsTestSuite.scala deleted file mode 100644 index 54da3bf5ec..0000000000 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/AnomalyDetectionMetricsTestSuite.scala +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package ai.h2o.sparkling.ml.metrics - -import ai.h2o.sparkling.ml.algos.H2OIsolationForest -import ai.h2o.sparkling.ml.features.H2OAutoEncoder -import ai.h2o.sparkling.{SharedH2OTestContext, TestUtils} -import org.apache.spark.sql.SparkSession -import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner -import org.scalatest.{FunSuite, Matchers} - -@RunWith(classOf[JUnitRunner]) -class AnomalyDetectionMetricsTestSuite extends FunSuite with Matchers with SharedH2OTestContext { - - override def createSparkSession(): SparkSession = sparkSession("local[*]") - - private lazy val trainingDataset = spark.read - .option("inferSchema", "true") - .csv(TestUtils.locate("smalldata/anomaly/ecg_discord_train.csv")) - - private lazy val validationDataset = spark.read - .option("inferSchema", "true") - .csv(TestUtils.locate("smalldata/anomaly/ecg_discord_test.csv")) - - ignore("test calculation of isolation forest metric objects on arbitrary dataset") { - val algorithm = new H2OIsolationForest() - .setSeed(42) -// .setValidationDataFrame(validationDataset) - - val model = algorithm.fit(trainingDataset) - - MetricsAssertions.assertEssentialMetrics( - model, - trainingDataset, - validationDataset, - trainingMetricsTolerance = 0.00001, - validationMetricsTolerance = 0.00001) - } -} diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/AutoEncoderMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/AutoEncoderMetricsTestSuite.scala deleted file mode 100644 index 73b76b8798..0000000000 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/AutoEncoderMetricsTestSuite.scala +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package ai.h2o.sparkling.ml.metrics - -import ai.h2o.sparkling.ml.features.H2OAutoEncoder -import ai.h2o.sparkling.{SharedH2OTestContext, TestUtils} -import org.apache.spark.sql.SparkSession -import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner -import org.scalatest.{FunSuite, Matchers} - -@RunWith(classOf[JUnitRunner]) -class AutoEncoderMetricsTestSuite extends FunSuite with Matchers with SharedH2OTestContext { - - override def createSparkSession(): SparkSession = sparkSession("local[*]") - - private lazy val dataset = spark.read - .option("header", "true") - .option("inferSchema", "true") - .csv(TestUtils.locate("smalldata/prostate/prostate.csv")) - private lazy val Array(trainingDataset, validationDataset) = dataset.randomSplit(Array(0.8, 0.2), seed = 42L) - - test("test calculation of autoencoder metric objects on arbitrary dataset") { - val algorithm = new H2OAutoEncoder() - .setSeed(1) - .setInputCols("DCAPS", "PSA", "VOL") - .setValidationDataFrame(validationDataset) - .setOutputCol("Output") - .setHidden(Array(3)) - .setReproducible(true) - - val model = algorithm.fit(trainingDataset) - - MetricsAssertions.assertEssentialMetrics( - model, - trainingDataset, - validationDataset, - trainingMetricsTolerance = 0.00001, - validationMetricsTolerance = 0.00001) - } -} diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/ClusteringMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/ClusteringMetricsTestSuite.scala deleted file mode 100644 index 0fdbea4625..0000000000 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/ClusteringMetricsTestSuite.scala +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package ai.h2o.sparkling.ml.metrics - -import ai.h2o.sparkling.ml.algos.H2OKMeans -import ai.h2o.sparkling.ml.models.{H2OKMeansMOJOModel, H2OMOJOModel} -import ai.h2o.sparkling.{SharedH2OTestContext, TestUtils} -import org.apache.spark.sql.{DataFrame, SparkSession} -import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner -import org.scalatest.{FunSuite, Matchers} - -@RunWith(classOf[JUnitRunner]) -class ClusteringMetricsTestSuite extends FunSuite with Matchers with SharedH2OTestContext { - - override def createSparkSession(): SparkSession = sparkSession("local[*]") - - private lazy val dataset = spark.read - .option("header", "true") - .option("inferSchema", "true") - .csv(TestUtils.locate("smalldata/iris/iris_wheader.csv")) - - private lazy val Array(trainingDataset, validationDataset) = dataset.randomSplit(Array(0.8, 0.2), seed = 42L) - - test("test calculation of kmeans metric objects on arbitrary dataset") { - val algorithm = new H2OKMeans() - .setValidationDataFrame(validationDataset) - .setSeed(1) - .setK(3) - .setUserPoints(Array(Array(4.9, 3.0, 1.4, 0.2), Array(5.6, 2.5, 3.9, 1.1), Array(6.5, 3.0, 5.2, 2.0))) - .setFeaturesCols("sepal_len", "sepal_wid", "petal_len", "petal_wid") - - val model = algorithm.fit(trainingDataset) - - assertMetrics(model, trainingDataset, validationDataset, trainingMetricsTolerance = 0.00001) - } - - private def assertMetrics( - model: H2OKMeansMOJOModel, - trainingDataset: DataFrame, - validationDataset: DataFrame, - trainingMetricsTolerance: Double = 0.0, - validationMetricsTolerance: Double = 0.0): Unit = { - MetricsAssertions.assertEssentialMetrics( - model, - trainingDataset, - validationDataset, - trainingMetricsTolerance, - validationMetricsTolerance) - TestUtils.assertDataFramesAreEqual( - model.getTrainingMetricsObject().getCentroidStats(), - model.getMetricsObject(trainingDataset).getCentroidStats(), - "Centroid", - Map("Size" -> trainingMetricsTolerance, "Within Cluster Sum of Squares" -> trainingMetricsTolerance)) - TestUtils.assertDataFramesAreEqual( - model.getValidationMetricsObject().getCentroidStats(), - model.getMetricsObject(validationDataset).getCentroidStats(), - "Centroid", - Map("Size" -> validationMetricsTolerance, "Within Cluster Sum of Squares" -> validationMetricsTolerance)) - } -} diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/DimReductionMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/DimReductionMetricsTestSuite.scala deleted file mode 100644 index f61eac5cc4..0000000000 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/DimReductionMetricsTestSuite.scala +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package ai.h2o.sparkling.ml.metrics - -import ai.h2o.sparkling.ml.features.{H2OGLRM, H2OPCA} -import ai.h2o.sparkling.{SharedH2OTestContext, TestUtils} -import org.apache.spark.sql.SparkSession -import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner -import org.scalatest.{FunSuite, Matchers} - -@RunWith(classOf[JUnitRunner]) -class DimReductionMetricsTestSuite extends FunSuite with Matchers with SharedH2OTestContext { - - override def createSparkSession(): SparkSession = sparkSession("local[*]") - - private lazy val dataset = spark.read - .option("header", "true") - .option("inferSchema", "true") - .csv(TestUtils.locate("smalldata/prostate/prostate.csv")) - private lazy val Array(trainingDataset, validationDataset) = dataset.randomSplit(Array(0.8, 0.2), seed = 42L) - - test("test calculation of H2OPCA metrics on arbitrary dataset") { - val algorithm = new H2OPCA() - .setSeed(1) - .setInputCols("RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON") - .setOutputCol("Output") - .setValidationDataFrame(validationDataset) - .setImputeMissing(true) - .setPcaMethod("Power") - .setK(3) - - val model = algorithm.fit(trainingDataset) - - MetricsAssertions.assertEssentialMetrics( - model, - trainingDataset, - validationDataset, - trainingMetricsTolerance = 0.00001, - validationMetricsTolerance = 0.00001) - } - - ignore("test calculation of H2OGLRM metrics on arbitrary dataset") { - val Array(rawTrainingDataset, rawValidationDataset) = dataset.randomSplit(Array(0.5, 0.5), seed = 42) - val glrmTrainingDataset = rawTrainingDataset.limit(150) - val glrmValidationDataset = rawValidationDataset.limit(150) - - val algorithm = new H2OGLRM() - .setSeed(1) - .setValidationDataFrame(glrmValidationDataset) - .setInputCols("RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON") - .setOutputCol("Output") - .setWithReconstructedCol(true) - .setReconstructedCol("Reconstructed") - .setK(3) - - val model = algorithm.fit(glrmTrainingDataset) - println(model.getTrainingMetrics()) - println(model.getMetrics(glrmTrainingDataset)) - println(model.getValidationMetrics()) - println(model.getMetrics(glrmValidationDataset)) - MetricsAssertions.assertEssentialMetrics( - model, - glrmTrainingDataset, - glrmValidationDataset, - trainingMetricsTolerance = Double.PositiveInfinity, - validationMetricsTolerance = 0.00001) - } -} diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/OrdinalMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/OrdinalMetricsTestSuite.scala deleted file mode 100644 index 2ee3f53ddd..0000000000 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/OrdinalMetricsTestSuite.scala +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package ai.h2o.sparkling.ml.metrics - -import ai.h2o.sparkling.ml.algos.{H2OGAM, H2OGLM} -import ai.h2o.sparkling.ml.models.{H2OGLMMOJOModel, H2OMOJOModel} -import ai.h2o.sparkling.{SharedH2OTestContext, TestUtils} -import org.apache.spark.sql.functions.monotonically_increasing_id -import org.apache.spark.sql.{DataFrame, SparkSession} -import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner -import org.scalatest.{FunSuite, Matchers} - -@RunWith(classOf[JUnitRunner]) -class OrdinalMetricsTestSuite extends FunSuite with Matchers with SharedH2OTestContext { - - override def createSparkSession(): SparkSession = sparkSession("local[*]") - - private lazy val dataset = spark.read - .option("header", "true") - .option("inferSchema", "true") - .csv(TestUtils.locate("smalldata/insurance.csv")) - .withColumn("ID", monotonically_increasing_id) - .repartition(20) - - private lazy val Array(trainingDataset, validationDataset) = dataset.randomSplit(Array(0.8, 0.2), 42L) - - private def assertMetrics[T](model: H2OMOJOModel): Unit = { - assertMetrics(model.getTrainingMetricsObject(), model.getTrainingMetrics()) - assertMetrics(model.getValidationMetricsObject(), model.getValidationMetrics()) - assert(model.getCrossValidationMetricsObject() == null) - assert(model.getCrossValidationMetrics() == Map()) - } - - private def assertMetrics(metricsObject: H2OMetrics, metrics: Map[String, Double]): Unit = { - metricsObject.isInstanceOf[H2OOrdinalGLMMetrics] should be(true) - MetricsAssertions.assertMetricsObjectAgainstMetricsMap(metricsObject, metrics) - } - - private def assertMetrics( - model: H2OMOJOModel, - trainingDataset: DataFrame, - validationDataset: DataFrame, - trainingMetricsTolerance: Double = 0.0, - validationMetricsTolerance: Double = 0.0): Unit = { - MetricsAssertions.assertEssentialMetrics( - model, - trainingDataset, - validationDataset, - trainingMetricsTolerance, - validationMetricsTolerance) - - val trainingMetricObject = model.getMetricsObject(trainingDataset).asInstanceOf[H2OOrdinalGLMMetrics] - val expectedTrainingMetricObject = model.getTrainingMetricsObject().asInstanceOf[H2OOrdinalGLMMetrics] - TestUtils.assertDataFramesAreIdentical( - trainingMetricObject.getConfusionMatrix(), - expectedTrainingMetricObject.getConfusionMatrix()) - TestUtils.assertDataFramesAreEqual( - trainingMetricObject.getHitRatioTable(), - expectedTrainingMetricObject.getHitRatioTable(), - "K", - trainingMetricsTolerance) - - val validationMetricObject = model.getMetricsObject(validationDataset).asInstanceOf[H2OOrdinalGLMMetrics] - val expectedValidationMetricObject = model.getValidationMetricsObject().asInstanceOf[H2OOrdinalGLMMetrics] - TestUtils.assertDataFramesAreIdentical( - validationMetricObject.getConfusionMatrix(), - expectedValidationMetricObject.getConfusionMatrix()) - TestUtils.assertDataFramesAreEqual( - validationMetricObject.getHitRatioTable(), - expectedValidationMetricObject.getHitRatioTable(), - "K", - validationMetricsTolerance) - } - - test("test ordinal glm metric objects") { - val algo = new H2OGLM() - .setSplitRatio(0.8) - .setFeaturesCols("District", "Group", "Claims") - .setLabelCol("Age") - .setSeed(1) - .setFamily("ordinal") - val model = algo.fit(dataset) - assertMetrics[H2OOrdinalMetrics](model) - - model.write.overwrite().save("ml/build/glm_ordinal_model_metrics") - val loadedModel = H2OGLMMOJOModel.load("ml/build/glm_ordinal_model_metrics") - assertMetrics[H2OOrdinalGLMMetrics](loadedModel) - } - - test("test calculation of ordinal H2OGLM metrics on arbitrary dataset") { - val algo = new H2OGLM() - .setValidationDataFrame(validationDataset) - .setFeaturesCols("District", "Group", "Claims") - .setLabelCol("Age") - .setSeed(1) - .setFamily("ordinal") - val model = algo.fit(trainingDataset) - - assertMetrics( - model, - trainingDataset, - validationDataset, - trainingMetricsTolerance = 0.00001, - validationMetricsTolerance = 0.00000001) - } - - test("test calculation of ordinal H2OGLM metrics with weightCol set on arbitrary dataset") { - val algo = new H2OGLM() - .setValidationDataFrame(validationDataset) - .setFeaturesCols("District", "Group", "Claims") - .setLabelCol("Age") - .setSeed(1) - .setFamily("ordinal") - .setWeightCol("ID") - val model = algo.fit(trainingDataset) - - assertMetrics( - model, - trainingDataset, - validationDataset, - trainingMetricsTolerance = 0.00001, - validationMetricsTolerance = 0.00000001) - } - - def gamTrainingDataset = trainingDataset.repartition(1) - def gamValidationDataset = validationDataset.repartition(1) - - // TODO: investigate why metrics are different - ignore("test calculation of ordinal H2OGAM metrics on arbitrary dataset") { - val algo = new H2OGAM() - .setValidationDataFrame(gamValidationDataset) - .setFeaturesCols("District", "Group") - .setGamCols(Array("Claims")) - .setLabelCol("Age") - .setSeed(1) - .setFamily("ordinal") - val model = algo.fit(gamTrainingDataset) - - assertMetrics( - model, - gamTrainingDataset, - gamValidationDataset, - trainingMetricsTolerance = 0.00001, - validationMetricsTolerance = 0.00000001) - } - - // TODO: investigate why metrics are different - ignore("test calculation of ordinal H2OGAM metrics with weightCol set on arbitrary dataset") { - val algo = new H2OGAM() - .setValidationDataFrame(gamValidationDataset) - .setFeaturesCols("District", "Group") - .setGamCols(Array("Claims")) - .setLabelCol("Age") - .setSeed(1) - .setFamily("ordinal") - .setWeightCol("ID") - val model = algo.fit(gamTrainingDataset) - - assertMetrics( - model, - gamTrainingDataset, - gamValidationDataset, - trainingMetricsTolerance = 0.00001, - validationMetricsTolerance = 0.00000001) - } -} From c50fffe7a28240ac1ec54da6caeb2a64c6d591b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Mon, 21 Mar 2022 12:56:17 +0100 Subject: [PATCH 04/37] Remove unrelated logic --- doc/src/site/sphinx/deployment/load_mojo.rst | 13 +----- .../ml/algos/H2OKMeansTestSuite.scala | 7 ---- .../sparkling/ml/params/H2OMOJOModelParams.py | 15 ------- .../ai/h2o/sparkling/ml/models/H2OMOJOModel.R | 9 ---- r/src/tests/testthat/testMojo.R | 28 ------------- .../ml/metrics/GLRMMetricCalculation.scala | 33 --------------- .../ml/metrics/KmeansMetricCalculation.scala | 42 ------------------- .../sparkling/ml/models/H2OMOJOModel.scala | 10 +---- .../ml/models/H2OSupervisedMOJOModel.scala | 10 ----- 9 files changed, 4 insertions(+), 163 deletions(-) delete mode 100644 scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/GLRMMetricCalculation.scala delete mode 100644 scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/KmeansMetricCalculation.scala diff --git a/doc/src/site/sphinx/deployment/load_mojo.rst b/doc/src/site/sphinx/deployment/load_mojo.rst index e4b5377e07..1c549b61b3 100644 --- a/doc/src/site/sphinx/deployment/load_mojo.rst +++ b/doc/src/site/sphinx/deployment/load_mojo.rst @@ -362,8 +362,8 @@ Obtaining Scoring History The method ``getScoringHistory`` returns a data frame describing how the model evolved during the training process according to a certain training and validation metrics. -Obtaining Pre-calculated Metrics -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Obtaining Metrics +^^^^^^^^^^^^^^^^^ There are two sets of methods to obtain metrics from the MOJO model. @@ -389,15 +389,6 @@ the metrics could be also of a complex type. (see :ref:`metrics` for details) There is also the method ``getCurrentMetricsObject()`` working a similar way as ``getCurrentMetrics()``. -Calculation of Metrics on Arbitrary Dataset -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The below two methods calculate metrics on a provided dataset. - -- ``getMetrics(dataFrame)`` - Returns a map with basic metrics of double type - -- ``getMetricsObject(dataFrame)`` - Returns an object with basic and more complex metrics available via getter methods. - (see :ref:`metrics` for details) - Obtaining Cross Validation Metrics Summary ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The ``getCrossValidationMetricsSummary`` method returns data frame with information about performance of individual folds diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/algos/H2OKMeansTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/algos/H2OKMeansTestSuite.scala index e44c79a6a1..f5fd6e23f5 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/algos/H2OKMeansTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/algos/H2OKMeansTestSuite.scala @@ -18,18 +18,11 @@ package ai.h2o.sparkling.ml.algos import ai.h2o.sparkling.{SharedH2OTestContext, TestUtils} -import com.google.gson.GsonBuilder -import hex.genmodel.MojoModel -import hex.genmodel.easy.EasyPredictModelWrapper -import hex.kmeans.KMeansModel -import hex.schemas.KMeansModelV3 -import org.apache.commons.io.IOUtils import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.sql.{Row, SparkSession} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{FunSuite, Matchers} -import water.AutoBuffer @RunWith(classOf[JUnitRunner]) class H2OKMeansTestSuite extends FunSuite with Matchers with SharedH2OTestContext { diff --git a/py-scoring/src/ai/h2o/sparkling/ml/params/H2OMOJOModelParams.py b/py-scoring/src/ai/h2o/sparkling/ml/params/H2OMOJOModelParams.py index c5fe9c9c07..8a323cef77 100644 --- a/py-scoring/src/ai/h2o/sparkling/ml/params/H2OMOJOModelParams.py +++ b/py-scoring/src/ai/h2o/sparkling/ml/params/H2OMOJOModelParams.py @@ -33,21 +33,6 @@ def getModelSummary(self): def getDomainValues(self): return H2OTypeConverters.scalaMapStringDictStringToStringDictString(self._java_obj.getDomainValues()) - def getMetrics(self, dataFrame): - """ - :return: A map of all metrics of the float type calculated on a data frame passed as a parameter. - """ - self._transfer_params_to_java() - return H2OTypeConverters.scalaMapStringStringToDictStringAny(self._java_obj.getMetrics(dataFrame._jdf)) - - def getMetricsObject(self, dataFrame): - """ - :return: An object holding all metrics of the float type and also more complex performance information - calculated on a data frame passed as a parameter. - """ - self._transfer_params_to_java() - return H2OMetricsFactory.fromJavaObject(self._java_obj.getMetricsObject(dataFrame._jdf)) - def getTrainingMetrics(self): """ :return: A map of all metrics of the float type calculated on the training dataset. diff --git a/r/src/R/ai/h2o/sparkling/ml/models/H2OMOJOModel.R b/r/src/R/ai/h2o/sparkling/ml/models/H2OMOJOModel.R index 1bd3c4de58..8b1d7fab73 100644 --- a/r/src/R/ai/h2o/sparkling/ml/models/H2OMOJOModel.R +++ b/r/src/R/ai/h2o/sparkling/ml/models/H2OMOJOModel.R @@ -49,15 +49,6 @@ H2OMOJOModel <- setRefClass("H2OMOJOModel", methods = list( getDomainValues = function() { invoke(.self$jmojo, "getDomainValues") }, - getMetrics = function(sparkFrame) { - sparkFrame <- spark_dataframe(sparkFrame) - invoke(.self$jmojo, "getMetrics", sparkFrame) - }, - getMetricsObject = function(sparkFrame) { - sparkFrame <- spark_dataframe(sparkFrame) - metrics <- invoke(.self$jmojo, "getMetricsObject", sparkFrame) - H2OMetricsFactory.fromJavaObject(metrics) - }, getTrainingMetrics = function() { invoke(.self$jmojo, "getTrainingMetrics") }, diff --git a/r/src/tests/testthat/testMojo.R b/r/src/tests/testthat/testMojo.R index 85aa61855b..3c830c8e5e 100644 --- a/r/src/tests/testthat/testMojo.R +++ b/r/src/tests/testthat/testMojo.R @@ -115,34 +115,6 @@ test_that("test model category", { expect_equal(category, "Binomial") }) -test_that("test metrics calculation", { - path <- paste0("file://", locate("smalldata/prostate/prostate.csv")) - dataset <- spark_read_csv(sc, path = path, infer_schema = TRUE, header = TRUE) - dataset <- dplyr::rename(dataset, capsule = CAPSULE) - model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo"))) - metrics <- model$getMetrics(dataset) - expect_equal(as.character(metrics[["AUC"]]), "0.896878869021911") - expect_equal(length(metrics), 10) -}) - -test_that("test metrics object calculation", { - path <- paste0("file://", locate("smalldata/prostate/prostate.csv")) - dataset <- spark_read_csv(sc, path = path, infer_schema = TRUE, header = TRUE) - dataset <- dplyr::rename(dataset, capsule = CAPSULE) - model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo"))) - metrics <- model$getMetricsObject(dataset) - aucValue <- metrics$getAUC() - scoringTime <- metrics$getScoringTime() - - thresholdsAndScores <- metrics$getThresholdsAndMetricScores() - thresholdsAndScoresFrame <- dplyr::tally(thresholdsAndScores) - thresholdsAndScoresCount <- as.double(dplyr::collect(thresholdsAndScoresFrame)[[1]]) - - expect_equal(as.character(aucValue), "0.896878869021911") - expect_true(scoringTime > 0) - expect_true(thresholdsAndScoresCount > 0) -}) - test_that("test training metrics", { model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo"))) metrics <- model$getTrainingMetrics() diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/GLRMMetricCalculation.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/GLRMMetricCalculation.scala deleted file mode 100644 index fa5e81996d..0000000000 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/GLRMMetricCalculation.scala +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package ai.h2o.sparkling.ml.metrics - -import ai.h2o.sparkling.ml.models.H2OGLRMMOJOModel -import hex.genmodel.easy.{EasyPredictModelWrapper, RowData} - -trait GLRMMetricCalculation { - self: H2OGLRMMOJOModel => - - private[sparkling] def getPredictionGetter(): (EasyPredictModelWrapper, RowData, Double) => Array[Double] = { - (wrapper: EasyPredictModelWrapper, rowData: RowData, offset: Double) => - { - val vpa = wrapper.predictDimReduction(rowData) - vpa.reconstructed - } - } -} diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/KmeansMetricCalculation.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/KmeansMetricCalculation.scala deleted file mode 100644 index 427ea0213d..0000000000 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/KmeansMetricCalculation.scala +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package ai.h2o.sparkling.ml.metrics - -import ai.h2o.sparkling.ml.models.H2OKMeansMOJOModel -import hex.ModelMetrics.IndependentMetricBuilder -import hex.ModelMetricsClustering.IndependentMetricBuilderClustering -import hex.genmodel.GenModel -import hex.genmodel.algos.kmeans.KMeansMojoModel -import hex.genmodel.easy.{EasyPredictModelWrapper, RowData} - -trait KmeansMetricCalculation { - self: H2OKMeansMOJOModel => - - override private[sparkling] def getActualValuesExtractor(): (RowData, EasyPredictModelWrapper) => Array[Double] = { - (rowData: RowData, wrapper: EasyPredictModelWrapper) => - { - val model = wrapper.m.asInstanceOf[KMeansMojoModel] - val rawData = new Array[Double](wrapper.m.nfeatures()) - wrapper.fillRawData(rowData, rawData) - if (model._standardize) { - GenModel.Kmeans_preprocessData(rawData, model._means, model._mults, model._modes) - } - rawData - } - } -} diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OMOJOModel.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OMOJOModel.scala index dedf8445f4..32fed08ea2 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OMOJOModel.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OMOJOModel.scala @@ -17,7 +17,7 @@ package ai.h2o.sparkling.ml.models -import java.io.{BufferedReader, File, InputStream} +import java.io.{File, InputStream} import _root_.hex.genmodel.attributes.ModelJsonReader import _root_.hex.genmodel.easy.EasyPredictModelWrapper import _root_.hex.genmodel.{MojoModel, MojoReaderBackendFactory} @@ -33,9 +33,8 @@ import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions._ import _root_.hex.genmodel.attributes.Table.ColumnType import ai.h2o.sparkling.api.generation.common.MetricNameConverter -import ai.h2o.sparkling.ml.metrics.H2OMetrics import org.apache.spark.SparkFiles -import ai.h2o.sparkling.ml.metrics.{H2OMetrics, MetricCalculation} +import ai.h2o.sparkling.ml.metrics.H2OMetrics import org.apache.spark.expose.Logging import org.apache.spark.ml.Model import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema @@ -235,11 +234,6 @@ abstract class H2OMOJOModel private[sparkling] def getCrossValidationModelsAsArray(): Array[H2OMOJOModel] = crossValidationModels - private[sparkling] def getDetailsReader(): BufferedReader = { - val reader = MojoReaderBackendFactory.createReaderBackend(getMojo().getAbsolutePath) - reader.getTextFile(ModelJsonReader.MODEL_DETAILS_FILE) - } - private[sparkling] def setCrossValidationModels(models: Array[H2OMOJOModel]): this.type = { crossValidationModels = models if (models != null) { diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OSupervisedMOJOModel.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OSupervisedMOJOModel.scala index 08f1be1689..38af0a18dc 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OSupervisedMOJOModel.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OSupervisedMOJOModel.scala @@ -20,7 +20,6 @@ package ai.h2o.sparkling.ml.models import ai.h2o.sparkling.ml.params.H2OSupervisedMOJOParams import hex.ModelCategory import hex.genmodel.MojoModel -import hex.genmodel.easy.{EasyPredictModelWrapper, RowData} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.DataFrame import org.apache.spark.sql.expressions.UserDefinedFunction @@ -68,15 +67,6 @@ class H2OSupervisedMOJOModel(override val uid: String) extends H2OAlgorithmMOJOM flatDataFrame.withColumn(outputColumnName, udf(struct(args: _*))) } } - - override private[sparkling] def getActualValuesExtractor(): (RowData, EasyPredictModelWrapper) => Array[Double] = { - (rowData: RowData, wrapper: EasyPredictModelWrapper) => - { - val responseColumn = wrapper.m._responseColumn - val encodedActualValue = wrapper.extractRawDataValue(rowData, responseColumn) - Array[Double](encodedActualValue) - } - } } object H2OSupervisedMOJOModel extends H2OSpecificMOJOLoader[H2OSupervisedMOJOModel] From aeb48eb085232b80118145f2f991f1c5c186d40c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Mon, 21 Mar 2022 15:00:00 +0100 Subject: [PATCH 05/37] Remove unrelated logic --- .../common/AlgorithmConfigurations.scala | 37 +++++++------------ .../common/AlgorithmSubstitutionContext.scala | 1 - .../common/AutoMLConfiguration.scala | 2 +- .../FeatureEstimatorConfigurations.scala | 1 - .../common/GridSearchConfiguration.scala | 2 +- .../generation/scala/MOJOModelTemplate.scala | 10 +---- .../sparkling/ml/models/H2OMOJOModel.scala | 2 +- 7 files changed, 18 insertions(+), 37 deletions(-) diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AlgorithmConfigurations.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AlgorithmConfigurations.scala index ca843c4694..9a6a44817b 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AlgorithmConfigurations.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AlgorithmConfigurations.scala @@ -112,7 +112,7 @@ class AlgorithmConfigurations extends MultipleAlgorithmsConfiguration { type KMeansParamsV3 = KMeansV3.KMeansParametersV3 val explicitDefaultValues = - Map[String, Any]("max_w2" -> 3.402823e38f, "response_column" -> "label", "model_id" -> null) + Map[String, Any]("max_w2" -> 3.402823e38f, "response_column" -> "label", "model_id" -> null, "lambda" -> null) val noDeprecation = Seq.empty @@ -173,34 +173,25 @@ class AlgorithmConfigurations extends MultipleAlgorithmsConfiguration { type IFParameters = IsolationForestParameters - val none = Seq.empty - - val algorithms = Seq[(String, Class[_], String, Seq[String], Seq[String], Option[String])]( - ("H2OXGBoost", classOf[XGBoostParameters], treeSupervised, Seq(withDistribution), none, None), - ("H2OGBM", classOf[GBMParameters], treeSupervised, Seq(withDistribution), none, None), - ("H2ODRF", classOf[DRFParameters], treeSupervised, Seq(withDistribution), none, None), - ("H2OGLM", classOf[GLMParameters], cvSupervised, Seq(withFamily), none, Some("H2OGLMMetrics")), - ("H2OGAM", classOf[GAMParameters], cvSupervised, Seq(withFamily), none, None), - ("H2ODeepLearning", classOf[DeepLearningParameters], cvSupervised, Seq(withDistribution), none, None), - ("H2ORuleFit", classOf[RuleFitParameters], supervised, Seq(withDistribution), none, None), - ( - "H2OKMeans", - classOf[KMeansParameters], - unsupervised, - Seq("H2OKMeansExtras"), - Seq("KmeansMetricCalculation"), - Some("H2OClusteringMetrics")), - ("H2OCoxPH", classOf[CoxPHParameters], supervised, none, none, Some("H2ORegressionCoxPHMetrics")), - ("H2OIsolationForest", classOf[IFParameters], treeUnsupervised, none, none, Some("H2OAnomalyMetrics"))) - - for ((entityName, h2oParametersClass: Class[_], algorithmType, extraParents, extraMOJOParents, metricsClass) <- algorithms) + val algorithms = Seq[(String, Class[_], String, Seq[String], Option[String])]( + ("H2OXGBoost", classOf[XGBoostParameters], treeSupervised, Seq(withDistribution), None), + ("H2OGBM", classOf[GBMParameters], treeSupervised, Seq(withDistribution), None), + ("H2ODRF", classOf[DRFParameters], treeSupervised, Seq(withDistribution), None), + ("H2OGLM", classOf[GLMParameters], cvSupervised, Seq(withFamily), Some("H2OGLMMetrics")), + ("H2OGAM", classOf[GAMParameters], cvSupervised, Seq(withFamily), None), + ("H2ODeepLearning", classOf[DeepLearningParameters], cvSupervised, Seq(withDistribution), None), + ("H2ORuleFit", classOf[RuleFitParameters], supervised, Seq(withDistribution), None), + ("H2OKMeans", classOf[KMeansParameters], unsupervised, Seq("H2OKMeansExtras"), Some("H2OClusteringMetrics")), + ("H2OCoxPH", classOf[CoxPHParameters], supervised, Seq.empty, Some("H2ORegressionCoxPHMetrics")), + ("H2OIsolationForest", classOf[IFParameters], treeUnsupervised, Seq.empty, Some("H2OAnomalyMetrics"))) + + for ((entityName, h2oParametersClass: Class[_], algorithmType, extraParents, metricsClass) <- algorithms) yield AlgorithmSubstitutionContext( namespace = "ai.h2o.sparkling.ml.algos", entityName, h2oParametersClass, algorithmType, extraParents, - extraMOJOParents, specificMetricsClass = metricsClass) } diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AlgorithmSubstitutionContext.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AlgorithmSubstitutionContext.scala index a5ac7105e0..afb4975de2 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AlgorithmSubstitutionContext.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AlgorithmSubstitutionContext.scala @@ -23,7 +23,6 @@ case class AlgorithmSubstitutionContext( h2oSchemaClass: Class[_], algorithmType: String, extraInheritedEntities: Seq[String] = Seq.empty, - extraInheritedEntitiesOnMOJO: Seq[String] = Seq.empty, constructorMethods: Boolean = true, specificMetricsClass: Option[String] = None) extends SubstitutionContextBase diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AutoMLConfiguration.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AutoMLConfiguration.scala index 8a0822344e..724724e0f9 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AutoMLConfiguration.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AutoMLConfiguration.scala @@ -52,7 +52,7 @@ class AutoMLConfiguration extends SingleAlgorithmConfiguration { null, "H2OSupervisedAlgorithmWithFoldColumn", Seq("H2OAutoMLExtras"), - constructorMethods = false)) + false)) } override def problemSpecificAlgorithmConfiguration: Seq[ProblemSpecificAlgorithmSubstitutionContext] = { diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/FeatureEstimatorConfigurations.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/FeatureEstimatorConfigurations.scala index 0eeccb76ba..b74ba73c43 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/FeatureEstimatorConfigurations.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/FeatureEstimatorConfigurations.scala @@ -115,7 +115,6 @@ class FeatureEstimatorConfigurations extends MultipleAlgorithmsConfiguration { override def algorithmConfiguration: Seq[AlgorithmSubstitutionContext] = { - def none = Seq.empty[String] val algorithms = Seq[(String, Class[_], String, Option[String])]( ("H2OAutoEncoder", classOf[DeepLearningParameters], "H2OAutoEncoderBase", Some("H2OAutoEncoderMetrics")), ("H2OPCA", classOf[PCAParameters], "H2ODimReductionEstimator", Some("H2OPCAMetrics")), diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/GridSearchConfiguration.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/GridSearchConfiguration.scala index ba0e8ae10a..2e5a0241b3 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/GridSearchConfiguration.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/GridSearchConfiguration.scala @@ -71,6 +71,6 @@ class GridSearchConfiguration extends SingleAlgorithmConfiguration { null, "H2OAlgorithm", Seq("H2OGridSearchExtras"), - constructorMethods = false)) + false)) } } diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/scala/MOJOModelTemplate.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/scala/MOJOModelTemplate.scala index 39d783055a..9b925f77b9 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/scala/MOJOModelTemplate.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/scala/MOJOModelTemplate.scala @@ -46,7 +46,6 @@ object MOJOModelTemplate val imports = Seq( "com.google.gson.JsonObject", "ai.h2o.sparkling.ml.params.ParameterConstructorMethods", - "ai.h2o.sparkling.ml.metrics._", "hex.genmodel.MojoModel", "org.apache.spark.expose.Logging", "ai.h2o.sparkling.utils.DataFrameSerializationWrappers._") ++ @@ -60,9 +59,7 @@ object MOJOModelTemplate .replace("Estimator", "MOJOModel") .replaceFirst("Base$", "MOJOBase"), "ParameterConstructorMethods", - "Logging") ++ - explicitFieldImplementations ++ - algorithmSubstitutionContext.extraInheritedEntitiesOnMOJO + "Logging") ++ explicitFieldImplementations val entityName = algorithmSubstitutionContext.entityName val entityParameters = "(override val uid: String)" @@ -215,11 +212,6 @@ object MOJOModelTemplate | override def getCrossValidationMetricsObject(): $metrics = { | val value = super.getCrossValidationMetricsObject() | if (value == null) null else value.asInstanceOf[$metrics] - | } - | - | override def getMetricsObject(dataFrame: org.apache.spark.sql.DataFrame): $metrics = { - | val value = super.getMetricsObject(dataFrame) - | if (value == null) null else value.asInstanceOf[$metrics] | }""".stripMargin } } diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OMOJOModel.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OMOJOModel.scala index 32fed08ea2..ec32a099a2 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OMOJOModel.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/models/H2OMOJOModel.scala @@ -33,8 +33,8 @@ import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions._ import _root_.hex.genmodel.attributes.Table.ColumnType import ai.h2o.sparkling.api.generation.common.MetricNameConverter -import org.apache.spark.SparkFiles import ai.h2o.sparkling.ml.metrics.H2OMetrics +import org.apache.spark.SparkFiles import org.apache.spark.expose.Logging import org.apache.spark.ml.Model import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema From e25801a47d99542774d1c4b35a2516406bba841f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Mon, 21 Mar 2022 15:04:45 +0100 Subject: [PATCH 06/37] Remove extra dependency --- extensions/build.gradle | 1 - 1 file changed, 1 deletion(-) diff --git a/extensions/build.gradle b/extensions/build.gradle index 7acd163eca..bfea309244 100644 --- a/extensions/build.gradle +++ b/extensions/build.gradle @@ -5,7 +5,6 @@ dependencies { compileOnly("org.scala-lang:scala-library:${scalaVersion}") compileOnly("ai.h2o:h2o-core:${h2oVersion}") - compileOnly("ai.h2o:h2o-algos:${h2oVersion}") compileOnly("javax.servlet:servlet-api:2.5") testImplementation("org.scala-lang:scala-library:${scalaVersion}") From 32db73d340a26741f117ea0376eca3ed613f4419 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Mon, 21 Mar 2022 15:17:57 +0100 Subject: [PATCH 07/37] Remove extra dependency and return test back --- .../ml/algos/OrdinalPredictionTestSuite.scala | 22 +++++++++++++++++++ scoring/build.gradle | 1 - 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/algos/OrdinalPredictionTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/algos/OrdinalPredictionTestSuite.scala index 70dba7889d..88ee00258b 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/algos/OrdinalPredictionTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/algos/OrdinalPredictionTestSuite.scala @@ -86,4 +86,26 @@ class OrdinalPredictionTestSuite extends FunSuite with Matchers with SharedH2OTe assert(schema == expectedSchema) assert(schema == expectedSchemaByTransform) } + + private def assertMetrics[T](model: H2OMOJOModel): Unit = { + assertMetrics(model.getTrainingMetricsObject(), model.getTrainingMetrics()) + assertMetrics(model.getValidationMetricsObject(), model.getValidationMetrics()) + assert(model.getCrossValidationMetricsObject() == null) + assert(model.getCrossValidationMetrics() == Map()) + } + + private def assertMetrics(metricsObject: H2OMetrics, metrics: Map[String, Double]): Unit = { + metricsObject.isInstanceOf[H2OOrdinalGLMMetrics] should be(true) + MetricsAssertions.assertMetricsObjectAgainstMetricsMap(metricsObject, metrics) + } + + test("test ordinal glm metric objects") { + val algo = createAlgorithm() + val model = algo.fit(dataset) + assertMetrics[H2OOrdinalMetrics](model) + + model.write.overwrite().save("ml/build/glm_ordinal_model_metrics") + val loadedModel = H2OGLMMOJOModel.load("ml/build/glm_ordinal_model_metrics") + assertMetrics[H2OOrdinalGLMMetrics](loadedModel) + } } diff --git a/scoring/build.gradle b/scoring/build.gradle index 9bf91fd0da..ddc8c277df 100644 --- a/scoring/build.gradle +++ b/scoring/build.gradle @@ -28,7 +28,6 @@ dependencies { // Required for model metrics calculation implementation("ai.h2o:h2o-core:${h2oVersion}") - implementation("ai.h2o:h2o-algos:${h2oVersion}") implementation(project(":sparkling-water-extensions")) compileOnly(project(':sparkling-water-api-generation')) From a777a7d8b027e2a5385747805a3f484ca6992e4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Mon, 21 Mar 2022 16:24:14 +0100 Subject: [PATCH 08/37] Remove extra tests --- .../ml/metrics/BinomialMetricsTestSuite.scala | 77 +++------------ .../metrics/MultinomialMetricsTestSuite.scala | 94 +++---------------- .../metrics/RegressionMetricsTestSuite.scala | 73 +++----------- 3 files changed, 35 insertions(+), 209 deletions(-) diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala index f27cf81eb9..e061827113 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala @@ -72,15 +72,13 @@ class BinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OTest trainingDataset: DataFrame, validationDataset: DataFrame, trainingMetricsTolerance: Double = 0.0, - validationMetricsTolerance: Double = 0.0, - skipExtraMetrics: Boolean = false): Unit = { + validationMetricsTolerance: Double = 0.0): Unit = { MetricsAssertions.assertEssentialMetrics( model, trainingDataset, validationDataset, trainingMetricsTolerance, - validationMetricsTolerance, - skipExtraMetrics) + validationMetricsTolerance) if (trainingMetricsTolerance < Double.PositiveInfinity) { val trainingMetricObject = model.getMetricsObject(trainingDataset).asInstanceOf[H2OBinomialMetrics] @@ -158,15 +156,14 @@ class BinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OTest } { - val algorithmsAndTolerances: Seq[(() => H2OSupervisedAlgorithm[_], Double, Double, Boolean)] = Seq( - (() => new H2ODeepLearning(), 0.00001, 0.000001, false), - (() => new H2OXGBoost(), 0.0001, 0.0001, false), - (() => new H2OGBM(), 0.0001, 0.0001, false), - (() => new H2OGLM(), 0.00001, 0.000001, false), - (() => new H2ODRF(), Double.PositiveInfinity, 0.0001, false)) - // TODO: investigate differences - (() => new H2ORuleFit(), Double.PositiveInfinity, 0.0005, true)) - - for ((algorithmGetter, trainingMetricsTolerance, validationMetricsTolerance, skipExtraMetrics) <- algorithmsAndTolerances) { + val algorithmsAndTolerances: Seq[(() => H2OSupervisedAlgorithm[_], Double, Double)] = Seq( + (() => new H2ODeepLearning(), 0.00001, 0.000001), + (() => new H2OXGBoost(), 0.0001, 0.0001), + (() => new H2OGBM(), 0.0001, 0.0001), + (() => new H2OGLM(), 0.00001, 0.000001), + (() => new H2ODRF(), Double.PositiveInfinity, 0.0001)) + + for ((algorithmGetter, trainingMetricsTolerance, validationMetricsTolerance) <- algorithmsAndTolerances) { val algorithmName = algorithmGetter().getClass.getSimpleName test(s"test calculation of binomial $algorithmName metrics on arbitrary dataset") { @@ -183,8 +180,7 @@ class BinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OTest trainingDataset, validationDataset, trainingMetricsTolerance, - validationMetricsTolerance, - skipExtraMetrics) + validationMetricsTolerance) } test(s"test calculation of binomial $algorithmName metrics with weightCol set on arbitrary dataset") { @@ -202,8 +198,7 @@ class BinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OTest trainingDataset, validationDataset, trainingMetricsTolerance, - validationMetricsTolerance, - skipExtraMetrics) + validationMetricsTolerance) } } } @@ -227,52 +222,4 @@ class BinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OTest } } } - - { - // TODO: Investigate differences when data frames have more partitions - def gamTrainingDataset = trainingDataset.repartition(1) - def gamValidationDataset = validationDataset.repartition(1) - - test(s"test calculation of binomial H2OGAM metrics on arbitrary dataset") { - val algorithm = new H2OGAM() - algorithm - .setValidationDataFrame(gamValidationDataset) - .setSeed(1L) - .setFeaturesCols("AGE", "RACE", "DPROS", "DCAPS", "VOL", "GLEASON") - .setGamCols(Array("PSA")) - .setLabelCol("CAPSULE") - val model = algorithm.fit(gamTrainingDataset) - - assertMetrics(model, gamTrainingDataset, gamValidationDataset, 0.00001, 0.00000001) - } - - // H2OGAM renames Gam cols when offset columns is set (PSA -> PSA_0_center__8) - ignore(s"test calculation of binomial H2OGAM metrics with offsetCol set on arbitrary dataset") { - val algorithm = new H2OGAM() - algorithm - .setValidationDataFrame(gamValidationDataset) - .setSeed(1L) - .setFeaturesCols("AGE", "RACE", "DPROS", "DCAPS", "VOL", "GLEASON") - .setGamCols(Array("PSA")) - .setLabelCol("CAPSULE") - .setOffsetCol("ID") - val model = algorithm.fit(gamTrainingDataset) - - assertMetrics(model, gamTrainingDataset, gamValidationDataset, 0.00001, 0.00000001) - } - - test(s"test calculation of binomial H2OGAM metrics with weightCol set on arbitrary dataset") { - val algorithm = new H2OGAM() - algorithm - .setValidationDataFrame(gamValidationDataset) - .setSeed(1L) - .setFeaturesCols("AGE", "RACE", "DPROS", "DCAPS", "VOL", "GLEASON") - .setGamCols(Array("PSA")) - .setLabelCol("CAPSULE") - .setWeightCol("ID") - val model = algorithm.fit(gamTrainingDataset) - - assertMetrics(model, gamTrainingDataset, gamValidationDataset, 0.00001, 0.00000001) - } - } } diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala index ef0c47b2fd..ffaeb74b32 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala @@ -63,15 +63,13 @@ class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OT trainingDataset: DataFrame, validationDataset: DataFrame, trainingMetricsTolerance: Double = 0.0, - validationMetricsTolerance: Double = 0.0, - skipExtraMetrics: Boolean = false): Unit = { + validationMetricsTolerance: Double = 0.0): Unit = { MetricsAssertions.assertEssentialMetrics( model, trainingDataset, validationDataset, trainingMetricsTolerance, - validationMetricsTolerance, - skipExtraMetrics) + validationMetricsTolerance) if (trainingMetricsTolerance < Double.PositiveInfinity) { val trainingMetricObject = model.getMetricsObject(trainingDataset).asInstanceOf[H2OMultinomialMetrics] @@ -151,15 +149,14 @@ class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OT } { - val algorithmsAndTolerances: Seq[(() => H2OSupervisedAlgorithm[_], Double, Double, Boolean)] = Seq( - (() => new H2ODeepLearning(), 0.00001, 0.00000001, false), - (() => new H2OXGBoost(), 0.00001, 0.00000001, false), - (() => new H2OGBM(), 0.00001, 0.00000001, false), - (() => new H2OGLM(), 0.00001, 0.00000001, false), - (() => new H2ODRF(), Double.PositiveInfinity, 0.00000001, false), - (() => new H2ORuleFit(), 0.0001, 0.00001, true)) - - for ((algorithmGetter, trainingMetricsTolerance, validationMetricsTolerance, skipExtraMetrics) <- algorithmsAndTolerances) { + val algorithmsAndTolerances: Seq[(() => H2OSupervisedAlgorithm[_], Double, Double)] = Seq( + (() => new H2ODeepLearning(), 0.00001, 0.00000001), + (() => new H2OXGBoost(), 0.00001, 0.00000001), + (() => new H2OGBM(), 0.00001, 0.00000001), + (() => new H2OGLM(), 0.00001, 0.00000001), + (() => new H2ODRF(), Double.PositiveInfinity, 0.00000001)) + + for ((algorithmGetter, trainingMetricsTolerance, validationMetricsTolerance) <- algorithmsAndTolerances) { val algorithmName = algorithmGetter().getClass.getSimpleName test(s"test calculation of multinomial $algorithmName metrics on arbitrary dataset") { @@ -178,8 +175,7 @@ class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OT trainingDataset, validationDataset, trainingMetricsTolerance, - validationMetricsTolerance, - skipExtraMetrics) + validationMetricsTolerance) } test(s"test calculation of multinomial $algorithmName metrics with weightCol set on arbitrary dataset") { @@ -199,8 +195,7 @@ class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OT trainingDataset, validationDataset, trainingMetricsTolerance, - validationMetricsTolerance, - skipExtraMetrics) + validationMetricsTolerance) } } } @@ -226,69 +221,4 @@ class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OT } } } - { - // TODO: Investigate differences when data frames have more partitions - def gamTrainingDataset = trainingDataset.repartition(1) - def gamValidationDataset = validationDataset.repartition(1) - - test("test calculation of multinomial H2OGAM metrics on arbitrary dataset") { - val algo = new H2OGAM() - .setValidationDataFrame(gamValidationDataset) - .setSeed(1) - .setFeaturesCols("sepal_len", "sepal_wid", "petal_len") - .setGamCols(Array("petal_len")) - .setColumnsToCategorical("class") - .setAucType("MACRO_OVR") - .setLabelCol("class") - val model = algo.fit(gamTrainingDataset) - - assertMetrics( - model, - gamTrainingDataset, - gamValidationDataset, - trainingMetricsTolerance = 0.0001, - validationMetricsTolerance = 0.00000001) - } - - // H2OGAM renames Gam cols when offset columns is set (petal_len -> petal_len_0_center__8) - ignore("test calculation of multinomial H2OGAM metrics with offsetCol set on arbitrary dataset") { - val algo = new H2OGAM() - .setValidationDataFrame(gamValidationDataset) - .setSeed(1) - .setFeaturesCols("sepal_len", "sepal_wid", "petal_len") - .setGamCols(Array("petal_len")) - .setColumnsToCategorical("class") - .setAucType("MACRO_OVR") - .setLabelCol("class") - .setOffsetCol("ID") - val model = algo.fit(gamTrainingDataset) - - assertMetrics( - model, - gamTrainingDataset, - gamValidationDataset, - trainingMetricsTolerance = 0.0001, - validationMetricsTolerance = 0.00000001) - } - - test("test calculation of multinomial H2OGAM metrics with weightCol set on arbitrary dataset") { - val algo = new H2OGAM() - .setValidationDataFrame(gamValidationDataset) - .setSeed(1) - .setFeaturesCols("sepal_len", "sepal_wid", "petal_len") - .setGamCols(Array("petal_len")) - .setColumnsToCategorical("class") - .setAucType("MACRO_OVR") - .setLabelCol("class") - .setWeightCol("WEIGHT") - val model = algo.fit(gamTrainingDataset) - - assertMetrics( - model, - gamTrainingDataset, - gamValidationDataset, - trainingMetricsTolerance = 0.0001, - validationMetricsTolerance = 0.00000001) - } - } } diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/RegressionMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/RegressionMetricsTestSuite.scala index b84d260626..b64b43f887 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/RegressionMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/RegressionMetricsTestSuite.scala @@ -85,16 +85,14 @@ class RegressionMetricsTestSuite extends FunSuite with Matchers with SharedH2OTe } { - val algorithmsAndTolerances: Seq[(() => H2OSupervisedAlgorithm[_], Double, Double, Boolean)] = Seq( - (() => new H2ODeepLearning(), 0.00001, 0.00000001, false), - (() => new H2OXGBoost(), 0.00001, 0.00000001, false), - (() => new H2OGBM(), 0.0001, 0.00000001, false), - (() => new H2OGLM(), 0.00001, 0.00000001, false), - (() => new H2ODRF(), Double.PositiveInfinity, 0.00000001, false)) // ignore comparision on the training dataset - // H2O runtime produces additional GLM metrics - // TODO: investigate differences (() => new H2ORuleFit(), 0.001, 0.00000001, true)) - - for ((algorithmGetter, trainingMetricsTolerance, validationMetricsTolerance, skipExtraMetrics) <- algorithmsAndTolerances) { + val algorithmsAndTolerances: Seq[(() => H2OSupervisedAlgorithm[_], Double, Double)] = Seq( + (() => new H2ODeepLearning(), 0.00001, 0.00000001), + (() => new H2OXGBoost(), 0.00001, 0.00000001), + (() => new H2OGBM(), 0.0001, 0.00000001), + (() => new H2OGLM(), 0.00001, 0.00000001), + (() => new H2ODRF(), Double.PositiveInfinity, 0.00000001)) // ignore comparision on the training dataset + + for ((algorithmGetter, trainingMetricsTolerance, validationMetricsTolerance) <- algorithmsAndTolerances) { val algorithmName = algorithmGetter().getClass.getSimpleName test(s"test calculation of regression $algorithmName metrics on arbitrary dataset") { @@ -111,8 +109,7 @@ class RegressionMetricsTestSuite extends FunSuite with Matchers with SharedH2OTe trainingDataset, validationDataset, trainingMetricsTolerance, - validationMetricsTolerance, - skipExtraMetrics) + validationMetricsTolerance) } test(s"test calculation of regression $algorithmName metrics with weight column set on arbitrary dataset ") { @@ -130,8 +127,7 @@ class RegressionMetricsTestSuite extends FunSuite with Matchers with SharedH2OTe trainingDataset, validationDataset, trainingMetricsTolerance, - validationMetricsTolerance, - skipExtraMetrics) + validationMetricsTolerance) } } } @@ -139,7 +135,7 @@ class RegressionMetricsTestSuite extends FunSuite with Matchers with SharedH2OTe val algorithmsAndTolerances: Seq[(H2OSupervisedAlgorithm[_], Double, Double)] = Seq( (new H2OXGBoost(), 0.00001, 0.00000001), (new H2OGBM(), 0.001, 0.00000001), - (new H2OGLM(), 0.00001, 0.00000001)) // H2ORuleFit and H2ODRF doesn't support offset column + (new H2OGLM(), 0.00001, 0.00000001)) for ((algorithm, trainingMetricsTolerance, validationMetricsTolerance) <- algorithmsAndTolerances) { val algorithmName = algorithm.getClass.getSimpleName @@ -161,51 +157,4 @@ class RegressionMetricsTestSuite extends FunSuite with Matchers with SharedH2OTe } } } - { - // TODO: investigate why GAM there are differences when more partitions are used - def gamTrainingDataset = trainingDataset.repartition(1) - def gamValidationDataset = validationDataset.repartition(1) - - test(s"test calculation of regression H2OGAM metrics on arbitrary dataset") { - val algorithm = new H2OGAM() - algorithm - .setValidationDataFrame(gamValidationDataset) - .setSeed(1L) - .setGamCols(Array(Array("PSA"))) - .setFeaturesCols("CAPSULE", "RACE", "DPROS", "DCAPS", "VOL", "GLEASON") - .setLabelCol("AGE") - val model = algorithm.fit(gamTrainingDataset) - - MetricsAssertions.assertEssentialMetrics(model, gamTrainingDataset, gamValidationDataset, 0.00001, 0.00000001) - } - - test(s"test calculation of regression H2OGAM metrics with weight column set on arbitrary dataset") { - val algorithm = new H2OGAM() - algorithm - .setValidationDataFrame(gamValidationDataset) - .setSeed(1L) - .setGamCols(Array(Array("PSA"))) - .setFeaturesCols("CAPSULE", "RACE", "DPROS", "DCAPS", "VOL", "GLEASON") - .setLabelCol("AGE") - .setWeightCol("ID") - val model = algorithm.fit(gamTrainingDataset) - - MetricsAssertions.assertEssentialMetrics(model, gamTrainingDataset, gamValidationDataset, 0.00001, 0.00000001) - } - - // H2OGAM renames Gam cols when offset columns is set (PSA -> PSA_0_center__8) - ignore(s"test calculation of regression H2OGAM metrics with offset column set on arbitrary dataset") { - val algorithm = new H2OGAM() - algorithm - .setValidationDataFrame(gamValidationDataset) - .setSeed(1L) - .setGamCols(Array(Array("PSA"))) - .setFeaturesCols("CAPSULE", "RACE", "DPROS", "DCAPS", "VOL", "GLEASON") - .setLabelCol("AGE") - .setOffsetCol("ID") - val model = algorithm.fit(gamTrainingDataset) - - MetricsAssertions.assertEssentialMetrics(model, gamTrainingDataset, gamValidationDataset, 0.00001, 0.00000001) - } - } } From 66f0451941cb3259701d45d2218f38b98f101c39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Mon, 21 Mar 2022 19:36:36 +0100 Subject: [PATCH 09/37] Update tests --- .../ml/metrics/BinomialMetricsTestSuite.scala | 60 +++++++++++++++---- .../ml/metrics/MetricsAssertions.scala | 38 ++++++------ .../metrics/MultinomialMetricsTestSuite.scala | 59 ++++++++++++++---- .../metrics/NoRuntimeMetricsTestSuite.scala | 17 ++++-- .../metrics/RegressionMetricsTestSuite.scala | 37 ++++++++++-- 5 files changed, 156 insertions(+), 55 deletions(-) diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala index e061827113..882424d9aa 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala @@ -69,19 +69,18 @@ class BinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OTest private def assertMetrics( model: H2OMOJOModel, - trainingDataset: DataFrame, - validationDataset: DataFrame, + trainingMetricObject: H2OBinomialMetrics, + validationMetricObject: H2OBinomialMetrics, trainingMetricsTolerance: Double = 0.0, validationMetricsTolerance: Double = 0.0): Unit = { MetricsAssertions.assertEssentialMetrics( model, - trainingDataset, - validationDataset, + trainingMetricObject, + validationMetricObject, trainingMetricsTolerance, validationMetricsTolerance) if (trainingMetricsTolerance < Double.PositiveInfinity) { - val trainingMetricObject = model.getMetricsObject(trainingDataset).asInstanceOf[H2OBinomialMetrics] val expectedTrainingMetricObject = model.getTrainingMetricsObject().asInstanceOf[H2OBinomialMetrics] // Confusion matrix is not correctly calculated in H2O-3 runtime. @@ -103,7 +102,6 @@ class BinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OTest } if (validationMetricsTolerance < Double.PositiveInfinity) { - val validationMetricObject = model.getMetricsObject(validationDataset).asInstanceOf[H2OBinomialMetrics] val expectedValidationMetricObject = model.getValidationMetricsObject().asInstanceOf[H2OBinomialMetrics] // Confusion matrix is not correctly calculated in H2O-3 runtime. @@ -173,12 +171,23 @@ class BinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OTest .set(algorithm.getParam("seed"), 1L) .setFeaturesCols("AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON") .setLabelCol("CAPSULE") + val model = algorithm.fit(trainingDataset) + val domain = model.getDomainValues()("CAPSULE") + val trainingMetricObject = H2OBinomialMetrics.calculate( + model.transform(trainingDataset), + domain, + labelCol = "CAPSULE") + val validationMetricObject = H2OBinomialMetrics.calculate( + model.transform(validationDataset), + domain, + labelCol = "CAPSULE") + assertMetrics( model, - trainingDataset, - validationDataset, + trainingMetricObject, + trainingMetricObject, trainingMetricsTolerance, validationMetricsTolerance) } @@ -191,12 +200,24 @@ class BinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OTest .setFeaturesCols("AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON") .setLabelCol("CAPSULE") .setWeightCol("WEIGHT") + val model = algorithm.fit(trainingDataset) + val domain = model.getDomainValues()("CAPSULE") + val trainingMetricObject = H2OBinomialMetrics.calculate( + model.transform(trainingDataset), + domain, + labelCol = "CAPSULE", + weightColOption = Some("WEIGHT")) + val validationMetricObject = H2OBinomialMetrics.calculate( + model.transform(validationDataset), + domain, + labelCol = "CAPSULE", + weightColOption = Some("WEIGHT")) assertMetrics( model, - trainingDataset, - validationDataset, + trainingMetricObject, + validationMetricObject, trainingMetricsTolerance, validationMetricsTolerance) } @@ -216,9 +237,26 @@ class BinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OTest .setFeaturesCols("AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON") .setLabelCol("CAPSULE") .setOffsetCol("ID") + val model = algorithm.fit(trainingDataset) + val domain = model.getDomainValues()("CAPSULE") + val trainingMetricObject = H2OBinomialMetrics.calculate( + model.transform(trainingDataset), + domain, + labelCol = "CAPSULE", + offsetColOption = Some("ID")) + val validationMetricObject = H2OBinomialMetrics.calculate( + model.transform(validationDataset), + domain, + labelCol = "CAPSULE", + offsetColOption = Some("ID")) - assertMetrics(model, trainingDataset, validationDataset, trainingMetricsTolerance, validationMetricsTolerance) + assertMetrics( + model, + trainingMetricObject, + validationMetricObject, + trainingMetricsTolerance, + validationMetricsTolerance) } } } diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MetricsAssertions.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MetricsAssertions.scala index a9d2a901c0..5c94b84316 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MetricsAssertions.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MetricsAssertions.scala @@ -25,7 +25,8 @@ object MetricsAssertions extends Matchers { def assertMetricsObjectAgainstMetricsMap( metricsObject: H2OMetrics, metrics: Map[String, Double], - ignoredGetters: Set[String] = Set("getCustomMetricValue")): Unit = { + ignoredGetters: Set[String] = Set("getCustomMetricValue"), + tolerance: Double = 0.0): Unit = { for (getter <- metricsObject.getClass.getMethods if getter.getName.startsWith("get") if !ignoredGetters.contains("getCustomMetricValue") @@ -37,8 +38,10 @@ object MetricsAssertions extends Matchers { val metricValue = metrics.get(metricName).get if (metricValue.isNaN) { assert(value.asInstanceOf[Double].isNaN) + } else if (tolerance > 0.0) { + metricValue shouldBe (asInstanceOf[Double] +- tolerance) } else { - value shouldEqual metricValue + metricValue shouldBe value } } } @@ -69,30 +72,23 @@ object MetricsAssertions extends Matchers { def assertEssentialMetrics( model: H2OMOJOModel, - trainingDataset: DataFrame, - validationDataset: DataFrame, + trainingMetricsObject: H2OMetrics, + validationMetricsObject: H2OMetrics, trainingMetricsTolerance: Double = 0.0, - validationMetricsTolerance: Double = 0.0, - skipExtraMetrics: Boolean = false): Unit = { - val trainingMetrics = model.getMetrics(trainingDataset) - val trainingMetricsObject = model.getMetricsObject(trainingDataset) - val validationMetrics = model.getMetrics(validationDataset) - val validationMetricsObject = model.getMetricsObject(validationDataset) + validationMetricsTolerance: Double = 0.0): Unit = { val expectedTrainingMetrics = model.getTrainingMetrics() val expectedValidationMetrics = model.getValidationMetrics() + val ignoredGetters = Set("getCustomMetricValue", "getScoringTime") - MetricsAssertions.assertEqual( + MetricsAssertions.assertMetricsObjectAgainstMetricsMap( + trainingMetricsObject, expectedTrainingMetrics, - trainingMetrics, - tolerance = trainingMetricsTolerance, - skipExtraMetrics = skipExtraMetrics) - MetricsAssertions.assertEqual( + ignoredGetters, + trainingMetricsTolerance) + MetricsAssertions.assertMetricsObjectAgainstMetricsMap( + validationMetricsObject, expectedValidationMetrics, - validationMetrics, - tolerance = validationMetricsTolerance, - skipExtraMetrics = skipExtraMetrics) - val ignoredGetters = Set("getCustomMetricValue", "getScoringTime") - MetricsAssertions.assertMetricsObjectAgainstMetricsMap(trainingMetricsObject, trainingMetrics, ignoredGetters) - MetricsAssertions.assertMetricsObjectAgainstMetricsMap(validationMetricsObject, validationMetrics, ignoredGetters) + ignoredGetters, + validationMetricsTolerance) } } diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala index ffaeb74b32..31cab44069 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala @@ -60,19 +60,18 @@ class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OT private def assertMetrics( model: H2OMOJOModel, - trainingDataset: DataFrame, - validationDataset: DataFrame, + trainingMetricObject: H2OMultinomialMetrics, + validationMetricObject: H2OMultinomialMetrics, trainingMetricsTolerance: Double = 0.0, validationMetricsTolerance: Double = 0.0): Unit = { MetricsAssertions.assertEssentialMetrics( model, - trainingDataset, - validationDataset, + trainingMetricObject, + validationMetricObject, trainingMetricsTolerance, validationMetricsTolerance) if (trainingMetricsTolerance < Double.PositiveInfinity) { - val trainingMetricObject = model.getMetricsObject(trainingDataset).asInstanceOf[H2OMultinomialMetrics] val expectedTrainingMetricObject = model.getTrainingMetricsObject().asInstanceOf[H2OMultinomialMetrics] TestUtils.assertDataFramesAreEqual( trainingMetricObject.getMultinomialAUCTable(), @@ -95,7 +94,6 @@ class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OT } if (validationMetricsTolerance < Double.PositiveInfinity) { - val validationMetricObject = model.getMetricsObject(validationDataset).asInstanceOf[H2OMultinomialMetrics] val expectedValidationMetricObject = model.getValidationMetricsObject().asInstanceOf[H2OMultinomialMetrics] TestUtils.assertDataFramesAreEqual( validationMetricObject.getMultinomialAUCTable(), @@ -168,12 +166,22 @@ class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OT .setColumnsToCategorical("class") .set(algorithm.getParam("aucType"), "MACRO_OVR") .setLabelCol("class") + val model = algorithm.fit(trainingDataset) + val domain = model.getDomainValues()("class") + val trainingMetricObject = H2OMultinomialMetrics.calculate( + model.transform(trainingDataset), + domain, + labelCol = "class") + val validationMetricObject = H2OMultinomialMetrics.calculate( + model.transform(validationDataset), + domain, + labelCol = "class") assertMetrics( model, - trainingDataset, - validationDataset, + trainingMetricObject, + validationMetricObject, trainingMetricsTolerance, validationMetricsTolerance) } @@ -188,12 +196,24 @@ class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OT .set(algorithm.getParam("aucType"), "MACRO_OVR") .setLabelCol("class") .setWeightCol("WEIGHT") + val model = algorithm.fit(trainingDataset) + val domain = model.getDomainValues()("class") + val trainingMetricObject = H2OMultinomialMetrics.calculate( + model.transform(trainingDataset), + domain, + labelCol = "class", + weightColOption = Some("WEIGHT")) + val validationMetricObject = H2OMultinomialMetrics.calculate( + model.transform(validationDataset), + domain, + labelCol = "class", + weightColOption = Some("WEIGHT")) assertMetrics( model, - trainingDataset, - validationDataset, + trainingMetricObject, + validationMetricObject, trainingMetricsTolerance, validationMetricsTolerance) } @@ -215,9 +235,26 @@ class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OT .set(algorithm.getParam("aucType"), "MACRO_OVR") .setLabelCol("class") .setOffsetCol("ID") + val model = algorithm.fit(trainingDataset) + val domain = model.getDomainValues()("class") + val trainingMetricObject = H2OMultinomialMetrics.calculate( + model.transform(trainingDataset), + domain, + labelCol = "class", + offsetColOption = Some("ID")) + val validationMetricObject = H2OMultinomialMetrics.calculate( + model.transform(validationDataset), + domain, + labelCol = "class", + offsetColOption = Some("ID")) - assertMetrics(model, trainingDataset, validationDataset, trainingMetricsTolerance, validationMetricsTolerance) + assertMetrics( + model, + trainingMetricObject, + validationMetricObject, + trainingMetricsTolerance, + validationMetricsTolerance) } } } diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/NoRuntimeMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/NoRuntimeMetricsTestSuite.scala index 32ea2e46c6..77da2f8547 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/NoRuntimeMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/NoRuntimeMetricsTestSuite.scala @@ -45,23 +45,28 @@ class NoRuntimeMetricsTestSuite extends FunSuite with Matchers with SparkTestCon val mojo = H2OMOJOModel.createFromMojo( this.getClass.getClassLoader.getResourceAsStream("binom_model_prostate.mojo"), "binom_model_prostate.mojo") - mojo.getMetrics(prostateDataFrame) shouldNot be(null) - mojo.getMetricsObject(prostateDataFrame) shouldNot be(null) + + val domain = mojo.getDomainValues()("capsule") + val metrics = H2OBinomialMetrics.calculate(mojo.transform(prostateDataFrame), domain, labelCol = "capsule") + metrics shouldNot be(null) } test("Test calculation of metrics on saved regression model") { val mojo = H2OMOJOModel.createFromMojo( this.getClass.getClassLoader.getResourceAsStream("regre_model_prostate.mojo"), "regre_model_prostate.mojo") - mojo.getMetrics(prostateDataFrame) shouldNot be(null) - mojo.getMetricsObject(prostateDataFrame) shouldNot be(null) + + val metrics = H2ORegressionMetrics.calculate(mojo.transform(prostateDataFrame), labelCol = "capsule") + metrics shouldNot be(null) } test("Test calculation of metrics on saved multinomial model") { val mojo = H2OMOJOModel.createFromMojo( this.getClass.getClassLoader.getResourceAsStream("multi_model_iris.mojo"), "multi_model_iris.mojo") - mojo.getMetrics(irisDataFrame) shouldNot be(null) - mojo.getMetricsObject(irisDataFrame) shouldNot be(null) + + val domain = mojo.getDomainValues()("capsule") + val metrics = H2OMultinomialMetrics.calculate(mojo.transform(prostateDataFrame), domain, labelCol = "capsule") + metrics shouldNot be(null) } } diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/RegressionMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/RegressionMetricsTestSuite.scala index b64b43f887..de9c1fa3f2 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/RegressionMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/RegressionMetricsTestSuite.scala @@ -102,12 +102,19 @@ class RegressionMetricsTestSuite extends FunSuite with Matchers with SharedH2OTe .set(algorithm.getParam("seed"), 1L) .setFeaturesCols("CAPSULE", "RACE", "DPROS", "DCAPS", "VOL", "GLEASON") .setLabelCol("AGE") + val model = algorithm.fit(trainingDataset) + val trainingMetrics = H2ORegressionMetrics.calculate( + dataFrame = model.transform(trainingDataset), + labelCol = "AGE") + val validationMetrics = H2ORegressionMetrics.calculate( + dataFrame = model.transform(validationDataset), + labelCol = "AGE") MetricsAssertions.assertEssentialMetrics( model, - trainingDataset, - validationDataset, + trainingMetrics, + validationMetrics, trainingMetricsTolerance, validationMetricsTolerance) } @@ -120,12 +127,21 @@ class RegressionMetricsTestSuite extends FunSuite with Matchers with SharedH2OTe .setFeaturesCols("CAPSULE", "RACE", "DPROS", "DCAPS", "VOL", "GLEASON") .setLabelCol("AGE") .setWeightCol("ID") + val model = algorithm.fit(trainingDataset) + val trainingMetrics = H2ORegressionMetrics.calculate( + dataFrame = model.transform(trainingDataset), + labelCol = "AGE", + weightColOption = Some("ID")) + val validationMetrics = H2ORegressionMetrics.calculate( + dataFrame = model.transform(validationDataset), + labelCol = "AGE", + weightColOption = Some("ID")) MetricsAssertions.assertEssentialMetrics( model, - trainingDataset, - validationDataset, + trainingMetrics, + validationMetrics, trainingMetricsTolerance, validationMetricsTolerance) } @@ -146,12 +162,21 @@ class RegressionMetricsTestSuite extends FunSuite with Matchers with SharedH2OTe .setFeaturesCols("CAPSULE", "RACE", "DPROS", "DCAPS", "VOL", "GLEASON") .setLabelCol("AGE") .setOffsetCol("ID") + val model = algorithm.fit(trainingDataset) + val trainingMetrics = H2ORegressionMetrics.calculate( + dataFrame = model.transform(trainingDataset), + labelCol = "AGE", + offsetColOption = Some("ID")) + val validationMetrics = H2ORegressionMetrics.calculate( + dataFrame = model.transform(validationDataset), + labelCol = "AGE", + offsetColOption = Some("ID")) MetricsAssertions.assertEssentialMetrics( model, - trainingDataset, - validationDataset, + trainingMetrics, + validationMetrics, trainingMetricsTolerance, validationMetricsTolerance) } From 38e5ab478b38cffb6fa52373d43bc5d4f5a30edb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Tue, 22 Mar 2022 17:30:46 +0100 Subject: [PATCH 10/37] Fix tests --- .../common/MetricsConfigurations.scala | 6 +- .../hex/MetricsCalculationTypeExtensions.java | 27 +-------- gradle.properties | 2 +- .../ml/metrics/H2OBinomialMetrics.scala | 9 ++- .../ml/metrics/H2OMultinomialMetrics.scala | 9 ++- .../ml/metrics/H2ORegressionMetrics.scala | 9 ++- .../ml/metrics/MetricCalculation.scala | 55 ++++++++++--------- 7 files changed, 57 insertions(+), 60 deletions(-) diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/MetricsConfigurations.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/MetricsConfigurations.scala index 3edeac9e80..37d95b1311 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/MetricsConfigurations.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/MetricsConfigurations.scala @@ -30,7 +30,7 @@ trait MetricsConfigurations { "The class makes available all metrics that shared across all algorithms, and ML problems." + " (classification, regression, dimension reduction)."), ModelMetricsSubstitutionContext( - "H2OBinomialMetrics", + "H2OBinomialMetricsBase", classOf[ModelMetricsBinomialV3[_, _]], Seq("H2OCommonMetrics"), "The class makes available all metrics that shared across all algorithms supporting binomial classification."), @@ -40,7 +40,7 @@ trait MetricsConfigurations { Seq("H2OBinomialMetrics", "H2OGLMMetrics"), "The class makes available all binomial metrics supported by GLM algorithm."), ModelMetricsSubstitutionContext( - "H2ORegressionMetrics", + "H2ORegressionMetricsBase", classOf[ModelMetricsRegressionV3[_, _]], Seq("H2OCommonMetrics"), "The class makes available all metrics that shared across all algorithms supporting regression."), @@ -55,7 +55,7 @@ trait MetricsConfigurations { Seq("H2ORegressionMetrics"), "The class makes available all regression metrics supported by CoxPH algorithm."), ModelMetricsSubstitutionContext( - "H2OMultinomialMetrics", + "H2OMultinomialMetricsBase", classOf[ModelMetricsMultinomialV3[_, _]], Seq("H2OCommonMetrics"), "The class makes available all metrics that shared across all algorithms supporting multinomial classification."), diff --git a/extensions/src/main/scala/hex/MetricsCalculationTypeExtensions.java b/extensions/src/main/scala/hex/MetricsCalculationTypeExtensions.java index bd1b472a4b..54a58fa9e4 100644 --- a/extensions/src/main/scala/hex/MetricsCalculationTypeExtensions.java +++ b/extensions/src/main/scala/hex/MetricsCalculationTypeExtensions.java @@ -1,19 +1,13 @@ package hex; -import hex.glm.IndependentGLMMetricBuilder; -import hex.glrm.ModelMetricsGLRM; -import hex.pca.ModelMetricsPCA; -import hex.tree.isofor.ModelMetricsAnomaly; import java.util.Arrays; import water.TypeMapExtension; -import water.api.ModelMetricsPCAV3; import water.api.schemas3.*; public class MetricsCalculationTypeExtensions implements TypeMapExtension { public static final String[] MODEL_BUILDER_CLASSES = { ModelMetrics.IndependentMetricBuilder.class.getName(), ModelMetricsSupervised.IndependentMetricBuilderSupervised.class.getName(), - ModelMetricsUnsupervised.IndependentMetricBuilderUnsupervised.class.getName(), ModelMetricsBinomial.IndependentMetricBuilderBinomial.class.getName(), AUC2.AUCBuilder.class.getName(), ModelMetricsRegression.IndependentMetricBuilderRegression.class.getName(), @@ -39,33 +33,14 @@ public class MetricsCalculationTypeExtensions implements TypeMapExtension { OlogitFunction.class.getName(), OloglogFunction.class.getName(), OprobitFunction.class.getName(), - ModelMetricsMultinomial.IndependentMetricBuilderMultinomial.class.getName(), - ModelMetricsOrdinal.IndependentMetricBuilderOrdinal.class.getName(), - ModelMetricsClustering.IndependentMetricBuilderClustering.class.getName(), - ModelMetricsHGLM.IndependentMetricBuilderHGLM.class.getName(), - ModelMetricsGLRM.IndependentGLRMModelMetricsBuilder.class.getName(), - ModelMetricsAnomaly.IndependentMetricBuilderAnomaly.class.getName(), - IndependentGLMMetricBuilder.class.getName(), - hex.glm.GLMModel.GLMWeightsFun.class.getName(), - ModelMetricsAutoEncoder.IndependentAutoEncoderMetricBuilder.class.getName(), - ModelMetricsPCA.IndependentPCAMetricBuilder.class.getName() + ModelMetricsMultinomial.IndependentMetricBuilderMultinomial.class.getName() }; public static final String[] SCHEMA_CLASSES = { ModelMetricsBaseV3.class.getName(), - ModelMetricsBinomialGLMV3.class.getName(), ModelMetricsBinomialV3.class.getName(), - ModelMetricsMultinomialGLMV3.class.getName(), ModelMetricsMultinomialV3.class.getName(), - ModelMetricsOrdinalGLMV3.class.getName(), - ModelMetricsOrdinalV3.class.getName(), - ModelMetricsRegressionGLMV3.class.getName(), - ModelMetricsRegressionCoxPHV3.class.getName(), ModelMetricsRegressionV3.class.getName(), - ModelMetricsAutoEncoderV3.class.getName(), - ModelMetricsPCAV3.class.getName(), - ModelMetricsHGLMV3.class.getName(), - ModelMetricsClusteringV3.class.getName(), ConfusionMatrixV3.class.getName(), TwoDimTableV3.class.getName(), TwoDimTableV3.ColumnSpecsBase.class.getName() diff --git a/gradle.properties b/gradle.properties index b58887a6a1..adc9eefdee 100644 --- a/gradle.properties +++ b/gradle.properties @@ -34,6 +34,6 @@ version=3.38.0.1-199-SNAPSHOT kubernetesSupportSinceSpark=2.4 databricksTestSinceSpark=2.4 spotlessModern=true -testH2OBranch=mn/PUBDEV-8373 +testH2OBranch=mn/PUBDEV-8373b makeBooklet=false testingBaseImage="harbor.h2o.ai/opsh2oai/h2o-3-hadoop-cdh-6.3:84" diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala index ff61cf90d2..cdfadd8d6e 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala @@ -19,8 +19,15 @@ package ai.h2o.sparkling.ml.metrics import hex.ModelMetricsBinomial.IndependentMetricBuilderBinomial import hex.genmodel.utils.DistributionFamily +import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame +@MetricsDescription(description = "The class makes available all metrics that shared across all algorithms supporting binomial classification.") +class H2OBinomialMetrics(override val uid: String) extends H2OBinomialMetricsBase(uid) { + + def this() = this(Identifiable.randomUID("H2OBinomialMetrics")) +} + object H2OBinomialMetrics extends MetricCalculation { def calculate( @@ -32,7 +39,7 @@ object H2OBinomialMetrics extends MetricCalculation { offsetColOption: Option[String] = None, distributionFamily: String = "AUTO"): H2OBinomialMetrics = { val domainFamilyEnum = DistributionFamily.valueOf(distributionFamily) - val getMetricBuilder = () => new IndependentMetricBuilderBinomial[_](domain, domainFamilyEnum) + val getMetricBuilder = () => new IndependentMetricBuilderBinomial(domain, domainFamilyEnum) val gson = getMetricGson( getMetricBuilder, diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala index 0d7da7d268..e24a0fd04e 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala @@ -20,8 +20,15 @@ package ai.h2o.sparkling.ml.metrics import ai.h2o.sparkling.ml.metrics.H2OBinomialMetrics.getMetricGson import hex.ModelMetricsMultinomial.IndependentMetricBuilderMultinomial import hex.MultinomialAucType +import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame +@MetricsDescription(description = "The class makes available all metrics that shared across all algorithms supporting multinomial classification.") +class H2OMultinomialMetrics(override val uid: String) extends H2OMultinomialMetricsBase(uid) { + + def this() = this(Identifiable.randomUID("H2OBinomialMetrics")) +} + object H2OMultinomialMetrics { def calculate( dataFrame: DataFrame, @@ -40,7 +47,7 @@ object H2OMultinomialMetrics { case None => null } val getMetricBuilder = - () => new IndependentMetricBuilderMultinomial[_](nclasses, domain, aucTypeEnum, priorDistribution) + () => new IndependentMetricBuilderMultinomial(nclasses, domain, aucTypeEnum, priorDistribution) val gson = getMetricGson( getMetricBuilder, diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala index 98552725a6..e5fb79580b 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala @@ -21,8 +21,15 @@ import ai.h2o.sparkling.ml.metrics.H2OBinomialMetrics.getMetricGson import hex.DistributionFactory import hex.ModelMetricsRegression.IndependentMetricBuilderRegression import hex.genmodel.utils.DistributionFamily +import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame +@MetricsDescription(description = "The class makes available all metrics that shared across all algorithms supporting regression.") +class H2ORegressionMetrics(override val uid: String) extends H2ORegressionMetricsBase(uid) { + + def this() = this(Identifiable.randomUID("H2ORegressionMetrics")) +} + object H2ORegressionMetrics { def calculate( @@ -34,7 +41,7 @@ object H2ORegressionMetrics { distributionFamily: String = "AUTO"): H2ORegressionMetrics = { val domainFamilyEnum = DistributionFamily.valueOf(distributionFamily) val distribution= DistributionFactory.getDistribution(domainFamilyEnum) - val getMetricBuilder = () => new IndependentMetricBuilderRegression[_](distribution) + val getMetricBuilder = () => new IndependentMetricBuilderRegression(distribution) val gson = getMetricGson( getMetricBuilder, diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala index ec02bdb4d3..6fcc16e7c3 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala @@ -26,7 +26,8 @@ import org.apache.spark.{ExposeUtils, ml, mllib} import org.apache.spark.sql.DataFrame import water.api.{Schema, SchemaServer} import water.api.schemas3._ -import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType, StringType} +import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType, StringType, StructType} +import org.apache.spark.sql.functions.{col, lit} trait MetricCalculation { @@ -92,43 +93,43 @@ trait MetricCalculation { offsetColOption: Option[String], weightColOption: Option[String], domain: Array[String]): JsonObject = { - val flatDF = getFlattenDataFrame(dataFrame) - val predictionType = flatDF.schema.fields.find(f => f.name == predictionCol).get.dataType - val predictionColIndex = flatDF.schema.indexOf(predictionCol) - val actualType = flatDF.schema.fields.find(f => f.name == labelCol).get.dataType - val actualColIndex = flatDF.schema.indexOf(labelCol) + val basicDF = dataFrame.select(col(predictionCol) as "prediction", col(labelCol) as "label") + val withWeightColDF = weightColOption match { + case Some(weightCol) => basicDF.withColumn("weight", col(weightCol)) + case None => basicDF.withColumn("weight", lit(0.0d)) + } + val flatDF = weightColOption match { + case Some(offsetCol) => withWeightColDF.withColumn("offset", col(offsetCol)) + case None => withWeightColDF.withColumn("offset", lit(1.0d)) + } + val predictionType = flatDF.schema.fields(0).dataType + val actualType = flatDF.schema.fields(1).dataType val filledMetricsBuilder = flatDF.rdd .mapPartitions[IndependentMetricBuilder[_]] { rows => val metricBuilder = createMetricBuilder() while (rows.hasNext) { val row = rows.next() - val offset = offsetColOption match { - case Some(offsetCol) => row.getDouble(row.fieldIndex(offsetCol)) - case None => 0.0d - } - val weight = weightColOption match { - case Some(weightCol) => row.getDouble(row.fieldIndex(weightCol)) - case None => 1.0d - } val prediction = predictionType match { - case ArrayType(DoubleType, _) => row.getSeq[Double](predictionColIndex).toArray - case ArrayType(FloatType, _) => row.getSeq[Float](predictionColIndex).map(_.toDouble).toArray - case DoubleType => Array(row.getDouble(predictionColIndex)) - case FloatType => Array(row.getFloat(predictionColIndex).toDouble) - case v if ExposeUtils.isMLVectorUDT(v) => - val vector = row.getAs[ml.linalg.Vector](predictionColIndex) - vector.toDense.values - case _: mllib.linalg.VectorUDT => - val vector = row.getAs[mllib.linalg.Vector](predictionColIndex) - vector.toDense.values + case StructType(fields) if fields.forall(_.dataType == DoubleType) => + row.getStruct(0).toSeq.map(_.asInstanceOf[Double]).toArray + case StructType(fields) if fields.forall(_.dataType == FloatType) => + row.getStruct(0).toSeq.map(_.asInstanceOf[Float].toDouble).toArray + case ArrayType(DoubleType, _) => row.getSeq[Double](0).toArray + case ArrayType(FloatType, _) => row.getSeq[Float](0).map(_.toDouble).toArray + case DoubleType => Array(row.getDouble(0)) + case FloatType => Array(row.getFloat(0).toDouble) + case v if ExposeUtils.isMLVectorUDT(v) => row.getAs[ml.linalg.Vector](0).toDense.values + case _: mllib.linalg.VectorUDT => row.getAs[mllib.linalg.Vector](0).toDense.values } val actualValue = actualType match { case StringType => - val label = row.getString(actualColIndex) + val label = row.getString(1) domain.indexOf(label).toDouble - case DoubleType => row.getDouble(actualColIndex) - case FloatType => row.getFloat(actualColIndex) + case DoubleType => row.getDouble(1) + case FloatType => row.getFloat(1) } + val weight = row.getDouble(2) + val offset = row.getDouble(3) metricBuilder.perRow(prediction, Array(actualValue), weight, offset) } Iterator.single(metricBuilder) From d118f747d73b46d941f34dc51d4c0e383cf64522 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Fri, 25 Mar 2022 19:30:00 +0100 Subject: [PATCH 11/37] Fix tests --- .../ml/metrics/BinomialMetricsTestSuite.scala | 42 +++++++------ .../metrics/MultinomialMetricsTestSuite.scala | 13 ++-- .../metrics/NoRuntimeMetricsTestSuite.scala | 4 +- .../metrics/RegressionMetricsTestSuite.scala | 10 ++- r/src/R/ai/h2o/sparkling/H2OConf.R | 2 +- .../ml/metrics/H2OBinomialMetrics.scala | 37 +++++++++-- .../ml/metrics/H2OMultinomialMetrics.scala | 40 ++++++++++-- .../ml/metrics/H2ORegressionMetrics.scala | 48 +++++++------- .../ml/metrics/MetricCalculation.scala | 63 ++++++------------- 9 files changed, 146 insertions(+), 113 deletions(-) diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala index 882424d9aa..dbd1ce0916 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala @@ -21,7 +21,7 @@ import ai.h2o.sparkling.ml.algos._ import ai.h2o.sparkling.ml.models.{H2OGBMMOJOModel, H2OGLMMOJOModel, H2OMOJOModel} import ai.h2o.sparkling.{SharedH2OTestContext, TestUtils} import org.apache.spark.sql.functions.rand -import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types._ import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @@ -84,10 +84,14 @@ class BinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OTest val expectedTrainingMetricObject = model.getTrainingMetricsObject().asInstanceOf[H2OBinomialMetrics] // Confusion matrix is not correctly calculated in H2O-3 runtime. - val trainingConfusionMatrix = trainingMetricObject.getConfusionMatrix().count() - val expectedTrainingConfusionMatrix = expectedTrainingMetricObject.getConfusionMatrix().count() - trainingConfusionMatrix shouldBe >(0L) - trainingConfusionMatrix shouldEqual expectedTrainingConfusionMatrix + val trainingConfusionMatrix = trainingMetricObject.getConfusionMatrix() + val expectedTrainingConfusionMatrix = expectedTrainingMetricObject.getConfusionMatrix() + if (expectedTrainingConfusionMatrix == null) { + trainingConfusionMatrix should be(null) + } else { + trainingConfusionMatrix.count() shouldBe >(0L) + trainingConfusionMatrix.count() shouldEqual expectedTrainingConfusionMatrix.count() + } val trainingMetricScores = trainingMetricObject.getThresholdsAndMetricScores().count() val expectedTrainingMetricScores = expectedTrainingMetricObject.getThresholdsAndMetricScores().count() @@ -105,10 +109,14 @@ class BinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OTest val expectedValidationMetricObject = model.getValidationMetricsObject().asInstanceOf[H2OBinomialMetrics] // Confusion matrix is not correctly calculated in H2O-3 runtime. - val validationConfusionMatrix = validationMetricObject.getConfusionMatrix().count() - val expectedValidationConfusionMatrix = expectedValidationMetricObject.getConfusionMatrix().count() - validationConfusionMatrix shouldBe >(0L) - validationConfusionMatrix shouldEqual expectedValidationConfusionMatrix + val validationConfusionMatrix = validationMetricObject.getConfusionMatrix() + val expectedValidationConfusionMatrix = expectedValidationMetricObject.getConfusionMatrix() + if (expectedValidationConfusionMatrix == null) { + validationConfusionMatrix should be(null) + } else { + validationConfusionMatrix.count() shouldBe >(0L) + validationConfusionMatrix.count() shouldEqual expectedValidationConfusionMatrix.count() + } val validationMetricScores = validationMetricObject.getThresholdsAndMetricScores().count() val expectedValidationMetricScores = expectedValidationMetricObject.getThresholdsAndMetricScores().count() @@ -173,21 +181,17 @@ class BinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OTest .setLabelCol("CAPSULE") val model = algorithm.fit(trainingDataset) - val domain = model.getDomainValues()("CAPSULE") - val trainingMetricObject = H2OBinomialMetrics.calculate( - model.transform(trainingDataset), - domain, - labelCol = "CAPSULE") - val validationMetricObject = H2OBinomialMetrics.calculate( - model.transform(validationDataset), - domain, - labelCol = "CAPSULE") + val domain = model.getDomainValues()("CAPSULE") + val trainingMetricObject = + H2OBinomialMetrics.calculate(model.transform(trainingDataset), domain, labelCol = "CAPSULE") + val validationMetricObject = + H2OBinomialMetrics.calculate(model.transform(validationDataset), domain, labelCol = "CAPSULE") assertMetrics( model, trainingMetricObject, - trainingMetricObject, + validationMetricObject, trainingMetricsTolerance, validationMetricsTolerance) } diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala index 31cab44069..ce9fd00c89 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala @@ -140,7 +140,6 @@ class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OT .setLabelCol("class") val model = algo.fit(dataset) assertMetrics[H2OMultinomialGLMMetrics](model) - model.write.overwrite().save("ml/build/glm_multinomial_model_metrics") val loadedModel = H2OGLMMOJOModel.load("ml/build/glm_multinomial_model_metrics") assertMetrics[H2OMultinomialGLMMetrics](loadedModel) @@ -169,14 +168,10 @@ class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OT val model = algorithm.fit(trainingDataset) val domain = model.getDomainValues()("class") - val trainingMetricObject = H2OMultinomialMetrics.calculate( - model.transform(trainingDataset), - domain, - labelCol = "class") - val validationMetricObject = H2OMultinomialMetrics.calculate( - model.transform(validationDataset), - domain, - labelCol = "class") + val trainingMetricObject = + H2OMultinomialMetrics.calculate(model.transform(trainingDataset), domain, labelCol = "class") + val validationMetricObject = + H2OMultinomialMetrics.calculate(model.transform(validationDataset), domain, labelCol = "class") assertMetrics( model, diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/NoRuntimeMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/NoRuntimeMetricsTestSuite.scala index 77da2f8547..de8123d065 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/NoRuntimeMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/NoRuntimeMetricsTestSuite.scala @@ -65,8 +65,8 @@ class NoRuntimeMetricsTestSuite extends FunSuite with Matchers with SparkTestCon this.getClass.getClassLoader.getResourceAsStream("multi_model_iris.mojo"), "multi_model_iris.mojo") - val domain = mojo.getDomainValues()("capsule") - val metrics = H2OMultinomialMetrics.calculate(mojo.transform(prostateDataFrame), domain, labelCol = "capsule") + val domain = mojo.getDomainValues()("class") + val metrics = H2OMultinomialMetrics.calculate(mojo.transform(irisDataFrame), domain, labelCol = "class") metrics shouldNot be(null) } } diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/RegressionMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/RegressionMetricsTestSuite.scala index de9c1fa3f2..2992fa04f1 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/RegressionMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/RegressionMetricsTestSuite.scala @@ -104,12 +104,10 @@ class RegressionMetricsTestSuite extends FunSuite with Matchers with SharedH2OTe .setLabelCol("AGE") val model = algorithm.fit(trainingDataset) - val trainingMetrics = H2ORegressionMetrics.calculate( - dataFrame = model.transform(trainingDataset), - labelCol = "AGE") - val validationMetrics = H2ORegressionMetrics.calculate( - dataFrame = model.transform(validationDataset), - labelCol = "AGE") + val trainingMetrics = + H2ORegressionMetrics.calculate(dataFrame = model.transform(trainingDataset), labelCol = "AGE") + val validationMetrics = + H2ORegressionMetrics.calculate(dataFrame = model.transform(validationDataset), labelCol = "AGE") MetricsAssertions.assertEssentialMetrics( model, diff --git a/r/src/R/ai/h2o/sparkling/H2OConf.R b/r/src/R/ai/h2o/sparkling/H2OConf.R index 669e981368..7ab9256ab8 100644 --- a/r/src/R/ai/h2o/sparkling/H2OConf.R +++ b/r/src/R/ai/h2o/sparkling/H2OConf.R @@ -1,4 +1,4 @@ -# + # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala index cdfadd8d6e..6f96713cab 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala @@ -19,10 +19,15 @@ package ai.h2o.sparkling.ml.metrics import hex.ModelMetricsBinomial.IndependentMetricBuilderBinomial import hex.genmodel.utils.DistributionFamily +import org.apache.spark.{ExposeUtils, ml, mllib} import org.apache.spark.ml.util.Identifiable -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.functions.col -@MetricsDescription(description = "The class makes available all metrics that shared across all algorithms supporting binomial classification.") +@MetricsDescription( + description = + "The class makes available all metrics that shared across all algorithms supporting binomial classification.") class H2OBinomialMetrics(override val uid: String) extends H2OBinomialMetricsBase(uid) { def this() = this(Identifiable.randomUID("H2OBinomialMetrics")) @@ -33,17 +38,18 @@ object H2OBinomialMetrics extends MetricCalculation { def calculate( dataFrame: DataFrame, domain: Array[String], - predictionProbabilitiesCol: String = "detailed_prediction.probabilities", + predictionProbabilitiesCol: String = "detailed_prediction", labelCol: String = "label", weightColOption: Option[String] = None, offsetColOption: Option[String] = None, distributionFamily: String = "AUTO"): H2OBinomialMetrics = { val domainFamilyEnum = DistributionFamily.valueOf(distributionFamily) val getMetricBuilder = () => new IndependentMetricBuilderBinomial(domain, domainFamilyEnum) + val castedLabelDF = dataFrame.withColumn(labelCol, col(labelCol) cast StringType) val gson = getMetricGson( getMetricBuilder, - dataFrame, + castedLabelDF, predictionProbabilitiesCol, labelCol, offsetColOption, @@ -71,4 +77,27 @@ object H2OBinomialMetrics extends MetricCalculation { Option(offsetCol), distributionFamily) } + + override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = { + dataType match { + case StructType(fields) + if fields(0).dataType == StringType && fields(1).dataType.isInstanceOf[StructType] && + fields(1).dataType.asInstanceOf[StructType].fields.forall(_.dataType == DoubleType) => + val predictionStructure = row.getStruct(0) + val prediction = predictionStructure.getString(0) + val index = domain.indexOf(prediction).toDouble + val probabilities = predictionStructure.getStruct(1) + + Array(index) ++ probabilities.toSeq.map(_.asInstanceOf[Double]) + case ArrayType(DoubleType, _) => row.getSeq[Double](0).toArray + case ArrayType(FloatType, _) => row.getSeq[Float](0).map(_.toDouble).toArray + case v if ExposeUtils.isMLVectorUDT(v) => row.getAs[ml.linalg.Vector](0).toDense.values + case _: mllib.linalg.VectorUDT => row.getAs[mllib.linalg.Vector](0).toDense.values + } + } + + override protected def getActualValue(dataType: DataType, domain: Array[String], row: Row): Double = { + val label = row.getString(1) + domain.indexOf(label).toDouble + } } diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala index e24a0fd04e..fe6db1db42 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala @@ -17,23 +17,27 @@ package ai.h2o.sparkling.ml.metrics -import ai.h2o.sparkling.ml.metrics.H2OBinomialMetrics.getMetricGson import hex.ModelMetricsMultinomial.IndependentMetricBuilderMultinomial import hex.MultinomialAucType +import org.apache.spark.{ExposeUtils, ml, mllib} import org.apache.spark.ml.util.Identifiable -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.col +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.types.{ArrayType, DataType, DoubleType, FloatType, StringType, StructType} -@MetricsDescription(description = "The class makes available all metrics that shared across all algorithms supporting multinomial classification.") +@MetricsDescription( + description = + "The class makes available all metrics that shared across all algorithms supporting multinomial classification.") class H2OMultinomialMetrics(override val uid: String) extends H2OMultinomialMetricsBase(uid) { def this() = this(Identifiable.randomUID("H2OBinomialMetrics")) } -object H2OMultinomialMetrics { +object H2OMultinomialMetrics extends MetricCalculation { def calculate( dataFrame: DataFrame, domain: Array[String], - predictionProbabilitiesCol: String = "detailed_prediction.probabilities", + predictionProbabilitiesCol: String = "detailed_prediction", labelCol: String = "label", weightColOption: Option[String] = None, offsetColOption: Option[String] = None, @@ -48,10 +52,11 @@ object H2OMultinomialMetrics { } val getMetricBuilder = () => new IndependentMetricBuilderMultinomial(nclasses, domain, aucTypeEnum, priorDistribution) + val castedLabelDF = dataFrame.withColumn(labelCol, col(labelCol) cast StringType) val gson = getMetricGson( getMetricBuilder, - dataFrame, + castedLabelDF, predictionProbabilitiesCol, labelCol, offsetColOption, @@ -81,4 +86,27 @@ object H2OMultinomialMetrics { Option(priorDistribution), aucType) } + + override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = { + dataType match { + case StructType(fields) + if fields(0).dataType == StringType && fields(1).dataType.isInstanceOf[StructType] && + fields(1).dataType.asInstanceOf[StructType].fields.forall(_.dataType == DoubleType) => + val predictionStructure = row.getStruct(0) + val prediction = predictionStructure.getString(0) + val index = domain.indexOf(prediction).toDouble + val probabilities = predictionStructure.getStruct(1) + + Array(index) ++ probabilities.toSeq.map(_.asInstanceOf[Double]) + case ArrayType(DoubleType, _) => row.getSeq[Double](0).toArray + case ArrayType(FloatType, _) => row.getSeq[Float](0).map(_.toDouble).toArray + case v if ExposeUtils.isMLVectorUDT(v) => row.getAs[ml.linalg.Vector](0).toDense.values + case _: mllib.linalg.VectorUDT => row.getAs[mllib.linalg.Vector](0).toDense.values + } + } + + override protected def getActualValue(dataType: DataType, domain: Array[String], row: Row): Double = { + val label = row.getString(1) + domain.indexOf(label).toDouble + } } diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala index e5fb79580b..50f585f7f5 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala @@ -17,20 +17,21 @@ package ai.h2o.sparkling.ml.metrics -import ai.h2o.sparkling.ml.metrics.H2OBinomialMetrics.getMetricGson import hex.DistributionFactory import hex.ModelMetricsRegression.IndependentMetricBuilderRegression import hex.genmodel.utils.DistributionFamily import org.apache.spark.ml.util.Identifiable -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.types._ -@MetricsDescription(description = "The class makes available all metrics that shared across all algorithms supporting regression.") +@MetricsDescription( + description = "The class makes available all metrics that shared across all algorithms supporting regression.") class H2ORegressionMetrics(override val uid: String) extends H2ORegressionMetricsBase(uid) { def this() = this(Identifiable.randomUID("H2ORegressionMetrics")) } -object H2ORegressionMetrics { +object H2ORegressionMetrics extends MetricCalculation { def calculate( dataFrame: DataFrame, @@ -40,17 +41,11 @@ object H2ORegressionMetrics { offsetColOption: Option[String] = None, distributionFamily: String = "AUTO"): H2ORegressionMetrics = { val domainFamilyEnum = DistributionFamily.valueOf(distributionFamily) - val distribution= DistributionFactory.getDistribution(domainFamilyEnum) - val getMetricBuilder = () => new IndependentMetricBuilderRegression(distribution) + val getMetricBuilder = + () => new IndependentMetricBuilderRegression(DistributionFactory.getDistribution(domainFamilyEnum)) - val gson = getMetricGson( - getMetricBuilder, - dataFrame, - predictionCol, - labelCol, - offsetColOption, - weightColOption, - null) + val gson = + getMetricGson(getMetricBuilder, dataFrame, predictionCol, labelCol, offsetColOption, weightColOption, null) val result = new H2ORegressionMetrics() result.setMetrics(gson, "H2ORegressionMetrics.calculate") result @@ -63,12 +58,23 @@ object H2ORegressionMetrics { weightCol: String, offsetCol: String, distributionFamily: String): H2ORegressionMetrics = { - calculate( - dataFrame, - predictionCol, - labelCol, - Option(weightCol), - Option(offsetCol), - distributionFamily) + calculate(dataFrame, predictionCol, labelCol, Option(weightCol), Option(offsetCol), distributionFamily) + } + + override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = { + dataType match { + case StructType(fields) if fields.head.dataType == DoubleType => Array(row.getStruct(0).getDouble(0)) + case DoubleType => Array(row.getDouble(0)) + case FloatType => Array(row.getFloat(0).toDouble) + } + } + + override protected def getActualValue(dataType: DataType, domain: Array[String], row: Row): Double = dataType match { + case DoubleType => row.getDouble(1) + case FloatType => row.getFloat(1).toDouble + case LongType => row.getLong(1).toDouble + case IntegerType => row.getInt(1).toDouble + case ShortType => row.getShort(1).toDouble + case ByteType => row.getByte(1).toDouble } } diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala index 6fcc16e7c3..9e870b12a6 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala @@ -17,29 +17,17 @@ package ai.h2o.sparkling.ml.metrics -import ai.h2o.sparkling.ml.models.RowConverter -import ai.h2o.sparkling.ml.utils.{DatasetShape, SchemaUtils} import com.google.gson.{GsonBuilder, JsonObject} import hex._ import hex.ModelMetrics.IndependentMetricBuilder -import org.apache.spark.{ExposeUtils, ml, mllib} -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.{DataFrame, Row} import water.api.{Schema, SchemaServer} import water.api.schemas3._ -import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType, StringType, StructType} +import org.apache.spark.sql.types._ import org.apache.spark.sql.functions.{col, lit} trait MetricCalculation { - private[sparkling] def getFlattenDataFrame(dataFrame: DataFrame): DataFrame = { - val flatDataFrame = DatasetShape.getDatasetShape(dataFrame.schema) match { - case DatasetShape.Flat => dataFrame - case DatasetShape.StructsOnly | DatasetShape.Nested => - SchemaUtils.appendFlattenedStructsToDataFrame(dataFrame, RowConverter.temporaryColumnPrefix) - } - flatDataFrame - } - private[sparkling] def validateDataFrameForMetricCalculation( flatDataFrame: DataFrame, labelCol: String, @@ -47,8 +35,7 @@ trait MetricCalculation { weightColOption: Option[String]): Unit = { if (labelCol != null && !flatDataFrame.columns.contains(labelCol)) { - throw new IllegalArgumentException( - s"DataFrame passed as a parameter does not contain label column '$labelCol'.") + throw new IllegalArgumentException(s"DataFrame passed as a parameter does not contain label column '$labelCol'.") } if (offsetColOption.isDefined) { @@ -85,7 +72,11 @@ trait MetricCalculation { schema } - private[sparkling] def getMetricGson( + protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] + + protected def getActualValue(dataType: DataType, domain: Array[String], row: Row): Double + + protected def getMetricGson( createMetricBuilder: () => IndependentMetricBuilder[_], dataFrame: DataFrame, predictionCol: String, @@ -93,15 +84,13 @@ trait MetricCalculation { offsetColOption: Option[String], weightColOption: Option[String], domain: Array[String]): JsonObject = { - val basicDF = dataFrame.select(col(predictionCol) as "prediction", col(labelCol) as "label") - val withWeightColDF = weightColOption match { - case Some(weightCol) => basicDF.withColumn("weight", col(weightCol)) - case None => basicDF.withColumn("weight", lit(0.0d)) - } - val flatDF = weightColOption match { - case Some(offsetCol) => withWeightColDF.withColumn("offset", col(offsetCol)) - case None => withWeightColDF.withColumn("offset", lit(1.0d)) - } + val flatDF = dataFrame.select(col(predictionCol) as "prediction", col(labelCol) as "label", weightColOption match { + case Some(weightCol) => col(weightCol) cast DoubleType as "weight" + case None => lit(1.0d) as "weight" + }, offsetColOption match { + case Some(offsetCol) => col(offsetCol) cast DoubleType as "offset" + case None => lit(0.0d) as "offset" + }) val predictionType = flatDF.schema.fields(0).dataType val actualType = flatDF.schema.fields(1).dataType val filledMetricsBuilder = flatDF.rdd @@ -109,25 +98,8 @@ trait MetricCalculation { val metricBuilder = createMetricBuilder() while (rows.hasNext) { val row = rows.next() - val prediction = predictionType match { - case StructType(fields) if fields.forall(_.dataType == DoubleType) => - row.getStruct(0).toSeq.map(_.asInstanceOf[Double]).toArray - case StructType(fields) if fields.forall(_.dataType == FloatType) => - row.getStruct(0).toSeq.map(_.asInstanceOf[Float].toDouble).toArray - case ArrayType(DoubleType, _) => row.getSeq[Double](0).toArray - case ArrayType(FloatType, _) => row.getSeq[Float](0).map(_.toDouble).toArray - case DoubleType => Array(row.getDouble(0)) - case FloatType => Array(row.getFloat(0).toDouble) - case v if ExposeUtils.isMLVectorUDT(v) => row.getAs[ml.linalg.Vector](0).toDense.values - case _: mllib.linalg.VectorUDT => row.getAs[mllib.linalg.Vector](0).toDense.values - } - val actualValue = actualType match { - case StringType => - val label = row.getString(1) - domain.indexOf(label).toDouble - case DoubleType => row.getDouble(1) - case FloatType => row.getFloat(1) - } + val prediction = getPredictionValues(predictionType, domain, row) + val actualValue: Double = getActualValue(actualType, domain, row) val weight = row.getDouble(2) val offset = row.getDouble(3) metricBuilder.perRow(prediction, Array(actualValue), weight, offset) @@ -136,6 +108,7 @@ trait MetricCalculation { } .reduce((f, s) => { f.reduce(s); f }) + filledMetricsBuilder.postGlobal() val metrics = filledMetricsBuilder.makeModelMetrics() val schema = metricsToSchema(metrics) val json = schema.toJsonString From 3f4cf5630c465c4b0fba0f29809af0420277ded9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Mon, 28 Mar 2022 19:58:47 +0200 Subject: [PATCH 12/37] Python wrappers --- .../ml/metrics/H2OBinomialMetrics.py | 43 +++++++++++++++++ .../ml/metrics/H2OMultinomialMetrics.py | 45 ++++++++++++++++++ .../ml/metrics/H2ORegressionMetrics.py | 41 ++++++++++++++++ .../ml/metrics/H2OBinomialMetrics.scala | 47 +++++++++++-------- .../ml/metrics/H2OMultinomialMetrics.scala | 40 +++++++++++----- .../ml/metrics/H2ORegressionMetrics.scala | 23 ++++++--- 6 files changed, 202 insertions(+), 37 deletions(-) create mode 100644 py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py create mode 100644 py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py create mode 100644 py-scoring/src/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.py diff --git a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py new file mode 100644 index 0000000000..399325921d --- /dev/null +++ b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py @@ -0,0 +1,43 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from pyspark.ml.param import * +from ai.h2o.sparkling.ml.metrics.H2OBinomialMetricsBase import H2OBinomialMetricsBase +from ai.h2o.sparkling.Initializer import Initializer +from pyspark.ml.util import _jvm + + +class H2OBinomialMetrics(H2OBinomialMetricsBase): + + @staticmethod + def calculate(dataFrame, + domain, + predictionCol = "detailed_prediction", + labelCol = "label", + weightCol = None, + offsetCol = None, + distributionFamily = "binomial"): + # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths + Initializer.load_sparkling_jar() + javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OBinomialMetrics.calclate(dataFrame, + domain, + predictionCol, + labelCol, + weightCol, + offsetCol, + distributionFamily) + return H2OBinomialMetrics(javaMetrics) diff --git a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py new file mode 100644 index 0000000000..f8ac9484fc --- /dev/null +++ b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py @@ -0,0 +1,45 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from pyspark.ml.param import * +from ai.h2o.sparkling.ml.metrics.H2OMultinomialMetricsBase import H2OMultinomialMetricsBase +from ai.h2o.sparkling.Initializer import Initializer +from pyspark.ml.util import _jvm + + +class H2OMultinomialMetrics(H2OMultinomialMetricsBase): + + @staticmethod + def calculate(dataFrame, + domain, + predictionCol = "detailed_prediction", + labelCol = "label", + weightCol = None, + offsetCol = None, + priorDistribution = None, + aucType = "AUTO"): + # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths + Initializer.load_sparkling_jar() + javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OMultinomialMetrics.calclate(dataFrame, + domain, + predictionCol, + labelCol, + weightCol, + offsetCol, + priorDistribution, + aucType) + return H2OMultinomialMetrics(javaMetrics) diff --git a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.py b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.py new file mode 100644 index 0000000000..0eb2dca203 --- /dev/null +++ b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.py @@ -0,0 +1,41 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from pyspark.ml.param import * +from ai.h2o.sparkling.ml.metrics.H2ORegressionMetricsBase import H2ORegressionMetricsBase +from ai.h2o.sparkling.Initializer import Initializer +from pyspark.ml.util import _jvm + + +class H2ORegressionMetrics(H2ORegressionMetricsBase): + + @staticmethod + def calculate(dataFrame, + domain, + predictionCol = "detailed_prediction", + labelCol = "label", + weightCol = None, + offsetCol = None): + # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths + Initializer.load_sparkling_jar() + javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2ORegressionMetrics.calclate(dataFrame, + domain, + predictionCol, + labelCol, + weightCol, + offsetCol) + return H2ORegressionMetrics(javaMetrics) diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala index 6f96713cab..5acd82814a 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala @@ -35,26 +35,42 @@ class H2OBinomialMetrics(override val uid: String) extends H2OBinomialMetricsBas object H2OBinomialMetrics extends MetricCalculation { + /** + * The method calculates binomial metrics on a provided data frame with predictions and actual values. + * + * @param dataFrame A data frame with predictions and actual values + * @param domain Array of classes representing negative and positive response. Negative class must at position 0 and + * positive at 1. + * @param predictionCol The name of prediction column. The prediction column must have the same type as + * a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or + * a array type or vector of doubles. First item is must be 0.0 or 1.0 representing + * negative or positive response. The other items must be probabilities to predict given probability + * classes. + * @param labelCol The name of label column that contains actual values. + * @param weightColOption The name of a weight column. + * @param offsetColOption The name of a offset column. + * @param distributionFamily The name of distribution family. Possible values: bernoulli, quasibinomial + * @return Calculated binomial metrics + */ def calculate( dataFrame: DataFrame, domain: Array[String], - predictionProbabilitiesCol: String = "detailed_prediction", + predictionCol: String = "detailed_prediction", labelCol: String = "label", weightColOption: Option[String] = None, offsetColOption: Option[String] = None, - distributionFamily: String = "AUTO"): H2OBinomialMetrics = { + distributionFamily: String = "bernoulli"): H2OBinomialMetrics = { + if (!Set("bernoulli", "quasibinomial").contains(distributionFamily)) { + throw new IllegalArgumentException( + s"Passed value of distributionFamily is $distributionFamily. " + + "Possible values are 'bernoulli', 'quasibinomial'") + } val domainFamilyEnum = DistributionFamily.valueOf(distributionFamily) val getMetricBuilder = () => new IndependentMetricBuilderBinomial(domain, domainFamilyEnum) val castedLabelDF = dataFrame.withColumn(labelCol, col(labelCol) cast StringType) - val gson = getMetricGson( - getMetricBuilder, - castedLabelDF, - predictionProbabilitiesCol, - labelCol, - offsetColOption, - weightColOption, - domain) + val gson = + getMetricGson(getMetricBuilder, castedLabelDF, predictionCol, labelCol, offsetColOption, weightColOption, domain) val result = new H2OBinomialMetrics() result.setMetrics(gson, "H2OBinomialMetrics.calculate") result @@ -63,19 +79,12 @@ object H2OBinomialMetrics extends MetricCalculation { def calculate( dataFrame: DataFrame, domain: Array[String], - predictionProbabilitiesCol: String, + predictionCol: String, labelCol: String, weightCol: String, offsetCol: String, distributionFamily: String): Unit = { - calculate( - dataFrame, - domain, - predictionProbabilitiesCol, - labelCol, - Option(weightCol), - Option(offsetCol), - distributionFamily) + calculate(dataFrame, domain, predictionCol, labelCol, Option(weightCol), Option(offsetCol), distributionFamily) } override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = { diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala index fe6db1db42..420427a54c 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala @@ -34,10 +34,34 @@ class H2OMultinomialMetrics(override val uid: String) extends H2OMultinomialMetr } object H2OMultinomialMetrics extends MetricCalculation { + + /** + * The method calculates multinomial metrics on a provided data frame with predictions and actual values. + * + * @param dataFrame A data frame with predictions and actual values + * @param domain Array of response classes. + * @param predictionCol The name of prediction column. The prediction column must have the same type as + * a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or + * a array type or vector of doubles. First item is must be 0.0, 1.0, 2.0 representing + * indexes of response classes. The other items must be probabilities to predict given probability + * classes. + * @param labelCol The name of label column that contains actual values. + * @param weightColOption The name of a weight column. + * @param offsetColOption The name of a offset column. + * @param priorDistributionOption Prior class probabilities needed for calculation of hit ratio table + * @param aucType Type of multinomial AUC/AUCPR calculation. Possible values: + * - AUTO, + * - NONE, + * - MACRO_OVR, + * - WEIGHTED_OVR, + * - MACRO_OVO, + * - WEIGHTED_OVO + * @return Calculated multinomial metrics + */ def calculate( dataFrame: DataFrame, domain: Array[String], - predictionProbabilitiesCol: String = "detailed_prediction", + predictionCol: String = "detailed_prediction", labelCol: String = "label", weightColOption: Option[String] = None, offsetColOption: Option[String] = None, @@ -54,14 +78,8 @@ object H2OMultinomialMetrics extends MetricCalculation { () => new IndependentMetricBuilderMultinomial(nclasses, domain, aucTypeEnum, priorDistribution) val castedLabelDF = dataFrame.withColumn(labelCol, col(labelCol) cast StringType) - val gson = getMetricGson( - getMetricBuilder, - castedLabelDF, - predictionProbabilitiesCol, - labelCol, - offsetColOption, - weightColOption, - domain) + val gson = + getMetricGson(getMetricBuilder, castedLabelDF, predictionCol, labelCol, offsetColOption, weightColOption, domain) val result = new H2OMultinomialMetrics() result.setMetrics(gson, "H2OMultinomialMetrics.calculate") result @@ -70,7 +88,7 @@ object H2OMultinomialMetrics extends MetricCalculation { def calculate( dataFrame: DataFrame, domain: Array[String], - predictionProbabilitiesCol: String, + predictionCol: String, labelCol: String, weightCol: String, offsetCol: String, @@ -79,7 +97,7 @@ object H2OMultinomialMetrics extends MetricCalculation { calculate( dataFrame, domain, - predictionProbabilitiesCol, + predictionCol, labelCol, Option(weightCol), Option(offsetCol), diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala index 50f585f7f5..48eeb58b48 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala @@ -33,16 +33,26 @@ class H2ORegressionMetrics(override val uid: String) extends H2ORegressionMetric object H2ORegressionMetrics extends MetricCalculation { + /** + * The method calculates regression metrics on a provided data frame with predictions and actual values. + * + * @param dataFrame A data frame with predictions and actual values + * @param predictionCol The name of prediction column. The prediction column must have the same type as + * a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or + * it must be of DoubleType or FloatType. + * @param labelCol The name of label column that contains actual values. + * @param weightColOption The name of a weight column. + * @param offsetColOption The name of a offset column. + * @return Calculated regression metrics + */ def calculate( dataFrame: DataFrame, predictionCol: String = "prediction", labelCol: String = "label", weightColOption: Option[String] = None, - offsetColOption: Option[String] = None, - distributionFamily: String = "AUTO"): H2ORegressionMetrics = { - val domainFamilyEnum = DistributionFamily.valueOf(distributionFamily) + offsetColOption: Option[String] = None): H2ORegressionMetrics = { val getMetricBuilder = - () => new IndependentMetricBuilderRegression(DistributionFactory.getDistribution(domainFamilyEnum)) + () => new IndependentMetricBuilderRegression(DistributionFactory.getDistribution(DistributionFamily.AUTO)) val gson = getMetricGson(getMetricBuilder, dataFrame, predictionCol, labelCol, offsetColOption, weightColOption, null) @@ -56,9 +66,8 @@ object H2ORegressionMetrics extends MetricCalculation { predictionCol: String, labelCol: String, weightCol: String, - offsetCol: String, - distributionFamily: String): H2ORegressionMetrics = { - calculate(dataFrame, predictionCol, labelCol, Option(weightCol), Option(offsetCol), distributionFamily) + offsetCol: String): H2ORegressionMetrics = { + calculate(dataFrame, predictionCol, labelCol, Option(weightCol), Option(offsetCol)) } override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = { From 3f2dabd437116a988a54e9b7a3a4608987ec8e21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Tue, 29 Mar 2022 12:04:54 +0200 Subject: [PATCH 13/37] Fix multinomial tests --- .../metrics/MultinomialMetricsTestSuite.scala | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala index ce9fd00c89..35e7ee9e30 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala @@ -169,9 +169,17 @@ class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OT val model = algorithm.fit(trainingDataset) val domain = model.getDomainValues()("class") val trainingMetricObject = - H2OMultinomialMetrics.calculate(model.transform(trainingDataset), domain, labelCol = "class") + H2OMultinomialMetrics.calculate( + model.transform(trainingDataset), + domain, + labelCol = "class", + aucType = "MACRO_OVR") val validationMetricObject = - H2OMultinomialMetrics.calculate(model.transform(validationDataset), domain, labelCol = "class") + H2OMultinomialMetrics.calculate( + model.transform(validationDataset), + domain, + labelCol = "class", + aucType = "MACRO_OVR") assertMetrics( model, @@ -198,12 +206,14 @@ class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OT model.transform(trainingDataset), domain, labelCol = "class", - weightColOption = Some("WEIGHT")) + weightColOption = Some("WEIGHT"), + aucType = "MACRO_OVR") val validationMetricObject = H2OMultinomialMetrics.calculate( model.transform(validationDataset), domain, labelCol = "class", - weightColOption = Some("WEIGHT")) + weightColOption = Some("WEIGHT"), + aucType = "MACRO_OVR") assertMetrics( model, @@ -237,12 +247,14 @@ class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OT model.transform(trainingDataset), domain, labelCol = "class", - offsetColOption = Some("ID")) + offsetColOption = Some("ID"), + aucType = "MACRO_OVR") val validationMetricObject = H2OMultinomialMetrics.calculate( model.transform(validationDataset), domain, labelCol = "class", - offsetColOption = Some("ID")) + offsetColOption = Some("ID"), + aucType = "MACRO_OVR") assertMetrics( model, From ef0670c31b4cbf69bcb8d9de5ea4ac6d784393b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Tue, 29 Mar 2022 12:09:01 +0200 Subject: [PATCH 14/37] Remove prior distribution --- .../h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py | 2 -- .../sparkling/ml/metrics/H2OMultinomialMetrics.scala | 10 +--------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py index f8ac9484fc..64340162d1 100644 --- a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py +++ b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py @@ -30,7 +30,6 @@ def calculate(dataFrame, labelCol = "label", weightCol = None, offsetCol = None, - priorDistribution = None, aucType = "AUTO"): # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths Initializer.load_sparkling_jar() @@ -40,6 +39,5 @@ def calculate(dataFrame, labelCol, weightCol, offsetCol, - priorDistribution, aucType) return H2OMultinomialMetrics(javaMetrics) diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala index 420427a54c..3b1805ca1d 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala @@ -48,7 +48,6 @@ object H2OMultinomialMetrics extends MetricCalculation { * @param labelCol The name of label column that contains actual values. * @param weightColOption The name of a weight column. * @param offsetColOption The name of a offset column. - * @param priorDistributionOption Prior class probabilities needed for calculation of hit ratio table * @param aucType Type of multinomial AUC/AUCPR calculation. Possible values: * - AUTO, * - NONE, @@ -65,17 +64,12 @@ object H2OMultinomialMetrics extends MetricCalculation { labelCol: String = "label", weightColOption: Option[String] = None, offsetColOption: Option[String] = None, - priorDistributionOption: Option[Array[Double]] = None, aucType: String = "AUTO"): H2OMultinomialMetrics = { val aucTypeEnum = MultinomialAucType.valueOf(aucType) val nclasses = domain.length - val priorDistribution = priorDistributionOption match { - case Some(x) => x - case None => null - } val getMetricBuilder = - () => new IndependentMetricBuilderMultinomial(nclasses, domain, aucTypeEnum, priorDistribution) + () => new IndependentMetricBuilderMultinomial(nclasses, domain, aucTypeEnum, null) val castedLabelDF = dataFrame.withColumn(labelCol, col(labelCol) cast StringType) val gson = @@ -92,7 +86,6 @@ object H2OMultinomialMetrics extends MetricCalculation { labelCol: String, weightCol: String, offsetCol: String, - priorDistribution: Array[Double], aucType: String): H2OMultinomialMetrics = { calculate( dataFrame, @@ -101,7 +94,6 @@ object H2OMultinomialMetrics extends MetricCalculation { labelCol, Option(weightCol), Option(offsetCol), - Option(priorDistribution), aucType) } From 6a4a15c0afedfa1e413c2561812140decd743ee1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Tue, 29 Mar 2022 12:41:15 +0200 Subject: [PATCH 15/37] remove distribution option --- .../sparkling/ml/metrics/H2OBinomialMetrics.py | 6 ++---- .../ml/metrics/H2OBinomialMetrics.scala | 17 ++++------------- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py index 399325921d..f3212d30e6 100644 --- a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py +++ b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py @@ -29,8 +29,7 @@ def calculate(dataFrame, predictionCol = "detailed_prediction", labelCol = "label", weightCol = None, - offsetCol = None, - distributionFamily = "binomial"): + offsetCol = None): # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths Initializer.load_sparkling_jar() javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OBinomialMetrics.calclate(dataFrame, @@ -38,6 +37,5 @@ def calculate(dataFrame, predictionCol, labelCol, weightCol, - offsetCol, - distributionFamily) + offsetCol) return H2OBinomialMetrics(javaMetrics) diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala index 5acd82814a..7cd75849e9 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala @@ -49,7 +49,6 @@ object H2OBinomialMetrics extends MetricCalculation { * @param labelCol The name of label column that contains actual values. * @param weightColOption The name of a weight column. * @param offsetColOption The name of a offset column. - * @param distributionFamily The name of distribution family. Possible values: bernoulli, quasibinomial * @return Calculated binomial metrics */ def calculate( @@ -58,15 +57,8 @@ object H2OBinomialMetrics extends MetricCalculation { predictionCol: String = "detailed_prediction", labelCol: String = "label", weightColOption: Option[String] = None, - offsetColOption: Option[String] = None, - distributionFamily: String = "bernoulli"): H2OBinomialMetrics = { - if (!Set("bernoulli", "quasibinomial").contains(distributionFamily)) { - throw new IllegalArgumentException( - s"Passed value of distributionFamily is $distributionFamily. " + - "Possible values are 'bernoulli', 'quasibinomial'") - } - val domainFamilyEnum = DistributionFamily.valueOf(distributionFamily) - val getMetricBuilder = () => new IndependentMetricBuilderBinomial(domain, domainFamilyEnum) + offsetColOption: Option[String] = None): H2OBinomialMetrics = { + val getMetricBuilder = () => new IndependentMetricBuilderBinomial(domain, DistributionFamily.bernoulli) val castedLabelDF = dataFrame.withColumn(labelCol, col(labelCol) cast StringType) val gson = @@ -82,9 +74,8 @@ object H2OBinomialMetrics extends MetricCalculation { predictionCol: String, labelCol: String, weightCol: String, - offsetCol: String, - distributionFamily: String): Unit = { - calculate(dataFrame, domain, predictionCol, labelCol, Option(weightCol), Option(offsetCol), distributionFamily) + offsetCol: String): Unit = { + calculate(dataFrame, domain, predictionCol, labelCol, Option(weightCol), Option(offsetCol)) } override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = { From 624a627f29c7c5c3a8df387e4331bb8524de9d11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Tue, 29 Mar 2022 14:19:24 +0200 Subject: [PATCH 16/37] spotless Apply --- .../h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala index 3b1805ca1d..c7169a4811 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala @@ -87,14 +87,7 @@ object H2OMultinomialMetrics extends MetricCalculation { weightCol: String, offsetCol: String, aucType: String): H2OMultinomialMetrics = { - calculate( - dataFrame, - domain, - predictionCol, - labelCol, - Option(weightCol), - Option(offsetCol), - aucType) + calculate(dataFrame, domain, predictionCol, labelCol, Option(weightCol), Option(offsetCol), aucType) } override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = { From cefff34cfa732a49a00547187365df9ed1329fe3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Tue, 29 Mar 2022 17:08:49 +0200 Subject: [PATCH 17/37] dataType checks --- .../ml/metrics/H2OBinomialMetrics.scala | 29 ++++++++++++++ .../ml/metrics/H2OMultinomialMetrics.scala | 30 +++++++++++++- .../ml/metrics/H2ORegressionMetrics.scala | 39 +++++++++++++++---- .../ml/metrics/MetricCalculation.scala | 26 ++++++++++--- 4 files changed, 110 insertions(+), 14 deletions(-) diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala index 7cd75849e9..ca416cc2d2 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala @@ -58,6 +58,7 @@ object H2OBinomialMetrics extends MetricCalculation { labelCol: String = "label", weightColOption: Option[String] = None, offsetColOption: Option[String] = None): H2OBinomialMetrics = { + validateDataFrameForMetricCalculation(dataFrame, predictionCol, labelCol, offsetColOption, weightColOption) val getMetricBuilder = () => new IndependentMetricBuilderBinomial(domain, DistributionFamily.bernoulli) val castedLabelDF = dataFrame.withColumn(labelCol, col(labelCol) cast StringType) @@ -100,4 +101,32 @@ object H2OBinomialMetrics extends MetricCalculation { val label = row.getString(1) domain.indexOf(label).toDouble } + + override protected def validateDataFrameForMetricCalculation( + dataFrame: DataFrame, + predictionCol: String, + labelCol: String, + offsetColOption: Option[String], + weightColOption: Option[String]): Unit = { + super.validateDataFrameForMetricCalculation(dataFrame, predictionCol, labelCol, offsetColOption, weightColOption) + val predictionType = dataFrame.schema.fields.find(_.name == predictionCol).get.dataType + val isPredictionTypeValid = predictionType match { + case StructType(fields) + if fields(0).dataType == StringType && fields(1).dataType.isInstanceOf[StructType] && + fields(1).dataType.asInstanceOf[StructType].fields.forall(_.dataType == DoubleType) => + true + case ArrayType(DoubleType, _) => true + case ArrayType(FloatType, _) => true + case v if ExposeUtils.isMLVectorUDT(v) => true + case _: mllib.linalg.VectorUDT => true + case _ => false + } + if (!isPredictionTypeValid) { + throw new IllegalArgumentException(s"The type of the prediction column '$predictionCol' is not valid. " + + "The prediction column must have the same type as a detailed_prediction column coming from the transform " + + "method of H2OMOJOModel descendant or a array type or vector of doubles. First item is must be 0.0 or 1.0" + + "representing negative or positive response. The other items must be probabilities to predict given probability" + + "classes.") + } + } } diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala index c7169a4811..e478bacde4 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala @@ -65,7 +65,7 @@ object H2OMultinomialMetrics extends MetricCalculation { weightColOption: Option[String] = None, offsetColOption: Option[String] = None, aucType: String = "AUTO"): H2OMultinomialMetrics = { - + validateDataFrameForMetricCalculation(dataFrame, predictionCol, labelCol, offsetColOption, weightColOption) val aucTypeEnum = MultinomialAucType.valueOf(aucType) val nclasses = domain.length val getMetricBuilder = @@ -112,4 +112,32 @@ object H2OMultinomialMetrics extends MetricCalculation { val label = row.getString(1) domain.indexOf(label).toDouble } + + override protected def validateDataFrameForMetricCalculation( + dataFrame: DataFrame, + predictionCol: String, + labelCol: String, + offsetColOption: Option[String], + weightColOption: Option[String]): Unit = { + super.validateDataFrameForMetricCalculation(dataFrame, predictionCol, labelCol, offsetColOption, weightColOption) + val predictionType = dataFrame.schema.fields.find(_.name == predictionCol).get.dataType + val isPredictionTypeValid = predictionType match { + case StructType(fields) + if fields(0).dataType == StringType && fields(1).dataType.isInstanceOf[StructType] && + fields(1).dataType.asInstanceOf[StructType].fields.forall(_.dataType == DoubleType) => + true + case ArrayType(DoubleType, _) => true + case ArrayType(FloatType, _) => true + case v if ExposeUtils.isMLVectorUDT(v) => true + case _: mllib.linalg.VectorUDT => true + case _ => false + } + if (!isPredictionTypeValid) { + throw new IllegalArgumentException(s"The type of the prediction column '$predictionCol' is not valid. " + + "The prediction column must have the same type as a detailed_prediction column coming from the transform " + + "method of H2OMOJOModel descendant or a array type or vector of doubles. First item is must be 0.0, 1.0, 2.0 " + + "representing indexes of response classes. The other items must be probabilities to predict given " + + "probability classes.") + } + } } diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala index 48eeb58b48..fc476a64b2 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala @@ -21,6 +21,7 @@ import hex.DistributionFactory import hex.ModelMetricsRegression.IndependentMetricBuilderRegression import hex.genmodel.utils.DistributionFamily import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.functions.col import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types._ @@ -51,11 +52,12 @@ object H2ORegressionMetrics extends MetricCalculation { labelCol: String = "label", weightColOption: Option[String] = None, offsetColOption: Option[String] = None): H2ORegressionMetrics = { + validateDataFrameForMetricCalculation(dataFrame, predictionCol, labelCol, offsetColOption, weightColOption) val getMetricBuilder = () => new IndependentMetricBuilderRegression(DistributionFactory.getDistribution(DistributionFamily.AUTO)) - + val castedLabelDF = dataFrame.withColumn(labelCol, col(labelCol) cast DoubleType) val gson = - getMetricGson(getMetricBuilder, dataFrame, predictionCol, labelCol, offsetColOption, weightColOption, null) + getMetricGson(getMetricBuilder, castedLabelDF, predictionCol, labelCol, offsetColOption, weightColOption, null) val result = new H2ORegressionMetrics() result.setMetrics(gson, "H2ORegressionMetrics.calculate") result @@ -80,10 +82,33 @@ object H2ORegressionMetrics extends MetricCalculation { override protected def getActualValue(dataType: DataType, domain: Array[String], row: Row): Double = dataType match { case DoubleType => row.getDouble(1) - case FloatType => row.getFloat(1).toDouble - case LongType => row.getLong(1).toDouble - case IntegerType => row.getInt(1).toDouble - case ShortType => row.getShort(1).toDouble - case ByteType => row.getByte(1).toDouble + } + + override protected def validateDataFrameForMetricCalculation( + dataFrame: DataFrame, + predictionCol: String, + labelCol: String, + offsetColOption: Option[String], + weightColOption: Option[String]): Unit = { + super.validateDataFrameForMetricCalculation(dataFrame, predictionCol, labelCol, offsetColOption, weightColOption) + + val predictionType = dataFrame.schema.fields.find(_.name == predictionCol).get.dataType + val isPredictionTypeValid = predictionType match { + case StructType(fields) if fields.head.dataType == DoubleType => true + case DoubleType => true + case FloatType => true + case _ => false + } + if (!isPredictionTypeValid) { + throw new IllegalArgumentException( + s"The type of the prediction column '$predictionCol' is not valid. " + + "The prediction column must have the same type as a detailed_prediction column coming from the transform " + + "method of H2OMOJOModel descendant or it must be of DoubleType or FloatType.") + } + + val labelType = dataFrame.schema.fields.find(_.name == labelCol).get.dataType + if (!labelType.isInstanceOf[NumericType]) { + throw new IllegalArgumentException(s"The label column '$labelCol' must be a numeric type.") + } } } diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala index 9e870b12a6..dd15759c18 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala @@ -28,34 +28,48 @@ import org.apache.spark.sql.functions.{col, lit} trait MetricCalculation { - private[sparkling] def validateDataFrameForMetricCalculation( - flatDataFrame: DataFrame, + protected def validateDataFrameForMetricCalculation( + dataFrame: DataFrame, + predictionCol: String, labelCol: String, offsetColOption: Option[String], weightColOption: Option[String]): Unit = { - if (labelCol != null && !flatDataFrame.columns.contains(labelCol)) { + if (predictionCol != null && !dataFrame.columns.contains(predictionCol)) { + throw new IllegalArgumentException( + s"DataFrame passed as a parameter does not contain prediction column '$predictionCol'.") + } + + if (labelCol != null && !dataFrame.columns.contains(labelCol)) { throw new IllegalArgumentException(s"DataFrame passed as a parameter does not contain label column '$labelCol'.") } if (offsetColOption.isDefined) { val offsetCol = offsetColOption.get - if (!flatDataFrame.columns.contains(offsetCol)) { + if (!dataFrame.columns.contains(offsetCol)) { throw new IllegalArgumentException( s"DataFrame passed as a parameter does not contain offset column '$offsetCol'.") } + val offsetType = dataFrame.schema.fields.find(_.name == offsetCol).get.dataType + if (!offsetType.isInstanceOf[NumericType]) { + throw new IllegalArgumentException(s"The offset column '$offsetCol' must be a numeric type.") + } } if (weightColOption.isDefined) { val weightCol = weightColOption.get - if (!flatDataFrame.columns.contains(weightCol)) { + if (!dataFrame.columns.contains(weightCol)) { throw new IllegalArgumentException( s"DataFrame passed as a parameter does not contain weight column '$weightCol'.") } + val weightType = dataFrame.schema.fields.find(_.name == weightCol).get.dataType + if (!weightType.isInstanceOf[NumericType]) { + throw new IllegalArgumentException(s"The weight column '$weightType' must be a numeric type.") + } } } - private[sparkling] def metricsToSchema(metrics: ModelMetrics): Schema[_, _] = { + private def metricsToSchema(metrics: ModelMetrics): Schema[_, _] = { val schemas = MetricsCalculationTypeExtensions.SCHEMA_CLASSES.map(c => Class.forName(c).getConstructor().newInstance().asInstanceOf[Schema[Nothing, Nothing]]) From 75d1274e22bc66035635f62ac7c517611399190e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Wed, 30 Mar 2022 12:27:42 +0200 Subject: [PATCH 18/37] revert test changes in python --- py/tests/unit/with_runtime_sparkling/test_mojo.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/py/tests/unit/with_runtime_sparkling/test_mojo.py b/py/tests/unit/with_runtime_sparkling/test_mojo.py index a2fc18bef6..c9f0e2eaae 100644 --- a/py/tests/unit/with_runtime_sparkling/test_mojo.py +++ b/py/tests/unit/with_runtime_sparkling/test_mojo.py @@ -255,23 +255,9 @@ def compareMetricValues(metricsObject, metricsMap): assert metricsObject.getThresholdsAndMetricScores().count() > 0 assert len(metricsObject.getThresholdsAndMetricScores().columns) > 0 - def compareCalculatedMetricValues(metricsObject, metricsMap): - for metric in metricsMap: - if metric != "ScoringTime": - metricValue = metricsMap[metric] - objectValue = getattr(metricsObject, "get" + metric)() - assert(metricValue == objectValue) - assert metricsObject.getConfusionMatrix().count() > 0 - assert len(metricsObject.getConfusionMatrix().columns) > 0 - assert metricsObject.getMaxCriteriaAndMetricScores().count() > 0 - assert len(metricsObject.getMaxCriteriaAndMetricScores().columns) > 0 - assert metricsObject.getThresholdsAndMetricScores().count() > 0 - assert len(metricsObject.getThresholdsAndMetricScores().columns) > 0 - compareMetricValues(model.getTrainingMetricsObject(), model.getTrainingMetrics()) compareMetricValues(model.getCrossValidationMetricsObject(), model.getCrossValidationMetrics()) compareMetricValues(model.getCurrentMetricsObject(), model.getCurrentMetrics()) - compareCalculatedMetricValues(model.getMetricsObject(prostateDataset), model.getMetrics(prostateDataset)) assert model.getValidationMetricsObject() is None assert model.getValidationMetrics() == {} From e6887bc364f7ab1dfaf1c0e5894960583cd0121c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Wed, 30 Mar 2022 12:36:08 +0200 Subject: [PATCH 19/37] revert change in R --- r/src/R/ai/h2o/sparkling/H2OConf.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/R/ai/h2o/sparkling/H2OConf.R b/r/src/R/ai/h2o/sparkling/H2OConf.R index 7ab9256ab8..669e981368 100644 --- a/r/src/R/ai/h2o/sparkling/H2OConf.R +++ b/r/src/R/ai/h2o/sparkling/H2OConf.R @@ -1,4 +1,4 @@ - # +# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. From 1708d9bf4073df0530de13d5f31a751ceb821824 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Wed, 30 Mar 2022 15:19:55 +0200 Subject: [PATCH 20/37] Add R classes --- .../python/MetricsFactoryTemplate.scala | 6 +++++- .../generation/r/MetricsFactoryTemplate.scala | 6 +++++- .../sparkling/ml/metrics/H2OBinomialMetrics.R | 21 +++++++++++++++++++ .../ml/metrics/H2OMultinomialMetrics.R | 21 +++++++++++++++++++ .../ml/metrics/H2ORegressionMetrics.R | 21 +++++++++++++++++++ 5 files changed, 73 insertions(+), 2 deletions(-) create mode 100644 r/src/R/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.R create mode 100644 r/src/R/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.R create mode 100644 r/src/R/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.R diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsFactoryTemplate.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsFactoryTemplate.scala index 524b634825..bf0d44a00b 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsFactoryTemplate.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsFactoryTemplate.scala @@ -49,7 +49,11 @@ object MetricsFactoryTemplate extends ((Seq[ModelMetricsSubstitutionContext]) => private def generatePatternMatchingCases(metricSubstitutionContexts: Seq[ModelMetricsSubstitutionContext]): String = { metricSubstitutionContexts .map { metricSubstitutionContext => - val metricsObjectName = metricSubstitutionContext.entityName + val metricsObjectName = if (metricSubstitutionContext.entityName.endsWith("Base")) { + metricSubstitutionContext.entityName.substring(metricSubstitutionContext.entityName.length - 4) + } else { + metricSubstitutionContext.entityName + } s""" elif javaObject.getClass().getSimpleName() == "$metricsObjectName": | return $metricsObjectName(javaObject)""".stripMargin } diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/r/MetricsFactoryTemplate.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/r/MetricsFactoryTemplate.scala index bfe6d56b4f..8e6117b932 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/r/MetricsFactoryTemplate.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/r/MetricsFactoryTemplate.scala @@ -58,7 +58,11 @@ object MetricsFactoryTemplate extends ((Seq[ModelMetricsSubstitutionContext]) => private def generateCases(metricSubstitutionContexts: Seq[ModelMetricsSubstitutionContext]): String = { metricSubstitutionContexts .map { metricSubstitutionContext => - val metricsObjectName = metricSubstitutionContext.entityName + val metricsObjectName = if (metricSubstitutionContext.entityName.endsWith("Base")) { + metricSubstitutionContext.entityName.substring(metricSubstitutionContext.entityName.length - 4) + } else { + metricSubstitutionContext.entityName + } s""" } else if (invoke(invoke(javaObject, "getClass"), "getSimpleName") == "$metricsObjectName") { | rsparkling.$metricsObjectName(javaObject)""".stripMargin } diff --git a/r/src/R/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.R b/r/src/R/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.R new file mode 100644 index 0000000000..3f610d68fd --- /dev/null +++ b/r/src/R/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.R @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +source(file.path("R", "H2OBinomialMetricsBase.R")) + +#' @export rsparkling.H2OBinomialMetricsBase +rsparkling.H2OBinomialMetrics <- setRefClass("rsparkling.H2OBinomialMetrics", contains = ("rsparkling.H2OBinomialMetricsBase")) diff --git a/r/src/R/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.R b/r/src/R/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.R new file mode 100644 index 0000000000..b869282eee --- /dev/null +++ b/r/src/R/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.R @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +source(file.path("R", "H2OMultinomialMetricsBase.R")) + +#' @export rsparkling.H2OMultinomialMetricsBase +rsparkling.H2OMultinomialMetrics <- setRefClass("rsparkling.H2OMultinomialMetrics", contains = ("rsparkling.H2OMultinomialMetricsBase")) diff --git a/r/src/R/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.R b/r/src/R/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.R new file mode 100644 index 0000000000..72fe31d420 --- /dev/null +++ b/r/src/R/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.R @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +source(file.path("R", "H2ORegressionMetricsBase.R")) + +#' @export rsparkling.H2ORegressionMetricsBase +rsparkling.H2ORegressionMetrics <- setRefClass("rsparkling.H2ORegressionMetrics", contains = ("rsparkling.H2ORegressionMetricsBase")) From 5240eba53b6effcf5052de50a500d4b766a0da0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Wed, 30 Mar 2022 17:48:34 +0200 Subject: [PATCH 21/37] Remove offset column from multinomial metrics --- .../metrics/MultinomialMetricsTestSuite.scala | 41 ------------------- .../ml/metrics/H2OBinomialMetrics.py | 2 +- .../ml/metrics/H2OMultinomialMetrics.py | 14 +++---- .../ml/metrics/H2ORegressionMetrics.py | 12 +++--- .../ml/metrics/H2OMultinomialMetrics.scala | 9 ++-- 5 files changed, 16 insertions(+), 62 deletions(-) diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala index 35e7ee9e30..02684ee4fe 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala @@ -224,45 +224,4 @@ class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OT } } } - { - val algorithmsAndTolerances: Seq[(H2OSupervisedAlgorithm[_], Double, Double)] = - Seq((new H2OXGBoost(), 0.00001, 0.00000001), (new H2OGLM(), 0.00001, 0.00000001)) - - for ((algorithm, trainingMetricsTolerance, validationMetricsTolerance) <- algorithmsAndTolerances) { - val algorithmName = algorithm.getClass.getSimpleName - - test(s"test calculation of multinomial $algorithmName metrics with offsetCol set on arbitrary dataset") { - algorithm - .setValidationDataFrame(validationDataset) - .set(algorithm.getParam("seed"), 1L) - .setFeaturesCols("sepal_len", "sepal_wid", "petal_len", "petal_wid") - .setColumnsToCategorical("class") - .set(algorithm.getParam("aucType"), "MACRO_OVR") - .setLabelCol("class") - .setOffsetCol("ID") - - val model = algorithm.fit(trainingDataset) - val domain = model.getDomainValues()("class") - val trainingMetricObject = H2OMultinomialMetrics.calculate( - model.transform(trainingDataset), - domain, - labelCol = "class", - offsetColOption = Some("ID"), - aucType = "MACRO_OVR") - val validationMetricObject = H2OMultinomialMetrics.calculate( - model.transform(validationDataset), - domain, - labelCol = "class", - offsetColOption = Some("ID"), - aucType = "MACRO_OVR") - - assertMetrics( - model, - trainingMetricObject, - validationMetricObject, - trainingMetricsTolerance, - validationMetricsTolerance) - } - } - } } diff --git a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py index f3212d30e6..d7ade2d008 100644 --- a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py +++ b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py @@ -32,7 +32,7 @@ def calculate(dataFrame, offsetCol = None): # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths Initializer.load_sparkling_jar() - javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OBinomialMetrics.calclate(dataFrame, + javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OBinomialMetrics.calculate(dataFrame, domain, predictionCol, labelCol, diff --git a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py index 64340162d1..e7432ed5b3 100644 --- a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py +++ b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py @@ -29,15 +29,13 @@ def calculate(dataFrame, predictionCol = "detailed_prediction", labelCol = "label", weightCol = None, - offsetCol = None, aucType = "AUTO"): # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths Initializer.load_sparkling_jar() - javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OMultinomialMetrics.calclate(dataFrame, - domain, - predictionCol, - labelCol, - weightCol, - offsetCol, - aucType) + javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OMultinomialMetrics.calculate(dataFrame, + domain, + predictionCol, + labelCol, + weightCol, + aucType) return H2OMultinomialMetrics(javaMetrics) diff --git a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.py b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.py index 0eb2dca203..2daa054f86 100644 --- a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.py +++ b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.py @@ -32,10 +32,10 @@ def calculate(dataFrame, offsetCol = None): # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths Initializer.load_sparkling_jar() - javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2ORegressionMetrics.calclate(dataFrame, - domain, - predictionCol, - labelCol, - weightCol, - offsetCol) + javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2ORegressionMetrics.calculate(dataFrame, + domain, + predictionCol, + labelCol, + weightCol, + offsetCol) return H2ORegressionMetrics(javaMetrics) diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala index e478bacde4..a04f410119 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala @@ -47,7 +47,6 @@ object H2OMultinomialMetrics extends MetricCalculation { * classes. * @param labelCol The name of label column that contains actual values. * @param weightColOption The name of a weight column. - * @param offsetColOption The name of a offset column. * @param aucType Type of multinomial AUC/AUCPR calculation. Possible values: * - AUTO, * - NONE, @@ -63,9 +62,8 @@ object H2OMultinomialMetrics extends MetricCalculation { predictionCol: String = "detailed_prediction", labelCol: String = "label", weightColOption: Option[String] = None, - offsetColOption: Option[String] = None, aucType: String = "AUTO"): H2OMultinomialMetrics = { - validateDataFrameForMetricCalculation(dataFrame, predictionCol, labelCol, offsetColOption, weightColOption) + validateDataFrameForMetricCalculation(dataFrame, predictionCol, labelCol, None, weightColOption) val aucTypeEnum = MultinomialAucType.valueOf(aucType) val nclasses = domain.length val getMetricBuilder = @@ -73,7 +71,7 @@ object H2OMultinomialMetrics extends MetricCalculation { val castedLabelDF = dataFrame.withColumn(labelCol, col(labelCol) cast StringType) val gson = - getMetricGson(getMetricBuilder, castedLabelDF, predictionCol, labelCol, offsetColOption, weightColOption, domain) + getMetricGson(getMetricBuilder, castedLabelDF, predictionCol, labelCol, None, weightColOption, domain) val result = new H2OMultinomialMetrics() result.setMetrics(gson, "H2OMultinomialMetrics.calculate") result @@ -85,9 +83,8 @@ object H2OMultinomialMetrics extends MetricCalculation { predictionCol: String, labelCol: String, weightCol: String, - offsetCol: String, aucType: String): H2OMultinomialMetrics = { - calculate(dataFrame, domain, predictionCol, labelCol, Option(weightCol), Option(offsetCol), aucType) + calculate(dataFrame, domain, predictionCol, labelCol, Option(weightCol), aucType) } override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = { From 0dfc38e84999180883f635c2e1cdd0919aacb9f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Wed, 30 Mar 2022 18:28:52 +0200 Subject: [PATCH 22/37] Fix metric factory generatino --- .../api/generation/python/MetricsFactoryTemplate.scala | 2 +- .../h2o/sparkling/api/generation/r/MetricsFactoryTemplate.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsFactoryTemplate.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsFactoryTemplate.scala index bf0d44a00b..25f8f2c653 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsFactoryTemplate.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsFactoryTemplate.scala @@ -50,7 +50,7 @@ object MetricsFactoryTemplate extends ((Seq[ModelMetricsSubstitutionContext]) => metricSubstitutionContexts .map { metricSubstitutionContext => val metricsObjectName = if (metricSubstitutionContext.entityName.endsWith("Base")) { - metricSubstitutionContext.entityName.substring(metricSubstitutionContext.entityName.length - 4) + metricSubstitutionContext.entityName.substring(0, metricSubstitutionContext.entityName.length - 4) } else { metricSubstitutionContext.entityName } diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/r/MetricsFactoryTemplate.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/r/MetricsFactoryTemplate.scala index 8e6117b932..d3c8476e06 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/r/MetricsFactoryTemplate.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/r/MetricsFactoryTemplate.scala @@ -59,7 +59,7 @@ object MetricsFactoryTemplate extends ((Seq[ModelMetricsSubstitutionContext]) => metricSubstitutionContexts .map { metricSubstitutionContext => val metricsObjectName = if (metricSubstitutionContext.entityName.endsWith("Base")) { - metricSubstitutionContext.entityName.substring(metricSubstitutionContext.entityName.length - 4) + metricSubstitutionContext.entityName.substring(0, metricSubstitutionContext.entityName.length - 4) } else { metricSubstitutionContext.entityName } From afd76768e89921c9c407fef1686b69b119492a5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Mon, 4 Apr 2022 17:25:36 +0200 Subject: [PATCH 23/37] Use original builders instead --- .../scala/hex/MetricsCalculationTypeExtensions.java | 10 +++++----- .../h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala | 5 ++--- .../sparkling/ml/metrics/H2OMultinomialMetrics.scala | 4 ++-- .../sparkling/ml/metrics/H2ORegressionMetrics.scala | 4 ++-- .../h2o/sparkling/ml/metrics/MetricCalculation.scala | 10 +++++----- 5 files changed, 16 insertions(+), 17 deletions(-) diff --git a/extensions/src/main/scala/hex/MetricsCalculationTypeExtensions.java b/extensions/src/main/scala/hex/MetricsCalculationTypeExtensions.java index 54a58fa9e4..4a1aa22d49 100644 --- a/extensions/src/main/scala/hex/MetricsCalculationTypeExtensions.java +++ b/extensions/src/main/scala/hex/MetricsCalculationTypeExtensions.java @@ -6,11 +6,11 @@ public class MetricsCalculationTypeExtensions implements TypeMapExtension { public static final String[] MODEL_BUILDER_CLASSES = { - ModelMetrics.IndependentMetricBuilder.class.getName(), - ModelMetricsSupervised.IndependentMetricBuilderSupervised.class.getName(), - ModelMetricsBinomial.IndependentMetricBuilderBinomial.class.getName(), + ModelMetrics.MetricBuilder.class.getName(), + ModelMetricsSupervised.MetricBuilderSupervised.class.getName(), + ModelMetricsBinomial.MetricBuilderBinomial.class.getName(), AUC2.AUCBuilder.class.getName(), - ModelMetricsRegression.IndependentMetricBuilderRegression.class.getName(), + ModelMetricsRegression.MetricBuilderRegression.class.getName(), Distribution.class.getName(), GaussianDistribution.class.getName(), BernoulliDistribution.class.getName(), @@ -33,7 +33,7 @@ public class MetricsCalculationTypeExtensions implements TypeMapExtension { OlogitFunction.class.getName(), OloglogFunction.class.getName(), OprobitFunction.class.getName(), - ModelMetricsMultinomial.IndependentMetricBuilderMultinomial.class.getName() + ModelMetricsMultinomial.MetricBuilderMultinomial.class.getName() }; public static final String[] SCHEMA_CLASSES = { diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala index ca416cc2d2..235c825517 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala @@ -17,8 +17,7 @@ package ai.h2o.sparkling.ml.metrics -import hex.ModelMetricsBinomial.IndependentMetricBuilderBinomial -import hex.genmodel.utils.DistributionFamily +import hex.ModelMetricsBinomial.MetricBuilderBinomial import org.apache.spark.{ExposeUtils, ml, mllib} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types._ @@ -59,7 +58,7 @@ object H2OBinomialMetrics extends MetricCalculation { weightColOption: Option[String] = None, offsetColOption: Option[String] = None): H2OBinomialMetrics = { validateDataFrameForMetricCalculation(dataFrame, predictionCol, labelCol, offsetColOption, weightColOption) - val getMetricBuilder = () => new IndependentMetricBuilderBinomial(domain, DistributionFamily.bernoulli) + val getMetricBuilder = () => new MetricBuilderBinomial(domain) val castedLabelDF = dataFrame.withColumn(labelCol, col(labelCol) cast StringType) val gson = diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala index a04f410119..b9a31b50e7 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala @@ -17,7 +17,7 @@ package ai.h2o.sparkling.ml.metrics -import hex.ModelMetricsMultinomial.IndependentMetricBuilderMultinomial +import hex.ModelMetricsMultinomial.MetricBuilderMultinomial import hex.MultinomialAucType import org.apache.spark.{ExposeUtils, ml, mllib} import org.apache.spark.ml.util.Identifiable @@ -67,7 +67,7 @@ object H2OMultinomialMetrics extends MetricCalculation { val aucTypeEnum = MultinomialAucType.valueOf(aucType) val nclasses = domain.length val getMetricBuilder = - () => new IndependentMetricBuilderMultinomial(nclasses, domain, aucTypeEnum, null) + () => new MetricBuilderMultinomial(nclasses, domain, aucTypeEnum) val castedLabelDF = dataFrame.withColumn(labelCol, col(labelCol) cast StringType) val gson = diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala index fc476a64b2..8b55e92ede 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala @@ -18,7 +18,7 @@ package ai.h2o.sparkling.ml.metrics import hex.DistributionFactory -import hex.ModelMetricsRegression.IndependentMetricBuilderRegression +import hex.ModelMetricsRegression.MetricBuilderRegression import hex.genmodel.utils.DistributionFamily import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions.col @@ -54,7 +54,7 @@ object H2ORegressionMetrics extends MetricCalculation { offsetColOption: Option[String] = None): H2ORegressionMetrics = { validateDataFrameForMetricCalculation(dataFrame, predictionCol, labelCol, offsetColOption, weightColOption) val getMetricBuilder = - () => new IndependentMetricBuilderRegression(DistributionFactory.getDistribution(DistributionFamily.AUTO)) + () => new MetricBuilderRegression(DistributionFactory.getDistribution(DistributionFamily.AUTO)) val castedLabelDF = dataFrame.withColumn(labelCol, col(labelCol) cast DoubleType) val gson = getMetricGson(getMetricBuilder, castedLabelDF, predictionCol, labelCol, offsetColOption, weightColOption, null) diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala index dd15759c18..169b054cc1 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala @@ -19,7 +19,7 @@ package ai.h2o.sparkling.ml.metrics import com.google.gson.{GsonBuilder, JsonObject} import hex._ -import hex.ModelMetrics.IndependentMetricBuilder +import hex.ModelMetrics.MetricBuilder import org.apache.spark.sql.{DataFrame, Row} import water.api.{Schema, SchemaServer} import water.api.schemas3._ @@ -91,7 +91,7 @@ trait MetricCalculation { protected def getActualValue(dataType: DataType, domain: Array[String], row: Row): Double protected def getMetricGson( - createMetricBuilder: () => IndependentMetricBuilder[_], + createMetricBuilder: () => MetricBuilder[_], dataFrame: DataFrame, predictionCol: String, labelCol: String, @@ -108,7 +108,7 @@ trait MetricCalculation { val predictionType = flatDF.schema.fields(0).dataType val actualType = flatDF.schema.fields(1).dataType val filledMetricsBuilder = flatDF.rdd - .mapPartitions[IndependentMetricBuilder[_]] { rows => + .mapPartitions[MetricBuilder[_]] { rows => val metricBuilder = createMetricBuilder() while (rows.hasNext) { val row = rows.next() @@ -116,14 +116,14 @@ trait MetricCalculation { val actualValue: Double = getActualValue(actualType, domain, row) val weight = row.getDouble(2) val offset = row.getDouble(3) - metricBuilder.perRow(prediction, Array(actualValue), weight, offset) + metricBuilder.perRow(prediction, Array(actualValue.toFloat), weight, offset, null) } Iterator.single(metricBuilder) } .reduce((f, s) => { f.reduce(s); f }) filledMetricsBuilder.postGlobal() - val metrics = filledMetricsBuilder.makeModelMetrics() + val metrics = filledMetricsBuilder.makeModelMetrics(null, null, null, null) val schema = metricsToSchema(metrics) val json = schema.toJsonString new GsonBuilder().create().fromJson(json, classOf[JsonObject]) From 756e169376da161723296b0467cf09c2768bad0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Mon, 4 Apr 2022 19:34:59 +0200 Subject: [PATCH 24/37] Add python smoke tests --- .../test_metric_calculation.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 py/tests/unit/with_runtime_sparkling/test_metric_calculation.py diff --git a/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py b/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py new file mode 100644 index 0000000000..f0f71390ab --- /dev/null +++ b/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +from pysparkling.ml import * + +from ai.h2o.sparkling.ml.models.H2OBinomialMetrics import H2OBinomialMetrics +from ai.h2o.sparkling.ml.models.H2OMultinomialMetrics import H2OMultinomialMetrics +from ai.h2o.sparkling.ml.models.H2ORegressionMetrics import H2ORegressionMetrics +from ai.h2o.sparkling.ml.models.H2OMOJOModel import H2OMOJOModel + + +def testRegressionMetricsCalculation(prostateDataset): + mojo = H2OMOJOModel.createFromMojo( + "file://" + os.path.abspath("../ml/src/test/resources/regre_model_prostate.mojo")) + metrics = H2ORegressionMetrics.calculate(mojo.transform(prostateDataset), labelCol = "capsule") + assert metrics is not None + + +def testBinomialMetricsCalculation(prostateDataset): + mojo = H2OMOJOModel.createFromMojo( + "file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo")) + domain = mojo.getDomainValues()["capsule"] + metrics = H2OBinomialMetrics.calculate(mojo.transform(prostateDataset), domain, labelCol = "capsule") + assert metrics is not None + + +def testMultinomialMetricsCalculation(irisDataset): + mojo = H2OMOJOModel.createFromMojo( + "file://" + os.path.abspath("../ml/src/test/resources/multi_model_iris.mojo")) + domain = mojo.getDomainValues()["class"] + metrics = H2OMultinomialMetrics.calculate(mojo.transform(irisDataset), domain, labelCol = "class") + assert metrics is not None From d1de56298782af96939d9be8b42793a92960c3a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Tue, 5 Apr 2022 10:33:21 +0200 Subject: [PATCH 25/37] Use master version --- gradle.properties | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gradle.properties b/gradle.properties index adc9eefdee..6055f1b8f3 100644 --- a/gradle.properties +++ b/gradle.properties @@ -29,11 +29,11 @@ pythonEnvironments=2.7 3.6 3.7 3.8 # Select for which Spark version is Sparkling Water built by default spark=3.2 # Sparkling Water Version -version=3.38.0.1-199-SNAPSHOT +version=3.38.0.1-1-SNAPSHOT # Spark version from which is Kubernetes Supported kubernetesSupportSinceSpark=2.4 databricksTestSinceSpark=2.4 spotlessModern=true -testH2OBranch=mn/PUBDEV-8373b +testH2OBranch=master makeBooklet=false testingBaseImage="harbor.h2o.ai/opsh2oai/h2o-3-hadoop-cdh-6.3:84" From df87eb3330ba291eb402a34c923e875eeb787224 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Tue, 5 Apr 2022 20:02:33 +0200 Subject: [PATCH 26/37] Update python API --- .../python/MetricsInitTemplate.scala | 8 ++++- .../src/ai/h2o/sparkling/ml/__init__.py | 1 + .../ml/metrics/H2OBinomialMetrics.py | 26 ++++++++++++---- .../ml/metrics/H2OMultinomialMetrics.py | 31 +++++++++++++++---- .../ml/metrics/H2ORegressionMetrics.py | 23 +++++++++----- py-scoring/src/pysparkling/ml/__init__.py | 4 ++- .../src/pysparkling/ml/metrics/__init__.py | 20 ++++++++++++ py/src/ai/h2o/sparkling/ml/__init__.py | 1 + py/src/pysparkling/ml/__init__.py | 3 +- py/src/pysparkling/ml/metrics/__init__.py | 21 +++++++++++++ .../unit/with_runtime_sparkling/conftest.py | 5 +++ .../test_metric_calculation.py | 9 ++---- .../ml/metrics/H2OBinomialMetrics.scala | 7 +++-- .../ml/metrics/H2OMultinomialMetrics.scala | 11 ++++--- .../ml/metrics/H2ORegressionMetrics.scala | 3 +- 15 files changed, 135 insertions(+), 38 deletions(-) create mode 100644 py-scoring/src/pysparkling/ml/metrics/__init__.py create mode 100644 py/src/pysparkling/ml/metrics/__init__.py diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsInitTemplate.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsInitTemplate.scala index 1bdb65d56c..859cece2e8 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsInitTemplate.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsInitTemplate.scala @@ -22,7 +22,13 @@ import ai.h2o.sparkling.api.generation.common.{EntitySubstitutionContext, ModelM object MetricsInitTemplate extends ((Seq[ModelMetricsSubstitutionContext]) => String) with PythonEntityTemplate { def apply(metricSubstitutionContexts: Seq[ModelMetricsSubstitutionContext]): String = { - val metricClasses = metricSubstitutionContexts.map(_.entityName) + val metricClasses = metricSubstitutionContexts.map { metricSubstitutionContext => + if (metricSubstitutionContext.entityName.endsWith("Base")) { + metricSubstitutionContext.entityName.substring(0, metricSubstitutionContext.entityName.length - 4) + } else { + metricSubstitutionContext.entityName + } + } val imports = metricClasses.map(metricClass => s"ai.h2o.sparkling.ml.metrics.$metricClass.$metricClass") val entitySubstitutionContext = EntitySubstitutionContext(null, null, null, imports) diff --git a/py-scoring/src/ai/h2o/sparkling/ml/__init__.py b/py-scoring/src/ai/h2o/sparkling/ml/__init__.py index f0a1ffabe2..78169a4807 100644 --- a/py-scoring/src/ai/h2o/sparkling/ml/__init__.py +++ b/py-scoring/src/ai/h2o/sparkling/ml/__init__.py @@ -20,3 +20,4 @@ from ai.h2o.sparkling.ml.models import H2ODeepLearningMOJOModel, H2ODRFMOJOModel, H2OIsolationForestMOJOModel, H2OPCAMOJOModel, H2OGLRMMOJOModel from ai.h2o.sparkling.ml.models import H2OMOJOModel, H2OAlgorithmMOJOModel, H2OFeatureMOJOModel, H2OMOJOPipelineModel, H2OMOJOSettings from ai.h2o.sparkling.ml.models import H2OCoxPHMOJOModel, H2ORuleFitMOJOModel, H2OWord2VecMOJOModel +from ai.h2o.sparkling.ml.metrics import H2ORegressionMetrics, H2OBinomialMetrics, H2OMultinomialMetrics diff --git a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py index d7ade2d008..be6a87ef17 100644 --- a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py +++ b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py @@ -30,12 +30,26 @@ def calculate(dataFrame, labelCol = "label", weightCol = None, offsetCol = None): + ''' + The method calculates binomial metrics on a provided data frame with predictions and actual values. + :param dataFrame: A data frame with predictions and actual values + :param domain: A list of classes representing negative and positive response. Negative class must at position 0 + and positive at 1 + :param predictionCol: The name of prediction column. The prediction column must have the same type as + a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or a array type or + vector of doubles. First item is must be 0.0 or 1.0 representing negative or positive response. The other items + must be probabilities to predict given probability classes. + :param labelCol: The name of label column that contains actual values. + :param weightCol: The name of a weight column. + :param offsetCol: The name of a offset column. + :return: Calculated binomial metrics + ''' # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths Initializer.load_sparkling_jar() - javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OBinomialMetrics.calculate(dataFrame, - domain, - predictionCol, - labelCol, - weightCol, - offsetCol) + javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OBinomialMetrics.calculateInternal(dataFrame._jdf, + domain, + predictionCol, + labelCol, + weightCol, + offsetCol) return H2OBinomialMetrics(javaMetrics) diff --git a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py index e7432ed5b3..7a7854b455 100644 --- a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py +++ b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py @@ -30,12 +30,31 @@ def calculate(dataFrame, labelCol = "label", weightCol = None, aucType = "AUTO"): + ''' + The method calculates multinomial metrics on a provided data frame with predictions and actual values. + :param dataFrame: A data frame with predictions and actual values. + :param domain: List of response classes. + :param predictionCol: The name of prediction column. The prediction column must have the same type as + a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or a array type or + vector of doubles. First item is must be 0.0, 1.0, 2.0 representing indexes of response classes. The other + items must be probabilities to predict given probability classes. + :param labelCol: The name of label column that contains actual values. + :param weightCol: The name of a weight column. + :param aucType: Type of multinomial AUC/AUCPR calculation. Possible values: + - AUTO, + - NONE, + - MACRO_OVR, + - WEIGHTED_OVR, + - MACRO_OVO, + - WEIGHTED_OVO + :return: Calculated multinomial metrics + ''' # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths Initializer.load_sparkling_jar() - javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OMultinomialMetrics.calculate(dataFrame, - domain, - predictionCol, - labelCol, - weightCol, - aucType) + javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OMultinomialMetrics.calculateInternal(dataFrame._jdf, + domain, + predictionCol, + labelCol, + weightCol, + aucType) return H2OMultinomialMetrics(javaMetrics) diff --git a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.py b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.py index 2daa054f86..b82dd6ded4 100644 --- a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.py +++ b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.py @@ -25,17 +25,26 @@ class H2ORegressionMetrics(H2ORegressionMetricsBase): @staticmethod def calculate(dataFrame, - domain, predictionCol = "detailed_prediction", labelCol = "label", weightCol = None, offsetCol = None): + ''' + The method calculates regression metrics on a provided data frame with predictions and actual values. + :param dataFrame: A data frame with predictions and actual values + :param predictionCol: The name of prediction column. The prediction column must have the same type as + a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or + it must be of DoubleType or FloatType. + :param labelCol: The name of label column that contains actual values. + :param weightCol: The name of a weight column. + :param offsetCol: The name of a offset column. + :return: Calculated regression metrics + ''' # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths Initializer.load_sparkling_jar() - javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2ORegressionMetrics.calculate(dataFrame, - domain, - predictionCol, - labelCol, - weightCol, - offsetCol) + javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2ORegressionMetrics.calculateInternal(dataFrame._jdf, + predictionCol, + labelCol, + weightCol, + offsetCol) return H2ORegressionMetrics(javaMetrics) diff --git a/py-scoring/src/pysparkling/ml/__init__.py b/py-scoring/src/pysparkling/ml/__init__.py index 7aa23d5515..3f2cfc4c2d 100644 --- a/py-scoring/src/pysparkling/ml/__init__.py +++ b/py-scoring/src/pysparkling/ml/__init__.py @@ -16,12 +16,14 @@ # from pysparkling.ml.models import * +from pysparkling.ml.metrics import * __all__ = ["H2OMOJOModel", "H2OSupervisedMOJOModel", "H2OTreeBasedSupervisedMOJOModel", "H2OUnsupervisedMOJOModel", "H2OTreeBasedUnsupervisedMOJOModel", "H2OMOJOPipelineModel", "H2OMOJOSettings", "H2OBinaryModel", "H2OKMeansMOJOModel", "H2OGLMMOJOModel", "H2OGAMMOJOModel", "H2OGBMMOJOModel", "H2OXGBoostMOJOModel", "H2ODeepLearningMOJOModel", "H2ODRFMOJOModel", "H2OIsolationForestMOJOModel", "H2OPCAMOJOModel", - "H2OGLRMMOJOModel", "H2OCoxPHMOJOModel", "H2ORuleFitMOJOModel", "H2OWord2VecMOJOModel"] + "H2OGLRMMOJOModel", "H2OCoxPHMOJOModel", "H2ORuleFitMOJOModel", "H2OWord2VecMOJOModel", + "H2ORegressionMetrics", "H2OMultinomialMetrics", "H2OBinomialMetrics"] from pysparkling.initializer import Initializer diff --git a/py-scoring/src/pysparkling/ml/metrics/__init__.py b/py-scoring/src/pysparkling/ml/metrics/__init__.py new file mode 100644 index 0000000000..a24e87398c --- /dev/null +++ b/py-scoring/src/pysparkling/ml/metrics/__init__.py @@ -0,0 +1,20 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ai.h2o.sparkling.ml.metrics import H2ORegressionMetrics, H2OMultinomialMetrics, H2OBinomialMetrics + +__all__ = ["H2ORegressionMetrics", "H2OMultinomialMetrics", "H2OBinomialMetrics"] diff --git a/py/src/ai/h2o/sparkling/ml/__init__.py b/py/src/ai/h2o/sparkling/ml/__init__.py index 89f8998524..d4748a8ba2 100644 --- a/py/src/ai/h2o/sparkling/ml/__init__.py +++ b/py/src/ai/h2o/sparkling/ml/__init__.py @@ -27,3 +27,4 @@ from ai.h2o.sparkling.ml.models import H2ODeepLearningMOJOModel, H2OWord2VecMOJOModel, H2OAutoEncoderMOJOModel, H2ODRFMOJOModel, H2OPCAMOJOModel, H2OGLRMMOJOModel from ai.h2o.sparkling.ml.models import H2OIsolationForestMOJOModel, H2OCoxPHMOJOModel, H2ORuleFitMOJOModel, H2OStackedEnsembleMOJOModel from ai.h2o.sparkling.ml.models import H2OMOJOModel, H2OAlgorithmMOJOModel, H2OFeatureMOJOModel, H2OMOJOPipelineModel, H2OMOJOSettings +from ai.h2o.sparkling.ml.metrics import H2ORegressionMetrics, H2OBinomialMetrics, H2OMultinomialMetrics diff --git a/py/src/pysparkling/ml/__init__.py b/py/src/pysparkling/ml/__init__.py index 2a5c74fab7..3a1721c887 100644 --- a/py/src/pysparkling/ml/__init__.py +++ b/py/src/pysparkling/ml/__init__.py @@ -19,6 +19,7 @@ from pysparkling.ml.algos.regression import * from pysparkling.ml.features import * from pysparkling.ml.models import * +from pysparkling.ml.metrics import * __all__ = ["ColumnPruner", "H2OGBM", "H2ODeepLearning", "H2OAutoML", "H2OXGBoost", "H2OGLM", "H2OCoxPH", "H2OGAM", "H2OMOJOModel", "H2OAlgorithmMOJOModel", "H2OFeatureMOJOModel", "H2OSupervisedMOJOModel", @@ -32,7 +33,7 @@ "H2ODRFMOJOModel", "H2OIsolationForestMOJOModel", "H2OWord2Vec", "H2OWord2VecMOJOModel", "H2OAutoEncoder", "H2OAutoEncoderMOJOModel", "H2OPCA", "H2OPCAMOJOModel", "H2OGLRM", "H2OGLRMMOJOModel", "H2ORuleFit", "H2ORuleFitClassifier", "H2ORuleFitRegressor", "H2ORuleFitMOJOModel", "H2OStackedEnsemble", - "H2OStackedEnsembleMOJOModel"] + "H2OStackedEnsembleMOJOModel", "H2ORegressionMetrics", "H2OBinomialMetrics", "H2OMultinomialMetrics"] from pysparkling.initializer import Initializer diff --git a/py/src/pysparkling/ml/metrics/__init__.py b/py/src/pysparkling/ml/metrics/__init__.py new file mode 100644 index 0000000000..9bec18e1f3 --- /dev/null +++ b/py/src/pysparkling/ml/metrics/__init__.py @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ai.h2o.sparkling.ml.metrics import H2ORegressionMetrics, H2OBinomialMetrics, H2OMultinomialMetrics + + +__all__ = ["H2ORegressionMetrics", "H2OBinomialMetrics", "H2OMultinomialMetrics"] diff --git a/py/tests/unit/with_runtime_sparkling/conftest.py b/py/tests/unit/with_runtime_sparkling/conftest.py index 2b8c0799d2..e7350c7d13 100644 --- a/py/tests/unit/with_runtime_sparkling/conftest.py +++ b/py/tests/unit/with_runtime_sparkling/conftest.py @@ -60,6 +60,11 @@ def irisDatasetPath(): return "file://" + os.path.abspath("../examples/smalldata/iris/iris_wheader.csv") +@pytest.fixture(scope="module") +def irisDataset(spark, irisDatasetPath): + return spark.read.csv(irisDatasetPath, header=True, inferSchema=True) + + @pytest.fixture(scope="module") def airlinesDatasetPath(): return "file://" + os.path.abspath("../examples/smalldata/airlines/allyears2k_headers.csv") diff --git a/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py b/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py index f0f71390ab..def81191ef 100644 --- a/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py +++ b/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py @@ -17,16 +17,11 @@ import os from pysparkling.ml import * -from ai.h2o.sparkling.ml.models.H2OBinomialMetrics import H2OBinomialMetrics -from ai.h2o.sparkling.ml.models.H2OMultinomialMetrics import H2OMultinomialMetrics -from ai.h2o.sparkling.ml.models.H2ORegressionMetrics import H2ORegressionMetrics -from ai.h2o.sparkling.ml.models.H2OMOJOModel import H2OMOJOModel - def testRegressionMetricsCalculation(prostateDataset): mojo = H2OMOJOModel.createFromMojo( "file://" + os.path.abspath("../ml/src/test/resources/regre_model_prostate.mojo")) - metrics = H2ORegressionMetrics.calculate(mojo.transform(prostateDataset), labelCol = "capsule") + metrics = H2ORegressionMetrics.calculate(mojo.transform(prostateDataset), labelCol = "CAPSULE") assert metrics is not None @@ -34,7 +29,7 @@ def testBinomialMetricsCalculation(prostateDataset): mojo = H2OMOJOModel.createFromMojo( "file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo")) domain = mojo.getDomainValues()["capsule"] - metrics = H2OBinomialMetrics.calculate(mojo.transform(prostateDataset), domain, labelCol = "capsule") + metrics = H2OBinomialMetrics.calculate(mojo.transform(prostateDataset), domain, labelCol = "CAPSULE") assert metrics is not None diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala index 235c825517..57ae6ddec7 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala @@ -68,14 +68,15 @@ object H2OBinomialMetrics extends MetricCalculation { result } - def calculate( + // The method serves for call from Python/R API + def calculateInternal( dataFrame: DataFrame, - domain: Array[String], + domain: java.util.ArrayList[String], predictionCol: String, labelCol: String, weightCol: String, offsetCol: String): Unit = { - calculate(dataFrame, domain, predictionCol, labelCol, Option(weightCol), Option(offsetCol)) + calculate(dataFrame, domain.toArray[String](new Array[String](0)), predictionCol, labelCol, Option(weightCol), Option(offsetCol)) } override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = { diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala index b9a31b50e7..a65fe556fb 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala @@ -22,7 +22,7 @@ import hex.MultinomialAucType import org.apache.spark.{ExposeUtils, ml, mllib} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions.col -import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.types.{ArrayType, DataType, DoubleType, FloatType, StringType, StructType} @MetricsDescription( @@ -38,7 +38,7 @@ object H2OMultinomialMetrics extends MetricCalculation { /** * The method calculates multinomial metrics on a provided data frame with predictions and actual values. * - * @param dataFrame A data frame with predictions and actual values + * @param dataFrame A data frame with predictions and actual values. * @param domain Array of response classes. * @param predictionCol The name of prediction column. The prediction column must have the same type as * a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or @@ -77,14 +77,15 @@ object H2OMultinomialMetrics extends MetricCalculation { result } - def calculate( + // The method serves for call from Python/R API + def calculateInternal( dataFrame: DataFrame, - domain: Array[String], + domain: java.util.ArrayList[String], predictionCol: String, labelCol: String, weightCol: String, aucType: String): H2OMultinomialMetrics = { - calculate(dataFrame, domain, predictionCol, labelCol, Option(weightCol), aucType) + calculate(dataFrame, domain.toArray[String](new Array[String](0)), predictionCol, labelCol, Option(weightCol), aucType) } override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = { diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala index 8b55e92ede..54e7ac3209 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala @@ -63,7 +63,8 @@ object H2ORegressionMetrics extends MetricCalculation { result } - def calculate( + // The method serves for call from Python/R API + def calculateInternal( dataFrame: DataFrame, predictionCol: String, labelCol: String, From 87d434c4811e6fbb476b2ef1ebde01048b1e2965 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Thu, 7 Apr 2022 11:55:43 +0200 Subject: [PATCH 27/37] fix formatting --- .../ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala | 8 +++++++- .../h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala index 57ae6ddec7..368f40ade2 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala @@ -76,7 +76,13 @@ object H2OBinomialMetrics extends MetricCalculation { labelCol: String, weightCol: String, offsetCol: String): Unit = { - calculate(dataFrame, domain.toArray[String](new Array[String](0)), predictionCol, labelCol, Option(weightCol), Option(offsetCol)) + calculate( + dataFrame, + domain.toArray[String](new Array[String](0)), + predictionCol, + labelCol, + Option(weightCol), + Option(offsetCol)) } override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = { diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala index a65fe556fb..96df9ecf0a 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala @@ -85,7 +85,13 @@ object H2OMultinomialMetrics extends MetricCalculation { labelCol: String, weightCol: String, aucType: String): H2OMultinomialMetrics = { - calculate(dataFrame, domain.toArray[String](new Array[String](0)), predictionCol, labelCol, Option(weightCol), aucType) + calculate( + dataFrame, + domain.toArray[String](new Array[String](0)), + predictionCol, + labelCol, + Option(weightCol), + aucType) } override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = { From 3002e2119d7a7b2177852ea2a3eaedce5a9aaf98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Thu, 7 Apr 2022 14:39:55 +0200 Subject: [PATCH 28/37] fix python test --- .../python/MetricsFactoryTemplate.scala | 16 +++++++++++----- .../generation/r/MetricsFactoryTemplate.scala | 17 ++++++++++++----- .../sparkling/ml/metrics/H2OBinomialMetrics.R | 19 +++++++++++++++++++ .../ml/metrics/H2OMultinomialMetrics.R | 19 +++++++++++++++++++ .../ml/metrics/H2ORegressionMetrics.R | 17 +++++++++++++++++ 5 files changed, 78 insertions(+), 10 deletions(-) diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsFactoryTemplate.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsFactoryTemplate.scala index 25f8f2c653..c7db3a9c0a 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsFactoryTemplate.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsFactoryTemplate.scala @@ -22,7 +22,7 @@ import ai.h2o.sparkling.api.generation.common.{EntitySubstitutionContext, ModelM object MetricsFactoryTemplate extends ((Seq[ModelMetricsSubstitutionContext]) => String) with PythonEntityTemplate { def apply(metricSubstitutionContexts: Seq[ModelMetricsSubstitutionContext]): String = { - val metricClasses = metricSubstitutionContexts.map(_.entityName) + val metricClasses = getEntityNames(metricSubstitutionContexts) val imports = Seq("py4j.java_gateway.JavaObject") ++ metricClasses.map(metricClass => s"ai.h2o.sparkling.ml.metrics.$metricClass.$metricClass") @@ -46,16 +46,22 @@ object MetricsFactoryTemplate extends ((Seq[ModelMetricsSubstitutionContext]) => } } - private def generatePatternMatchingCases(metricSubstitutionContexts: Seq[ModelMetricsSubstitutionContext]): String = { + private def getEntityNames(metricSubstitutionContexts: Seq[ModelMetricsSubstitutionContext]): Seq[String] = { metricSubstitutionContexts .map { metricSubstitutionContext => - val metricsObjectName = if (metricSubstitutionContext.entityName.endsWith("Base")) { + if (metricSubstitutionContext.entityName.endsWith("Base")) { metricSubstitutionContext.entityName.substring(0, metricSubstitutionContext.entityName.length - 4) } else { metricSubstitutionContext.entityName } - s""" elif javaObject.getClass().getSimpleName() == "$metricsObjectName": - | return $metricsObjectName(javaObject)""".stripMargin + } + } + + private def generatePatternMatchingCases(metricSubstitutionContexts: Seq[ModelMetricsSubstitutionContext]): String = { + getEntityNames(metricSubstitutionContexts) + .map { entityName => + s""" elif javaObject.getClass().getSimpleName() == "$entityName": + | return $entityName(javaObject)""".stripMargin } .mkString("\n") } diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/r/MetricsFactoryTemplate.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/r/MetricsFactoryTemplate.scala index d3c8476e06..29fdcd1c68 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/r/MetricsFactoryTemplate.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/r/MetricsFactoryTemplate.scala @@ -22,7 +22,7 @@ import ai.h2o.sparkling.api.generation.common.ModelMetricsSubstitutionContext object MetricsFactoryTemplate extends ((Seq[ModelMetricsSubstitutionContext]) => String) { def apply(metricSubstitutionContexts: Seq[ModelMetricsSubstitutionContext]): String = { - val metricClasses = metricSubstitutionContexts.map(_.entityName) + val metricClasses = getEntityNames(metricSubstitutionContexts) val imports = metricClasses.map(metricClass => s"""source(file.path("R", "${metricClass}.R"))""").mkString("\n") s"""# @@ -55,16 +55,23 @@ object MetricsFactoryTemplate extends ((Seq[ModelMetricsSubstitutionContext]) => |""".stripMargin } - private def generateCases(metricSubstitutionContexts: Seq[ModelMetricsSubstitutionContext]): String = { + private def getEntityNames(metricSubstitutionContexts: Seq[ModelMetricsSubstitutionContext]): Seq[String] = { metricSubstitutionContexts .map { metricSubstitutionContext => - val metricsObjectName = if (metricSubstitutionContext.entityName.endsWith("Base")) { + if (metricSubstitutionContext.entityName.endsWith("Base")) { metricSubstitutionContext.entityName.substring(0, metricSubstitutionContext.entityName.length - 4) } else { metricSubstitutionContext.entityName } - s""" } else if (invoke(invoke(javaObject, "getClass"), "getSimpleName") == "$metricsObjectName") { - | rsparkling.$metricsObjectName(javaObject)""".stripMargin + } + } + + private def generateCases(metricSubstitutionContexts: Seq[ModelMetricsSubstitutionContext]): String = { + val names = getEntityNames(metricSubstitutionContexts) + names + .map { entityName => + s""" } else if (invoke(invoke(javaObject, "getClass"), "getSimpleName") == "$entityName") { + | rsparkling.$entityName(javaObject)""".stripMargin } .mkString("\n") } diff --git a/r/src/R/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.R b/r/src/R/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.R index 3f610d68fd..503df00b3b 100644 --- a/r/src/R/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.R +++ b/r/src/R/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.R @@ -19,3 +19,22 @@ source(file.path("R", "H2OBinomialMetricsBase.R")) #' @export rsparkling.H2OBinomialMetricsBase rsparkling.H2OBinomialMetrics <- setRefClass("rsparkling.H2OBinomialMetrics", contains = ("rsparkling.H2OBinomialMetricsBase")) + +H2OBinomialMetrics.calculate <- function(dataFrame, + domain, + predictionCol = "detailed_prediction", + labelCol = "label", + weightCol = NULL, + offsetCol = NULL) { + sc <- spark_connection_find()[[1]] + javaMetrics <- invoke_static(sc, + "ai.h2o.sparkling.ml.metrics.H2OBinomialMetrics", + "calculate", + dataFrame, + domain, + predictionCol, + labelCol, + weightCol, + offsetCol) + rsparkling.H2OBinomialMetrics(javaMetrics) +} diff --git a/r/src/R/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.R b/r/src/R/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.R index b869282eee..9d849f4a7f 100644 --- a/r/src/R/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.R +++ b/r/src/R/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.R @@ -19,3 +19,22 @@ source(file.path("R", "H2OMultinomialMetricsBase.R")) #' @export rsparkling.H2OMultinomialMetricsBase rsparkling.H2OMultinomialMetrics <- setRefClass("rsparkling.H2OMultinomialMetrics", contains = ("rsparkling.H2OMultinomialMetricsBase")) + +H2OMultinomialMetrics.calculate <- function(dataFrame, + domain, + predictionCol = "detailed_prediction", + labelCol = "label", + weightCol = NULL, + aucType = "AUTO") { + sc <- spark_connection_find()[[1]] + javaMetrics <- invoke_static(sc, + "ai.h2o.sparkling.ml.metrics.H2OMultinomialMetrics", + "calculate", + dataFrame, + domain, + predictionCol, + labelCol, + weightCol, + aucType) + rsparkling.H2OMultinomialMetrics(javaMetrics) +} diff --git a/r/src/R/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.R b/r/src/R/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.R index 72fe31d420..1c6deb363f 100644 --- a/r/src/R/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.R +++ b/r/src/R/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.R @@ -19,3 +19,20 @@ source(file.path("R", "H2ORegressionMetricsBase.R")) #' @export rsparkling.H2ORegressionMetricsBase rsparkling.H2ORegressionMetrics <- setRefClass("rsparkling.H2ORegressionMetrics", contains = ("rsparkling.H2ORegressionMetricsBase")) + +H2ORegressionMetrics.calculate <- function(dataFrame, + predictionCol = "detailed_prediction", + labelCol = "label", + weightCol = NULL, + offsetCol = NULL) { + sc <- spark_connection_find()[[1]] + javaMetrics <- invoke_static(sc, + "ai.h2o.sparkling.ml.metrics.H2ORegressionMetrics", + "calculate", + dataFrame, + predictionCol, + labelCol, + weightCol, + offsetCol) + rsparkling.H2ORegressionMetrics(javaMetrics) +} From 93fd203cba7aad4d7ea268998d302ef713ced8ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Fri, 8 Apr 2022 15:17:55 +0200 Subject: [PATCH 29/37] Add R tests --- .../sparkling/ml/metrics/H2OBinomialMetrics.R | 7 +- .../ml/metrics/H2OMultinomialMetrics.R | 7 +- .../ml/metrics/H2ORegressionMetrics.R | 7 +- r/src/tests/testthat/testMetricCalculation.R | 145 ++++++++++++++++++ r/src/tests/testthat/testMojo.R | 39 ----- .../ml/metrics/H2OBinomialMetrics.scala | 21 ++- .../ml/metrics/H2OMultinomialMetrics.scala | 21 ++- 7 files changed, 195 insertions(+), 52 deletions(-) create mode 100644 r/src/tests/testthat/testMetricCalculation.R diff --git a/r/src/R/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.R b/r/src/R/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.R index 503df00b3b..6e984bfa29 100644 --- a/r/src/R/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.R +++ b/r/src/R/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.R @@ -20,17 +20,18 @@ source(file.path("R", "H2OBinomialMetricsBase.R")) #' @export rsparkling.H2OBinomialMetricsBase rsparkling.H2OBinomialMetrics <- setRefClass("rsparkling.H2OBinomialMetrics", contains = ("rsparkling.H2OBinomialMetricsBase")) -H2OBinomialMetrics.calculate <- function(dataFrame, +H2OBinomialMetrics.calculate <- function(sparkFrame, domain, predictionCol = "detailed_prediction", labelCol = "label", weightCol = NULL, offsetCol = NULL) { sc <- spark_connection_find()[[1]] + sparkFrame <- spark_dataframe(sparkFrame) javaMetrics <- invoke_static(sc, "ai.h2o.sparkling.ml.metrics.H2OBinomialMetrics", - "calculate", - dataFrame, + "calculateInternal", + sparkFrame, domain, predictionCol, labelCol, diff --git a/r/src/R/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.R b/r/src/R/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.R index 9d849f4a7f..6362721c71 100644 --- a/r/src/R/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.R +++ b/r/src/R/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.R @@ -20,17 +20,18 @@ source(file.path("R", "H2OMultinomialMetricsBase.R")) #' @export rsparkling.H2OMultinomialMetricsBase rsparkling.H2OMultinomialMetrics <- setRefClass("rsparkling.H2OMultinomialMetrics", contains = ("rsparkling.H2OMultinomialMetricsBase")) -H2OMultinomialMetrics.calculate <- function(dataFrame, +H2OMultinomialMetrics.calculate <- function(sparkFrame, domain, predictionCol = "detailed_prediction", labelCol = "label", weightCol = NULL, aucType = "AUTO") { sc <- spark_connection_find()[[1]] + sparkFrame <- spark_dataframe(sparkFrame) javaMetrics <- invoke_static(sc, "ai.h2o.sparkling.ml.metrics.H2OMultinomialMetrics", - "calculate", - dataFrame, + "calculateInternal", + sparkFrame, domain, predictionCol, labelCol, diff --git a/r/src/R/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.R b/r/src/R/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.R index 1c6deb363f..26996608c4 100644 --- a/r/src/R/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.R +++ b/r/src/R/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.R @@ -20,16 +20,17 @@ source(file.path("R", "H2ORegressionMetricsBase.R")) #' @export rsparkling.H2ORegressionMetricsBase rsparkling.H2ORegressionMetrics <- setRefClass("rsparkling.H2ORegressionMetrics", contains = ("rsparkling.H2ORegressionMetricsBase")) -H2ORegressionMetrics.calculate <- function(dataFrame, +H2ORegressionMetrics.calculate <- function(sparkFrame, predictionCol = "detailed_prediction", labelCol = "label", weightCol = NULL, offsetCol = NULL) { sc <- spark_connection_find()[[1]] + sparkFrame <- spark_dataframe(sparkFrame) javaMetrics <- invoke_static(sc, "ai.h2o.sparkling.ml.metrics.H2ORegressionMetrics", - "calculate", - dataFrame, + "calculateInternal", + sparkFrame, predictionCol, labelCol, weightCol, diff --git a/r/src/tests/testthat/testMetricCalculation.R b/r/src/tests/testthat/testMetricCalculation.R new file mode 100644 index 0000000000..8d308b06ad --- /dev/null +++ b/r/src/tests/testthat/testMetricCalculation.R @@ -0,0 +1,145 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +context("Test metrics calculation") + +config <- spark_config() +config <- c(config, list( + "spark.hadoop.yarn.timeline-service.enabled" = "false", + "spark.ext.h2o.external.cluster.size" = "1", + "spark.ext.h2o.backend.cluster.mode" = Sys.getenv("spark.ext.h2o.backend.cluster.mode"), + "sparklyr.connect.enablehivesupport" = FALSE, + "sparklyr.gateway.connect.timeout" = 240, + "sparklyr.gateway.start.timeout" = 240, + "sparklyr.backend.timeout" = 240, + "sparklyr.log.console" = TRUE, + "spark.ext.h2o.external.start.mode" = "auto", + "spark.ext.h2o.external.disable.version.check" = "true", + "sparklyr.gateway.port" = 55555, + "sparklyr.connect.timeout" = 60 * 5, + "spark.master" = "local[*]" +)) + +for (i in 1:4) { + tryCatch( + { + sc <- spark_connect(master = "local[*]", config = config) + }, error = function(e) { } + ) +} + +locate <- function(fileName) { + normalizePath(file.path("../../../../../examples/", fileName)) +} + +test_that("test training metrics", { + model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo"))) + metrics <- model$getTrainingMetrics() + expect_equal(as.character(metrics[["AUC"]]), "0.896878869021911") + expect_equal(length(metrics), 10) +}) + +test_that("test training metrics object", { + model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo"))) + metrics <- model$getTrainingMetricsObject() + aucValue <- metrics$getAUC() + scoringTime <- metrics$getScoringTime() + + thresholdsAndScores <- metrics$getThresholdsAndMetricScores() + thresholdsAndScoresFrame <- dplyr::tally(thresholdsAndScores) + thresholdsAndScoresCount <- as.double(dplyr::collect(thresholdsAndScoresFrame)[[1]]) + + gainsLiftTable <- metrics$getGainsLiftTable() + gainsLiftTableFrame <- dplyr::tally(gainsLiftTable) + gainsLiftTableCount <- as.double(dplyr::collect(gainsLiftTableFrame)[[1]]) + + expect_equal(as.character(aucValue), "0.896878869021911") + expect_true(scoringTime > 0) + expect_true(thresholdsAndScoresCount > 0) + expect_true(gainsLiftTableCount > 0) +}) + +test_that("test null cross validation metrics object", { + model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo"))) + cvObject <- model$getCrossValidationMetricsObject() + expect_true(is.null(cvObject)) +}) + +test_that("test current metrics", { + model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo"))) + metrics <- model$getCurrentMetrics() + expect_equal(metrics, model$getTrainingMetrics()) +}) + +test_that("test calculation of regression metrics", { + path <- paste0("file://", locate("smalldata/prostate/prostate.csv")) + dataset <- spark_read_csv(sc, path = path, infer_schema = TRUE, header = TRUE) + model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/regre_model_prostate.mojo"))) + predictions <- model$transform(dataset) + + metrics <- H2ORegressionMetrics.calculate(predictions, labelCol = "CAPSULE") + + mae <- metrics$getMAE() + rmsle <- metrics$getRMSLE() + + expect_true(mae > 0.0) + expect_true(rmsle > 0.0) +}) + +test_that("test calculation of binomial metrics", { + path <- paste0("file://", locate("smalldata/prostate/prostate.csv")) + dataset <- spark_read_csv(sc, path = path, infer_schema = TRUE, header = TRUE) + model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo"))) + predictions <- model$transform(dataset) + domainValues <- model$getDomainValues() + + metrics <- H2OBinomialMetrics.calculate(predictions, domainValues[["capsule"]], labelCol = "CAPSULE") + + aucValue <- metrics$getAUC() + scoringTime <- metrics$getScoringTime() + + thresholdsAndScores <- metrics$getThresholdsAndMetricScores() + thresholdsAndScoresFrame <- dplyr::tally(thresholdsAndScores) + thresholdsAndScoresCount <- as.double(dplyr::collect(thresholdsAndScoresFrame)[[1]]) + + expect_true(aucValue > 0.6) + expect_true(scoringTime > 0) + expect_true(thresholdsAndScoresCount > 0) +}) + +test_that("test calculation of multinomial metrics", { + path <- paste0("file://", locate("smalldata/iris/iris_wheader.csv")) + dataset <- spark_read_csv(sc, path = path, infer_schema = TRUE, header = TRUE) + model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/multi_model_iris.mojo"))) + predictions <- model$transform(dataset) + domainValues <- model$getDomainValues() + + metrics <- H2OMultinomialMetrics.calculate(predictions, domainValues[["class"]], labelCol = "class") + + aucValue <- metrics$getAUC() + scoringTime <- metrics$getScoringTime() + + confusionMatrix <- metrics$getConfusionMatrix() + confusionMatrixFrame <- dplyr::tally(confusionMatrix) + confusionMatrixCount <- as.double(dplyr::collect(confusionMatrixFrame)[[1]]) + + expect_true(aucValue > 0.6) + expect_true(scoringTime > 0) + expect_true(confusionMatrixCount > 0) +}) + +spark_disconnect(sc) diff --git a/r/src/tests/testthat/testMojo.R b/r/src/tests/testthat/testMojo.R index 3c830c8e5e..099274bf64 100644 --- a/r/src/tests/testthat/testMojo.R +++ b/r/src/tests/testthat/testMojo.R @@ -115,45 +115,6 @@ test_that("test model category", { expect_equal(category, "Binomial") }) -test_that("test training metrics", { - model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo"))) - metrics <- model$getTrainingMetrics() - expect_equal(as.character(metrics[["AUC"]]), "0.896878869021911") - expect_equal(length(metrics), 10) -}) - -test_that("test training metrics object", { - model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo"))) - metrics <- model$getTrainingMetricsObject() - aucValue <- metrics$getAUC() - scoringTime <- metrics$getScoringTime() - - thresholdsAndScores <- metrics$getThresholdsAndMetricScores() - thresholdsAndScoresFrame <- dplyr::tally(thresholdsAndScores) - thresholdsAndScoresCount <- as.double(dplyr::collect(thresholdsAndScoresFrame)[[1]]) - - gainsLiftTable <- metrics$getGainsLiftTable() - gainsLiftTableFrame <- dplyr::tally(gainsLiftTable) - gainsLiftTableCount <- as.double(dplyr::collect(gainsLiftTableFrame)[[1]]) - - expect_equal(as.character(aucValue), "0.896878869021911") - expect_true(scoringTime > 0) - expect_true(thresholdsAndScoresCount > 0) - expect_true(gainsLiftTableCount > 0) -}) - -test_that("test null cross validation metrics object", { - model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo"))) - cvObject <- model$getCrossValidationMetricsObject() - expect_true(is.null(cvObject)) -}) - -test_that("test current metrics", { - model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo"))) - metrics <- model$getCurrentMetrics() - expect_equal(metrics, model$getTrainingMetrics()) -}) - test_that("test MOJO predictions on unseen categoricals", { path <- paste0("file://", normalizePath("../../../../../ml/src/test/resources/deep_learning_airlines_categoricals.zip")) settings <- H2OMOJOSettings(convertUnknownCategoricalLevelsToNa = TRUE) diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala index 368f40ade2..442ee5effd 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala @@ -68,14 +68,14 @@ object H2OBinomialMetrics extends MetricCalculation { result } - // The method serves for call from Python/R API + // The method serves for call from Python API def calculateInternal( dataFrame: DataFrame, domain: java.util.ArrayList[String], predictionCol: String, labelCol: String, weightCol: String, - offsetCol: String): Unit = { + offsetCol: String): H2OBinomialMetrics = { calculate( dataFrame, domain.toArray[String](new Array[String](0)), @@ -85,6 +85,23 @@ object H2OBinomialMetrics extends MetricCalculation { Option(offsetCol)) } + // The method serves for call from R API + def calculateInternal( + dataFrame: DataFrame, + domain: Array[String], + predictionCol: String, + labelCol: String, + weightCol: String, + offsetCol: String): H2OBinomialMetrics = { + calculate( + dataFrame, + domain, + predictionCol, + labelCol, + Option(weightCol), + Option(offsetCol)) + } + override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = { dataType match { case StructType(fields) diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala index 96df9ecf0a..a17e499ced 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala @@ -30,7 +30,7 @@ import org.apache.spark.sql.types.{ArrayType, DataType, DoubleType, FloatType, S "The class makes available all metrics that shared across all algorithms supporting multinomial classification.") class H2OMultinomialMetrics(override val uid: String) extends H2OMultinomialMetricsBase(uid) { - def this() = this(Identifiable.randomUID("H2OBinomialMetrics")) + def this() = this(Identifiable.randomUID("H2OMultinomialMetrics")) } object H2OMultinomialMetrics extends MetricCalculation { @@ -77,7 +77,7 @@ object H2OMultinomialMetrics extends MetricCalculation { result } - // The method serves for call from Python/R API + // The method serves for call from Python API def calculateInternal( dataFrame: DataFrame, domain: java.util.ArrayList[String], @@ -94,6 +94,23 @@ object H2OMultinomialMetrics extends MetricCalculation { aucType) } + // The method serves for call from R API + def calculateInternal( + dataFrame: DataFrame, + domain: Array[String], + predictionCol: String, + labelCol: String, + weightCol: String, + aucType: String): H2OMultinomialMetrics = { + calculate( + dataFrame, + domain, + predictionCol, + labelCol, + Option(weightCol), + aucType) + } + override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = { dataType match { case StructType(fields) From c939266e305e701aa7aa7e15004a597c61f71103 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Fri, 8 Apr 2022 15:53:19 +0200 Subject: [PATCH 30/37] add more conditions to python tests --- .../unit/with_runtime_sparkling/test_metric_calculation.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py b/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py index def81191ef..19d761eb4e 100644 --- a/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py +++ b/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py @@ -23,6 +23,8 @@ def testRegressionMetricsCalculation(prostateDataset): "file://" + os.path.abspath("../ml/src/test/resources/regre_model_prostate.mojo")) metrics = H2ORegressionMetrics.calculate(mojo.transform(prostateDataset), labelCol = "CAPSULE") assert metrics is not None + assert metrics.getMAE() > 0.0 + assert metrics.getRMSLE() > 0.0 def testBinomialMetricsCalculation(prostateDataset): @@ -31,6 +33,8 @@ def testBinomialMetricsCalculation(prostateDataset): domain = mojo.getDomainValues()["capsule"] metrics = H2OBinomialMetrics.calculate(mojo.transform(prostateDataset), domain, labelCol = "CAPSULE") assert metrics is not None + assert metrics.getAUC() > 0.5 + assert metrics.getConfusionMatrix().count() > 0 def testMultinomialMetricsCalculation(irisDataset): @@ -39,3 +43,5 @@ def testMultinomialMetricsCalculation(irisDataset): domain = mojo.getDomainValues()["class"] metrics = H2OMultinomialMetrics.calculate(mojo.transform(irisDataset), domain, labelCol = "class") assert metrics is not None + assert metrics.getAUC() > 0.5 + assert metrics.getConfusionMatrix().count() > 0 From da2c335296ff77d7b2db4ce180eb8585406d9b4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Fri, 8 Apr 2022 16:04:14 +0200 Subject: [PATCH 31/37] spotless apply --- .../ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala | 8 +------- .../h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala | 8 +------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala index 442ee5effd..9c54fd4f06 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala @@ -93,13 +93,7 @@ object H2OBinomialMetrics extends MetricCalculation { labelCol: String, weightCol: String, offsetCol: String): H2OBinomialMetrics = { - calculate( - dataFrame, - domain, - predictionCol, - labelCol, - Option(weightCol), - Option(offsetCol)) + calculate(dataFrame, domain, predictionCol, labelCol, Option(weightCol), Option(offsetCol)) } override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = { diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala index a17e499ced..e7b75cdae0 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala @@ -102,13 +102,7 @@ object H2OMultinomialMetrics extends MetricCalculation { labelCol: String, weightCol: String, aucType: String): H2OMultinomialMetrics = { - calculate( - dataFrame, - domain, - predictionCol, - labelCol, - Option(weightCol), - aucType) + calculate(dataFrame, domain, predictionCol, labelCol, Option(weightCol), aucType) } override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = { From da8417ca84c673f344b2d6bd19bee6c0ccdbebaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Tue, 12 Apr 2022 11:30:33 +0200 Subject: [PATCH 32/37] Update metric calculation to work with just probabilities --- .../ml/metrics/BinomialMetricsTestSuite.scala | 40 ++++++++++++- .../metrics/MultinomialMetricsTestSuite.scala | 45 ++++++++++++++- .../ml/metrics/H2OBinomialMetrics.py | 5 +- .../ml/metrics/H2OMultinomialMetrics.py | 6 +- .../ml/metrics/H2OBinomialMetrics.scala | 56 +++++++++++-------- .../ml/metrics/H2OMultinomialMetrics.scala | 52 ++++++++++++----- .../ml/metrics/H2ORegressionMetrics.scala | 14 +++-- .../ml/metrics/MetricCalculation.scala | 1 + 8 files changed, 168 insertions(+), 51 deletions(-) diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala index dbd1ce0916..5a36495647 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala @@ -20,8 +20,8 @@ package ai.h2o.sparkling.ml.metrics import ai.h2o.sparkling.ml.algos._ import ai.h2o.sparkling.ml.models.{H2OGBMMOJOModel, H2OGLMMOJOModel, H2OMOJOModel} import ai.h2o.sparkling.{SharedH2OTestContext, TestUtils} -import org.apache.spark.sql.functions.rand -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.functions.{rand, col} +import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.types._ import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @@ -196,6 +196,42 @@ class BinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OTest validationMetricsTolerance) } + test(s"test calculation of binomial $algorithmName metrics with probabilities passed to predictionCol") { + val algorithm = algorithmGetter() + algorithm + .setValidationDataFrame(validationDataset) + .set(algorithm.getParam("seed"), 1L) + .setFeaturesCols("AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON") + .setLabelCol("CAPSULE") + + val model = algorithm.fit(trainingDataset) + val domain = model.getDomainValues()("CAPSULE") + + def extractProbability(df: DataFrame): DataFrame = { + df.withColumn("probability", col(s"detailed_prediction.probabilities.${domain(1)}")) + } + + val trainingMetricObject = + H2OBinomialMetrics.calculate( + extractProbability(model.transform(trainingDataset)), + domain, + labelCol = "CAPSULE", + predictionCol = "probability") + val validationMetricObject = + H2OBinomialMetrics.calculate( + extractProbability(model.transform(validationDataset)), + domain, + labelCol = "CAPSULE", + predictionCol = "probability") + + assertMetrics( + model, + trainingMetricObject, + validationMetricObject, + trainingMetricsTolerance, + validationMetricsTolerance) + } + test(s"test calculation of binomial $algorithmName metrics with weightCol set on arbitrary dataset") { val algorithm = algorithmGetter() algorithm diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala index 02684ee4fe..15a35ba40d 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala @@ -20,11 +20,13 @@ package ai.h2o.sparkling.ml.metrics import ai.h2o.sparkling.ml.algos._ import ai.h2o.sparkling.ml.models.{H2OGBMMOJOModel, H2OGLMMOJOModel, H2OMOJOModel} import ai.h2o.sparkling.{SharedH2OTestContext, TestUtils} +import hex.genmodel.GenModel import org.apache.spark.sql.{DataFrame, SparkSession} -import org.apache.spark.sql.functions.{monotonically_increasing_id, rand} +import org.apache.spark.sql.functions.{monotonically_increasing_id, rand, udf} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{FunSuite, Matchers} +import org.apache.spark.sql.functions.{array, col} @RunWith(classOf[JUnitRunner]) class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OTestContext { @@ -189,6 +191,47 @@ class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OT validationMetricsTolerance) } + test(s"test calculation of multinomial $algorithmName metrics with just probabilities") { + val algorithm = algorithmGetter() + algorithm + .setValidationDataFrame(validationDataset) + .set(algorithm.getParam("seed"), 1L) + .setFeaturesCols("sepal_len", "sepal_wid", "petal_len", "petal_wid") + .setColumnsToCategorical("class") + .set(algorithm.getParam("aucType"), "MACRO_OVR") + .setLabelCol("class") + + val model = algorithm.fit(trainingDataset) + val priorClassDistribution = model.unwrapMojoModel()._priorClassDistrib + val domain = model.getDomainValues()("class") + def extractProbabilities(df: DataFrame) = { + val columns = domain.map(label => col(s"detailed_prediction.probabilities.$label")) + df.withColumn("probabilities", array(columns: _*)) + } + + val trainingMetricObject = + H2OMultinomialMetrics.calculate( + extractProbabilities(model.transform(trainingDataset)), + domain, + labelCol = "class", + predictionCol = "probabilities", + aucType = "MACRO_OVR") + val validationMetricObject = + H2OMultinomialMetrics.calculate( + extractProbabilities(model.transform(validationDataset)), + domain, + labelCol = "class", + predictionCol = "probabilities", + aucType = "MACRO_OVR") + + assertMetrics( + model, + trainingMetricObject, + validationMetricObject, + trainingMetricsTolerance, + validationMetricsTolerance) + } + test(s"test calculation of multinomial $algorithmName metrics with weightCol set on arbitrary dataset") { val algorithm = algorithmGetter() algorithm diff --git a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py index be6a87ef17..52b2659549 100644 --- a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py +++ b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py @@ -36,9 +36,8 @@ def calculate(dataFrame, :param domain: A list of classes representing negative and positive response. Negative class must at position 0 and positive at 1 :param predictionCol: The name of prediction column. The prediction column must have the same type as - a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or a array type or - vector of doubles. First item is must be 0.0 or 1.0 representing negative or positive response. The other items - must be probabilities to predict given probability classes. + a detailed_prediction column coming from the transform method of H2OMOJOModel descendant. Or the type must + be FloatType/DoubleType where values represent probabilities of the positive response. :param labelCol: The name of label column that contains actual values. :param weightCol: The name of a weight column. :param offsetCol: The name of a offset column. diff --git a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py index 7a7854b455..8e41d56f3f 100644 --- a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py +++ b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py @@ -35,9 +35,9 @@ def calculate(dataFrame, :param dataFrame: A data frame with predictions and actual values. :param domain: List of response classes. :param predictionCol: The name of prediction column. The prediction column must have the same type as - a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or a array type or - vector of doubles. First item is must be 0.0, 1.0, 2.0 representing indexes of response classes. The other - items must be probabilities to predict given probability classes. + a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or + a array type or vector of doubles where particular arrays represent class probabilities. + The order of probabilities must correspond to the order of labels in the passed domain. :param labelCol: The name of label column that contains actual values. :param weightCol: The name of a weight column. :param aucType: Type of multinomial AUC/AUCPR calculation. Possible values: diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala index 9c54fd4f06..2d78d7706c 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala @@ -18,7 +18,6 @@ package ai.h2o.sparkling.ml.metrics import hex.ModelMetricsBinomial.MetricBuilderBinomial -import org.apache.spark.{ExposeUtils, ml, mllib} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row} @@ -41,10 +40,9 @@ object H2OBinomialMetrics extends MetricCalculation { * @param domain Array of classes representing negative and positive response. Negative class must at position 0 and * positive at 1. * @param predictionCol The name of prediction column. The prediction column must have the same type as - * a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or - * a array type or vector of doubles. First item is must be 0.0 or 1.0 representing - * negative or positive response. The other items must be probabilities to predict given probability - * classes. + * a detailed_prediction column coming from the transform method of H2OMOJOModel descendant. + * Or the type must be FloatType/DoubleType where values represent probabilities of + * the positive response. * @param labelCol The name of label column that contains actual values. * @param weightColOption The name of a weight column. * @param offsetColOption The name of a offset column. @@ -57,7 +55,7 @@ object H2OBinomialMetrics extends MetricCalculation { labelCol: String = "label", weightColOption: Option[String] = None, offsetColOption: Option[String] = None): H2OBinomialMetrics = { - validateDataFrameForMetricCalculation(dataFrame, predictionCol, labelCol, offsetColOption, weightColOption) + validateDataFrameForMetricCalculation(dataFrame, domain, predictionCol, labelCol, offsetColOption, weightColOption) val getMetricBuilder = () => new MetricBuilderBinomial(domain) val castedLabelDF = dataFrame.withColumn(labelCol, col(labelCol) cast StringType) @@ -100,20 +98,25 @@ object H2OBinomialMetrics extends MetricCalculation { dataType match { case StructType(fields) if fields(0).dataType == StringType && fields(1).dataType.isInstanceOf[StructType] && - fields(1).dataType.asInstanceOf[StructType].fields.forall(_.dataType == DoubleType) => + fields(1).dataType.asInstanceOf[StructType].fields.forall(_.dataType == DoubleType) && + fields(1).dataType.asInstanceOf[StructType].fields.length == 2 => val predictionStructure = row.getStruct(0) val prediction = predictionStructure.getString(0) val index = domain.indexOf(prediction).toDouble val probabilities = predictionStructure.getStruct(1) - Array(index) ++ probabilities.toSeq.map(_.asInstanceOf[Double]) - case ArrayType(DoubleType, _) => row.getSeq[Double](0).toArray - case ArrayType(FloatType, _) => row.getSeq[Float](0).map(_.toDouble).toArray - case v if ExposeUtils.isMLVectorUDT(v) => row.getAs[ml.linalg.Vector](0).toDense.values - case _: mllib.linalg.VectorUDT => row.getAs[mllib.linalg.Vector](0).toDense.values + case StructType(fields) if fields.forall(_.dataType == DoubleType) && fields.length == 2 => + val probabilities = row.getStruct(0) + Array(-1.0) ++ probabilities.toSeq.map(_.asInstanceOf[Double]) + case DoubleType => probabilityToArray(row.getDouble(0)) + case FloatType => probabilityToArray(row.getFloat(0).toDouble) } } + private def probabilityToArray(probability: Double): Array[Double] = { + Array[Double](-1 /* unused */, 1 - probability, probability) + } + override protected def getActualValue(dataType: DataType, domain: Array[String], row: Row): Double = { val label = row.getString(1) domain.indexOf(label).toDouble @@ -121,29 +124,36 @@ object H2OBinomialMetrics extends MetricCalculation { override protected def validateDataFrameForMetricCalculation( dataFrame: DataFrame, + domain: Array[String], predictionCol: String, labelCol: String, offsetColOption: Option[String], weightColOption: Option[String]): Unit = { - super.validateDataFrameForMetricCalculation(dataFrame, predictionCol, labelCol, offsetColOption, weightColOption) + super.validateDataFrameForMetricCalculation( + dataFrame, + domain, + predictionCol, + labelCol, + offsetColOption, + weightColOption) val predictionType = dataFrame.schema.fields.find(_.name == predictionCol).get.dataType val isPredictionTypeValid = predictionType match { case StructType(fields) if fields(0).dataType == StringType && fields(1).dataType.isInstanceOf[StructType] && - fields(1).dataType.asInstanceOf[StructType].fields.forall(_.dataType == DoubleType) => + fields(1).dataType.asInstanceOf[StructType].fields.forall(_.dataType == DoubleType) && + fields(1).dataType.asInstanceOf[StructType].fields.length == 2 => true - case ArrayType(DoubleType, _) => true - case ArrayType(FloatType, _) => true - case v if ExposeUtils.isMLVectorUDT(v) => true - case _: mllib.linalg.VectorUDT => true + case StructType(fields) if fields.forall(_.dataType == DoubleType) && fields.length == 2 => true + case DoubleType => true + case FloatType => true case _ => false } if (!isPredictionTypeValid) { - throw new IllegalArgumentException(s"The type of the prediction column '$predictionCol' is not valid. " + - "The prediction column must have the same type as a detailed_prediction column coming from the transform " + - "method of H2OMOJOModel descendant or a array type or vector of doubles. First item is must be 0.0 or 1.0" + - "representing negative or positive response. The other items must be probabilities to predict given probability" + - "classes.") + throw new IllegalArgumentException( + s"The type of the prediction column '$predictionCol' is not valid. " + + "The prediction column must have the same type as a detailed_prediction column coming from the transform " + + "method of H2OMOJOModel descendant or a array type or vector of doubles. Or the type must be " + + "FloatType/DoubleType where values represent probabilities of positive response.") } } } diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala index e7b75cdae0..05b6f85584 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala @@ -19,10 +19,11 @@ package ai.h2o.sparkling.ml.metrics import hex.ModelMetricsMultinomial.MetricBuilderMultinomial import hex.MultinomialAucType +import hex.genmodel.GenModel import org.apache.spark.{ExposeUtils, ml, mllib} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions.col -import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{ArrayType, DataType, DoubleType, FloatType, StringType, StructType} @MetricsDescription( @@ -42,9 +43,8 @@ object H2OMultinomialMetrics extends MetricCalculation { * @param domain Array of response classes. * @param predictionCol The name of prediction column. The prediction column must have the same type as * a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or - * a array type or vector of doubles. First item is must be 0.0, 1.0, 2.0 representing - * indexes of response classes. The other items must be probabilities to predict given probability - * classes. + * a array type or vector of doubles where particular arrays represent class probabilities. + * The order of probabilities must correspond to the order of labels in the passed domain. * @param labelCol The name of label column that contains actual values. * @param weightColOption The name of a weight column. * @param aucType Type of multinomial AUC/AUCPR calculation. Possible values: @@ -63,7 +63,7 @@ object H2OMultinomialMetrics extends MetricCalculation { labelCol: String = "label", weightColOption: Option[String] = None, aucType: String = "AUTO"): H2OMultinomialMetrics = { - validateDataFrameForMetricCalculation(dataFrame, predictionCol, labelCol, None, weightColOption) + validateDataFrameForMetricCalculation(dataFrame, domain, predictionCol, labelCol, None, weightColOption) val aucTypeEnum = MultinomialAucType.valueOf(aucType) val nclasses = domain.length val getMetricBuilder = @@ -109,20 +109,33 @@ object H2OMultinomialMetrics extends MetricCalculation { dataType match { case StructType(fields) if fields(0).dataType == StringType && fields(1).dataType.isInstanceOf[StructType] && - fields(1).dataType.asInstanceOf[StructType].fields.forall(_.dataType == DoubleType) => + fields(1).dataType.asInstanceOf[StructType].fields.forall(_.dataType == DoubleType) && + fields(1).dataType.asInstanceOf[StructType].fields.length == domain.length => val predictionStructure = row.getStruct(0) val prediction = predictionStructure.getString(0) val index = domain.indexOf(prediction).toDouble val probabilities = predictionStructure.getStruct(1) Array(index) ++ probabilities.toSeq.map(_.asInstanceOf[Double]) - case ArrayType(DoubleType, _) => row.getSeq[Double](0).toArray - case ArrayType(FloatType, _) => row.getSeq[Float](0).map(_.toDouble).toArray - case v if ExposeUtils.isMLVectorUDT(v) => row.getAs[ml.linalg.Vector](0).toDense.values - case _: mllib.linalg.VectorUDT => row.getAs[mllib.linalg.Vector](0).toDense.values + case StructType(fields) if fields.forall(_.dataType == DoubleType) && fields.length == domain.length => + val probabilities = row.getStruct(0).toSeq.map(_.asInstanceOf[Double]).toArray + probabilitiesToPredictedValues(probabilities) + case ArrayType(DoubleType, _) => probabilitiesToPredictedValues(row.getSeq[Double](0).toArray) + case ArrayType(FloatType, _) => probabilitiesToPredictedValues(row.getSeq[Float](0).map(_.toDouble).toArray) + case v if ExposeUtils.isMLVectorUDT(v) => + probabilitiesToPredictedValues(row.getAs[ml.linalg.Vector](0).toDense.values) + case _: mllib.linalg.VectorUDT => + probabilitiesToPredictedValues(row.getAs[mllib.linalg.Vector](0).toDense.values) } } + private def probabilitiesToPredictedValues(probabilities: Array[Double]): Array[Double] = { + val result = new Array[Double](probabilities.length + 1) + Array.copy(probabilities, 0, result, 1, probabilities.length) + result(0) = GenModel.getPredictionMultinomial(result, null, result).toDouble + result + } + override protected def getActualValue(dataType: DataType, domain: Array[String], row: Row): Double = { val label = row.getString(1) domain.indexOf(label).toDouble @@ -130,16 +143,26 @@ object H2OMultinomialMetrics extends MetricCalculation { override protected def validateDataFrameForMetricCalculation( dataFrame: DataFrame, + domain: Array[String], predictionCol: String, labelCol: String, offsetColOption: Option[String], weightColOption: Option[String]): Unit = { - super.validateDataFrameForMetricCalculation(dataFrame, predictionCol, labelCol, offsetColOption, weightColOption) + super.validateDataFrameForMetricCalculation( + dataFrame, + domain, + predictionCol, + labelCol, + offsetColOption, + weightColOption) val predictionType = dataFrame.schema.fields.find(_.name == predictionCol).get.dataType val isPredictionTypeValid = predictionType match { case StructType(fields) if fields(0).dataType == StringType && fields(1).dataType.isInstanceOf[StructType] && - fields(1).dataType.asInstanceOf[StructType].fields.forall(_.dataType == DoubleType) => + fields(1).dataType.asInstanceOf[StructType].fields.forall(_.dataType == DoubleType) && + fields(1).dataType.asInstanceOf[StructType].fields.length == domain.length => + true + case StructType(fields) if fields.forall(_.dataType == DoubleType) && fields.length == domain.length => true case ArrayType(DoubleType, _) => true case ArrayType(FloatType, _) => true @@ -150,9 +173,8 @@ object H2OMultinomialMetrics extends MetricCalculation { if (!isPredictionTypeValid) { throw new IllegalArgumentException(s"The type of the prediction column '$predictionCol' is not valid. " + "The prediction column must have the same type as a detailed_prediction column coming from the transform " + - "method of H2OMOJOModel descendant or a array type or vector of doubles. First item is must be 0.0, 1.0, 2.0 " + - "representing indexes of response classes. The other items must be probabilities to predict given " + - "probability classes.") + "method of H2OMOJOModel descendant or a array type or vector of doubles where particular arrays represent " + + "class probabilities. The order of probabilities must correspond to the order of labels in the passed domain.") } } } diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala index 54e7ac3209..0bdd1a70ed 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala @@ -48,11 +48,11 @@ object H2ORegressionMetrics extends MetricCalculation { */ def calculate( dataFrame: DataFrame, - predictionCol: String = "prediction", + predictionCol: String = "detailed_prediction", labelCol: String = "label", weightColOption: Option[String] = None, offsetColOption: Option[String] = None): H2ORegressionMetrics = { - validateDataFrameForMetricCalculation(dataFrame, predictionCol, labelCol, offsetColOption, weightColOption) + validateDataFrameForMetricCalculation(dataFrame, null, predictionCol, labelCol, offsetColOption, weightColOption) val getMetricBuilder = () => new MetricBuilderRegression(DistributionFactory.getDistribution(DistributionFamily.AUTO)) val castedLabelDF = dataFrame.withColumn(labelCol, col(labelCol) cast DoubleType) @@ -87,12 +87,18 @@ object H2ORegressionMetrics extends MetricCalculation { override protected def validateDataFrameForMetricCalculation( dataFrame: DataFrame, + domain: Array[String], predictionCol: String, labelCol: String, offsetColOption: Option[String], weightColOption: Option[String]): Unit = { - super.validateDataFrameForMetricCalculation(dataFrame, predictionCol, labelCol, offsetColOption, weightColOption) - + super.validateDataFrameForMetricCalculation( + dataFrame, + domain, + predictionCol, + labelCol, + offsetColOption, + weightColOption) val predictionType = dataFrame.schema.fields.find(_.name == predictionCol).get.dataType val isPredictionTypeValid = predictionType match { case StructType(fields) if fields.head.dataType == DoubleType => true diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala index 169b054cc1..6d6274978d 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala @@ -30,6 +30,7 @@ trait MetricCalculation { protected def validateDataFrameForMetricCalculation( dataFrame: DataFrame, + domain: Array[String], predictionCol: String, labelCol: String, offsetColOption: Option[String], From 45bc54edeaf9bc123c17ac36f608f010bbe97988 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Tue, 12 Apr 2022 15:55:26 +0200 Subject: [PATCH 33/37] Fix Python test --- py/tests/unit/with_runtime_sparkling/test_metric_calculation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py b/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py index 19d761eb4e..8af4f13e06 100644 --- a/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py +++ b/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py @@ -43,5 +43,5 @@ def testMultinomialMetricsCalculation(irisDataset): domain = mojo.getDomainValues()["class"] metrics = H2OMultinomialMetrics.calculate(mojo.transform(irisDataset), domain, labelCol = "class") assert metrics is not None - assert metrics.getAUC() > 0.5 + assert metrics.getLogloss() > 0.0 assert metrics.getConfusionMatrix().count() > 0 From 196529236482cccd417103fffb9c19911a637032 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Wed, 13 Apr 2022 14:31:21 +0200 Subject: [PATCH 34/37] Fix R test --- r/src/tests/testthat/testMetricCalculation.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/src/tests/testthat/testMetricCalculation.R b/r/src/tests/testthat/testMetricCalculation.R index 8d308b06ad..d76c44d45d 100644 --- a/r/src/tests/testthat/testMetricCalculation.R +++ b/r/src/tests/testthat/testMetricCalculation.R @@ -130,14 +130,14 @@ test_that("test calculation of multinomial metrics", { metrics <- H2OMultinomialMetrics.calculate(predictions, domainValues[["class"]], labelCol = "class") - aucValue <- metrics$getAUC() + logloss <- metrics$getLogloss() scoringTime <- metrics$getScoringTime() confusionMatrix <- metrics$getConfusionMatrix() confusionMatrixFrame <- dplyr::tally(confusionMatrix) confusionMatrixCount <- as.double(dplyr::collect(confusionMatrixFrame)[[1]]) - expect_true(aucValue > 0.6) + expect_true(logloss > 0.0) expect_true(scoringTime > 0) expect_true(confusionMatrixCount > 0) }) From 7e8f954a0f374c15cbf8889788741f8611ecd9ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Thu, 14 Apr 2022 19:56:06 +0200 Subject: [PATCH 35/37] Address review comments from Bartosz --- .../ml/metrics/BinomialMetricsTestSuite.scala | 16 ++++++++-------- .../ml/metrics/MultinomialMetricsTestSuite.scala | 8 ++++---- .../ml/metrics/H2OBinomialMetrics.scala | 6 ++++-- .../sparkling/ml/metrics/MetricCalculation.scala | 9 ++++++++- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala index 5a36495647..d2ce2ca748 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala @@ -57,14 +57,14 @@ class BinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OTest metricsObject.isInstanceOf[T] should be(true) MetricsAssertions.assertMetricsObjectAgainstMetricsMap(metricsObject, metrics) val binomialObject = metricsObject.asInstanceOf[H2OBinomialMetrics] - binomialObject.getConfusionMatrix().count() > 0 - binomialObject.getConfusionMatrix().columns.length > 0 - binomialObject.getGainsLiftTable().count() > 0 - binomialObject.getGainsLiftTable().columns.length > 0 - binomialObject.getMaxCriteriaAndMetricScores().count() > 0 - binomialObject.getMaxCriteriaAndMetricScores().columns.length > 0 - binomialObject.getThresholdsAndMetricScores().count() > 0 - binomialObject.getThresholdsAndMetricScores().columns.length > 0 + binomialObject.getConfusionMatrix().isEmpty should be (false) + binomialObject.getConfusionMatrix().columns should not be empty + binomialObject.getGainsLiftTable().isEmpty should be (false) + binomialObject.getGainsLiftTable().columns should not be empty + binomialObject.getMaxCriteriaAndMetricScores().isEmpty should be (false) + binomialObject.getMaxCriteriaAndMetricScores().columns should not be empty + binomialObject.getThresholdsAndMetricScores().isEmpty should be (false) + binomialObject.getThresholdsAndMetricScores().columns should not be empty } private def assertMetrics( diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala index 15a35ba40d..31166698ca 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala @@ -54,10 +54,10 @@ class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OT metricsObject.isInstanceOf[T] should be(true) MetricsAssertions.assertMetricsObjectAgainstMetricsMap(metricsObject, metrics) val multinomialObject = metricsObject.asInstanceOf[H2OMultinomialMetrics] - multinomialObject.getConfusionMatrix().count() > 0 - multinomialObject.getConfusionMatrix().columns.length > 0 - multinomialObject.getHitRatioTable().count() > 0 - multinomialObject.getHitRatioTable().columns.length > 0 + multinomialObject.getConfusionMatrix().isEmpty should be (false) + multinomialObject.getConfusionMatrix().columns should not be empty + multinomialObject.getHitRatioTable().isEmpty should be (false) + multinomialObject.getHitRatioTable().columns should not be empty } private def assertMetrics( diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala index 2d78d7706c..6eed9819b3 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala @@ -94,6 +94,8 @@ object H2OBinomialMetrics extends MetricCalculation { calculate(dataFrame, domain, predictionCol, labelCol, Option(weightCol), Option(offsetCol)) } + private val unusedLabelIndex: Double = -1.0 + override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = { dataType match { case StructType(fields) @@ -107,14 +109,14 @@ object H2OBinomialMetrics extends MetricCalculation { Array(index) ++ probabilities.toSeq.map(_.asInstanceOf[Double]) case StructType(fields) if fields.forall(_.dataType == DoubleType) && fields.length == 2 => val probabilities = row.getStruct(0) - Array(-1.0) ++ probabilities.toSeq.map(_.asInstanceOf[Double]) + Array(unusedLabelIndex) ++ probabilities.toSeq.map(_.asInstanceOf[Double]) case DoubleType => probabilityToArray(row.getDouble(0)) case FloatType => probabilityToArray(row.getFloat(0).toDouble) } } private def probabilityToArray(probability: Double): Array[Double] = { - Array[Double](-1 /* unused */, 1 - probability, probability) + Array[Double](unusedLabelIndex, 1 - probability, probability) } override protected def getActualValue(dataType: DataType, domain: Array[String], row: Row): Double = { diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala index 6d6274978d..3165dfdc06 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala @@ -124,7 +124,14 @@ trait MetricCalculation { .reduce((f, s) => { f.reduce(s); f }) filledMetricsBuilder.postGlobal() - val metrics = filledMetricsBuilder.makeModelMetrics(null, null, null, null) + + // Setting parameters of makeModelMetrics to null since they are required only by H2O runtime + val model = null + val frame = null + val adaptedFrame = null + val predictions = null + val metrics = filledMetricsBuilder.makeModelMetrics(model, frame, adaptedFrame, predictions) + val schema = metricsToSchema(metrics) val json = schema.toJsonString new GsonBuilder().create().fromJson(json, classOf[JsonObject]) From 2be8b85316837829bd42286afc413be7572bbfcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Tue, 19 Apr 2022 15:05:10 +0200 Subject: [PATCH 36/37] spotless apply --- .../sparkling/ml/metrics/BinomialMetricsTestSuite.scala | 8 ++++---- .../ml/metrics/MultinomialMetricsTestSuite.scala | 4 ++-- .../ai/h2o/sparkling/ml/metrics/MetricCalculation.scala | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala index d2ce2ca748..d952a9bc88 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala @@ -57,13 +57,13 @@ class BinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OTest metricsObject.isInstanceOf[T] should be(true) MetricsAssertions.assertMetricsObjectAgainstMetricsMap(metricsObject, metrics) val binomialObject = metricsObject.asInstanceOf[H2OBinomialMetrics] - binomialObject.getConfusionMatrix().isEmpty should be (false) + binomialObject.getConfusionMatrix().isEmpty should be(false) binomialObject.getConfusionMatrix().columns should not be empty - binomialObject.getGainsLiftTable().isEmpty should be (false) + binomialObject.getGainsLiftTable().isEmpty should be(false) binomialObject.getGainsLiftTable().columns should not be empty - binomialObject.getMaxCriteriaAndMetricScores().isEmpty should be (false) + binomialObject.getMaxCriteriaAndMetricScores().isEmpty should be(false) binomialObject.getMaxCriteriaAndMetricScores().columns should not be empty - binomialObject.getThresholdsAndMetricScores().isEmpty should be (false) + binomialObject.getThresholdsAndMetricScores().isEmpty should be(false) binomialObject.getThresholdsAndMetricScores().columns should not be empty } diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala index 31166698ca..fa3cb1a1ba 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala @@ -54,9 +54,9 @@ class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OT metricsObject.isInstanceOf[T] should be(true) MetricsAssertions.assertMetricsObjectAgainstMetricsMap(metricsObject, metrics) val multinomialObject = metricsObject.asInstanceOf[H2OMultinomialMetrics] - multinomialObject.getConfusionMatrix().isEmpty should be (false) + multinomialObject.getConfusionMatrix().isEmpty should be(false) multinomialObject.getConfusionMatrix().columns should not be empty - multinomialObject.getHitRatioTable().isEmpty should be (false) + multinomialObject.getHitRatioTable().isEmpty should be(false) multinomialObject.getHitRatioTable().columns should not be empty } diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala index 3165dfdc06..d207b3a277 100644 --- a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala +++ b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/MetricCalculation.scala @@ -129,7 +129,7 @@ trait MetricCalculation { val model = null val frame = null val adaptedFrame = null - val predictions = null + val predictions = null val metrics = filledMetricsBuilder.makeModelMetrics(model, frame, adaptedFrame, predictions) val schema = metricsToSchema(metrics) From ea696febf45b22392f5a91da5db57615984f2628 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Novotn=C3=BD?= Date: Wed, 20 Apr 2022 15:39:46 +0200 Subject: [PATCH 37/37] dataframe.isEmpty is not present in spark 2.2 --- .../sparkling/ml/metrics/BinomialMetricsTestSuite.scala | 8 ++++---- .../ml/metrics/MultinomialMetricsTestSuite.scala | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala index d952a9bc88..b34a5183f5 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/BinomialMetricsTestSuite.scala @@ -57,13 +57,13 @@ class BinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OTest metricsObject.isInstanceOf[T] should be(true) MetricsAssertions.assertMetricsObjectAgainstMetricsMap(metricsObject, metrics) val binomialObject = metricsObject.asInstanceOf[H2OBinomialMetrics] - binomialObject.getConfusionMatrix().isEmpty should be(false) + binomialObject.getConfusionMatrix().count() should be > (0L) binomialObject.getConfusionMatrix().columns should not be empty - binomialObject.getGainsLiftTable().isEmpty should be(false) + binomialObject.getGainsLiftTable().count() should be > (0L) binomialObject.getGainsLiftTable().columns should not be empty - binomialObject.getMaxCriteriaAndMetricScores().isEmpty should be(false) + binomialObject.getMaxCriteriaAndMetricScores().count() should be > (0L) binomialObject.getMaxCriteriaAndMetricScores().columns should not be empty - binomialObject.getThresholdsAndMetricScores().isEmpty should be(false) + binomialObject.getThresholdsAndMetricScores().count() should be > (0L) binomialObject.getThresholdsAndMetricScores().columns should not be empty } diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala index fa3cb1a1ba..5398f2a252 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/metrics/MultinomialMetricsTestSuite.scala @@ -54,9 +54,9 @@ class MultinomialMetricsTestSuite extends FunSuite with Matchers with SharedH2OT metricsObject.isInstanceOf[T] should be(true) MetricsAssertions.assertMetricsObjectAgainstMetricsMap(metricsObject, metrics) val multinomialObject = metricsObject.asInstanceOf[H2OMultinomialMetrics] - multinomialObject.getConfusionMatrix().isEmpty should be(false) + multinomialObject.getConfusionMatrix().count() should be > (0L) multinomialObject.getConfusionMatrix().columns should not be empty - multinomialObject.getHitRatioTable().isEmpty should be(false) + multinomialObject.getHitRatioTable().count() should be > (0L) multinomialObject.getHitRatioTable().columns should not be empty }