From 5a87d3039bc4c6ff8a4ea05b9516ecc7d3a30233 Mon Sep 17 00:00:00 2001 From: Tsung-Ju Lii Date: Wed, 10 Jan 2024 02:02:11 +0800 Subject: [PATCH] calculate all column statistics --- metaphor/hive/README.md | 4 +-- metaphor/hive/config.py | 2 ++ metaphor/hive/extractor.py | 41 ++++++++++++++++------- tests/hive/expected.json | 68 ++++++++++++++++++++++++++++---------- 4 files changed, 84 insertions(+), 31 deletions(-) diff --git a/metaphor/hive/README.md b/metaphor/hive/README.md index d2a2d491..93f68cd3 100644 --- a/metaphor/hive/README.md +++ b/metaphor/hive/README.md @@ -32,13 +32,13 @@ auth_user: password: ``` -#### `collect_stats` +#### Collecting Table and Column Statistics ```yaml collect_stats: ``` -Whether to collect table and column level statistics. Default is `False`. +`collect_stats` controls whether to collect table and column level statistics. Default is `False`. ## Testing diff --git a/metaphor/hive/config.py b/metaphor/hive/config.py index 64949635..cb9ad14b 100644 --- a/metaphor/hive/config.py +++ b/metaphor/hive/config.py @@ -15,6 +15,8 @@ class HiveRunConfig(BaseConfig): collect_stats: bool = False + # TODO: logic for controlling what column statistics to gather. + @property def connect_kwargs(self) -> Dict[str, Any]: kwargs = { diff --git a/metaphor/hive/extractor.py b/metaphor/hive/extractor.py index cfdcbc4b..9f12baf9 100644 --- a/metaphor/hive/extractor.py +++ b/metaphor/hive/extractor.py @@ -132,6 +132,20 @@ def _extract_table_stats( data_size_bytes=table_size, record_count=num_rows, ) + + numeric_types = { + "TINYINT", + "SMALLINT", + "INT", + "INTEGER", + "BIGINT", + "FLOAT", + "DOUBLE", + "DOUBLE PRECISION", + "DECIMAL", + "NUMERIC", + } + field_statistics: List[FieldStatistics] = [] for field in fields: cursor.execute( @@ -146,24 +160,13 @@ def _extract_table_stats( raw_field_statistics: Dict[str, Any] = { "fieldPath": field.field_path, } + for row in cursor: field_stats_key = stats_col_names.get(row[0]) if field_stats_key: try: raw_field_statistics[field_stats_key] = float(row[1]) except Exception: - numeric_types = { - "TINYINT", - "SMALLINT", - "INT", - "INTEGER", - "BIGINT", - "FLOAT", - "DOUBLE", - "DOUBLE PRECISION", - "DECIMAL", - "NUMERIC", - } if ( field.native_type and field.native_type.upper() in numeric_types @@ -171,6 +174,20 @@ def _extract_table_stats( logger.warning( f"Cannot find {field_stats_key} for field {field.field_path}" ) + if field.native_type and field.native_type.upper() in numeric_types: + try: + cursor.execute( + f"select std({field.field_path}), avg({field.field_path}) from {database}.{table}" + ) + std_dev, avg = next(cursor) + raw_field_statistics.update( + {"stdDev": float(std_dev), "average": float(avg)} + ) + except Exception: + logger.exception( + f"Cannot calculate std and / or avg for field {field.field_path}" + ) + field_statistics.append(FieldStatistics.from_dict(raw_field_statistics)) if field_statistics: dataset_statistics.field_statistics = field_statistics diff --git a/tests/hive/expected.json b/tests/hive/expected.json index 4bd8806c..ea66860d 100644 --- a/tests/hive/expected.json +++ b/tests/hive/expected.json @@ -22,11 +22,13 @@ "dataSizeBytes": 21.0, "fieldStatistics": [ { + "average": 5.5, "distinctValueCount": 10.0, "fieldPath": "id", "maxValue": 10.0, "minValue": 1.0, - "nullValueCount": 0.0 + "nullValueCount": 0.0, + "stdDev": 2.8722813232690143 } ], "recordCount": 10.0 @@ -63,11 +65,13 @@ "dataSizeBytes": 926.0, "fieldStatistics": [ { + "average": 203.0, "distinctValueCount": 5.0, "fieldPath": "deptno", "maxValue": 205.0, "minValue": 201.0, - "nullValueCount": 0.0 + "nullValueCount": 0.0, + "stdDev": 1.4142135623730951 }, { "distinctValueCount": 5.0, @@ -75,11 +79,13 @@ "nullValueCount": 0.0 }, { + "average": 503.0, "distinctValueCount": 5.0, "fieldPath": "locationid", "maxValue": 505.0, "minValue": 501.0, - "nullValueCount": 0.0 + "nullValueCount": 0.0, + "stdDev": 1.4142135623730951 } ], "recordCount": 5.0 @@ -124,18 +130,22 @@ "dataSizeBytes": 1594.0, "fieldStatistics": [ { + "average": 113.0, "distinctValueCount": 25.0, "fieldPath": "empid", "maxValue": 125.0, "minValue": 101.0, - "nullValueCount": 0.0 + "nullValueCount": 0.0, + "stdDev": 7.211102550927978 }, { + "average": 203.0, "distinctValueCount": 5.0, "fieldPath": "deptno", "maxValue": 205.0, "minValue": 201.0, - "nullValueCount": 0.0 + "nullValueCount": 0.0, + "stdDev": 1.4142135623730951 }, { "distinctValueCount": 25.0, @@ -143,11 +153,13 @@ "nullValueCount": 0.0 }, { + "average": 58000.0, "distinctValueCount": 13.0, "fieldPath": "salary", "maxValue": 65000.0, "minValue": 50000.0, - "nullValueCount": 0.0 + "nullValueCount": 0.0, + "stdDev": 3577.7087639996635 }, { "distinctValueCount": 25.0, @@ -189,11 +201,13 @@ "dataSizeBytes": 624.0, "fieldStatistics": [ { + "average": 113.61538461538461, "distinctValueCount": 13.0, "fieldPath": "empid", "maxValue": 125.0, "minValue": 102.0, - "nullValueCount": 0.0 + "nullValueCount": 0.0, + "stdDev": 7.07692307692315 }, { "distinctValueCount": 5.0, @@ -268,11 +282,13 @@ "dataSizeBytes": 139.0, "fieldStatistics": [ { + "average": 5.5, "distinctValueCount": 10.0, "fieldPath": "id", "maxValue": 10.0, "minValue": 1.0, - "nullValueCount": 0.0 + "nullValueCount": 0.0, + "stdDev": 2.8722813232690143 }, { "distinctValueCount": 9.0, @@ -314,25 +330,31 @@ "dataSizeBytes": 712.0, "fieldStatistics": [ { + "average": 50.5, "distinctValueCount": 100.0, "fieldPath": "id", "maxValue": 100.0, "minValue": 1.0, - "nullValueCount": 0.0 + "nullValueCount": 0.0, + "stdDev": 28.86607004772212 }, { + "average": 5.5, "distinctValueCount": 10.0, "fieldPath": "ship_type_id", "maxValue": 10.0, "minValue": 1.0, - "nullValueCount": 0.0 + "nullValueCount": 0.0, + "stdDev": 2.8722813232690143 }, { + "average": 5.5, "distinctValueCount": 10.0, "fieldPath": "crew_size", "maxValue": 10.0, "minValue": 1.0, - "nullValueCount": 0.0 + "nullValueCount": 0.0, + "stdDev": 2.8722813232690143 } ], "recordCount": 100.0 @@ -369,25 +391,31 @@ "dataSizeBytes": 8913.0, "fieldStatistics": [ { + "average": 500.5, "distinctValueCount": 987.0, "fieldPath": "id", "maxValue": 1000.0, "minValue": 1.0, - "nullValueCount": 0.0 + "nullValueCount": 0.0, + "stdDev": 288.6749902572095 }, { + "average": 50.5, "distinctValueCount": 100.0, "fieldPath": "ship_id", "maxValue": 100.0, "minValue": 1.0, - "nullValueCount": 0.0 + "nullValueCount": 0.0, + "stdDev": 28.86607004772212 }, { + "average": 5.5, "distinctValueCount": 10.0, "fieldPath": "admiral_id", "maxValue": 10.0, "minValue": 1.0, - "nullValueCount": 0.0 + "nullValueCount": 0.0, + "stdDev": 2.8722813232690143 } ], "recordCount": 1000.0 @@ -428,25 +456,31 @@ "dataSizeBytes": 1979173.0, "fieldStatistics": [ { + "average": 462.48475, "distinctValueCount": 940.0, "fieldPath": "userid", "maxValue": 943.0, "minValue": 1.0, - "nullValueCount": 0.0 + "nullValueCount": 0.0, + "stdDev": 266.61308705207534 }, { + "average": 425.53013, "distinctValueCount": 1720.0, "fieldPath": "movieid", "maxValue": 1682.0, "minValue": 1.0, - "nullValueCount": 0.0 + "nullValueCount": 0.0, + "stdDev": 330.7967023296682 }, { + "average": 3.52986, "distinctValueCount": 5.0, "fieldPath": "rating", "maxValue": 5.0, "minValue": 1.0, - "nullValueCount": 0.0 + "nullValueCount": 0.0, + "stdDev": 1.1256679707622494 }, { "distinctValueCount": 49262.0,