Skip to content

Commit

Permalink
calculate all column statistics
Browse files Browse the repository at this point in the history
  • Loading branch information
usefulalgorithm committed Jan 9, 2024
1 parent a20008b commit 5a87d30
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 31 deletions.
4 changes: 2 additions & 2 deletions metaphor/hive/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,13 @@ auth_user: <auth user for hiveserver>
password: <password for the auth user>
```

#### `collect_stats`
#### Collecting Table and Column Statistics

```yaml
collect_stats: <collect_stats>
```

Whether to collect table and column level statistics. Default is `False`.
`collect_stats` controls whether to collect table and column level statistics. Default is `False`.

## Testing

Expand Down
2 changes: 2 additions & 0 deletions metaphor/hive/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ class HiveRunConfig(BaseConfig):

collect_stats: bool = False

# TODO: logic for controlling what column statistics to gather.

@property
def connect_kwargs(self) -> Dict[str, Any]:
kwargs = {
Expand Down
41 changes: 29 additions & 12 deletions metaphor/hive/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,20 @@ def _extract_table_stats(
data_size_bytes=table_size,
record_count=num_rows,
)

numeric_types = {
"TINYINT",
"SMALLINT",
"INT",
"INTEGER",
"BIGINT",
"FLOAT",
"DOUBLE",
"DOUBLE PRECISION",
"DECIMAL",
"NUMERIC",
}

field_statistics: List[FieldStatistics] = []
for field in fields:
cursor.execute(
Expand All @@ -146,31 +160,34 @@ def _extract_table_stats(
raw_field_statistics: Dict[str, Any] = {
"fieldPath": field.field_path,
}

for row in cursor:
field_stats_key = stats_col_names.get(row[0])
if field_stats_key:
try:
raw_field_statistics[field_stats_key] = float(row[1])
except Exception:
numeric_types = {
"TINYINT",
"SMALLINT",
"INT",
"INTEGER",
"BIGINT",
"FLOAT",
"DOUBLE",
"DOUBLE PRECISION",
"DECIMAL",
"NUMERIC",
}
if (
field.native_type
and field.native_type.upper() in numeric_types
):
logger.warning(

Check warning on line 174 in metaphor/hive/extractor.py

View check run for this annotation

Codecov / codecov/patch

metaphor/hive/extractor.py#L174

Added line #L174 was not covered by tests
f"Cannot find {field_stats_key} for field {field.field_path}"
)
if field.native_type and field.native_type.upper() in numeric_types:
try:
cursor.execute(
f"select std({field.field_path}), avg({field.field_path}) from {database}.{table}"
)
std_dev, avg = next(cursor)
raw_field_statistics.update(
{"stdDev": float(std_dev), "average": float(avg)}
)
except Exception:
logger.exception(

Check warning on line 187 in metaphor/hive/extractor.py

View check run for this annotation

Codecov / codecov/patch

metaphor/hive/extractor.py#L186-L187

Added lines #L186 - L187 were not covered by tests
f"Cannot calculate std and / or avg for field {field.field_path}"
)

field_statistics.append(FieldStatistics.from_dict(raw_field_statistics))
if field_statistics:
dataset_statistics.field_statistics = field_statistics
Expand Down
68 changes: 51 additions & 17 deletions tests/hive/expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@
"dataSizeBytes": 21.0,
"fieldStatistics": [
{
"average": 5.5,
"distinctValueCount": 10.0,
"fieldPath": "id",
"maxValue": 10.0,
"minValue": 1.0,
"nullValueCount": 0.0
"nullValueCount": 0.0,
"stdDev": 2.8722813232690143
}
],
"recordCount": 10.0
Expand Down Expand Up @@ -63,23 +65,27 @@
"dataSizeBytes": 926.0,
"fieldStatistics": [
{
"average": 203.0,
"distinctValueCount": 5.0,
"fieldPath": "deptno",
"maxValue": 205.0,
"minValue": 201.0,
"nullValueCount": 0.0
"nullValueCount": 0.0,
"stdDev": 1.4142135623730951
},
{
"distinctValueCount": 5.0,
"fieldPath": "deptname",
"nullValueCount": 0.0
},
{
"average": 503.0,
"distinctValueCount": 5.0,
"fieldPath": "locationid",
"maxValue": 505.0,
"minValue": 501.0,
"nullValueCount": 0.0
"nullValueCount": 0.0,
"stdDev": 1.4142135623730951
}
],
"recordCount": 5.0
Expand Down Expand Up @@ -124,30 +130,36 @@
"dataSizeBytes": 1594.0,
"fieldStatistics": [
{
"average": 113.0,
"distinctValueCount": 25.0,
"fieldPath": "empid",
"maxValue": 125.0,
"minValue": 101.0,
"nullValueCount": 0.0
"nullValueCount": 0.0,
"stdDev": 7.211102550927978
},
{
"average": 203.0,
"distinctValueCount": 5.0,
"fieldPath": "deptno",
"maxValue": 205.0,
"minValue": 201.0,
"nullValueCount": 0.0
"nullValueCount": 0.0,
"stdDev": 1.4142135623730951
},
{
"distinctValueCount": 25.0,
"fieldPath": "name",
"nullValueCount": 0.0
},
{
"average": 58000.0,
"distinctValueCount": 13.0,
"fieldPath": "salary",
"maxValue": 65000.0,
"minValue": 50000.0,
"nullValueCount": 0.0
"nullValueCount": 0.0,
"stdDev": 3577.7087639996635
},
{
"distinctValueCount": 25.0,
Expand Down Expand Up @@ -189,11 +201,13 @@
"dataSizeBytes": 624.0,
"fieldStatistics": [
{
"average": 113.61538461538461,
"distinctValueCount": 13.0,
"fieldPath": "empid",
"maxValue": 125.0,
"minValue": 102.0,
"nullValueCount": 0.0
"nullValueCount": 0.0,
"stdDev": 7.07692307692315
},
{
"distinctValueCount": 5.0,
Expand Down Expand Up @@ -268,11 +282,13 @@
"dataSizeBytes": 139.0,
"fieldStatistics": [
{
"average": 5.5,
"distinctValueCount": 10.0,
"fieldPath": "id",
"maxValue": 10.0,
"minValue": 1.0,
"nullValueCount": 0.0
"nullValueCount": 0.0,
"stdDev": 2.8722813232690143
},
{
"distinctValueCount": 9.0,
Expand Down Expand Up @@ -314,25 +330,31 @@
"dataSizeBytes": 712.0,
"fieldStatistics": [
{
"average": 50.5,
"distinctValueCount": 100.0,
"fieldPath": "id",
"maxValue": 100.0,
"minValue": 1.0,
"nullValueCount": 0.0
"nullValueCount": 0.0,
"stdDev": 28.86607004772212
},
{
"average": 5.5,
"distinctValueCount": 10.0,
"fieldPath": "ship_type_id",
"maxValue": 10.0,
"minValue": 1.0,
"nullValueCount": 0.0
"nullValueCount": 0.0,
"stdDev": 2.8722813232690143
},
{
"average": 5.5,
"distinctValueCount": 10.0,
"fieldPath": "crew_size",
"maxValue": 10.0,
"minValue": 1.0,
"nullValueCount": 0.0
"nullValueCount": 0.0,
"stdDev": 2.8722813232690143
}
],
"recordCount": 100.0
Expand Down Expand Up @@ -369,25 +391,31 @@
"dataSizeBytes": 8913.0,
"fieldStatistics": [
{
"average": 500.5,
"distinctValueCount": 987.0,
"fieldPath": "id",
"maxValue": 1000.0,
"minValue": 1.0,
"nullValueCount": 0.0
"nullValueCount": 0.0,
"stdDev": 288.6749902572095
},
{
"average": 50.5,
"distinctValueCount": 100.0,
"fieldPath": "ship_id",
"maxValue": 100.0,
"minValue": 1.0,
"nullValueCount": 0.0
"nullValueCount": 0.0,
"stdDev": 28.86607004772212
},
{
"average": 5.5,
"distinctValueCount": 10.0,
"fieldPath": "admiral_id",
"maxValue": 10.0,
"minValue": 1.0,
"nullValueCount": 0.0
"nullValueCount": 0.0,
"stdDev": 2.8722813232690143
}
],
"recordCount": 1000.0
Expand Down Expand Up @@ -428,25 +456,31 @@
"dataSizeBytes": 1979173.0,
"fieldStatistics": [
{
"average": 462.48475,
"distinctValueCount": 940.0,
"fieldPath": "userid",
"maxValue": 943.0,
"minValue": 1.0,
"nullValueCount": 0.0
"nullValueCount": 0.0,
"stdDev": 266.61308705207534
},
{
"average": 425.53013,
"distinctValueCount": 1720.0,
"fieldPath": "movieid",
"maxValue": 1682.0,
"minValue": 1.0,
"nullValueCount": 0.0
"nullValueCount": 0.0,
"stdDev": 330.7967023296682
},
{
"average": 3.52986,
"distinctValueCount": 5.0,
"fieldPath": "rating",
"maxValue": 5.0,
"minValue": 1.0,
"nullValueCount": 0.0
"nullValueCount": 0.0,
"stdDev": 1.1256679707622494
},
{
"distinctValueCount": 49262.0,
Expand Down

0 comments on commit 5a87d30

Please sign in to comment.