Skip to content

Commit

Permalink
Merge branch 'main' into olex/discard-more-empties
Browse files Browse the repository at this point in the history
  • Loading branch information
olejandro authored Feb 28, 2024
2 parents b8080cf + eac45c0 commit 10ab962
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 0 deletions.
1 change: 1 addition & 0 deletions xl2times/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def convert_xl_to_times(
transforms.remove_tables_with_formulas, # slow
transforms.normalize_column_aliases,
transforms.remove_comment_rows,
transforms.revalidate_input_tables,
transforms.process_regions,
transforms.remove_exreg_cols,
transforms.generate_dummy_processes,
Expand Down
12 changes: 12 additions & 0 deletions xl2times/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,10 @@ class Config:
veda_attr_defaults: Dict[str, Dict[str, list]]
# Known columns for each tag
known_columns: Dict[Tag, Set[str]]
# Query columns for each tag
query_columns: Dict[Tag, Set[str]]
# Required columns for each tag
required_columns: Dict[Tag, Set[str]]
# Names of regions to include in the model; if empty, all regions are included.
filter_regions: Set[str]
times_sets: Dict[str, List[str]]
Expand All @@ -214,6 +218,7 @@ def __init__(
self.discard_if_empty,
self.query_columns,
self.known_columns,
self.required_columns,
) = Config._read_veda_tags_info(veda_tags_file)
self.veda_attr_defaults, self.attr_aliases = Config._read_veda_attr_defaults(
veda_attr_defaults_file
Expand Down Expand Up @@ -371,6 +376,7 @@ def _read_veda_tags_info(
Iterable[Tag],
Dict[Tag, Set[str]],
Dict[Tag, Set[str]],
Dict[Tag, Set[str]],
]:
def to_tag(s: str) -> Tag:
# The file stores the tag name in lowercase, and without the ~
Expand All @@ -393,6 +399,7 @@ def to_tag(s: str) -> Tag:
discard_if_empty = []
query_cols = defaultdict(set)
known_cols = defaultdict(set)
required_cols = defaultdict(set)

for tag_info in veda_tags_info:
tag_name = to_tag(tag_info["tag_name"])
Expand All @@ -415,6 +422,10 @@ def to_tag(s: str) -> Tag:

if valid_field["query_field"]:
query_cols[tag_name].add(field_name)

if valid_field["remove_any_row_if_absent"]:
required_cols[tag_name].add(field_name)

known_cols[tag_name].add(field_name)

for valid_field_name in valid_field_names:
Expand Down Expand Up @@ -443,6 +454,7 @@ def to_tag(s: str) -> Tag:
discard_if_empty,
query_cols,
known_cols,
required_cols,
)

@staticmethod
Expand Down
41 changes: 41 additions & 0 deletions xl2times/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,47 @@ def discard(table):
return result


def revalidate_input_tables(
config: datatypes.Config,
tables: List[datatypes.EmbeddedXlTable],
model: datatypes.TimesModel,
) -> List[datatypes.EmbeddedXlTable]:
"""
Perform further validation of input tables by checking whether required columns are
present / non-empty. Remove tables without required columns or if they are empty.
"""
result = []
for table in tables:
tag = datatypes.Tag(table.tag.split(":")[0])
required_cols = config.required_columns[tag]
unique_table_cols = set(table.dataframe.columns)
if required_cols:
# Drop table if any column in required columns is missing
missing_cols = required_cols - unique_table_cols
if missing_cols:
logger.warning(
f"Dropping {tag.value} table withing range {table.range} on sheet {table.sheetname}"
f" in file {table.filename} due to missing required columns: {missing_cols}"
)
# Discard the table
continue
# Check whether any of the required columns is empty
else:
df = table.dataframe
empty_required_cols = {c for c in required_cols if all(df[c].isna())}
if empty_required_cols:
logger.warning(
f"Dropping {tag.value} table within range {table.range} on sheet {table.sheetname}"
f" in file {table.filename} due to empty required columns: {empty_required_cols}"
)
# Discard the table
continue
# Append table to the list if reached this far
result.append(table)

return result


def normalize_tags_columns(
config: datatypes.Config,
tables: List[datatypes.EmbeddedXlTable],
Expand Down

0 comments on commit 10ab962

Please sign in to comment.