Skip to content

Commit

Permalink
Check for dupes and drop empties in new validate transform (#145)
Browse files Browse the repository at this point in the history
  • Loading branch information
siddharth-krishna authored Dec 1, 2023
1 parent c3d8eec commit 06cd28f
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 16 deletions.
2 changes: 1 addition & 1 deletion times_reader/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def convert_xl_to_times(
transform_list = [
transforms.normalize_tags_columns,
transforms.remove_fill_tables,
transforms.remove_empty_tables,
transforms.validate_input_tables,
lambda config, tables: [transforms.remove_comment_cols(t) for t in tables],
transforms.remove_tables_with_formulas, # slow
transforms.normalize_column_aliases,
Expand Down
38 changes: 23 additions & 15 deletions times_reader/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,15 +102,6 @@ def remove_comment_cols(table: datatypes.EmbeddedXlTable) -> datatypes.EmbeddedX

df = table.dataframe.drop(comment_cols, axis=1)
df.reset_index(drop=True, inplace=True)

# TODO: should we move the code below to a separate transform?
seen = set()
dupes = [x for x in df.columns if x in seen or seen.add(x)]
if len(dupes) > 0:
print(
f"WARNING: Duplicate columns in {table.range}, {table.sheetname},"
f" {table.filename}: {','.join(dupes)}"
)
return replace(table, dataframe=df)


Expand Down Expand Up @@ -138,15 +129,13 @@ def has_formulas(table):
return [table for table in tables if not has_formulas(table)]


def remove_empty_tables(
def validate_input_tables(
config: datatypes.Config,
tables: List[datatypes.EmbeddedXlTable],
) -> List[datatypes.EmbeddedXlTable]:
"""
Return a modified copy of 'tables' where empty tables have been deleted from the list.
:param tables: List of tables in EmbeddedXlTable format.
:return: List of non-empty tables in EmbeddedXlTable format.
Perform some basic validation (tag names are valid, no duplicate column labels), and
remove empty tables (for recognized tags).
"""

check_list = [
Expand Down Expand Up @@ -181,10 +170,29 @@ def remove_empty_tables(
def discard(table):
if table.tag in check_list:
return not table.dataframe.shape[0]
elif table.tag == datatypes.Tag.unitconversion:
print("Dropping ~UNITCONVERSION table")
return True
else:
return False

return [table for table in tables if not discard(table)]
result = []
for table in tables:
if not datatypes.Tag.has_tag(table.tag.split(":")[0]):
print(f"WARNING: Dropping table with unrecognized tag {table.tag}")
continue
if discard(table):
continue
# Check for duplicate columns:
seen = set()
dupes = [x for x in table.dataframe.columns if x in seen or seen.add(x)]
if len(dupes) > 0:
print(
f"WARNING: Duplicate columns in {table.range}, {table.sheetname},"
f" {table.filename}: {','.join(dupes)}"
)
result.append(table)
return result


def normalize_tags_columns(
Expand Down

0 comments on commit 06cd28f

Please sign in to comment.