From 06cd28f327f1d76bcf92c3698dd4db0c843654d2 Mon Sep 17 00:00:00 2001
From: Siddharth Krishna <siddharth-krishna@users.noreply.github.com>
Date: Fri, 1 Dec 2023 16:41:23 +0000
Subject: [PATCH] Check for dupes and drop empties in new validate transform
 (#145)

---
 times_reader/__main__.py   |  2 +-
 times_reader/transforms.py | 38 +++++++++++++++++++++++---------------
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/times_reader/__main__.py b/times_reader/__main__.py
index 43dd924..9ffd7c4 100644
--- a/times_reader/__main__.py
+++ b/times_reader/__main__.py
@@ -56,7 +56,7 @@ def convert_xl_to_times(
     transform_list = [
         transforms.normalize_tags_columns,
         transforms.remove_fill_tables,
-        transforms.remove_empty_tables,
+        transforms.validate_input_tables,
         lambda config, tables: [transforms.remove_comment_cols(t) for t in tables],
         transforms.remove_tables_with_formulas,  # slow
         transforms.normalize_column_aliases,
diff --git a/times_reader/transforms.py b/times_reader/transforms.py
index 448eabf..eb6c171 100644
--- a/times_reader/transforms.py
+++ b/times_reader/transforms.py
@@ -102,15 +102,6 @@ def remove_comment_cols(table: datatypes.EmbeddedXlTable) -> datatypes.EmbeddedX
 
     df = table.dataframe.drop(comment_cols, axis=1)
     df.reset_index(drop=True, inplace=True)
-
-    # TODO: should we move the code below to a separate transform?
-    seen = set()
-    dupes = [x for x in df.columns if x in seen or seen.add(x)]
-    if len(dupes) > 0:
-        print(
-            f"WARNING: Duplicate columns in {table.range}, {table.sheetname},"
-            f" {table.filename}: {','.join(dupes)}"
-        )
     return replace(table, dataframe=df)
 
 
@@ -138,15 +129,13 @@ def has_formulas(table):
     return [table for table in tables if not has_formulas(table)]
 
 
-def remove_empty_tables(
+def validate_input_tables(
     config: datatypes.Config,
     tables: List[datatypes.EmbeddedXlTable],
 ) -> List[datatypes.EmbeddedXlTable]:
     """
-    Return a modified copy of 'tables' where empty tables have been deleted from the list.
-
-    :param tables:      List of tables in EmbeddedXlTable format.
-    :return:            List of non-empty tables in EmbeddedXlTable format.
+    Perform some basic validation (tag names are valid, no duplicate column labels), and
+    remove empty tables (for recognized tags).
     """
 
     check_list = [
@@ -181,10 +170,29 @@ def remove_empty_tables(
     def discard(table):
         if table.tag in check_list:
             return not table.dataframe.shape[0]
+        elif table.tag == datatypes.Tag.unitconversion:
+            print("Dropping ~UNITCONVERSION table")
+            return True
         else:
             return False
 
-    return [table for table in tables if not discard(table)]
+    result = []
+    for table in tables:
+        if not datatypes.Tag.has_tag(table.tag.split(":")[0]):
+            print(f"WARNING: Dropping table with unrecognized tag {table.tag}")
+            continue
+        if discard(table):
+            continue
+        # Check for duplicate columns:
+        seen = set()
+        dupes = [x for x in table.dataframe.columns if x in seen or seen.add(x)]
+        if len(dupes) > 0:
+            print(
+                f"WARNING: Duplicate columns in {table.range}, {table.sheetname},"
+                f" {table.filename}: {','.join(dupes)}"
+            )
+        result.append(table)
+    return result
 
 
 def normalize_tags_columns(