Skip to content

Commit

Permalink
Use known columns from veda-tags.json (#157)
Browse files Browse the repository at this point in the history
  • Loading branch information
siddharth-krishna authored Dec 21, 2023
1 parent d1ac577 commit b84d4e1
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 53 deletions.
15 changes: 13 additions & 2 deletions xl2times/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ class Config:
# List of tags for which empty tables should be discarded
discard_if_empty: Iterable[Tag]
veda_attr_defaults: Dict[str, Dict[str, list]]
# Known columns for each tag
known_columns: Dict[Tag, Set[str]]

def __init__(
self,
Expand All @@ -172,6 +174,7 @@ def __init__(
self.column_aliases,
self.row_comment_chars,
self.discard_if_empty,
self.known_columns,
) = Config._read_veda_tags_info(veda_tags_file)
self.veda_attr_defaults, self.attr_aliases = Config._read_veda_attr_defaults(
veda_attr_defaults_file
Expand Down Expand Up @@ -308,7 +311,12 @@ def _read_mappings(filename: str) -> List[TimesXlMap]:
@staticmethod
def _read_veda_tags_info(
veda_tags_file: str,
) -> Tuple[Dict[Tag, Dict[str, str]], Dict[Tag, Dict[str, list]], Iterable[Tag]]:
) -> Tuple[
Dict[Tag, Dict[str, str]],
Dict[Tag, Dict[str, list]],
Iterable[Tag],
Dict[Tag, Set[str]],
]:
def to_tag(s: str) -> Tag:
# The file stores the tag name in lowercase, and without the ~
return Tag("~" + s.upper())
Expand All @@ -328,6 +336,7 @@ def to_tag(s: str) -> Tag:
valid_column_names = {}
row_comment_chars = {}
discard_if_empty = []
known_cols = defaultdict(set)

for tag_info in veda_tags_info:
tag_name = to_tag(tag_info["tag_name"])
Expand All @@ -348,6 +357,8 @@ def to_tag(s: str) -> Tag:
else:
field_name = valid_field["name"]

known_cols[tag_name].add(field_name)

for valid_field_name in valid_field_names:
valid_column_names[tag_name][valid_field_name] = field_name
row_comment_chars[tag_name][field_name] = valid_field[
Expand All @@ -363,7 +374,7 @@ def to_tag(s: str) -> Tag:
if base_tag in row_comment_chars:
row_comment_chars[tag_name] = row_comment_chars[base_tag]

return valid_column_names, row_comment_chars, discard_if_empty
return valid_column_names, row_comment_chars, discard_if_empty, known_cols

@staticmethod
def _read_veda_attr_defaults(
Expand Down
56 changes: 5 additions & 51 deletions xl2times/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,23 +318,7 @@ def process_flexible_import_table(
# datatypes.Tag column no longer used to identify data columns
# https://veda-documentation.readthedocs.io/en/latest/pages/introduction.html#veda2-0-enhanced-features
# TODO: Include other valid column headers
known_columns = [
"region",
"process",
"commodity",
"commodity-in",
"commodity-in-aux",
"commodity-out",
"commodity-out-aux",
"attribute",
"year",
"timeslice",
"limtype",
"currency",
"other_indexes",
"stage",
"sow",
]
known_columns = config.known_columns[datatypes.Tag.fi_t]
data_columns = [x for x in df.columns if x not in known_columns]

# Populate index columns
Expand Down Expand Up @@ -509,32 +493,13 @@ def process_user_constraint_table(
# Fill in UC_N blank cells with value from above
df["uc_n"] = df["uc_n"].ffill()

# TODO: Include other valid column headers
known_columns = [
"uc_n",
"region",
"pset_set",
"pset_pn",
"pset_pd",
"pset_ci",
"pset_co",
"cset_cn",
"cset_cd",
"cset_set",
"side",
"attribute",
"year",
"limtype",
"top_check",
"uc_desc", # Why is this in the index columns?
# TODO remove these?
"timeslice",
data_columns = [
x for x in df.columns if x not in config.known_columns[datatypes.Tag.uc_t]
]
data_columns = [x for x in df.columns if x not in known_columns]

# Populate columns
nrows = df.shape[0]
for colname in known_columns:
for colname in config.known_columns[datatypes.Tag.uc_t]:
if colname not in df.columns:
df[colname] = [None] * nrows
table = replace(table, dataframe=df)
Expand Down Expand Up @@ -1586,18 +1551,7 @@ def process_transform_insert(
df = table.dataframe.copy()

# Standardize column names
# TODO: Include other valid column names
# TODO should this go in datatypes.Config?
known_columns = {
"attribute",
"year",
"timeslice",
"limtype",
"currency",
"stage",
"sow",
"other_indexes",
} | query_columns
known_columns = config.known_columns[datatypes.Tag.tfm_ins] | query_columns

# Handle Regions:
if set(df.columns).isdisjoint(
Expand Down

0 comments on commit b84d4e1

Please sign in to comment.