Skip to content

Commit

Permalink
Update comments removal transforms (#140)
Browse files Browse the repository at this point in the history
This PR adds information on comment row indicators to `config`. The info
is then used to identify and remove those rows.

---------

Co-authored-by: Siddharth Krishna <[email protected]>
  • Loading branch information
olejandro and siddharth-krishna authored Dec 1, 2023
1 parent 9e3e5a3 commit f386a36
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 47 deletions.
4 changes: 3 additions & 1 deletion times_reader/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,12 @@ def convert_xl_to_times(
transforms.normalize_tags_columns,
transforms.remove_fill_tables,
transforms.remove_empty_tables,
lambda config, tables: [transforms.remove_comment_rows(t) for t in tables],
lambda config, tables: [transforms.remove_comment_cols(t) for t in tables],
transforms.remove_tables_with_formulas, # slow
transforms.normalize_column_aliases,
lambda config, tables: [
transforms.remove_comment_rows(config, t) for t in tables
],
transforms.apply_postnormalisation_fixes,
transforms.generate_dummy_processes,
transforms.process_transform_insert_variants,
Expand Down
23 changes: 23 additions & 0 deletions times_reader/config/veda-tags.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,29 @@
]
}
},
{
"tag_name": "comemi",
"tag_allowed_in": [
"BY",
"SubRES"
],
"tag_fields": {
"fields_names": [
"commname"
],
"fields_aliases": [
[
"commodity"
]
],
"row_ignore_symbol": [
[
"\\I:",
"*"
]
]
}
},
{
"tag_name": "currencies",
"tag_allowed_in": [
Expand Down
22 changes: 18 additions & 4 deletions times_reader/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ class Config:
all_attributes: Set[str]
# For each tag, this dictionary maps each column alias to the normalized name
column_aliases: Dict[Tag, Dict[str, str]]
# For each tag, this dictionary specifies comment row symbols by column name
row_comment_chars: Dict[Tag, Dict[str, list]]
veda_attr_defaults: Dict[str, Dict[str, list]]

def __init__(
Expand All @@ -161,7 +163,9 @@ def __init__(
self.dd_table_order, self.all_attributes = Config._process_times_info(
times_info_file
)
self.column_aliases = Config._read_veda_tags_info(veda_tags_file)
self.column_aliases, self.row_comment_chars = Config._read_veda_tags_info(
veda_tags_file
)
self.veda_attr_defaults = Config._read_veda_attr_defaults(
veda_attr_defaults_file
)
Expand Down Expand Up @@ -263,23 +267,33 @@ def _read_mappings(filename: str) -> List[TimesXlMap]:
return mappings

@staticmethod
def _read_veda_tags_info(veda_tags_file: str) -> Dict[Tag, Dict[str, str]]:
# Read veda_tags_file
def _read_veda_tags_info(
veda_tags_file: str,
) -> Tuple[Dict[Tag, Dict[str, str]], Dict[Tag, Dict[str, list]]]:
with resources.open_text("times_reader.config", veda_tags_file) as f:
veda_tags_info = json.load(f)
column_aliases = {}
row_comment_chars = {}

for tag_info in veda_tags_info:
if "tag_fields" in tag_info:
# The file stores the tag name in lowercase, and without the ~
tag_name = "~" + tag_info["tag_name"].upper()
# Process column aliases:
column_aliases[tag_name] = {}
names = tag_info["tag_fields"]["fields_names"]
aliases = tag_info["tag_fields"]["fields_aliases"]
assert len(names) == len(aliases)
for name, aliases in zip(names, aliases):
for alias in aliases:
column_aliases[tag_name][alias] = name
return column_aliases
# Process comment chars:
row_comment_chars[tag_name] = {}
chars = tag_info["tag_fields"]["row_ignore_symbol"]
assert len(names) == len(chars)
for name, chars_list in zip(names, chars):
row_comment_chars[tag_name][name] = chars_list
return column_aliases, row_comment_chars

@staticmethod
def _read_veda_attr_defaults(
Expand Down
94 changes: 52 additions & 42 deletions times_reader/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,13 @@
}


def remove_comment_rows(table: datatypes.EmbeddedXlTable) -> datatypes.EmbeddedXlTable:
def remove_comment_rows(
config: datatypes.Config, table: datatypes.EmbeddedXlTable
) -> datatypes.EmbeddedXlTable:
"""
Return a modified copy of 'table' where rows with cells containig '*'
or '\I:' in their first or third columns have been deleted. These characters
are defined in https://iea-etsap.org/docs/Documentation_for_the_TIMES_Model-Part-IV.pdf
as comment identifiers (pag 15).
TODO: we believe the deletion of the third column is a bug. We tried deleting that part
of the code but we failed to parse a row as a consequence. We need to investigate why,
fix that parsing and remove the deletion of the third column.
Return a modified copy of 'table' where rows with cells starting with symbols
indicating a comment row in any column have been deleted. Comment row symbols
are column name dependant and are specified in the config.
:param table: Table object in EmbeddedXlTable format.
:return: Table object in EmbeddedXlTable format without comment rows.
Expand All @@ -56,50 +54,56 @@ def remove_comment_rows(table: datatypes.EmbeddedXlTable) -> datatypes.EmbeddedX
return table

df = table.dataframe.copy()
comment_rows = list(
locate(
df.iloc[:, 0],
lambda cell: isinstance(cell, str)
and (cell.startswith("*") or cell.startswith("\\I:")),
)
)
df.drop(index=comment_rows, inplace=True)
df.reset_index(drop=True, inplace=True)

# TODO: the deletion of this third column is a bug. Removing it causes the
# program to fail parse all rows. We need to fix the parsing so it can read
# all rows and remove this code block.
if df.shape[1] > 1:
comment_rows = list(
locate(
df.iloc[:, 1],
lambda cell: isinstance(cell, str) and cell.startswith("*"),
tag = table.tag.split(":")[0]

if tag in config.row_comment_chars:
chars_by_colname = config.row_comment_chars[tag]
else:
return table

comment_rows = set()

for colname in df.columns:
if colname in chars_by_colname.keys():
comment_rows.update(
list(
locate(
df[colname],
lambda cell: isinstance(cell, str)
and (cell.startswith(tuple(chars_by_colname[colname]))),
)
)
)
)
df.drop(index=comment_rows, inplace=True)
df.reset_index(drop=True, inplace=True)

df.drop(index=list(comment_rows), inplace=True)
df.reset_index(drop=True, inplace=True)

return replace(table, dataframe=df)


def remove_comment_cols(table: datatypes.EmbeddedXlTable) -> datatypes.EmbeddedXlTable:
"""
Return a modified copy of 'table' where columns with labels starting with '*'
have been deleted.
have been deleted. Assumes that any leading spaces in the original input table
have been removed.
:param table: Table object in EmbeddedXlTable format.
:return: Table object in EmbeddedXlTable format without comment columns.
"""
if table.dataframe.size == 0:
return table

comment_cols = list(
locate(
table.dataframe.columns,
lambda cell: isinstance(cell, str) and cell.startswith("*"),
)
)
df = table.dataframe.drop(table.dataframe.columns[comment_cols], axis=1)
comment_cols = [
colname
for colname in table.dataframe.columns
if isinstance(colname, str) and colname.startswith("*")
]

df = table.dataframe.drop(comment_cols, axis=1)
df.reset_index(drop=True, inplace=True)

# TODO: should we move the code below to a separate transform?
seen = set()
dupes = [x for x in df.columns if x in seen or seen.add(x)]
if len(dupes) > 0:
Expand Down Expand Up @@ -232,6 +236,7 @@ def apply_postnormalisation_fixes(
config: datatypes.Config, tables: List[datatypes.EmbeddedXlTable]
) -> List[datatypes.EmbeddedXlTable]:
rename_cols_dict = {
datatypes.Tag.comemi: {"commname": "commodity"},
datatypes.Tag.fi_comm: {"commname": "commodity"},
datatypes.Tag.fi_process: {"techname": "process"},
datatypes.Tag.tfm_comgrp: {"value": "allregions"},
Expand Down Expand Up @@ -460,9 +465,14 @@ def process_flexible_import_table(
.loc[veda_process_sets["process"] == process]
.unique()
)
df.loc[i & (df["process"] == process), other] = cost_mapping[
veda_process_set[0]
]
if veda_process_set.shape[0]:
df.loc[i & (df["process"] == process), other] = cost_mapping[
veda_process_set[0]
]
else:
print(
f"WARNING: COST won't be processed as IRE_PRICE for {process}, because it is not in IMP/EXP/MIN"
)

# Use CommName to store the active commodity for EXP / IMP
i = df[attribute].isin(["COST", "IRE_PRICE"])
Expand Down Expand Up @@ -1274,8 +1284,6 @@ def process_commodity_emissions(
result.append(table)
else:
df = table.dataframe.copy()
# TODO either add ~COMEMI to veda-tags.json or do this somewhere less hacky:
df.rename(columns={"commname": "commodity"}, inplace=True)
index_columns = ["region", "year", "commodity"]
data_columns = [
colname for colname in df.columns if colname not in index_columns
Expand All @@ -1287,7 +1295,9 @@ def process_commodity_emissions(

if "region" in df.columns:
df = df.astype({"region": "string"})
df["region"] = df["region"].map(lambda s: s.split(","))
df["region"] = df["region"].map(
lambda s: s.split(",") if isinstance(s, str) else s
)
df = df.explode("region", ignore_index=True)
df = df[df["region"].isin(regions)]

Expand Down

0 comments on commit f386a36

Please sign in to comment.