Skip to content

Commit

Permalink
Make misc fixes (#256)
Browse files Browse the repository at this point in the history
Introduce various improvements

- use `model.topology` in `generate_topology_dictionary`
- include column-specific default values
- simplify pattern processing
- do not fill missing values in mig tables
- modify defaults for processing of `FLO_MARK`, `NCAP_AFC`, and `FLO_DELIV`
- apply code review suggestions from #259

---------

Co-authored-by: Siddharth Krishna <[email protected]>
  • Loading branch information
olejandro and siddharth-krishna authored Dec 26, 2024
1 parent 1d853ca commit 5370fcd
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 47 deletions.
28 changes: 23 additions & 5 deletions xl2times/config/veda-attr-defaults.json
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,13 @@
},
"AFC": {
"defaults": {
"other_indexes": [
"commodity",
"commodity-in",
"commodity-out",
"commodity-in-aux",
"commodity-out-aux"
],
"ts-level": "ANNUAL"
},
"times-attribute": "NCAP_AFC"
Expand Down Expand Up @@ -410,7 +417,9 @@
"defaults": {
"commodity": [
"commodity-in",
"commodity-in-aux"
"commodity-out",
"commodity-in-aux",
"commodity-out-aux"
],
"ts-level": "ANNUAL"
},
Expand Down Expand Up @@ -578,7 +587,9 @@
"defaults": {
"commodity": [
"commodity-in",
"commodity-in-aux"
"commodity-out",
"commodity-in-aux",
"commodity-out-aux"
],
"ts-level": "ANNUAL"
}
Expand Down Expand Up @@ -619,10 +630,10 @@
"FLO_MARK": {
"defaults": {
"commodity": [
"commodity-in",
"commodity-out",
"commodity-in-aux",
"commodity-out-aux"
"commodity-in",
"commodity-out-aux",
"commodity-in-aux"
],
"limtype": "UP"
}
Expand Down Expand Up @@ -781,6 +792,13 @@
},
"NCAP_AFC": {
"defaults": {
"other_indexes": [
"commodity",
"commodity-in",
"commodity-out",
"commodity-in-aux",
"commodity-out-aux"
],
"ts-level": "ANNUAL"
}
},
Expand Down
6 changes: 4 additions & 2 deletions xl2times/config/veda-tags.json
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,8 @@
"query_field": false,
"inherit_above": true,
"remove_first_row_if_absent": false,
"remove_any_row_if_absent": false
"remove_any_row_if_absent": false,
"default_to": "NRG"
},
{
"name": "ctslvl",
Expand Down Expand Up @@ -270,7 +271,8 @@
"query_field": false,
"inherit_above": true,
"remove_first_row_if_absent": false,
"remove_any_row_if_absent": false
"remove_any_row_if_absent": false,
"default_to": "PRE"
},
{
"name": "tact",
Expand Down
24 changes: 23 additions & 1 deletion xl2times/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,8 @@ class Config:
attr_aliases: set[str]
# For each tag, this dictionary maps each column alias to the normalized name
column_aliases: dict[Tag, dict[str, str]]
# For each tag, this dictionary maps each column name to its default value
column_default_value: dict[Tag, dict[str, str]]
# For each tag, this dictionary specifies comment row symbols by column name
row_comment_chars: dict[Tag, dict[str, list]]
# List of tags for which empty tables should be discarded
Expand Down Expand Up @@ -329,11 +331,13 @@ def __init__(
self.times_sets = Config._read_times_sets(times_sets_file)
(
self.column_aliases,
self.column_default_value,
self.row_comment_chars,
self.discard_if_empty,
self.query_columns,
self.known_columns,
self.required_columns,
self.forward_fill_cols,
) = Config._read_veda_tags_info(veda_tags_file)
self.veda_attr_defaults, self.attr_aliases = Config._read_veda_attr_defaults(
veda_attr_defaults_file
Expand Down Expand Up @@ -494,12 +498,14 @@ def _read_mappings(filename: str) -> list[TimesXlMap]:
def _read_veda_tags_info(
veda_tags_file: str,
) -> tuple[
dict[Tag, dict[str, str]],
dict[Tag, dict[str, str]],
dict[Tag, dict[str, list]],
Iterable[Tag],
dict[Tag, set[str]],
dict[Tag, set[str]],
dict[Tag, set[str]],
dict[Tag, set[str]],
]:
def to_tag(s: str) -> Tag:
# The file stores the tag name in lowercase, and without the ~
Expand All @@ -518,18 +524,20 @@ def to_tag(s: str) -> Tag:
)

valid_column_names = {}
column_default_value = {}
row_comment_chars = {}
discard_if_empty = []
query_cols = defaultdict(set)
known_cols = defaultdict(set)
required_cols = defaultdict(set)
forward_fill_cols = defaultdict(set)

for tag_info in veda_tags_info:
tag_name = to_tag(tag_info["tag_name"])
if "valid_fields" in tag_info:
discard_if_empty.append(tag_name)

valid_column_names[tag_name] = {}
column_default_value[tag_name] = {}
row_comment_chars[tag_name] = {}
# Process column aliases and comment chars:
for valid_field in tag_info["valid_fields"]:
Expand All @@ -543,12 +551,20 @@ def to_tag(s: str) -> Tag:
else:
field_name = valid_field["name"]

if "default_to" in valid_field:
column_default_value[tag_name][field_name] = valid_field[
"default_to"
]

if valid_field["query_field"]:
query_cols[tag_name].add(field_name)

if valid_field["remove_any_row_if_absent"]:
required_cols[tag_name].add(field_name)

if valid_field["inherit_above"]:
forward_fill_cols[tag_name].add(field_name)

known_cols[tag_name].add(field_name)

for valid_field_name in valid_field_names:
Expand All @@ -564,20 +580,26 @@ def to_tag(s: str) -> Tag:
if base_tag in valid_column_names:
valid_column_names[tag_name] = valid_column_names[base_tag]
discard_if_empty.append(tag_name)
if base_tag in column_default_value:
column_default_value[tag_name] = column_default_value[base_tag]
if base_tag in row_comment_chars:
row_comment_chars[tag_name] = row_comment_chars[base_tag]
if base_tag in query_cols:
query_cols[tag_name] = query_cols[base_tag]
if base_tag in known_cols:
known_cols[tag_name] = known_cols[base_tag]
if base_tag in forward_fill_cols:
forward_fill_cols[tag_name] = forward_fill_cols[base_tag]

return (
valid_column_names,
column_default_value,
row_comment_chars,
discard_if_empty,
query_cols,
known_cols,
required_cols,
forward_fill_cols,
)

@staticmethod
Expand Down
55 changes: 27 additions & 28 deletions xl2times/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -884,10 +884,17 @@ def fill_in_missing_values(

def fill_in_missing_values_table(table):
df = table.dataframe.copy()
default_values = config.column_default_value.get(table.tag, {})

for colname in df.columns:
# TODO make this more declarative
if colname in ["sets", "csets", "process"]:
# Forwards fill values in columns
if colname in config.forward_fill_cols[table.tag]:
df[colname] = df[colname].ffill()
# Apply default values to missing cells
col_default_value = default_values.get(colname)
if col_default_value is not None:
df[colname] = df[colname].fillna(col_default_value)
elif colname == "limtype" and table.tag == Tag.fi_comm and False:
isna = df[colname].isna()
ismat = df["csets"] == "MAT"
Expand Down Expand Up @@ -944,8 +951,8 @@ def fill_in_missing_values_table(table):
return replace(table, dataframe=df)

for table in tables:
if table.tag == Tag.tfm_upd:
# Missing values in update tables are wildcards and should not be filled in
if table.tag in [Tag.tfm_mig, Tag.tfm_upd]:
# Missing values in these tables are wildcards and should not be filled in
result.append(table)
else:
result.append(fill_in_missing_values_table(table))
Expand Down Expand Up @@ -1194,7 +1201,7 @@ def capitalise_table_values(
"""Ensure that all table entries are uppercase. Strip leading and trailing whitespace."""

def capitalise_table_entries(table: EmbeddedXlTable):
df = table.dataframe.copy()
df = table.dataframe
# Capitalise all entries if column type string
colnames = df.select_dtypes(include="object").columns
seen_cols = [colname for colname in colnames if colname in df.columns]
Expand All @@ -1203,8 +1210,7 @@ def capitalise_table_entries(table: EmbeddedXlTable):
# Index of rows with string entries
i = df[seen_col].apply(lambda x: isinstance(x, str))
if any(i):
df.loc[i, seen_col] = df[seen_col][i].str.upper()
df.loc[i, seen_col] = df[seen_col][i].str.strip()
df.loc[i, seen_col] = df[seen_col][i].str.upper().str.strip()
return replace(table, dataframe=df)
else:
return table
Expand Down Expand Up @@ -2129,22 +2135,13 @@ def process_transform_availability(
return result


def filter_by_pattern(df: pd.DataFrame, pattern: str, combined: bool) -> pd.DataFrame:
"""
Filter dataframe index by a regex pattern. Parameter combined indicates whether commas should
be treated as a pattern separator or belong to the pattern.
"""
def filter_by_pattern(df: pd.DataFrame, pattern: str) -> pd.DataFrame:
"""Filter dataframe index by a regex pattern."""
# Duplicates can be created when a process has multiple commodities that match the pattern
df = df.filter(
regex=utils.create_regexp(pattern, combined), axis="index"
).drop_duplicates()
if combined:
exclude = df.filter(
regex=utils.create_negative_regexp(pattern), axis="index"
).index
return df.drop(exclude)
else:
return df
df = df.filter(regex=utils.create_regexp(pattern), axis="index").drop_duplicates()
exclude = df.filter(regex=utils.create_negative_regexp(pattern), axis="index").index

return df.drop(exclude)


def intersect(acc, df):
Expand All @@ -2161,7 +2158,7 @@ def get_matching_processes(
if col in row.index and row[col] not in {None, ""}:
proc_set = topology[key]
pattern = row[col].upper()
filtered = filter_by_pattern(proc_set, pattern, col != "pset_pd")
filtered = filter_by_pattern(proc_set, pattern)
matching_processes = intersect(matching_processes, filtered)

if matching_processes is not None and any(matching_processes.duplicated()):
Expand All @@ -2176,7 +2173,7 @@ def get_matching_commodities(row: pd.Series, topology: dict[str, DataFrame]):
if col in row.index and row[col] not in {None, ""}:
matching_commodities = intersect(
matching_commodities,
filter_by_pattern(topology[key], row[col].upper(), col != "cset_cd"),
filter_by_pattern(topology[key], row[col].upper()),
)
return matching_commodities

Expand All @@ -2201,7 +2198,9 @@ def generate_topology_dictionary(
dictionary = dict()
pros = model.processes
coms = model.commodities
pros_and_coms = tables[Tag.fi_t]
pros_and_coms = model.topology[["process", "commodity", "io"]].drop_duplicates()
i_comm_in = pros_and_coms["io"] == "IN"
i_comm_out = pros_and_coms["io"] == "OUT"

dict_info = [
{"key": "processes_by_name", "df": pros[["process"]], "col": "process"},
Expand All @@ -2213,13 +2212,13 @@ def generate_topology_dictionary(
{"key": "processes_by_sets", "df": pros[["process", "sets"]], "col": "sets"},
{
"key": "processes_by_comm_in",
"df": pros_and_coms[["process", "commodity-in"]],
"col": "commodity-in",
"df": pros_and_coms[["process", "commodity"]][i_comm_in],
"col": "commodity",
},
{
"key": "processes_by_comm_out",
"df": pros_and_coms[["process", "commodity-out"]],
"col": "commodity-out",
"df": pros_and_coms[["process", "commodity"]][i_comm_out],
"col": "commodity",
},
{"key": "commodities_by_name", "df": coms[["commodity"]], "col": "commodity"},
{
Expand Down
22 changes: 11 additions & 11 deletions xl2times/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,17 +247,18 @@ def remove_positive_patterns(pattern: str) -> str:
return ",".join([word[1:] for word in pattern.split(",") if word[0] == "-"])


def remove_whitespace(pattern: str) -> str:
return ",".join([word.strip() for word in pattern.split(",")])


@functools.lru_cache(maxsize=int(1e6))
def create_regexp(pattern: str, combined: bool = True) -> str:
# Distinguish comma-separated list of patterns vs a pattern with a comma(s)
if combined:
# Remove whitespaces
pattern = pattern.replace(" ", "")
# Exclude negative patterns
if has_negative_patterns(pattern):
pattern = remove_negative_patterns(pattern)
# Handle comma-separated values
pattern = pattern.replace(",", r"$|^")
pattern = remove_whitespace(pattern)
# Exclude negative patterns
if has_negative_patterns(pattern):
pattern = remove_negative_patterns(pattern)
# Handle comma-separated values
pattern = pattern.replace(",", r"$|^")
if len(pattern) == 0:
return r".*" # matches everything
# Handle substite VEDA wildcards with regex patterns
Expand All @@ -271,8 +272,7 @@ def create_regexp(pattern: str, combined: bool = True) -> str:

@functools.lru_cache(maxsize=int(1e6))
def create_negative_regexp(pattern: str) -> str:
# Remove whitespaces
pattern = pattern.replace(" ", "")
pattern = remove_whitespace(pattern)
# Exclude positive patterns
pattern = remove_positive_patterns(pattern)
if len(pattern) == 0:
Expand Down

0 comments on commit 5370fcd

Please sign in to comment.