Skip to content

Commit

Permalink
Don't remove white spaces or split description pattern
Browse files Browse the repository at this point in the history
  • Loading branch information
olejandro committed Mar 15, 2024
1 parent 7b2f6f7 commit 6c8846e
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 18 deletions.
23 changes: 16 additions & 7 deletions xl2times/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2016,11 +2016,18 @@ def process_transform_availability(
return result


def filter_by_pattern(df: pd.DataFrame, pattern: str) -> pd.DataFrame:
def filter_by_pattern(df: pd.DataFrame, pattern: str, combined: bool) -> pd.DataFrame:
# Duplicates can be created when a process has multiple commodities that match the pattern
df = df.filter(regex=utils.create_regexp(pattern), axis="index").drop_duplicates()
exclude = df.filter(regex=utils.create_negative_regexp(pattern), axis="index").index
return df.drop(exclude)
df = df.filter(
regex=utils.create_regexp(pattern, combined), axis="index"
).drop_duplicates()
if combined:
exclude = df.filter(
regex=utils.create_negative_regexp(pattern), axis="index"
).index
return df.drop(exclude)
else:
return df


def intersect(acc, df):
Expand All @@ -2029,13 +2036,15 @@ def intersect(acc, df):
return acc.merge(df)


def get_matching_processes(row: pd.Series, topology: dict[str, DataFrame]) -> pd.Series:
def get_matching_processes(
row: pd.Series, topology: dict[str, DataFrame]
) -> pd.Series | None:
matching_processes = None
for col, key in process_map.items():
if col in row.index and row[col] is not None:
proc_set = topology[key]
pattern = row[col].upper()
filtered = filter_by_pattern(proc_set, pattern)
filtered = filter_by_pattern(proc_set, pattern, col != "pset_pd")
matching_processes = intersect(matching_processes, filtered)

if matching_processes is not None and any(matching_processes.duplicated()):
Expand All @@ -2050,7 +2059,7 @@ def get_matching_commodities(row: pd.Series, topology: dict[str, DataFrame]):
if col in row.index and row[col] is not None:
matching_commodities = intersect(
matching_commodities,
filter_by_pattern(topology[key], row[col].upper()),
filter_by_pattern(topology[key], row[col].upper(), col != "cset_cd"),
)
return matching_commodities

Expand Down
25 changes: 14 additions & 11 deletions xl2times/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,26 +210,29 @@ def remove_positive_patterns(pattern: str) -> str:


@functools.lru_cache(maxsize=int(1e6))
def create_regexp(pattern: str) -> re.Pattern:
# Remove whitespaces
pattern = pattern.replace(" ", "")
# Exclude negative patterns
if has_negative_patterns(pattern):
pattern = remove_negative_patterns(pattern)
def create_regexp(pattern: str, combined: bool = True) -> str:
# Distinguish comma-separated list of patterns vs a pattern with a comma(s)
if combined:
# Remove whitespaces
pattern = pattern.replace(" ", "")
# Exclude negative patterns
if has_negative_patterns(pattern):
pattern = remove_negative_patterns(pattern)
# Handle comma-separated values
pattern = pattern.replace(",", r"$|^")
if len(pattern) == 0:
return re.compile(pattern) # matches everything
return r".*" # matches everything
# Handle substite VEDA wildcards with regex patterns
substitions = (("*", ".*"), ("?", "."), (",", r"$|^"))
for substition in substitions:
for substition in (("*", ".*"), ("?", ".")):
old, new = substition
pattern = pattern.replace(old, new)
# Do not match substrings
pattern = rf"^{pattern}$"
return re.compile(pattern)
return pattern


@functools.lru_cache(maxsize=int(1e6))
def create_negative_regexp(pattern: str) -> re.Pattern:
def create_negative_regexp(pattern: str) -> str:
# Remove whitespaces
pattern = pattern.replace(" ", "")
# Exclude positive patterns
Expand Down

0 comments on commit 6c8846e

Please sign in to comment.