-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Extract validated blocks and convert as needed (#2)
* SimaPro CSV files are structured into blocks, using control statements to start and end the blocks (but not always, that would be like playing on easy mode). This PR sets out the architecture for extracting these blocks, and includes the metadata block and partial implementation of input parameters and process datasets. * The biggest challenge is parsing formulas as strings, as there is no spec. We build on the existing implementation with more tests and fewer assumptions. Signed-off-by: Chris Mutel <[email protected]> Co-authored-by: João Gonçalves <[email protected]>
- Loading branch information
1 parent
4f5222e
commit 60aea55
Showing
15 changed files
with
666 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
__all__ = ( | ||
"DatabaseInputParameters", | ||
"EmptyBlock", | ||
"Process", | ||
"ProjectInputParameters", | ||
"SimaProCSVBlock", | ||
"SimaProCSVUncertainBlock", | ||
) | ||
|
||
|
||
from .base import EmptyBlock, SimaProCSVBlock, SimaProCSVUncertainBlock | ||
from .parameters import DatabaseInputParameters, ProjectInputParameters | ||
from .process import Process |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
# pylint: disable=too-many-arguments,unused-argument,too-many-return-statements | ||
import math | ||
|
||
from loguru import logger | ||
from stats_arrays import ( | ||
LognormalUncertainty, | ||
NormalUncertainty, | ||
TriangularUncertainty, | ||
UndefinedUncertainty, | ||
UniformUncertainty, | ||
) | ||
|
||
from ..utils import asnumber | ||
|
||
|
||
class SimaProCSVBlock: | ||
"""Base class for parsing and cleaning logical blocks in a SimaPro CSV file""" | ||
|
||
|
||
class EmptyBlock(SimaProCSVBlock): | ||
"""An empty block without content.""" | ||
|
||
|
||
class SimaProCSVUncertainBlock(SimaProCSVBlock): | ||
"""Base class which includes logic for parsing lines with probability distributions""" | ||
|
||
def undefined_distribution(self, amount: float) -> dict: | ||
return { | ||
"uncertainty type": UndefinedUncertainty.id, | ||
"loc": amount, | ||
"amount": amount, | ||
} | ||
|
||
def distribution( | ||
self, amount: str, kind: str, field1: str, field2: str, field3: str, header: dict, **kwargs | ||
) -> dict: | ||
decimal_separator = header.get("decimal_separator", ".") | ||
|
||
try: | ||
amount = asnumber(value=amount, decimal_separator=decimal_separator) | ||
field1 = asnumber(value=field1, decimal_separator=decimal_separator) | ||
field2 = asnumber(value=field2, decimal_separator=decimal_separator) | ||
field3 = asnumber(value=field3, decimal_separator=decimal_separator) | ||
except ValueError as exc: | ||
raise ValueError( | ||
f""" | ||
Can't convert uncertainty data to numbers: | ||
Uncertainty type: {kind} | ||
Amount: {amount} | ||
Field1: {field1} | ||
Field2: {field2} | ||
Field3: {field3} | ||
""" | ||
) from exc | ||
|
||
if kind == "Undefined": | ||
return self.undefined_distribution(amount) | ||
if kind == "Lognormal": | ||
if not amount or field1 <= 0: | ||
logger.warning("Invalid lognormal distribution: {amount}|{field1}") | ||
return self.undefined_distribution(amount) | ||
return { | ||
"uncertainty type": LognormalUncertainty.id, | ||
"scale": math.log(math.sqrt(field1)), | ||
"loc": math.log(abs(amount)), | ||
"negative": amount < 0, | ||
"amount": amount, | ||
} | ||
if kind == "Normal": | ||
if not amount or field1 <= 0: | ||
logger.warning("Invalid normal distribution: {amount}|{field1}") | ||
return self.undefined_distribution(amount) | ||
return { | ||
"uncertainty type": NormalUncertainty.id, | ||
"scale": math.sqrt(field1), | ||
"loc": amount, | ||
"negative": amount < 0, | ||
"amount": amount, | ||
} | ||
if kind == "Triangle": | ||
if not field2 <= amount <= field3: | ||
logger.warning("Invalid triangular distribution: {amount}|{field2}|{field3}") | ||
return self.undefined_distribution(amount) | ||
return { | ||
"uncertainty type": TriangularUncertainty.id, | ||
"minimum": field2, | ||
"maximum": field3, | ||
"loc": amount, | ||
"negative": amount < 0, | ||
"amount": amount, | ||
} | ||
if kind == "Uniform": | ||
if not field2 <= amount <= field3: | ||
logger.warning("Invalid uniform distribution: {amount}|{field2}|{field3}") | ||
return self.undefined_distribution(amount) | ||
return { | ||
"uncertainty type": UniformUncertainty.id, | ||
"minimum": field2, | ||
"maximum": field3, | ||
"loc": amount, | ||
"negative": amount < 0, | ||
"amount": amount, | ||
} | ||
raise ValueError(f"Unknown uncertainty type: {kind}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
from typing import List | ||
|
||
from ..utils import asboolean | ||
from .base import SimaProCSVUncertainBlock | ||
|
||
|
||
class GlobalInputParameters(SimaProCSVUncertainBlock): | ||
def __init__(self, block: List[list], header: dict): | ||
"""Parse a `Database Input Parameters` block. | ||
Each line has the form: | ||
0. name | ||
1. value (not formula) | ||
2. uncertainty type | ||
3. uncert. param. | ||
4. uncert. param. | ||
5. uncert. param. | ||
6. hidden ("Yes" or "No") | ||
7-X. comment (can include multiple elements) | ||
The block header label is already stripped.""" | ||
self.parsed = [] | ||
|
||
for line in block: | ||
if not any(elem.strip() for elem in line): | ||
continue | ||
self.parsed.append( | ||
self.distribution(*line[1:6], header=header) | ||
| { | ||
"name": line[0], | ||
"hidden": asboolean(line[6]), | ||
"comment": "\n".join([elem for elem in line[7:] if elem]), | ||
} | ||
) | ||
|
||
|
||
class DatabaseInputParameters(GlobalInputParameters): | ||
pass | ||
|
||
|
||
class ProjectInputParameters(GlobalInputParameters): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
import re | ||
from typing import List | ||
|
||
from loguru import logger | ||
|
||
from ..utils import asboolean, asdate | ||
from .base import SimaProCSVUncertainBlock | ||
|
||
LIST_ELEMENTS = { | ||
"Avoided products", | ||
"Economic issues", | ||
"Electricity/heat", | ||
"Emissions to air", | ||
"Emissions to soil", | ||
"Emissions to water", | ||
"Final waste flows", | ||
"Materials/fuels", | ||
"Non material emissions", | ||
"Products", | ||
"Resources", | ||
"Social issues", | ||
"Waste to treatment", | ||
"Waste treatment", | ||
} | ||
HAS_SUBCATEGORY = { | ||
"Economic issues", | ||
"Emissions to air", | ||
"Emissions to soil", | ||
"Emissions to water", | ||
"Final waste flows", | ||
"Non material emissions", | ||
"Resources", | ||
"Social issues", | ||
} | ||
NO_SUBCATEGORY = { | ||
"Avoided products", | ||
"Electricity/heat", | ||
"Materials/fuels", | ||
"Products", | ||
"Waste to treatment", | ||
} | ||
PARAMETERS = { | ||
"Calculated parameters", | ||
"Input parameters", | ||
} | ||
|
||
|
||
# Exclude `e` for exponent | ||
has_letters = re.compile("[a-df-zA-CF-Z]+") | ||
has_numbers = re.compile("[0-9]+") | ||
|
||
|
||
class Process(SimaProCSVUncertainBlock): | ||
"""A life cycle inventory process, with inputs, products, and elementary exchanges""" | ||
|
||
def __init__(self, block: List[list], header: dict): | ||
self.parsed = {"metadata": {}} | ||
self.raw = {} | ||
self.index = 0 | ||
self.unit_first = None | ||
|
||
while not any(block[self.index]): | ||
self.index += 1 | ||
|
||
# Start with metadata. This is stored as: | ||
# Key | ||
# Value | ||
# On separate lines. Also, sometimes Value is missing. | ||
while block[self.index][0] not in LIST_ELEMENTS: | ||
k, v = self.pull_metadata_pair(block, header) | ||
self.parsed["metadata"][k] = v | ||
|
||
# These sections need access to the global variable store | ||
# before they can be resolved | ||
while self.index < len(block): | ||
k, v = self.pull_raw_section(block) | ||
self.raw[k] = v | ||
|
||
def pull_raw_section(self, block: List[list]) -> (str, list): | ||
""" | ||
0. name | ||
1. subcategory | ||
2. unit (or value) | ||
3. value or formula (or unit) | ||
4. uncertainty type | ||
5. uncert. param. | ||
6. uncert. param. | ||
7. uncert. param. | ||
8. comment | ||
However, sometimes the value is in index 2, and the unit in index 3. Because why not! | ||
We assume default ordering unless we find a number in index 2. | ||
""" | ||
key = block[self.index][0] | ||
data = [] | ||
|
||
self.index += 1 | ||
|
||
while any(block[self.index]): | ||
if key in HAS_SUBCATEGORY: | ||
data.append( | ||
{ | ||
"name": a, | ||
"categories": (key, b), | ||
"maybe_unit": c, | ||
"maybe_value": d, | ||
"kind": e, | ||
"field1": f, | ||
"field2": g, | ||
"field3": h, | ||
} | ||
for (a, b, c, d, e, f, g, h) in block[self.index] | ||
) | ||
self.index += 1 | ||
|
||
# Skip empty line ending this section | ||
self.index += 1 | ||
|
||
return key, data | ||
|
||
def resolve_unit_amount(self, a: str, b: str) -> dict: | ||
"""Determine the unit and amount fields as accurately as possible.""" | ||
# Normally the unit comes first | ||
if not has_numbers.search(a) and has_numbers.search(b): | ||
unit, amount = a, b | ||
self.unit_first = True | ||
elif has_numbers.search(a) and not has_numbers.search(b): | ||
unit, amount = b, a | ||
self.unit_first = False | ||
# The amount could be a formula with only a variable | ||
# We don't handle this case for now | ||
else: | ||
logger.warning("Ambiguous unit/value pair: '{a}' and '{b}' in section {key}") | ||
unit, amount = a, b | ||
if has_letters.search(amount): | ||
return {"unit": unit, "formula": amount} | ||
# TBD: Evaulate number | ||
return {"unit": unit, "amount": float(amount)} | ||
|
||
def pull_metadata_pair(self, block: List[list], header: dict) -> (str, str): | ||
key = block[self.index][0] | ||
|
||
if key == "Literature references": | ||
self.index += 1 | ||
value = [] | ||
while any(block[self.index]): | ||
reference = {"reference": block[self.index][0]} | ||
if len(block[self.index]) > 1: | ||
reference["comment"] = block[self.index][1] | ||
value.append(reference) | ||
self.index += 1 | ||
elif key == "Date": | ||
value = asdate(block[self.index + 1][0], dayfirst=header["dayfirst"]) | ||
self.index += 2 | ||
elif key == "Infrastructure": | ||
value = asboolean(block[self.index + 1][0]) | ||
self.index += 2 | ||
else: | ||
value = ( | ||
" -||- ".join([elem for elem in block[self.index + 1] if elem]) | ||
if block[self.index + 1] | ||
else "" | ||
) | ||
self.index += 2 | ||
|
||
# Skip empty lines until next pair. Should only be one line, but life can be surprising | ||
while not any(block[self.index]): | ||
self.index += 1 | ||
|
||
return key, value |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
class IndeterminateBlockEnd(Exception): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.