Skip to content

Commit

Permalink
Extract validated blocks and convert as needed (#2)
Browse files Browse the repository at this point in the history
* SimaPro CSV files are structured into blocks, using control statements
  to start and end the blocks (but not always, that would be like
  playing on easy mode). This PR sets out the architecture for
  extracting these blocks, and includes the metadata block and partial
  implementation of input parameters and process datasets.

* The biggest challenge is parsing formulas as strings, as there is no
  spec. We build on the existing implementation with more tests and
  fewer assumptions.

Signed-off-by: Chris Mutel <[email protected]>
Co-authored-by: João Gonçalves <[email protected]>
  • Loading branch information
cmutel and jsvgoncalves committed Apr 30, 2024
1 parent 4f5222e commit 60aea55
Show file tree
Hide file tree
Showing 15 changed files with 666 additions and 16 deletions.
6 changes: 0 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,6 @@ repos:
# --remove-unused-variables,
# ]

# https://github.com/PyCQA/flake8/issues/234
- repo: https://github.com/john-hen/Flake8-pyproject
rev: 1.2.3
hooks:
- id: Flake8-pyproject

- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
Expand Down
3 changes: 0 additions & 3 deletions bw_simapro_csv/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
"""bw_simapro_csv."""

__all__ = (
"__version__",
"SimaProCSV",
# Add functions and variables you want exposed in `bw_simapro_csv.` namespace here
)

__version__ = "0.0.1"
Expand Down
13 changes: 13 additions & 0 deletions bw_simapro_csv/blocks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
__all__ = (
"DatabaseInputParameters",
"EmptyBlock",
"Process",
"ProjectInputParameters",
"SimaProCSVBlock",
"SimaProCSVUncertainBlock",
)


from .base import EmptyBlock, SimaProCSVBlock, SimaProCSVUncertainBlock
from .parameters import DatabaseInputParameters, ProjectInputParameters
from .process import Process
104 changes: 104 additions & 0 deletions bw_simapro_csv/blocks/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# pylint: disable=too-many-arguments,unused-argument,too-many-return-statements
import math

from loguru import logger
from stats_arrays import (
LognormalUncertainty,
NormalUncertainty,
TriangularUncertainty,
UndefinedUncertainty,
UniformUncertainty,
)

from ..utils import asnumber


class SimaProCSVBlock:
"""Base class for parsing and cleaning logical blocks in a SimaPro CSV file"""


class EmptyBlock(SimaProCSVBlock):
"""An empty block without content."""


class SimaProCSVUncertainBlock(SimaProCSVBlock):
"""Base class which includes logic for parsing lines with probability distributions"""

def undefined_distribution(self, amount: float) -> dict:
return {
"uncertainty type": UndefinedUncertainty.id,
"loc": amount,
"amount": amount,
}

def distribution(
self, amount: str, kind: str, field1: str, field2: str, field3: str, header: dict, **kwargs
) -> dict:
decimal_separator = header.get("decimal_separator", ".")

try:
amount = asnumber(value=amount, decimal_separator=decimal_separator)
field1 = asnumber(value=field1, decimal_separator=decimal_separator)
field2 = asnumber(value=field2, decimal_separator=decimal_separator)
field3 = asnumber(value=field3, decimal_separator=decimal_separator)
except ValueError as exc:
raise ValueError(
f"""
Can't convert uncertainty data to numbers:
Uncertainty type: {kind}
Amount: {amount}
Field1: {field1}
Field2: {field2}
Field3: {field3}
"""
) from exc

if kind == "Undefined":
return self.undefined_distribution(amount)
if kind == "Lognormal":
if not amount or field1 <= 0:
logger.warning("Invalid lognormal distribution: {amount}|{field1}")
return self.undefined_distribution(amount)
return {
"uncertainty type": LognormalUncertainty.id,
"scale": math.log(math.sqrt(field1)),
"loc": math.log(abs(amount)),
"negative": amount < 0,
"amount": amount,
}
if kind == "Normal":
if not amount or field1 <= 0:
logger.warning("Invalid normal distribution: {amount}|{field1}")
return self.undefined_distribution(amount)
return {
"uncertainty type": NormalUncertainty.id,
"scale": math.sqrt(field1),
"loc": amount,
"negative": amount < 0,
"amount": amount,
}
if kind == "Triangle":
if not field2 <= amount <= field3:
logger.warning("Invalid triangular distribution: {amount}|{field2}|{field3}")
return self.undefined_distribution(amount)
return {
"uncertainty type": TriangularUncertainty.id,
"minimum": field2,
"maximum": field3,
"loc": amount,
"negative": amount < 0,
"amount": amount,
}
if kind == "Uniform":
if not field2 <= amount <= field3:
logger.warning("Invalid uniform distribution: {amount}|{field2}|{field3}")
return self.undefined_distribution(amount)
return {
"uncertainty type": UniformUncertainty.id,
"minimum": field2,
"maximum": field3,
"loc": amount,
"negative": amount < 0,
"amount": amount,
}
raise ValueError(f"Unknown uncertainty type: {kind}")
43 changes: 43 additions & 0 deletions bw_simapro_csv/blocks/parameters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from typing import List

from ..utils import asboolean
from .base import SimaProCSVUncertainBlock


class GlobalInputParameters(SimaProCSVUncertainBlock):
def __init__(self, block: List[list], header: dict):
"""Parse a `Database Input Parameters` block.
Each line has the form:
0. name
1. value (not formula)
2. uncertainty type
3. uncert. param.
4. uncert. param.
5. uncert. param.
6. hidden ("Yes" or "No")
7-X. comment (can include multiple elements)
The block header label is already stripped."""
self.parsed = []

for line in block:
if not any(elem.strip() for elem in line):
continue
self.parsed.append(
self.distribution(*line[1:6], header=header)
| {
"name": line[0],
"hidden": asboolean(line[6]),
"comment": "\n".join([elem for elem in line[7:] if elem]),
}
)


class DatabaseInputParameters(GlobalInputParameters):
pass


class ProjectInputParameters(GlobalInputParameters):
pass
170 changes: 170 additions & 0 deletions bw_simapro_csv/blocks/process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
import re
from typing import List

from loguru import logger

from ..utils import asboolean, asdate
from .base import SimaProCSVUncertainBlock

LIST_ELEMENTS = {
"Avoided products",
"Economic issues",
"Electricity/heat",
"Emissions to air",
"Emissions to soil",
"Emissions to water",
"Final waste flows",
"Materials/fuels",
"Non material emissions",
"Products",
"Resources",
"Social issues",
"Waste to treatment",
"Waste treatment",
}
HAS_SUBCATEGORY = {
"Economic issues",
"Emissions to air",
"Emissions to soil",
"Emissions to water",
"Final waste flows",
"Non material emissions",
"Resources",
"Social issues",
}
NO_SUBCATEGORY = {
"Avoided products",
"Electricity/heat",
"Materials/fuels",
"Products",
"Waste to treatment",
}
PARAMETERS = {
"Calculated parameters",
"Input parameters",
}


# Exclude `e` for exponent
has_letters = re.compile("[a-df-zA-CF-Z]+")
has_numbers = re.compile("[0-9]+")


class Process(SimaProCSVUncertainBlock):
"""A life cycle inventory process, with inputs, products, and elementary exchanges"""

def __init__(self, block: List[list], header: dict):
self.parsed = {"metadata": {}}
self.raw = {}
self.index = 0
self.unit_first = None

while not any(block[self.index]):
self.index += 1

# Start with metadata. This is stored as:
# Key
# Value
# On separate lines. Also, sometimes Value is missing.
while block[self.index][0] not in LIST_ELEMENTS:
k, v = self.pull_metadata_pair(block, header)
self.parsed["metadata"][k] = v

# These sections need access to the global variable store
# before they can be resolved
while self.index < len(block):
k, v = self.pull_raw_section(block)
self.raw[k] = v

def pull_raw_section(self, block: List[list]) -> (str, list):
"""
0. name
1. subcategory
2. unit (or value)
3. value or formula (or unit)
4. uncertainty type
5. uncert. param.
6. uncert. param.
7. uncert. param.
8. comment
However, sometimes the value is in index 2, and the unit in index 3. Because why not!
We assume default ordering unless we find a number in index 2.
"""
key = block[self.index][0]
data = []

self.index += 1

while any(block[self.index]):
if key in HAS_SUBCATEGORY:
data.append(
{
"name": a,
"categories": (key, b),
"maybe_unit": c,
"maybe_value": d,
"kind": e,
"field1": f,
"field2": g,
"field3": h,
}
for (a, b, c, d, e, f, g, h) in block[self.index]
)
self.index += 1

# Skip empty line ending this section
self.index += 1

return key, data

def resolve_unit_amount(self, a: str, b: str) -> dict:
"""Determine the unit and amount fields as accurately as possible."""
# Normally the unit comes first
if not has_numbers.search(a) and has_numbers.search(b):
unit, amount = a, b
self.unit_first = True
elif has_numbers.search(a) and not has_numbers.search(b):
unit, amount = b, a
self.unit_first = False
# The amount could be a formula with only a variable
# We don't handle this case for now
else:
logger.warning("Ambiguous unit/value pair: '{a}' and '{b}' in section {key}")
unit, amount = a, b
if has_letters.search(amount):
return {"unit": unit, "formula": amount}
# TBD: Evaulate number
return {"unit": unit, "amount": float(amount)}

def pull_metadata_pair(self, block: List[list], header: dict) -> (str, str):
key = block[self.index][0]

if key == "Literature references":
self.index += 1
value = []
while any(block[self.index]):
reference = {"reference": block[self.index][0]}
if len(block[self.index]) > 1:
reference["comment"] = block[self.index][1]
value.append(reference)
self.index += 1
elif key == "Date":
value = asdate(block[self.index + 1][0], dayfirst=header["dayfirst"])
self.index += 2
elif key == "Infrastructure":
value = asboolean(block[self.index + 1][0])
self.index += 2
else:
value = (
" -||- ".join([elem for elem in block[self.index + 1] if elem])
if block[self.index + 1]
else ""
)
self.index += 2

# Skip empty lines until next pair. Should only be one line, but life can be surprising
while not any(block[self.index]):
self.index += 1

return key, value
2 changes: 2 additions & 0 deletions bw_simapro_csv/errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class IndeterminateBlockEnd(Exception):
pass
4 changes: 2 additions & 2 deletions bw_simapro_csv/header.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import warnings
from datetime import datetime
from enum import Enum
from typing import List, Optional

from dateutil import parser
from loguru import logger
from pydantic import BaseModel

from .utils import asboolean, nobraces, noquotes
Expand Down Expand Up @@ -108,7 +108,7 @@ def parse_header(data: List[str]) -> SimaProCSVHeader:
elif line.startswith("Library '"):
parsed["libraries"].append(noquotes(line[len("Library") :].strip()))
else:
warnings.warn(f"Can't understand header line (skipping):\n\t{line}")
logger.warning(f"Can't understand header line (skipping):\n\t{line}")

dayfirst = not (
date
Expand Down
Loading

0 comments on commit 60aea55

Please sign in to comment.