Extract validated blocks and convert as needed (#2)

* SimaPro CSV files are structured into blocks, using control statements to start and end the blocks (but not always, that would be like playing on easy mode). This PR sets out the architecture for extracting these blocks, and includes the metadata block and partial implementation of input parameters and process datasets. * The biggest challenge is parsing formulas as strings, as there is no spec. We build on the existing implementation with more tests and fewer assumptions. Signed-off-by: Chris Mutel <[email protected]> Co-authored-by: João Gonçalves <[email protected]>
brightway-lca · Apr 30, 2024 · 60aea55 · 60aea55
1 parent 4f5222e
commit 60aea55
Show file tree

Hide file tree

Showing 15 changed files with 666 additions and 16 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -30,12 +30,6 @@ repos:
 #       --remove-unused-variables,
 #     ]
 
-# https://github.com/PyCQA/flake8/issues/234
-- repo: https://github.com/john-hen/Flake8-pyproject
-  rev: 1.2.3
-  hooks:
-    - id: Flake8-pyproject
-
 - repo: https://github.com/pycqa/isort
   rev: 5.13.2
   hooks:

diff --git a/bw_simapro_csv/__init__.py b/bw_simapro_csv/__init__.py
@@ -1,9 +1,6 @@
-"""bw_simapro_csv."""
-
 __all__ = (
     "__version__",
     "SimaProCSV",
-    # Add functions and variables you want exposed in `bw_simapro_csv.` namespace here
 )
 
 __version__ = "0.0.1"

diff --git a/bw_simapro_csv/blocks/__init__.py b/bw_simapro_csv/blocks/__init__.py
@@ -0,0 +1,13 @@
+__all__ = (
+    "DatabaseInputParameters",
+    "EmptyBlock",
+    "Process",
+    "ProjectInputParameters",
+    "SimaProCSVBlock",
+    "SimaProCSVUncertainBlock",
+)
+
+
+from .base import EmptyBlock, SimaProCSVBlock, SimaProCSVUncertainBlock
+from .parameters import DatabaseInputParameters, ProjectInputParameters
+from .process import Process
diff --git a/bw_simapro_csv/blocks/base.py b/bw_simapro_csv/blocks/base.py
@@ -0,0 +1,104 @@
+# pylint: disable=too-many-arguments,unused-argument,too-many-return-statements
+import math
+
+from loguru import logger
+from stats_arrays import (
+    LognormalUncertainty,
+    NormalUncertainty,
+    TriangularUncertainty,
+    UndefinedUncertainty,
+    UniformUncertainty,
+)
+
+from ..utils import asnumber
+
+
+class SimaProCSVBlock:
+    """Base class for parsing and cleaning logical blocks in a SimaPro CSV file"""
+
+
+class EmptyBlock(SimaProCSVBlock):
+    """An empty block without content."""
+
+
+class SimaProCSVUncertainBlock(SimaProCSVBlock):
+    """Base class which includes logic for parsing lines with probability distributions"""
+
+    def undefined_distribution(self, amount: float) -> dict:
+        return {
+            "uncertainty type": UndefinedUncertainty.id,
+            "loc": amount,
+            "amount": amount,
+        }
+
+    def distribution(
+        self, amount: str, kind: str, field1: str, field2: str, field3: str, header: dict, **kwargs
+    ) -> dict:
+        decimal_separator = header.get("decimal_separator", ".")
+
+        try:
+            amount = asnumber(value=amount, decimal_separator=decimal_separator)
+            field1 = asnumber(value=field1, decimal_separator=decimal_separator)
+            field2 = asnumber(value=field2, decimal_separator=decimal_separator)
+            field3 = asnumber(value=field3, decimal_separator=decimal_separator)
+        except ValueError as exc:
+            raise ValueError(
+                f"""
+Can't convert uncertainty data to numbers:
+    Uncertainty type: {kind}
+    Amount: {amount}
+    Field1: {field1}
+    Field2: {field2}
+    Field3: {field3}
+    """
+            ) from exc
+
+        if kind == "Undefined":
+            return self.undefined_distribution(amount)
+        if kind == "Lognormal":
+            if not amount or field1 <= 0:
+                logger.warning("Invalid lognormal distribution: {amount}|{field1}")
+                return self.undefined_distribution(amount)
+            return {
+                "uncertainty type": LognormalUncertainty.id,
+                "scale": math.log(math.sqrt(field1)),
+                "loc": math.log(abs(amount)),
+                "negative": amount < 0,
+                "amount": amount,
+            }
+        if kind == "Normal":
+            if not amount or field1 <= 0:
+                logger.warning("Invalid normal distribution: {amount}|{field1}")
+                return self.undefined_distribution(amount)
+            return {
+                "uncertainty type": NormalUncertainty.id,
+                "scale": math.sqrt(field1),
+                "loc": amount,
+                "negative": amount < 0,
+                "amount": amount,
+            }
+        if kind == "Triangle":
+            if not field2 <= amount <= field3:
+                logger.warning("Invalid triangular distribution: {amount}|{field2}|{field3}")
+                return self.undefined_distribution(amount)
+            return {
+                "uncertainty type": TriangularUncertainty.id,
+                "minimum": field2,
+                "maximum": field3,
+                "loc": amount,
+                "negative": amount < 0,
+                "amount": amount,
+            }
+        if kind == "Uniform":
+            if not field2 <= amount <= field3:
+                logger.warning("Invalid uniform distribution: {amount}|{field2}|{field3}")
+                return self.undefined_distribution(amount)
+            return {
+                "uncertainty type": UniformUncertainty.id,
+                "minimum": field2,
+                "maximum": field3,
+                "loc": amount,
+                "negative": amount < 0,
+                "amount": amount,
+            }
+        raise ValueError(f"Unknown uncertainty type: {kind}")
diff --git a/bw_simapro_csv/blocks/parameters.py b/bw_simapro_csv/blocks/parameters.py
@@ -0,0 +1,43 @@
+from typing import List
+
+from ..utils import asboolean
+from .base import SimaProCSVUncertainBlock
+
+
+class GlobalInputParameters(SimaProCSVUncertainBlock):
+    def __init__(self, block: List[list], header: dict):
+        """Parse a `Database Input Parameters` block.
+
+        Each line has the form:
+
+        0. name
+        1. value (not formula)
+        2. uncertainty type
+        3. uncert. param.
+        4. uncert. param.
+        5. uncert. param.
+        6. hidden ("Yes" or "No")
+        7-X. comment (can include multiple elements)
+
+        The block header label is already stripped."""
+        self.parsed = []
+
+        for line in block:
+            if not any(elem.strip() for elem in line):
+                continue
+            self.parsed.append(
+                self.distribution(*line[1:6], header=header)
+                | {
+                    "name": line[0],
+                    "hidden": asboolean(line[6]),
+                    "comment": "\n".join([elem for elem in line[7:] if elem]),
+                }
+            )
+
+
+class DatabaseInputParameters(GlobalInputParameters):
+    pass
+
+
+class ProjectInputParameters(GlobalInputParameters):
+    pass
diff --git a/bw_simapro_csv/blocks/process.py b/bw_simapro_csv/blocks/process.py
@@ -0,0 +1,170 @@
+import re
+from typing import List
+
+from loguru import logger
+
+from ..utils import asboolean, asdate
+from .base import SimaProCSVUncertainBlock
+
+LIST_ELEMENTS = {
+    "Avoided products",
+    "Economic issues",
+    "Electricity/heat",
+    "Emissions to air",
+    "Emissions to soil",
+    "Emissions to water",
+    "Final waste flows",
+    "Materials/fuels",
+    "Non material emissions",
+    "Products",
+    "Resources",
+    "Social issues",
+    "Waste to treatment",
+    "Waste treatment",
+}
+HAS_SUBCATEGORY = {
+    "Economic issues",
+    "Emissions to air",
+    "Emissions to soil",
+    "Emissions to water",
+    "Final waste flows",
+    "Non material emissions",
+    "Resources",
+    "Social issues",
+}
+NO_SUBCATEGORY = {
+    "Avoided products",
+    "Electricity/heat",
+    "Materials/fuels",
+    "Products",
+    "Waste to treatment",
+}
+PARAMETERS = {
+    "Calculated parameters",
+    "Input parameters",
+}
+
+
+# Exclude `e` for exponent
+has_letters = re.compile("[a-df-zA-CF-Z]+")
+has_numbers = re.compile("[0-9]+")
+
+
+class Process(SimaProCSVUncertainBlock):
+    """A life cycle inventory process, with inputs, products, and elementary exchanges"""
+
+    def __init__(self, block: List[list], header: dict):
+        self.parsed = {"metadata": {}}
+        self.raw = {}
+        self.index = 0
+        self.unit_first = None
+
+        while not any(block[self.index]):
+            self.index += 1
+
+        # Start with metadata. This is stored as:
+        # Key
+        # Value
+        # On separate lines. Also, sometimes Value is missing.
+        while block[self.index][0] not in LIST_ELEMENTS:
+            k, v = self.pull_metadata_pair(block, header)
+            self.parsed["metadata"][k] = v
+
+        # These sections need access to the global variable store
+        # before they can be resolved
+        while self.index < len(block):
+            k, v = self.pull_raw_section(block)
+            self.raw[k] = v
+
+    def pull_raw_section(self, block: List[list]) -> (str, list):
+        """
+        0. name
+        1. subcategory
+        2. unit (or value)
+        3. value or formula (or unit)
+        4. uncertainty type
+        5. uncert. param.
+        6. uncert. param.
+        7. uncert. param.
+        8. comment
+
+        However, sometimes the value is in index 2, and the unit in index 3. Because why not!
+        We assume default ordering unless we find a number in index 2.
+        """
+        key = block[self.index][0]
+        data = []
+
+        self.index += 1
+
+        while any(block[self.index]):
+            if key in HAS_SUBCATEGORY:
+                data.append(
+                    {
+                        "name": a,
+                        "categories": (key, b),
+                        "maybe_unit": c,
+                        "maybe_value": d,
+                        "kind": e,
+                        "field1": f,
+                        "field2": g,
+                        "field3": h,
+                    }
+                    for (a, b, c, d, e, f, g, h) in block[self.index]
+                )
+            self.index += 1
+
+        # Skip empty line ending this section
+        self.index += 1
+
+        return key, data
+
+    def resolve_unit_amount(self, a: str, b: str) -> dict:
+        """Determine the unit and amount fields as accurately as possible."""
+        # Normally the unit comes first
+        if not has_numbers.search(a) and has_numbers.search(b):
+            unit, amount = a, b
+            self.unit_first = True
+        elif has_numbers.search(a) and not has_numbers.search(b):
+            unit, amount = b, a
+            self.unit_first = False
+        # The amount could be a formula with only a variable
+        # We don't handle this case for now
+        else:
+            logger.warning("Ambiguous unit/value pair: '{a}' and '{b}' in section {key}")
+            unit, amount = a, b
+        if has_letters.search(amount):
+            return {"unit": unit, "formula": amount}
+        # TBD: Evaulate number
+        return {"unit": unit, "amount": float(amount)}
+
+    def pull_metadata_pair(self, block: List[list], header: dict) -> (str, str):
+        key = block[self.index][0]
+
+        if key == "Literature references":
+            self.index += 1
+            value = []
+            while any(block[self.index]):
+                reference = {"reference": block[self.index][0]}
+                if len(block[self.index]) > 1:
+                    reference["comment"] = block[self.index][1]
+                value.append(reference)
+                self.index += 1
+        elif key == "Date":
+            value = asdate(block[self.index + 1][0], dayfirst=header["dayfirst"])
+            self.index += 2
+        elif key == "Infrastructure":
+            value = asboolean(block[self.index + 1][0])
+            self.index += 2
+        else:
+            value = (
+                " -||- ".join([elem for elem in block[self.index + 1] if elem])
+                if block[self.index + 1]
+                else ""
+            )
+            self.index += 2
+
+        # Skip empty lines until next pair. Should only be one line, but life can be surprising
+        while not any(block[self.index]):
+            self.index += 1
+
+        return key, value
diff --git a/bw_simapro_csv/errors.py b/bw_simapro_csv/errors.py
@@ -0,0 +1,2 @@
+class IndeterminateBlockEnd(Exception):
+    pass
diff --git a/bw_simapro_csv/header.py b/bw_simapro_csv/header.py
@@ -1,9 +1,9 @@
-import warnings
 from datetime import datetime
 from enum import Enum
 from typing import List, Optional
 
 from dateutil import parser
+from loguru import logger
 from pydantic import BaseModel
 
 from .utils import asboolean, nobraces, noquotes
@@ -108,7 +108,7 @@ def parse_header(data: List[str]) -> SimaProCSVHeader:
         elif line.startswith("Library '"):
             parsed["libraries"].append(noquotes(line[len("Library") :].strip()))
         else:
-            warnings.warn(f"Can't understand header line (skipping):\n\t{line}")
+            logger.warning(f"Can't understand header line (skipping):\n\t{line}")
 
     dayfirst = not (
         date