diff --git a/bw_simapro_csv/blocks/generic_biosphere.py b/bw_simapro_csv/blocks/generic_biosphere.py index e169011..ff008b5 100644 --- a/bw_simapro_csv/blocks/generic_biosphere.py +++ b/bw_simapro_csv/blocks/generic_biosphere.py @@ -1,6 +1,6 @@ from typing import Any, List -from ..cas import validate_cas +from ..cas import validate_cas_string from ..utils import add_amount_or_formula, skip_empty from .base import SimaProCSVBlock @@ -44,7 +44,7 @@ def __init__(self, block: List[tuple], header: dict, category: str): { "name": line[0], "unit": line[1], - "cas_number": validate_cas(line[2]), + "cas_number": validate_cas_string(line[2]), "comment": line[3], "line_no": line_no, } diff --git a/bw_simapro_csv/blocks/impact_category.py b/bw_simapro_csv/blocks/impact_category.py index fc651d9..e0f884f 100644 --- a/bw_simapro_csv/blocks/impact_category.py +++ b/bw_simapro_csv/blocks/impact_category.py @@ -1,6 +1,6 @@ from typing import List -from ..cas import validate_cas +from ..cas import validate_cas_string from ..utils import asnumber, jump_to_nonempty, skip_empty from .base import SimaProCSVBlock @@ -43,7 +43,7 @@ def __init__(self, block: List[tuple], header: dict): { "context": (line[0], line[1]), "name": line[2], - "cas_number": validate_cas(line[3]), + "cas_number": validate_cas_string(line[3]), "factor": asnumber(line[4]), "unit": line[5], "line_no": line_no, diff --git a/bw_simapro_csv/cas.py b/bw_simapro_csv/cas.py index b7b50cc..d28c37c 100644 --- a/bw_simapro_csv/cas.py +++ b/bw_simapro_csv/cas.py @@ -1,22 +1,54 @@ from numbers import Number -from typing import Any, Optional +from typing import Optional import numpy as np from loguru import logger -def validate_cas(s: Any) -> Optional[str]: - ERROR = "CAS Check Digit error: CAS '{}' has check digit of {}, but it should be {}" +def calculate_check_digit(cas: str) -> int: + return sum((a + 1) * int(b) for a, b in zip(range(9), cas[-1::-1])) % 10 - if isinstance(s, str): - s = s.strip() - if not s: + +def validate_cas_string(cas: Optional[str]) -> Optional[str]: + if isinstance(cas, str): + cas = cas.strip() + if not cas: return None - elif isinstance(s, Number) and np.isnan(s): + elif isinstance(cas, Number) and np.isnan(cas): return None - total = sum((a + 1) * int(b) for a, b in zip(range(9), s.replace("-", "")[-2::-1])) - if not total % 10 == int(s[-1]): - logger.warning("CAS not valid: {} ({})".format(s, ERROR.format(s, s[-1], total % 10))) + if "-" not in cas: + first, second, check_digit = cas[:-3], cas[-3:-1], int(cas[-1]) + if str(calculate_check_digit(first + second)) != str(check_digit): + logger.warning( + "Removing invalid CAS number {}; last digit should be {}".format( + cas, check_digit + ) + ) + return None + return "-".join([first, second, str(check_digit)]).lstrip("0") + elif cas.count("-") == 2 and not cas.split("-")[2]: + # e.g. 1228284-64- + check_digit = str(calculate_check_digit(cas.replace("-", ""))) + logger.warning( + "Adding missing CAS check digit, {} -> {}".format(cas, cas + check_digit) + ) + return cas + check_digit + elif cas.count("-") == 2: + first, second, third = cas.split("-") + check_digit = calculate_check_digit(first + second) + if str(check_digit) != third: + logger.warning( + "Removing invalid CAS number {}; last digit should be {}".format( + cas, check_digit + ) + ) + else: + return cas.lstrip("0") + else: + logger.warning( + "Given CAS can't be validated, wrong number of hyphens are present: {}".format( + cas + ) + ) return None - return s.lstrip("0") diff --git a/tests/fixtures/cas_missing_check_number.csv b/tests/fixtures/cas_missing_check_number.csv new file mode 100644 index 0000000..a49c4da --- /dev/null +++ b/tests/fixtures/cas_missing_check_number.csv @@ -0,0 +1,293 @@ +{SimaPro 8.0} +{processes} +{Date: 07.03.2014} +{Time: 15:52:27} +{Project: Test} +{CSV Format version: 7.0.0} +{CSV separator: Semicolon} +{Decimal separator: ,} +{Date separator: .} +{Short date format: dd.MM.yyyy} + +Process + +PlatformId +298f6b5c-46f5-11ec-81d3-0242ac130003 + +Category type +material + +Process identifier +DefaultX25250700002 + +Type +Unit process + +Process name +Test process + +Status +Draft + +Time period +2005-2009 + +Geography +Mixed data + +Technology +Worst case + +Representativeness +Theoretical calculation + +Multiple output allocation +Physical causality + +Substitution allocation +Actual substitution + +Cut off rules +Less than 1% (physical criteria) + +Capital goods +First order (only primary flows) + +Boundary with nature +Agricultural production is part of production system + +Infrastructure +No + +Date +24.02.2014 + +Record +data entry by: [System] + +Generator +generated by: [System] + +Literature references +Ecoinvent 3;is copyright protected: false + +Collection method +text for collection method + +Data treatment +text for data treatment + +Verification +text for verification + +Comment +text for comment + +Allocation rules +text for allocation rules + +System description +U.S. LCI Database;system description comment + +Products +my product;kg;0,5;100;not defined;Agricultural; + +Avoided products +Wool, at field/US;kg;1;Undefined;0;0;0; + +Resources +Acids;;kg;1;Undefined;0;0;0; + +Materials/fuels +Soy oil, refined, at plant/kg/RNA;kg;0;Undefined;0;0;0; + +Electricity/heat +Electricity, biomass, at power plant/US;kWh;0,1;Undefined;0;0;0; + +Emissions to air +(+-)-Citronellol;low. pop.;kg;1;Lognormal;2;0;0;(1,2,3,4,5)with comment + +Emissions to water +(1r,4r)-(+)-Camphor;lake;kg;1;Normal;3;0;0;;80db70fc-46d4-11ec-81d3-0242ac130003 + +Emissions to soil +1'-Acetoxysafrole;forestry;kg;1;Triangle;0;1;5; + +Final waste flows +Asbestos;;kg;1;Uniform;0;1;2; + +Non material emissions +Noise from bus km;;km;1;Undefined;0;0;0; + +Social issues +venting of argon, crude, liquid;;kg;1;Undefined;0;0;0; + +Economic issues +Sample economic issue;;kg;1;Undefined;0;0;0; + +Waste to treatment +Dummy, Disposal, msw, to sanitary landfill/kg/GLO;kg;1;Undefined;0;0;0; + +Input parameters +input_param;23,4;Uniform;0;13;33;No;this is the parameter commentis this presented in multiple lines?we should even be able tu use ��� + +Calculated parameters +calc_param;input_param ^ 2;comment for calc_param + + +End + + +System description + +Name +system name + +Category +Others + +Description +text for description + +Sub-systems +text for sub-systems + +Cut-off rules +text for cut-off rules + +Energy model +text for energy model + +Transport model +text for transport model + +Waste model +text for waste model + +Other assumptions +text for other assumptions + +Other information +text for other information + +Allocation rules +text for allocation rules + +End + +Quantities +Mass;Yes +Energy;Yes +Length;Yes + +End + + +Units +kg;Mass;1;kg +g;Mass;0,001;kg +kWh;Energy;3,6;MJ +MJ;Energy;1;MJ +ton;Mass;1000;kg +µg;Mass;0,000000001;kg +mg;Mass;0,000001;kg +GJ;Energy;1000;MJ +J;Energy;0,000001;MJ +kJ;Energy;0,001;MJ +Mtn;Mass;1000000000;kg +PJ;Energy;1000000000;MJ +TJ;Energy;1000000;MJ +kton;Mass;1000000;kg +ng;Mass;1,0E-12;kg +pg;Mass;1,0E-15;kg +MWh;Energy;3600;MJ +lb;Mass;0,4535924;kg +Btu;Energy;0,001055696;MJ +oz;Mass;0,02834952;kg +tn.sh;Mass;907,1848;kg +tn.lg;Mass;1016,047;kg +km;Length;1000;m +m;Length;1;m +cm;Length;0,01;m +dm;Length;0,1;m +mm;Length;0,001;m +µm;Length;0,000001;m +ft;Length;0,3048;m +inch;Length;0,0254;m +yard;Length;0,9144;m +mile;Length;1609,35;m +kcal;Energy;0,0041855;MJ +Wh;Energy;0,0036;MJ + +End + + +Raw materials +Acids;kg;; + +End + + +Airborne emissions +(+-)-Citronellol;kg;026489-01-0; + +End + + +Waterborne emissions +(1r,4r)-(+)-Camphor;kg;000464-49-3;;80db70fc-46d4-11ec-81d3-0242ac130003 + +End + + +Final waste flows +Asbestos;kg;; + +End + + +Emissions to soil +Prothioconazole;kg;178928-70-5;Formula: C14H15Cl2N3OS +Pydiflumetofen;kg;1228284-64-;Formula: C16H16Cl3F2N3O2 + + +End + + +Non material emissions +Noise from bus km;km;; + +End + + +Social issues +venting of argon, crude, liquid;kg;; + +End + + +Economic issues +Sample economic issue;kg;; + +End + + +Database Input parameters +db_input_param;1;Lognormal;1;0;0;No;database parameter + +End + +Database Calculated parameters +db_calc_param;db_input_param * 3;calculated database parameter + +End + +Project Input parameters +proj_input_param;32;Uniform;0;10;35;No;project input parameter + +End + +Project Calculated parameters +proj_calc_param;db_input_param *4;project calculated parameter + +End diff --git a/tests/unit/test_cas.py b/tests/unit/test_cas.py new file mode 100644 index 0000000..fc21027 --- /dev/null +++ b/tests/unit/test_cas.py @@ -0,0 +1,32 @@ +from bw_simapro_csv import SimaProCSV +from bw_simapro_csv.blocks import GenericBiosphere +from bw_simapro_csv.cas import calculate_check_digit, validate_cas_string + + +def test_calculate_check_digit(): + assert calculate_check_digit("773218") == 5 + assert calculate_check_digit("778240") == 3 + + +def test_validate_cas_string(): + assert validate_cas_string("7782425") == "7782-42-5" + assert validate_cas_string("007782425") == "7782-42-5" + assert validate_cas_string(" 7782-42-5\n") == "7782-42-5" + assert validate_cas_string("007782-42-5") == "7782-42-5" + assert validate_cas_string("1228284-64-") == "1228284-64-7" + assert validate_cas_string("") is None + assert validate_cas_string(None) is None + assert validate_cas_string(float("NaN")) is None + assert validate_cas_string("7782-425") is None + assert validate_cas_string("7782424") is None + + +def test_cas_in_file(fixtures_dir): + obj = SimaProCSV(fixtures_dir / "cas_missing_check_number.csv") + blocks = [ + elem + for elem in obj.blocks + if isinstance(elem, GenericBiosphere) and elem.category == "Emissions to soil" + ] + expected = [None, "1228284-64-7"] + assert [obj["cas_number"] for obj in blocks[0].parsed] == expected