Skip to content

Commit

Permalink
Make CAS validation more robust
Browse files Browse the repository at this point in the history
  • Loading branch information
cmutel committed Dec 11, 2024
1 parent f0b07db commit 3171387
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 24 deletions.
4 changes: 2 additions & 2 deletions bw_simapro_csv/blocks/generic_biosphere.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Any, List

from ..cas import validate_cas
from ..cas import validate_cas_string
from ..utils import add_amount_or_formula, skip_empty
from .base import SimaProCSVBlock

Expand Down Expand Up @@ -44,7 +44,7 @@ def __init__(self, block: List[tuple], header: dict, category: str):
{
"name": line[0],
"unit": line[1],
"cas_number": validate_cas(line[2]),
"cas_number": validate_cas_string(line[2]),
"comment": line[3],
"line_no": line_no,
}
Expand Down
4 changes: 2 additions & 2 deletions bw_simapro_csv/blocks/impact_category.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import List

from ..cas import validate_cas
from ..cas import validate_cas_string
from ..utils import asnumber, jump_to_nonempty, skip_empty
from .base import SimaProCSVBlock

Expand Down Expand Up @@ -43,7 +43,7 @@ def __init__(self, block: List[tuple], header: dict):
{
"context": (line[0], line[1]),
"name": line[2],
"cas_number": validate_cas(line[3]),
"cas_number": validate_cas_string(line[3]),
"factor": asnumber(line[4]),
"unit": line[5],
"line_no": line_no,
Expand Down
60 changes: 42 additions & 18 deletions bw_simapro_csv/cas.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,54 @@
import string
from numbers import Number
from typing import Any, Optional
from typing import Optional

import numpy as np
from loguru import logger


def validate_cas(s: Any) -> Optional[str]:
ERROR = "CAS Check Digit error: CAS '{}' has check digit of {}, but it should be {}"
def calculate_check_digit(cas: str) -> int:
return sum((a + 1) * int(b) for a, b in zip(range(9), cas[-1::-1])) % 10

if isinstance(s, str):
s = s.strip()
if not s:

def validate_cas_string(cas: Optional[str]) -> Optional[str]:
if isinstance(cas, str):
cas = cas.strip()
if not cas:
return None
elif isinstance(s, Number) and np.isnan(s):
elif isinstance(cas, Number) and np.isnan(cas):
return None

if s[-1] not in string.digits:
check_digit = None
cas = s.replace("-", "")
if "-" not in cas:
first, second, check_digit = cas[:-3], cas[-3:-1], int(cas[-1])
if str(calculate_check_digit(first + second)) != str(check_digit):
logger.warning(
"Removing invalid CAS number {}; last digit should be {}".format(
cas, check_digit
)
)
return None
return "-".join([first, second, str(check_digit)]).lstrip("0")
elif cas.count("-") == 2 and not cas.split("-")[2]:
# e.g. 1228284-64-
check_digit = str(calculate_check_digit(cas.replace("-", "")))
logger.warning(
"Adding missing CAS check digit, {} -> {}".format(cas, cas + check_digit)
)
return cas + check_digit
elif cas.count("-") == 2:
first, second, third = cas.split("-")
check_digit = calculate_check_digit(first + second)
if str(check_digit) != third:
logger.warning(
"Removing invalid CAS number {}; last digit should be {}".format(
cas, check_digit
)
)
else:
return cas.lstrip("0")
else:
check_digit = s[-1]
cas = s[:-1].replace('-', '')

total = sum((a + 1) * int(b) for a, b in zip(range(9), s.replace("-", "")[-2::-1]))
if not total % 10 == int(s[-1]):
logger.warning("CAS not valid: {} ({})".format(s, ERROR.format(s, s[-1], total % 10)))
logger.warning(
"Given CAS can't be validated, wrong number of hyphens are present: {}".format(
cas
)
)
return None
return s.lstrip("0")
4 changes: 2 additions & 2 deletions tests/fixtures/cas_missing_check_number.csv
Original file line number Diff line number Diff line change
Expand Up @@ -247,8 +247,8 @@ End


Emissions to soil
Prothioconazole,kg,178928-70-5,Formula: C14H15Cl2N3OS
Pydiflumetofen,kg,1228284-64-,Formula: C16H16Cl3F2N3O2
Prothioconazole;kg;178928-70-5;Formula: C14H15Cl2N3OS
Pydiflumetofen;kg;1228284-64-;Formula: C16H16Cl3F2N3O2


End
Expand Down
32 changes: 32 additions & 0 deletions tests/unit/test_cas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from bw_simapro_csv import SimaProCSV
from bw_simapro_csv.blocks import GenericBiosphere
from bw_simapro_csv.cas import calculate_check_digit, validate_cas_string


def test_calculate_check_digit():
assert calculate_check_digit("773218") == 5
assert calculate_check_digit("778240") == 3


def test_validate_cas_string():
assert validate_cas_string("7782425") == "7782-42-5"
assert validate_cas_string("007782425") == "7782-42-5"
assert validate_cas_string(" 7782-42-5\n") == "7782-42-5"
assert validate_cas_string("007782-42-5") == "7782-42-5"
assert validate_cas_string("1228284-64-") == "1228284-64-7"
assert validate_cas_string("") is None
assert validate_cas_string(None) is None
assert validate_cas_string(float("NaN")) is None
assert validate_cas_string("7782-425") is None
assert validate_cas_string("7782424") is None


def test_cas_in_file(fixtures_dir):
obj = SimaProCSV(fixtures_dir / "cas_missing_check_number.csv")
blocks = [
elem
for elem in obj.blocks
if isinstance(elem, GenericBiosphere) and elem.category == "Emissions to soil"
]
expected = [None, "1228284-64-7"]
assert [obj["cas_number"] for obj in blocks[0].parsed] == expected

0 comments on commit 3171387

Please sign in to comment.