diff --git a/quantulum3/_lang/en_US/tests/quantities.json b/quantulum3/_lang/en_US/tests/quantities.json index 5a88275..8912fad 100644 --- a/quantulum3/_lang/en_US/tests/quantities.json +++ b/quantulum3/_lang/en_US/tests/quantities.json @@ -1544,5 +1544,437 @@ "surface": "10" } ] + }, + { + "req": "a total of -34 kg/cm2", + "res": [ + { + "value": -34, + "unit": "kilogram per square centimetre", + "surface": "-34 kg/cm2", + "entity": "unknown", + "uncertainty": null, + "dimensions": [ + { + "base": "kilogram", + "power": 1, + "surface": "kg" + }, + { + "base": "centimetre", + "power": -2, + "surface": "cm" + } + ] + } + ] + }, + { + "req": "a total of -34kg/cm2", + "res": [ + { + "value": -34, + "unit": "kilogram per square centimetre", + "surface": "-34kg/cm2", + "entity": "unknown", + "uncertainty": null, + "dimensions": [ + { + "base": "kilogram", + "power": 1, + "surface": "kg" + }, + { + "base": "centimetre", + "power": -2, + "surface": "cm" + } + ] + } + ] + }, + { + "req": "a total of -34kg/ cm2", + "res": [ + { + "value": -34, + "unit": "kilogram per square centimetre", + "surface": "-34kg/ cm2", + "entity": "unknown", + "uncertainty": null, + "dimensions": [ + { + "base": "kilogram", + "power": 1, + "surface": "kg" + }, + { + "base": "centimetre", + "power": -2, + "surface": "cm" + } + ] + } + ] + }, + { + "req": "a total of -34 kg/ cm2", + "res": [ + { + "value": -34, + "unit": "kilogram per square centimetre", + "surface": "-34 kg/ cm2", + "entity": "unknown", + "uncertainty": null, + "dimensions": [ + { + "base": "kilogram", + "power": 1, + "surface": "kg" + }, + { + "base": "centimetre", + "power": -2, + "surface": "cm" + } + ] + } + ] + }, + { + "req": "a total of -34 kg / cm2", + "res": [ + { + "value": -34, + "unit": "kilogram per square centimetre", + "surface": "-34 kg / cm2", + "entity": "unknown", + "uncertainty": null, + "dimensions": [ + { + "base": "kilogram", + "power": 1, + "surface": "kg" + }, + { + "base": "centimetre", + "power": -2, + "surface": "cm" + } + ] + } + ] + }, + { + "req": "a total of -34 kg /cm2", + "res": [ + { + "value": -34, + "unit": "kilogram per square centimetre", + "surface": "-34 kg /cm2", + "entity": "unknown", + "uncertainty": null, + "dimensions": [ + { + "base": "kilogram", + "power": 1, + "surface": "kg" + }, + { + "base": "centimetre", + "power": -2, + "surface": "cm" + } + ] + } + ] + }, + { + "req": "a total of -39 kg/cm2", + "res": [ + { + "value": -39, + "unit": "kilogram per square centimetre", + "surface": "-39 kg/cm2", + "entity": "unknown", + "uncertainty": null, + "dimensions": [ + { + "base": "kilogram", + "power": 1, + "surface": "kg" + }, + { + "base": "centimetre", + "power": -2, + "surface": "cm" + } + ] + } + ] + }, + { + "req": "a total of -39kg / cm2", + "res": [ + { + "value": -39, + "unit": "kilogram per square centimetre", + "surface": "-39kg / cm2", + "entity": "unknown", + "uncertainty": null, + "dimensions": [ + { + "base": "kilogram", + "power": 1, + "surface": "kg" + }, + { + "base": "centimetre", + "power": -2, + "surface": "cm" + } + ] + } + ] + }, + { + "req": "a total of -34kg per cm2", + "res": [ + { + "value": -34, + "unit": "kilogram per square centimetre", + "surface": "-34kg per cm2", + "entity": "unknown", + "uncertainty": null, + "dimensions": [ + { + "base": "kilogram", + "power": 1, + "surface": "kg" + }, + { + "base": "centimetre", + "power": -2, + "surface": "cm" + } + ] + } + ] + }, + { + "req": "a total of -34 kgcm2", + "res": [ + { + "value": -34, + "unit": "kilogram square centimetre", + "surface": "-34 kgcm2", + "entity": "moment of inertia", + "uncertainty": null, + "dimensions": [ + { + "base": "kilogram", + "power": 1, + "surface": "kg" + }, + { + "base": "centimetre", + "power": 2, + "surface": "cm" + } + ] + } + ] + }, + { + "req": "a total of -34kg*cm2", + "res": [ + { + "value": -34, + "unit": "kilogram square centimetre", + "surface": "-34kg*cm2", + "entity": "moment of inertia", + "uncertainty": null, + "dimensions": [ + { + "base": "kilogram", + "power": 1, + "surface": "kg" + }, + { + "base": "centimetre", + "power": 2, + "surface": "cm" + } + ] + } + ] + }, + { + "req": "a total of -34kg* cm2", + "res": [ + { + "value": -34, + "unit": "kilogram square centimetre", + "surface": "-34kg* cm2", + "entity": "moment of inertia", + "uncertainty": null, + "dimensions": [ + { + "base": "kilogram", + "power": 1, + "surface": "kg" + }, + { + "base": "centimetre", + "power": 2, + "surface": "cm" + } + ] + } + ] + }, + { + "req": "a total of -34 kg cm2", + "res": [ + { + "value": -34, + "unit": "kilogram square centimetre", + "surface": "-34 kg cm2", + "entity": "moment of inertia", + "uncertainty": null, + "dimensions": [ + { + "base": "kilogram", + "power": 1, + "surface": "kg" + }, + { + "base": "centimetre", + "power": 2, + "surface": "cm" + } + ] + } + ] + }, + { + "req": "a total of -34 kg * cm2", + "res": [ + { + "value": -34, + "unit": "kilogram square centimetre", + "surface": "-34 kg * cm2", + "entity": "moment of inertia", + "uncertainty": null, + "dimensions": [ + { + "base": "kilogram", + "power": 1, + "surface": "kg" + }, + { + "base": "centimetre", + "power": 2, + "surface": "cm" + } + ] + } + ] + }, + { + "req": "a total of -34 kg *cm2", + "res": [ + { + "value": -34, + "unit": "kilogram square centimetre", + "surface": "-34 kg *cm2", + "entity": "moment of inertia", + "uncertainty": null, + "dimensions": [ + { + "base": "kilogram", + "power": 1, + "surface": "kg" + }, + { + "base": "centimetre", + "power": 2, + "surface": "cm" + } + ] + } + ] + }, + { + "req": "a total of -39 kg*cm2", + "res": [ + { + "value": -39, + "unit": "kilogram square centimetre", + "surface": "-39 kg*cm2", + "entity": "moment of inertia", + "uncertainty": null, + "dimensions": [ + { + "base": "kilogram", + "power": 1, + "surface": "kg" + }, + { + "base": "centimetre", + "power": 2, + "surface": "cm" + } + ] + } + ] + }, + { + "req": "a total of -39kg * cm2", + "res": [ + { + "value": -39, + "unit": "kilogram square centimetre", + "surface": "-39kg * cm2", + "entity": "moment of inertia", + "uncertainty": null, + "dimensions": [ + { + "base": "kilogram", + "power": 1, + "surface": "kg" + }, + { + "base": "centimetre", + "power": 2, + "surface": "cm" + } + ] + } + ] + }, + { + "req": "a total of -34kg times cm2", + "res": [ + { + "value": -34, + "unit": "kilogram square centimetre", + "surface": "-34kg times cm2", + "entity": "moment of inertia", + "uncertainty": null, + "dimensions": [ + { + "base": "kilogram", + "power": 1, + "surface": "kg" + }, + { + "base": "centimetre", + "power": 2, + "surface": "cm" + } + ] + } + ] } ] diff --git a/quantulum3/parser.py b/quantulum3/parser.py index 0fbe2d9..fc3d192 100644 --- a/quantulum3/parser.py +++ b/quantulum3/parser.py @@ -277,6 +277,51 @@ def parse_unit(item, unit, slash, lang="en_US"): """ return _get_parser(lang).parse_unit(item, unit, slash) +def compact_matches(item): + res = {} + starts = {} + groups = item.groupdict() + unit_index = 1 + op_index = 1 + if "prefix" in groups and groups["prefix"]: + res["prefix"] = groups["prefix"] + if "value" in groups and groups["value"]: + res["value"] = groups["value"] + op_acum = "" + for i in [1,2,3,4]: + op = groups.get(f"operator{i}", None) + un = groups.get(f"unit{i}", None) + if op and un: #op.strip(): + res[f"operator{op_index}"] = op_acum + op + starts[f"operator{op_index}"] = item.start(f"operator{i}") + op_index += 1 + op_acum = "" + if op and not un: + op_acum += op + if un: + res[f"unit{unit_index}"] = un + starts[f"unit{unit_index}"] = item.start(f"unit{i}") + unit_index += 1 + if op_index < unit_index: op_index = unit_index + print("ORIG", groups) + print("COMP", res) + return FakeItem(res, item, starts) + +class FakeItem: + def __init__(self, d, item, starts): + print("created with", d) + self.d = d + self.item = item + self.starts = starts + + def group(self, g): + return self.d.get(g, None) + + def end(self): return self.item.end() + def start(self, g=None): + if g is None: return self.item.start() + return self.starts[g] + ############################################################################### def get_unit(item, text, lang="en_US"): @@ -288,9 +333,12 @@ def get_unit(item, text, lang="en_US"): group_operators = ["operator1", "operator2", "operator3", "operator4"] # How much of the end is removed because of an "incorrect" regex match unit_shortening = 0 + print(item.groupdict()) item_units = [item.group(i) for i in group_units if item.group(i)] + print("unit:", item_units) + item = compact_matches(item) if len(item_units) == 0: unit = load.units(lang).names["dimensionless"] else: @@ -300,6 +348,7 @@ def get_unit(item, text, lang="en_US"): unit = item.group(group_units[index]) operator_index = None if index < 1 else group_operators[index - 1] operator = None if index < 1 else item.group(operator_index) + print(index, "O", operator, "U", unit) # disallow spaces as operators in units expressed in their symbols # Enforce consistency among multiplication and division operators @@ -510,6 +559,7 @@ def parse(text, lang="en_US", verbose=False) -> List[cls.Quantity]: groups = dict([i for i in item.groupdict().items() if i[1] and i[1].strip()]) _LOGGER.debug("Quantity found: %s", groups) + print("XX",groups) try: uncert, values = get_values(item, lang) diff --git a/quantulum3/regex.py b/quantulum3/regex.py index 42b02fd..8507122 100644 --- a/quantulum3/regex.py +++ b/quantulum3/regex.py @@ -54,14 +54,14 @@ def exponents_regex(lang="en_US"): def ranges(lang="en_US"): ranges_ = {"-"} ranges_.update(_get_regex(lang).RANGES) - return ranges_ + return list(sorted(ranges_)) @cached def uncertainties(lang="en_US"): uncertainties_ = {r"\+/-", r"±"} uncertainties_.update(_get_regex(lang).UNCERTAINTIES) - return uncertainties_ + return list(sorted(uncertainties_)) ############################################################################### @@ -153,7 +153,7 @@ def unicode_fractions_regex(): def multiplication_operators(lang="en_US"): mul = {"*", " ", "·", "x"} mul.update(_get_regex(lang).MULTIPLICATION_OPERATORS) - return mul + return sorted(mul) @cached @@ -172,7 +172,7 @@ def division_operators(lang="en_US"): def grouping_operators(lang="en_US"): grouping_ops = {" "} grouping_ops.update(_get_regex(lang).GROUPING_OPERATORS) - return grouping_ops + return list(sorted(grouping_ops)) def grouping_operators_regex(lang="en_US"): @@ -323,19 +323,23 @@ def units_regex(lang="en_US"): """ - op_keys = sorted(list(operators(lang)), key=len, reverse=True) + op_keys = list(operators(lang)) unit_keys = sorted( list(load.units(lang).surfaces.keys()) + list(load.units(lang).symbols.keys()), - key=len, + key=lambda x: (len(x), x), reverse=True, ) symbol_keys = sorted( - list(load.units(lang).prefix_symbols.keys()), key=len, reverse=True + list(load.units(lang).prefix_symbols.keys()), + key=lambda x: (len(x), x), + reverse=True, ) exponent = exponents_regex(lang).format(superscripts=unicode_superscript_regex()) - all_ops = "|".join([r"{}".format(re.escape(i)) for i in op_keys]) + ops = [r"{op}".format(op=re.escape(op)) for op in op_keys] + all_ops = "|".join(sorted(ops, key=lambda x: (len(x), x), reverse=True)) + all_units = "|".join([r"{}".format(re.escape(i)) for i in unit_keys]) all_symbols = "|".join([r"{}".format(re.escape(i)) for i in symbol_keys]) @@ -343,10 +347,10 @@ def units_regex(lang="en_US"): (?(?:%s)(?![a-zA-Z]))? # Currencies, mainly (?P%s)-? # Number - (?:(?P%s(?=(%s)%s))?(?P(?:%s)%s)?) # Operator + Unit (1) - (?:(?P%s(?=(%s)%s))?(?P(?:%s)%s)?) # Operator + Unit (2) - (?:(?P%s(?=(%s)%s))?(?P(?:%s)%s)?) # Operator + Unit (3) - (?:(?P%s(?=(%s)%s))?(?P(?:%s)%s)?) # Operator + Unit (4) + (?:(?P(?:\s*)%s(?:\s*)(?=(%s)%s))?(?P(?:%s)%s)?) # Operator + Unit (1) + (?:(?P(?:\s*)%s(?:\s*)(?=(%s)%s))?(?P(?:%s)%s)?) # Operator + Unit (2) + (?:(?P(?:\s*)%s(?:\s*)(?=(%s)%s))?(?P(?:%s)%s)?) # Operator + Unit (3) + (?:(?P(?:\s*)%s(?:\s*)(?=(%s)%s))?(?P(?:%s)%s)?) # Operator + Unit (4) (?!\w) # "end" of word """ % tuple( [all_symbols, range_pattern(lang)] diff --git a/quantulum3/tests/test_no_classifier.py b/quantulum3/tests/test_no_classifier.py index 66381f2..3d7fc41 100644 --- a/quantulum3/tests/test_no_classifier.py +++ b/quantulum3/tests/test_no_classifier.py @@ -42,6 +42,7 @@ def test_parse_no_classifier(self, lang="en_US"): ), ) for index, quant in enumerate(quants): + print(quant.unit) self.assertEqual(test["res"][index].surface, quant.surface) self.assertEqual(test["res"][index], quant) self.assertEqual(