From c0b2d187f2515b183007f1afc0e7789449a17c93 Mon Sep 17 00:00:00 2001 From: 1313ou <1313ou@gmail.com> Date: Mon, 7 Oct 2024 21:02:45 +0200 Subject: [PATCH 1/2] Changed generation of escaped XML IDs to allow non-ASCII, changed escaping of colon to avoid possible collision with cl (centilitre) --- scripts/wordnet.py | 26 ++++++++++++++++++++------ scripts/wordnet_yaml.py | 8 ++++---- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/scripts/wordnet.py b/scripts/wordnet.py index 19109c58..5c03fcc1 100644 --- a/scripts/wordnet.py +++ b/scripts/wordnet.py @@ -763,11 +763,24 @@ def extract_comments(wordnet_file, lexicon): c = None +# Regular expressions for valid NameStartChar and NameChar +# based on the XML 1.0 specification. +name_start_char_re = re.compile( + r'^[A-Z_a-z\xC0-\xD6\xD8-\xF6\xF8-\u02FF\u0370-\u037D\u037F-\u1FFF' + r'\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF' + r'\uF900-\uFDCF\uFDF0-\uFFFD]$') + +name_char_re = re.compile( + r'^[A-Z_a-z0-9\x2D\x2E\xB7\xC0-\xD6\xD8-\xF6\xF8-\u02FF' + r'\u0300-\u036F\u203F-\u2040\u0370-\u037D\u037F-\u1FFF' + r'\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF' + r'\uF900-\uFDCF\uFDF0-\uFFFD]$') + + def escape_lemma(lemma): """Format the lemma so it is valid XML id""" def elc(c): - if (c >= 'A' and c <= 'Z') or (c >= 'a' and c <= 'z') or ( - c >= '0' and c <= '9') or c == '.': + if ('A' <= c <= 'Z') or ('a' <= c <= 'z') or ('0' <= c <= '9') or c == '.': return c elif c == ' ': return '_' @@ -779,16 +792,17 @@ def elc(c): return '-ap-' elif c == '/': return '-sl-' - elif c == '-': - return '-' + elif c == ':': + return '-cn-' elif c == ',': return '-cm-' elif c == '!': return '-ex-' elif c == '+': return '-pl-' - else: - return '-%04x-' % ord(c) + elif name_char_re.match(c) or name_char_re.match(c): + return c + raise ValueError(f'Illegal character {c}') return "".join(elc(c) for c in lemma) diff --git a/scripts/wordnet_yaml.py b/scripts/wordnet_yaml.py index 83af9caf..31128c86 100644 --- a/scripts/wordnet_yaml.py +++ b/scripts/wordnet_yaml.py @@ -14,20 +14,20 @@ def map_sense_key(sk): if "%" in sk: e = sk.split("%") - return ("oewn-" + e[0].replace("'","-ap-").replace("/","-sl-").replace("!","-ex-").replace(",","-cm-").replace(":","-cl-").replace("+","-pl-") + + return ("oewn-" + e[0].replace("'","-ap-").replace("/","-sl-").replace("!","-ex-").replace(",","-cm-").replace(":","-cn-").replace("+","-pl-") + "__" + e[1].replace("_","-sp-").replace(":",".")) else: - return "oewn-" + sk.replace("%", "__").replace("'","-ap-").replace("/","-sl-").replace("!","-ex-").replace(",","-cm-").replace(":","-cl-").replace("+","-pl-") + return "oewn-" + sk.replace("%", "__").replace("'","-ap-").replace("/","-sl-").replace("!","-ex-").replace(",","-cm-").replace(":","-cn-").replace("+","-pl-") def unmap_sense_key(sk): if "__" in sk: e = sk.split("__") l = e[0][KEY_PREFIX_LEN:] r = "__".join(e[1:]) - return (l.replace("-ap-", "'").replace("-sl-", "/").replace("-ex-", "!").replace("-cm-",",").replace("-cl-",":").replace("-pl-","+") + + return (l.replace("-ap-", "'").replace("-sl-", "/").replace("-ex-", "!").replace("-cm-",",").replace("-cn-",":").replace("-pl-","+") + "%" + r.replace(".", ":").replace("-sp-","_")) else: - return sk[KEY_PREFIX_LEN:].replace("__", "%").replace("-ap-", "'").replace("-sl-", "/").replace("-ex-", "!").replace("-cm-",",").replace("-cl-",":").replace("-pl-","+") + return sk[KEY_PREFIX_LEN:].replace("__", "%").replace("-ap-", "'").replace("-sl-", "/").replace("-ex-", "!").replace("-cm-",",").replace("-cn-",":").replace("-pl-","+") def make_pos(y, pos): From e9a60a6286c42d4b4c2e562084b3c18f6bad89dd Mon Sep 17 00:00:00 2001 From: 1313ou <1313ou@gmail.com> Date: Tue, 8 Oct 2024 11:36:37 +0200 Subject: [PATCH 2/2] Optimize and validate --- scripts/validate.py | 2 +- scripts/wordnet.py | 39 ++++++++++++++++++++++++++------------- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/scripts/validate.py b/scripts/validate.py index fde287b5..3ae7203f 100644 --- a/scripts/validate.py +++ b/scripts/validate.py @@ -178,7 +178,7 @@ def check_lex_files(wn, fix): return errors -valid_id = re.compile("^oewn-[A-Za-z0-9_\\-.]*$") +valid_id = re.compile(fr"^oewn-{xml_id_char}*$") valid_sense_id = re.compile( "^oewn-[A-Za-z0-9_\\-.]+-([nvars])-([0-9]{8})-[0-9]{2}$") diff --git a/scripts/wordnet.py b/scripts/wordnet.py index 5c03fcc1..237b0ecd 100644 --- a/scripts/wordnet.py +++ b/scripts/wordnet.py @@ -763,19 +763,32 @@ def extract_comments(wordnet_file, lexicon): c = None -# Regular expressions for valid NameStartChar and NameChar +# Regular expressions for valid NameChar # based on the XML 1.0 specification. -name_start_char_re = re.compile( - r'^[A-Z_a-z\xC0-\xD6\xD8-\xF6\xF8-\u02FF\u0370-\u037D\u037F-\u1FFF' - r'\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF' - r'\uF900-\uFDCF\uFDF0-\uFFFD]$') - -name_char_re = re.compile( - r'^[A-Z_a-z0-9\x2D\x2E\xB7\xC0-\xD6\xD8-\xF6\xF8-\u02FF' - r'\u0300-\u036F\u203F-\u2040\u0370-\u037D\u037F-\u1FFF' - r'\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF' - r'\uF900-\uFDCF\uFDF0-\uFFFD]$') - +# We don't chek for 1st character extra restrictions +# because it's always prefixed with 'oewn-' +xml_id_az = r'A-Za-z' +xml_id_num = r'0-9' +xml_id_extend = ( + r'\xC0-\xD6' # ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ + r'\xD8-\xF6' # ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö + r'\xF8-\u02FF' + r'\u0370-\u037D' + r'\u037F-\u1FFF' + r'\u200C-\u200D' + r'\u2070-\u218F' + r'\u2C00-\u2FEF' + r'\u3001-\uD7FF' + r'\uF900-\uFDCF' + r'\uFDF0-\uFFFD' +) +xml_id_not_first = ( + r'\u0300-\u036F' + r'\u203F-\u2040' +) +# name_start_char = fr'[_{xml_id_az}{xml_id_extend}]' # not used if oewn- prefix +xml_id_char = fr'[_\-\.·{xml_id_az}{xml_id_num}{xml_id_extend}{xml_id_not_first}]' +xml_id_char_re = re.compile(xml_id_char) def escape_lemma(lemma): """Format the lemma so it is valid XML id""" @@ -800,7 +813,7 @@ def elc(c): return '-ex-' elif c == '+': return '-pl-' - elif name_char_re.match(c) or name_char_re.match(c): + elif xml_id_char_re.match(c): return c raise ValueError(f'Illegal character {c}')