From a1907ec1845d6493f57a254e8259deea1796130d Mon Sep 17 00:00:00 2001 From: "John P. McCrae" Date: Mon, 30 Sep 2024 13:20:38 +0100 Subject: [PATCH 1/2] Clean up of scripts This PR deletes most of the (unused) scripts in favour of two short (`from_yaml.py` and `validate.py`) We also introduce a `TOOLS.md` to the read me Closes #1030 --- .github/workflows/main.yml | 5 +- TOOLS.md | 9 + scripts/add-senses-nos.py | 61 -- scripts/assign-sense-key.py | 16 - scripts/change-definition.py | 93 --- scripts/change-entry.py | 105 --- scripts/change-example.py | 74 -- scripts/change-relation.py | 276 -------- scripts/change-synset.py | 95 --- scripts/change_manager.py | 812 ---------------------- scripts/check_sense_ids.py | 24 - scripts/from-yaml.py | 11 - scripts/{wordnet_yaml.py => from_yaml.py} | 199 +++--- scripts/merge-synset.py | 75 -- scripts/merge.py | 121 ---- scripts/remove_examples_dquots.py | 16 - scripts/split-synset.py | 70 -- scripts/to-yaml.py | 12 - scripts/validate.py | 5 +- scripts/wordnet.py | 17 +- 20 files changed, 103 insertions(+), 1993 deletions(-) create mode 100644 TOOLS.md delete mode 100644 scripts/add-senses-nos.py delete mode 100644 scripts/assign-sense-key.py delete mode 100644 scripts/change-definition.py delete mode 100644 scripts/change-entry.py delete mode 100644 scripts/change-example.py delete mode 100644 scripts/change-relation.py delete mode 100644 scripts/change-synset.py delete mode 100644 scripts/change_manager.py delete mode 100644 scripts/check_sense_ids.py delete mode 100644 scripts/from-yaml.py rename scripts/{wordnet_yaml.py => from_yaml.py} (66%) delete mode 100644 scripts/merge-synset.py delete mode 100644 scripts/merge.py delete mode 100644 scripts/remove_examples_dquots.py delete mode 100644 scripts/split-synset.py delete mode 100644 scripts/to-yaml.py diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1d2c62b1..5639c8e8 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -19,11 +19,8 @@ jobs: - uses: actions/checkout@v2 - name: Generate XML - run: python scripts/from-yaml.py + run: python scripts/from_yaml.py - - name: Merge the source files - run: python scripts/merge.py - - name: Install xmllint run: sudo apt-get update && sudo apt-get install libxml2-utils diff --git a/TOOLS.md b/TOOLS.md new file mode 100644 index 00000000..c46475ba --- /dev/null +++ b/TOOLS.md @@ -0,0 +1,9 @@ +# Tools for Working with Open English WordNet + +## English WordNet Editor + +This tool is a command-line editor for English WordNet. It allows you to add, delete, and modify synsets, words, and relations in English WordNet. It is written in Python and uses the `nltk` library to interact with WordNet. The tool is available on GitHub at https://github.com/jmccrae/ewe + +## WN Python Library + +This is a Python library for working with WordNet. It provides a simple interface for querying WordNet and accessing synsets, words, and relations. The library is available on GitHub at https://github.com/goodmami/wn diff --git a/scripts/add-senses-nos.py b/scripts/add-senses-nos.py deleted file mode 100644 index c34629c5..00000000 --- a/scripts/add-senses-nos.py +++ /dev/null @@ -1,61 +0,0 @@ -# This is the script used to add sense numberings to the senses -from glob import glob -import re - -indexes = {} - - -def load_indexes(index): - for line in open(index).readlines(): - if not line.startswith(" "): - elems = line.split() - syn_cnt = int(elems[2]) - if elems[1] == "a": - indexes["%s-s" % (elems[0])] = elems[-syn_cnt:] - indexes["%s-%s" % (elems[0], elems[1])] = elems[-syn_cnt:] - print("Loaded index %s" % index) - - -# Manual cases -# graeco-roman_wrestling-n -# pasture_land-n -# graeco-roman_architecture-n -# terracotta-n - -def main(): - r = re.compile(".*") - for wn31_part in glob("src/xml/wn31-*.xml"): - with open("%s.new" % wn31_part, "w") as out: - for line in open(wn31_part).readlines(): - m = r.match(line) - if m: - lemma = m.group(1).replace( - "-ap-", "'").replace("-sl-", "/").replace("-lb-", "(").replace("-rb-", ")") - if lemma.endswith("(a)") or lemma.endswith("(p)"): - lemma = lemma[:-3] - if lemma.endswith("(ip)"): - lemma = lemma[:-4] - pos = m.group(2) - synset = m.group(3) - lemma_pos = "%s-%s" % (lemma.lower(), pos) - if lemma_pos in indexes: - out.write( - " \n" % - (m.group(1), - m.group(2), - m.group(3), - m.group(4), - indexes[lemma_pos].index(synset), - m.group(5))) - else: - out.write(line) - else: - out.write(line) - - -if __name__ == "__main__": - load_indexes("index.noun") - load_indexes("index.adj") - load_indexes("index.verb") - load_indexes("index.adv") - main() diff --git a/scripts/assign-sense-key.py b/scripts/assign-sense-key.py deleted file mode 100644 index ac06684f..00000000 --- a/scripts/assign-sense-key.py +++ /dev/null @@ -1,16 +0,0 @@ -from wordnet import * -import change_manager -from glob import glob -import re -from sys import exit -import sense_keys - - -if __name__ == "__main__": - wn = change_manager.load_wordnet() - for e in wn.entries: - for s in e.senses: - if not s.sense_key: - s.sense_key = sense_keys.get_sense_key(wn, e, s, - wn.synset_by_id(s.synset).lex_name) - change_manager.save(wn) diff --git a/scripts/change-definition.py b/scripts/change-definition.py deleted file mode 100644 index 1e95b437..00000000 --- a/scripts/change-definition.py +++ /dev/null @@ -1,93 +0,0 @@ -import sys -import wordnet -import argparse -import os -import pickle -from autocorrect import Speller - - -def update_def(wn, synset, defn, add): - spell = Speller(lang='en') - if any([spell(w) != w for w in defn.split()]): - if input( - "There may be spelling errors in this definition. Proceed [y/N] : ") != "y": - sys.exit(-1) - print("Previous definitions:") - for d in synset.definitions: - print("> " + d.text) - wn_synset = wordnet.parse_wordnet("src/xml/wn-%s.xml" % synset.lex_name) - ss = wn_synset.synset_by_id(synset.id) - if add: - ss.definitions = ss.definitions + [wordnet.Definition(defn)] - else: - ss.definitions = [wordnet.Definition(defn)] - with open("src/xml/wn-%s.xml" % synset.lex_name, "w") as out: - wn_synset.to_xml(out, True) - - -def update_ili_def(wn, synset, defn): - wn_synset = wordnet.parse_wordnet("src/xml/wn-%s.xml" % synset.lex_name) - ss = wn_synset.synset_by_id(synset.id) - ss.ili_definition = wordnet.Definition(defn) - with open("src/xml/wn-%s.xml" % synset.lex_name, "w") as out: - wn_synset.to_xml(out, True) - - -def main(): - parser = argparse.ArgumentParser( - description="Change a definition within the wordnet") - parser.add_argument( - 'id', - metavar='ID', - type=str, - nargs="?", - help="The ID of the synset (sense) for the relationship") - parser.add_argument( - '--add', - action='store_true', - help="Add the new definition and retain the previous definition (otherwise this definition replaces previous definitions)") - parser.add_argument('--defn', type=str, - help="The new definition") - parser.add_argument('--ili', action='store_true', - help="Set the ILI definition") - - args = parser.parse_args() - - # Slightly speeds up the loading of WordNet - if not os.path.exists("wn.pickle") or os.path.getmtime( - "wn.pickle") < os.path.getmtime("wn.xml"): - print("Loading wordnet") - wn = wordnet.parse_wordnet("wn.xml") - pickle.dump(wn, open("wn.pickle", "wb")) - else: - wn = pickle.load(open("wn.pickle", "rb")) - - if not args.id: - id = "oewn-" + input("Enter synset ID : oewn-") - else: - id = args.id - - synset = wn.synset_by_id(id) - - if not synset: - print("Could not find the synset %s" % args.id) - sys.exit(-1) - - if args.ili: - if not args.defn: - args.defn = synset.definitions[0].text - - update_ili_def(wn, synset, args.defn) - else: - if not args.defn: - print("Definition : " + synset.definitions[0].text) - defn = input("New Definition : ") - else: - defn = args.defn - - update_def(wn, synset, defn, args.add) - change_manager.save_all_xml(wn) - - -if __name__ == "__main__": - main() diff --git a/scripts/change-entry.py b/scripts/change-entry.py deleted file mode 100644 index ecfe541f..00000000 --- a/scripts/change-entry.py +++ /dev/null @@ -1,105 +0,0 @@ -import sys -import wordnet -import argparse -import re -import change_manager - - -def main(): - parser = argparse.ArgumentParser( - description="Add or remove an entry from a synset") - parser.add_argument('synset', metavar='SYNSET_ID', type=str, nargs="?", - help="The ID of the synset to change") - parser.add_argument('lemma', metavar='LEMMA', type=str, nargs="?", - help="The lemma to change") - parser.add_argument('--add', action='store_true', - help="Add this entry to a synset") - parser.add_argument('--delete', action='store_true', - help="Remove this entry from a synset") - parser.add_argument('--move', action='store_true', - help="Change this entry to another synset") - parser.add_argument('--target', type=str, - help="The target for a change") - parser.add_argument( - '-n', - metavar='N', - type=int, - default=- - 1, - help="The position of this synset within the list of senses for the entry") - parser.add_argument('-i', metavar='IDX', type=int, default=-1, - help="The position of this lemma in the synset") - - args = parser.parse_args() - - if args.add: - action = "A" - elif args.delete: - action = "D" - elif args.move: - action = "M" - else: - action = input("[A]dd/[D]elete/[M]ove? ").upper() - if action != "A" and action != "D" and action != "M": - print("Bad action") - sys.exit(-1) - - wn = change_manager.load_wordnet() - - if not args.synset: - synset_id = "oewn-" + input("Enter synset ID : oewn-") - else: - synset_id = args.synset - - synset = wn.synset_by_id(synset_id) - - entries = wn.members_by_id(synset_id) - if entries: - print("Entries: " + ", ".join(entries)) - else: - print("No entries") - - if not args.lemma: - if action == "A": - lemma = input("New entry: ") - elif action == "D": - lemma = input("Entry to remove: ") - elif action == "M": - lemma = input("Entry to move: ") - else: - lemma = args.lemma - - if not synset: - print("Could not find synset") - sys.exit(-1) - - if action == "M" and not args.target: - args.target = "oewn-" + input("Target synset: oewn-") - - if action == "A": - change_manager.add_entry(wn, synset, lemma, args.i, args.n) - elif action == "D": - change_manager.delete_entry( - wn, synset, "oewn-%s-%s" % - (wordnet.escape_lemma(lemma), synset.part_of_speech.value)) - elif action == "M": - target_synset = wn.synset_by_id(args.target) - - if not target_synset: - print("Could not find synset") - sys.exit(-1) - - if synset.lex_name == target_synset.lex_name: - change_manager.change_entry(wn, synset, target_synset, lemma) - else: - print( - "Moving across lexicographer files so implementing change as delete then add") - change_manager.delete_entry( - wn, synset, "oewn-%s-%s" % - (wordnet.escape_lemma(lemma), synset.part_of_speech.value)) - change_manager.add_entry(wn, target_synset, lemma, args.i, args.n) - change_manager.save_all_xml(wn) - - -if __name__ == "__main__": - main() diff --git a/scripts/change-example.py b/scripts/change-example.py deleted file mode 100644 index 3a318d72..00000000 --- a/scripts/change-example.py +++ /dev/null @@ -1,74 +0,0 @@ -import sys -import wordnet -import argparse -import os -import pickle - - -def add_ex(wn, synset, example): - wn_synset = wordnet.parse_wordnet("src/xml/wn-%s.xml" % synset.lex_name) - ss = wn_synset.synset_by_id(synset.id) - ss.examples = ss.examples + [wordnet.Example(example)] - with open("src/xml/wn-%s.xml" % synset.lex_name, "w") as out: - wn_synset.to_xml(out, True) - - -def delete_ex(wn, synset, example): - wn_synset = wordnet.parse_wordnet("src/xml/wn-%s.xml" % synset.lex_name) - ss = wn_synset.synset_by_id(synset.id) - n_exs = len(ss.examples) - ss.examples = [ex for ex in ss.examples if ex.text != example] - if len(ss.examples) == n_exs: - print("No change") - else: - with open("src/xml/wn-%s.xml" % synset.lex_name, "w") as out: - wn_synset.to_xml(out, True) - - -def main(): - parser = argparse.ArgumentParser( - description="Add (or delete) an example of a synset") - parser.add_argument( - 'id', - metavar='ID', - type=str, - help="The ID of the synset (sense) for the relationship") - parser.add_argument('--delete', action='store_true', - help="Delete this definition instead of adding it") - parser.add_argument('--example', type=str, - help="The new example") - - args = parser.parse_args() - - # Slightly speeds up the loading of WordNet - if not os.path.exists("wn.pickle") or os.path.getmtime( - "wn.pickle") < os.path.getmtime("wn.xml"): - print("Loading wordnet") - wn = wordnet.parse_wordnet("wn.xml") - pickle.dump(wn, open("wn.pickle", "wb")) - else: - wn = pickle.load(open("wn.pickle", "rb")) - - synset = wn.synset_by_id(args.id) - - if not synset: - print("Could not find the synset %s" % args.id) - sys.exit(-1) - - if not args.example: - print("Please specify an example") - sys.exit(-1) - - if not args.example.startswith("\""): - print("Examples must start and end with a quotation") - sys.exit(-1) - - if args.delete: - delete_ex(wn, synset, args.example) - else: - add_ex(wn, synset, args.example) - change_manager.save_all_xml(wn) - - -if __name__ == "__main__": - main() diff --git a/scripts/change-relation.py b/scripts/change-relation.py deleted file mode 100644 index 744e1571..00000000 --- a/scripts/change-relation.py +++ /dev/null @@ -1,276 +0,0 @@ -import sys -import wordnet -import argparse -import os -import pickle -import re -import change_manager - - -def with_ewn(x): - if x: - return "oewn-" + x - else: - return None - - -def main(): - parser = argparse.ArgumentParser( - description="Change a relationship within the wordnet") - parser.add_argument( - 'source_id', - metavar='SOURCE_ID', - type=str, - nargs="?", - help="The ID of the source synset (sense) for the relationship") - parser.add_argument( - 'target_id', - metavar='TARGET_ID', - type=str, - nargs="?", - help="The ID of the target synset (sense) for the relationship") - parser.add_argument('--new-source', type=str, - help="The ID of the new source synset") - parser.add_argument('--new-target', type=str, - help="The ID of the new target synset") - parser.add_argument('--new-relation', type=str, - help="The type of the new relationship") - parser.add_argument('--add', action='store_true', - help="Add this relation as a new relation") - parser.add_argument('--delete', action='store_true', - help="Remove this relation (do not replace or change)") - parser.add_argument('--reverse', action='store_true', - help="Reverse this relation (swap source and target)") - - args = parser.parse_args() - - # Slightly speeds up the loading of WordNet - wn = change_manager.load_wordnet() - - if not args.source_id: - args.source_id = "oewn-" + input("Enter source synset ID: oewn-") - - if change_manager.sense_id_re.match(args.source_id): - (source_id, source_entry_id) = change_manager.decompose_sense_id(args.source_id) - else: - source_id = args.source_id - source_entry_id = None - - source_synset = wn.synset_by_id(source_id) - - if not source_synset: - print("Could not find the source synset %s" % source_id) - sys.exit(-1) - - if not args.target_id: - args.target_id = "oewn-" + input("Enter target synset ID: oewn-") - - if change_manager.sense_id_re.match(args.target_id): - (target_id, target_entry_id) = change_manager.decompose_sense_id(args.target_id) - else: - target_id = args.target_id - target_entry_id = None - - target_synset = wn.synset_by_id(target_id) - - if not target_synset: - print("Could not find the target synset %s" % target_id) - sys.exit(-1) - - if not args.new_source and not args.new_target and not args.new_relation and not args.delete: - mode = input( - "[A]dd new relation/[D]elete existing relation/[R]everse relation/[C]hange relation: ").lower() - if mode == "a": - args.add = True - if not args.new_relation: - args.new_relation = input("Enter new relation: ") - elif mode == "c": - mode = input("Change [S]ubject/[T]arget/[R]elation: ").lower() - if mode == "s": - args.new_source = with_ewn( - input("Enter new source (or blank for no change): oewn-")) - elif mode == "t": - args.new_target = with_ewn( - input("Enter new target (or blank for no change): oewn-")) - elif mode == "r": - args.new_relation = input( - "Enter new relation (or blank for no change): oewn-") - else: - print("Bad choice") - sys.exit(-1) - elif mode == "d": - args.delete = True - elif mode == "r": - args.reverse = True - else: - print("Bad mode") - sys.exit(-1) - - - if args.new_source: - if args.new_target or args.new_relation: - print("Please perform a single change at a time") - sys.exit(-1) - if args.add or args.delete: - print("Specifying new source when adding or deleting does not make sense") - sys.exit(-1) - - if source_entry_id or target_entry_id: - if not change_manager.sense_exists(wn, args.source_id): - print("Source sense %d does not exist" % args.source_id) - sys.exit(-1) - if not change_manager.sense_exists(wn, args.target_id): - print("Target sense %d does not exist" % args.target_id) - sys.exit(-1) - if not change_manager.sense_exists(wn, args.new_source): - print("New source sense %d does not exist" % args.new_source) - sys.exit(-1) - change_manager.update_source_sense( - wn, args.source_id, args.target_id, args.new_source) - else: - new_source = wn.synset_by_id(args.new_source) - - if not new_source: - print( - "Could not find the new source synset %s" % - args.new_source) - sys.exit(-1) - - change_manager.update_source( - wn, source_synset, target_synset, new_source) - - elif args.new_target: - if args.new_source or args.new_relation: - print("Please perform a single change at a time") - sys.exit(-1) - if args.add or args.delete: - print("Specifying new source when adding or deleting does not make sense") - sys.exit(-1) - if source_entry_id or target_entry_id: - if not change_manager.sense_exists(wn, args.source_id): - print("Source sense %d does not exist" % args.source_id) - sys.exit(-1) - if not change_manager.sense_exists(wn, args.target_id): - print("Target sense %d does not exist" % args.target_id) - sys.exit(-1) - if not change_manager.sense_exists(wn, args.new_target): - print("New target sense %d does not exist" % args.new_target) - sys.exit(-1) - change_manager.update_target_sense( - wn, args.source_id, args.target_id, args.new_target) - else: - new_target = wn.synset_by_id(args.new_target) - - if not new_target: - print( - "Could not find the new target synset %s" % - args.new_target) - sys.exit(-1) - - change_manager.update_target( - wn, source_synset, target_synset, new_target) - - elif args.new_relation: - if args.new_source or args.new_target: - print("Please perform a single change at a time") - sys.exit(-1) - - if source_entry_id: - if args.new_relation not in wordnet.SenseRelType._value2member_map_: - print("Not a valid relation type %s" % args.new_relation) - sys.exit(-1) - else: - if args.new_relation not in wordnet.SynsetRelType._value2member_map_: - print("Not a valid relation type %s" % args.new_relation) - sys.exit(-1) - - if args.add: - if args.delete: - print("Cannot both add and delete a relation") - sys.exit(-1) - if source_entry_id or target_entry_id: - if not change_manager.sense_exists(wn, args.source_id): - print("Source sense %d does not exist" % args.source_id) - sys.exit(-1) - if not change_manager.sense_exists(wn, args.target_id): - print("Target sense %d does not exist" % args.target_id) - sys.exit(-1) - if args.source_id == args.target_id: - print("Won't link sense %d to itself" % args.source_id) - sys.exit(-1) - change_manager.add_sense_relation( - wn, args.source_id, args.target_id, wordnet.SenseRelType( - args.new_relation)) - else: - if source_synset == target_synset: - print("Won't link synset %s to itself" % source_synset) - sys.exit(-1) - change_manager.add_relation( - wn, - source_synset, - target_synset, - wordnet.SynsetRelType( - args.new_relation)) - elif args.delete: - if source_entry_id or target_entry_id: - if not change_manager.sense_exists(wn, args.source_id): - print("Source sense %d does not exist" % args.source_id) - sys.exit(-1) - if not change_manager.sense_exists(wn, args.target_id): - print("Target sense %d does not exist" % args.target_id) - sys.exit(-1) - change_manager.delete_sense_relation( - wn, args.source_id, args.target_id) - else: - change_manager.delete_relation( - wn, source_synset, target_synset) - else: - if source_entry_id or target_entry_id: - if not change_manager.sense_exists(wn, args.source_id): - print("Source sense %d does not exist" % args.source_id) - sys.exit(-1) - if not change_manager.sense_exists(wn, args.target_id): - print("Target sense %d does not exist" % args.target_id) - sys.exit(-1) - change_manager.update_sense_relation( - wn, args.source_id, args.target_id, wordnet.SenseRelType( - args.new_relation)) - else: - change_manager.update_relation( - wn, source_synset, target_synset, wordnet.SynsetRelType( - args.new_relation)) - elif args.delete: - if args.add: - print("Cannot both add and delete a relation") - sys.exit(-1) - if source_entry_id or target_entry_id: - if not change_manager.sense_exists(wn, args.source_id): - print("Source sense %d does not exist" % args.source_id) - sys.exit(-1) - if not change_manager.sense_exists(wn, args.target_id): - print("Target sense %d does not exist" % args.target_id) - sys.exit(-1) - change_manager.delete_sense_relation( - wn, args.source_id, args.target_id) - else: - change_manager.delete_relation(wn, source_synset, target_synset) - elif args.reverse: - if source_entry_id or target_entry_id: - if not change_manager.sense_exists(wn, args.source_id): - print("Source sense %d does not exist" % args.source_id) - sys.exit(-1) - if not change_manager.sense_exists(wn, args.target_id): - print("Target sense %d does not exist" % args.target_id) - sys.exit(-1) - change_manager.reverse_sense_rel( - wn, args.source_id, args.target_id) - else: - change_manager.reverse_rel(wn, source_synset, target_synset) - - else: - print("No change specified") - change_manager.save_all_xml(wn) - - -if __name__ == "__main__": - main() diff --git a/scripts/change-synset.py b/scripts/change-synset.py deleted file mode 100644 index 5b0211d6..00000000 --- a/scripts/change-synset.py +++ /dev/null @@ -1,95 +0,0 @@ -import sys -import wordnet -import argparse -import re -import change_manager -import csv - - -def main(): - parser = argparse.ArgumentParser(description="Add or remove a synset") - parser.add_argument('synset', metavar='SYNSET_ID', type=str, nargs="?", - help="The ID of the synset to change") - parser.add_argument('--add', action='store_true', - help="Add this synset") - parser.add_argument('--delete', action='store_true', - help="Remove this synset") - parser.add_argument( - '--reason', - type=str, - help="The reason for a deletion or merge (required for deletion)") - parser.add_argument('--definition', type=str, - help="The definition of the new synset") - parser.add_argument('--lexfile', type=str, - help="The lexicographer file to write the synset to") - parser.add_argument('--pos', type=str, - help="The part of speech (n|v|a|r|s)") - parser.add_argument( - '--supersededby', - type=str, - help="The ID of the superseding synset (required for deletion)") - - args = parser.parse_args() - - wn = change_manager.load_wordnet() - - if not args.delete and not args.add: - mode = input("(A)dd synset/(d)elete synset: ").lower() - if mode == "a": - args.add = True - elif mode == "d": - args.delete = True - else: - print("Bad mode: " + mode) - sys.exit(-1) - - if args.delete: - if not args.synset: - args.synset = "oewn-" + input("Enter synset ID: oewn-") - synset = wn.synset_by_id(args.synset) - - if not synset: - print("Could not find synset") - sys.exit(-1) - - if not args.reason: - args.reason = input("Reason for deletion with (#IssueNo): ") - - if not args.supersededby: - args.supersededby = "oewn-" + \ - input("Enter superseding synset ID: oewn-") - - supersede_synset = wn.synset_by_id(args.supersededby) - - if not supersede_synset: - print("Could not find synset") - sys.exit(-1) - - if args.add: - if not args.definition: - args.definition = input("Definition: ") - if not args.lexfile: - args.lexfile = input("Lexicographer file: ") - if not args.pos: - args.pos = input( - "Part of speech (n)oun/(v)erb/(a)djective/adve(r)b/(s)atellite: ").lower() - - if args.add: - new_id = change_manager.add_synset( - wn, args.definition, args.lexfile, args.pos) - print( - "New synset created with ID %s. Please use change-entry and change-relation scripts to add entries and relations" % - new_id) - elif args.delete: - if not args.reason: - print("Please give a reason for deletion") - sys.exit(-1) - change_manager.delete_synset(wn, synset, supersede_synset, args.reason) - else: - print("No action chosen") - sys.exit(-1) - change_manager.save_all_xml(wn) - - -if __name__ == "__main__": - main() diff --git a/scripts/change_manager.py b/scripts/change_manager.py deleted file mode 100644 index e6a68394..00000000 --- a/scripts/change_manager.py +++ /dev/null @@ -1,812 +0,0 @@ -"""Utility functions for changing the wordnet""" -from wordnet import * -import pickle -import os -from glob import glob -import fileinput -import hashlib -from merge import wn_merge -import wordnet_yaml -from collections import defaultdict -from sense_keys import get_sense_key -from pathlib import Path - -sense_id_re = re.compile(r"oewn-(.*)-(.)-(\d{8})-\d{2}") - - -class ChangeList: - def __init__(self): - self.lexfiles = set() - self.entry_files = set() - - def change_entry(self, wn, entry): - for sense in entry.senses: - synset = wn.synset_by_id(sense.synset) - self.lexfiles.add(synset.lex_name) - entry_key = entry.lemma.written_form[0].lower() - if entry_key < 'a' or entry_key > 'z': - entry_key = '0' - self.entry_files.add(entry_key) - - def change_synset(self, synset): - self.lexfiles.add(synset.lex_name) - - -def load_wordnet(): - """Load the wordnet from disk""" - mode = None - # Use whichever version is latest - mtime_xml = max(os.path.getmtime(f) for f in glob("src/xml/*.xml")) - mtime_yaml = max(os.path.getmtime(f) for f in glob("src/yaml/*.yaml")) - if os.path.exists("wn.xml"): - mtime_wn_xml = os.path.getmtime("wn.xml") - else: - mtime_wn_xml = 0 - if os.path.exists("wn.pickle"): - mtime_pickle = os.path.getmtime("wn.pickle") - else: - mtime_pickle = 0 - if mtime_yaml > mtime_xml and mtime_yaml > mtime_wn_xml and mtime_yaml > mtime_pickle: - print("Reading from YAML") - wn = wordnet_yaml.load() - pickle.dump(wn, open("wn.pickle", "wb")) - elif mtime_xml > mtime_wn_xml and mtime_xml > mtime_pickle: - print("Merging and reading XML") - wn_merge() - wn = parse_wordnet("wn.xml") - pickle.dump(wn, open("wn.pickle", "wb")) - elif mtime_wn_xml > mtime_pickle: - print("Reading XML") - wn = parse_wordnet("wn.xml") - pickle.dump(wn, open("wn.pickle", "wb")) - else: - wn = pickle.load(open("wn.pickle", "rb")) - return wn - - -def save(wn, change_list=None): - """Save the wordnet to disk (all formats)""" - wordnet_yaml.save(wn, change_list) - save_all_xml(wn, change_list) - with codecs.open("wn.xml", "w", "utf-8") as outp: - wn.to_xml(outp, True) - pickle.dump(wn, open("wn.pickle", "wb")) - - -def save_all_xml(wn, change_list=None): - by_lex_name = {} - for synset in wn.synsets: - if synset.lex_name not in by_lex_name: - by_lex_name[synset.lex_name] = Lexicon( - "oewn", "Open English WordNet", "en", - "john@mccr.ae", "https://wordnet.princeton.edu/license-and-commercial-use", - "2019", "https://github.com/globalwordnet/english-wordnet") - by_lex_name[synset.lex_name].frames = wn.frames - by_lex_name[synset.lex_name].add_synset(synset) - - for entry in wn.entries: - sense_no = dict([(e.id, i) for i, e in enumerate(entry.senses)]) - for lex_name in by_lex_name.keys(): - senses = [ - sense for sense in entry.senses if wn.synset_by_id( - sense.synset).lex_name == lex_name] - if senses: - e = LexicalEntry(entry.id) - e.set_lemma(entry.lemma) - for f in entry.forms: - e.add_form(f) - for s in senses: - s.n = sense_no[s.id] - e.add_sense(s) - - #def find_sense_for_sb(sb_sense): - # for sense2 in senses: - # if sense2.id == sb_sense: - # return sense2.id - # return None - #e.syntactic_behaviours = [SyntacticBehaviour( - # sb.subcategorization_frame, - # [find_sense_for_sb(sense) for sense in sb.senses]) - # for sb in entry.syntactic_behaviours] - #e.syntactic_behaviours = [SyntacticBehaviour( - # sb.subcategorization_frame, [s for s in sb.senses if s]) - # for sb in e.syntactic_behaviours if any(sb.senses)] - e.pronunciation = entry.pronunciation - by_lex_name[lex_name].add_entry(e) - - for lex_name, wn in by_lex_name.items(): - if os.path.exists( - "src/xml/wn-%s.xml" % - lex_name) and ( - not change_list or lex_name in change_list.lexfiles): - wn_lex = parse_wordnet("src/xml/wn-%s.xml" % lex_name) - wn.comments = wn_lex.comments - entry_order = defaultdict( - lambda: 10000000, [ - (e, i) for i, e in enumerate( - entry.id for entry in wn_lex.entries)]) - wn.entries = sorted(wn.entries, key=lambda e: entry_order[e.id]) - for entry in wn.entries: - if wn_lex.entry_by_id(entry.id): - sense_order = defaultdict( - lambda: 10000, [ - (e, i) for i, e in enumerate( - sense.id for sense in wn_lex.entry_by_id( - entry.id).senses)]) - entry.senses = sorted( - entry.senses, key=lambda s: sense_order[s.id]) - # This is a bit of a hack as some of the n values are not - # continguous - for sense in entry.senses: - if wn_lex.sense_by_id(sense.id): - sense.n = wn_lex.sense_by_id(sense.id).n - sense_rel_order = defaultdict( - lambda: 10000, [ - ((sr.target, sr.rel_type), i) for i, sr in enumerate( - wn_lex.sense_by_id( - sense.id).sense_relations)]) - sense.sense_relations = sorted( - sense.sense_relations, key=lambda sr: sense_rel_order[(sr.target, sr.rel_type)]) - else: - print("sense not found:" + sense.id) - #sb_order = defaultdict( - # lambda: 10000, [ - # (e, i) for i, e in enumerate( - # sb.subcategorization_frame for sb in wn_lex.entry_by_id( - # entry.id).syntactic_behaviours)]) - #entry.syntactic_behaviours = sorted( - # entry.syntactic_behaviours, key=lambda sb: sb_order[sb.subcategorization_frame]) - #for sb in entry.syntactic_behaviours: - # sb2s = [sb2 for sb2 in wn_lex.entry_by_id(entry.id).syntactic_behaviours - # if sb2.subcategorization_frame == sb.subcategorization_frame] - # if sb2s: - # sbe_order = defaultdict( - # lambda: 10000, [ - # (e, i) for i, e in enumerate( - # sb2s[0].senses)]) - # sb.senses = sorted( - # sb.senses, key=lambda s: sbe_order[s]) - else: - print("not found:" + entry.id) - synset_order = defaultdict( - lambda: 1000000, [ - (e, i) for i, e in enumerate( - synset.id for synset in wn_lex.synsets)]) - wn.synsets = sorted(wn.synsets, key=lambda s: synset_order[s.id]) - for synset in wn.synsets: - if wn_lex.synset_by_id(synset.id): - synset_rel_order = defaultdict( - lambda: 10000, [ - ((sr.target, sr.rel_type), i) for i, sr in enumerate( - wn_lex.synset_by_id( - synset.id).synset_relations)]) - synset.synset_relations = sorted( - synset.synset_relations, key=lambda sr: synset_rel_order[(sr.target, sr.rel_type)]) - if not change_list or lex_name in change_list.lexfiles: - Path("src/xml").mkdir(parents=True, exist_ok=True) - with codecs.open("src/xml/wn-%s.xml" % lex_name, "w", "utf-8") as outp: - wn.to_xml(outp, True) - - -def delete_rel(source, target, change_list=None): - """Delete all relationships between two synsets""" - print("Delete %s =*=> %s" % (source.id, target.id)) - ss = source - source.synset_relations = [ - r for r in ss.synset_relations if r.target != target.id] - if change_list: - change_list.change_synset(source) - - -def decompose_sense_id(sense_id): - m = sense_id_re.match(sense_id) - if m: - lemma = m.group(1) - pos = m.group(2) - ssid = m.group(3) - return ("oewn-%s-%s" % (ssid, pos), "oewn-%s-%s" % (lemma, pos)) - else: - raise Exception("Not a sense ID") - - -def delete_sense_rel(wn, source, target, change_list=None): - """Delete all relationships between two senses""" - print("Delete %s =*=> %s" % (source, target)) - (source_synset, source_entry) = decompose_sense_id(source) - lex_name = wn.synset_by_id(source_synset).lex_name - entry = wn.entry_by_id(source_entry) - if change_list: - change_list.change_entry(wn, entry) - sense = [sense for sense in entry.senses if sense.id == source][0] - sense.sense_relations = [ - r for r in sense.sense_relations if r.target != target] - - -def insert_rel(source, rel_type, target, change_list=None): - """Insert a single relation between two synsets""" - print("Insert %s =%s=> %s" % (source.id, rel_type, target.id)) - ss = source - if [r for r in ss.synset_relations if r.target == - target.id and r.rel_type == rel_type]: - print("Already exists") - return - ss.synset_relations.append(SynsetRelation(target.id, rel_type)) - if change_list: - change_list.change_synset(target) - - -def empty_if_none(x): - """Returns an empty list if passed None otherwise the argument""" - if x: - return x - else: - return [] - -KEY_PREFIX_LEN = 5 - -def synset_key(synset_id): - return synset_id[KEY_PREFIX_LEN:-2] - - -def change_entry(wn, synset, target_synset, lemma, change_list=None): - """Change an entry, only works if both synsets are in the same file""" - print("Adding %s to synset %s" % (lemma, synset.id)) - n_entries = len(empty_if_none(wn.members_by_id(target_synset.id))) - entry_global = [entry for entry in empty_if_none(wn.entry_by_lemma(lemma)) - if wn.entry_by_id(entry).lemma.part_of_speech == synset.part_of_speech or - wn.entry_by_id(entry).lemma.part_of_speech == PartOfSpeech.ADJECTIVE and synset.part_of_speech == PartOfSpeech.ADJECTIVE_SATELLITE or - wn.entry_by_id(entry).lemma.part_of_speech == PartOfSpeech.ADJECTIVE_SATELLITE and synset.part_of_speech == PartOfSpeech.ADJECTIVE] - - if len(entry_global) == 1: - entry_global = wn.entry_by_id(entry_global[0]) - n_senses = len(entry_global.senses) - else: - entry_global = None - n_senses = 0 - - idx = n_entries + 1 - n = n_senses - - wn_synset = wn - entries = [entry for entry in empty_if_none(wn_synset.entry_by_lemma( - lemma)) if wn.entry_by_id(entry).lemma.part_of_speech == synset.part_of_speech] - - for entry in entries: - for sense in wn_synset.entry_by_id(entry).senses: - if sense.synset == synset.id: - print("Moving %s to %s" % (sense.id, target_synset.id)) - sense.synset = target_synset.id - wn.change_sense_id( - sense, - "oewn-%s-%s-%s-%02d" % - (escape_lemma(lemma), - target_synset.part_of_speech.value, - synset_key( - target_synset.id), - idx), - change_list) - if change_list: - change_list.change_entry(wn, entry) - - -def add_entry(wn, synset, lemma, idx=0, n=-1, change_list=None): - """Add a new lemma to a synset""" - print("Adding %s to synset %s" % (lemma, synset.id)) - n_entries = len(empty_if_none(wn.members_by_id(synset.id))) - entry_global = [entry for entry in empty_if_none(wn.entry_by_lemma(lemma)) - if wn.entry_by_id(entry).lemma.part_of_speech == synset.part_of_speech or - wn.entry_by_id(entry).lemma.part_of_speech == PartOfSpeech.ADJECTIVE and synset.part_of_speech == PartOfSpeech.ADJECTIVE_SATELLITE or - wn.entry_by_id(entry).lemma.part_of_speech == PartOfSpeech.ADJECTIVE_SATELLITE and synset.part_of_speech == PartOfSpeech.ADJECTIVE] - - if len(entry_global) == 1: - entry_global = wn.entry_by_id(entry_global[0]) - n_senses = len(entry_global.senses) - else: - entry_global = None - n_senses = 0 - - if idx <= 0: - idx = n_entries + 1 - elif idx > n_entries + 1: - raise Exception("IDX value specified is higher than number of entries") - elif idx == n_entries + 1: - pass - else: - for sense_id in sense_ids_for_synset(wn, synset): - this_idx = int(sense_id[-2:]) - if this_idx >= idx: - change_sense_idx(wn, sense_id, this_idx + 1) - - if n < 0: - n = n_senses - elif n > n_senses: - raise Exception("n value exceeds number of senses for lemma") - elif n == n_senses: - pass - else: - sense_n = 0 - for sense in entry_global.senses: - if sense_n >= n: - change_sense_n(wn, entry_global, sense.id, sense_n + 1) - sense_n += 1 - - wn_synset = wn - entries = [entry for entry in empty_if_none(wn_synset.entry_by_lemma( - lemma)) if wn.entry_by_id(entry).lemma.part_of_speech == synset.part_of_speech] - - if entries: - if len(entries) != 1: - raise Exception("More than one entry for part of speech") - print("Found an entry!") - wn_entry = wn.entry_by_id(entries[0]) - entry = wn_synset.entry_by_id(entries[0]) - sense = Sense( - id="oewn-%s-%s-%s-%02d" % - (escape_lemma(lemma), - synset.part_of_speech.value, - synset_key( - synset.id), - idx), - synset=synset.id, - n=n, - sense_key=None) - - wn_entry.senses.append(sense) - entry.senses.append(sense) - sense.sense_key = get_sense_key(wn, entry, sense, synset.lex_name) - if sense.synset not in wn.members: - wn.members[sense.synset] = [] - wn.members[sense.synset].append(wn_entry.lemma.written_form) - else: - n = 0 - print("Creating new entry") - entry = LexicalEntry( - "oewn-%s-%s" % (escape_lemma(lemma), synset.part_of_speech.value)) - entry.set_lemma(Lemma(lemma, synset.part_of_speech)) - sense = Sense( - id="oewn-%s-%s-%s-%02d" % - (escape_lemma(lemma), - synset.part_of_speech.value, - synset_key( - synset.id), - idx), - synset=synset.id, - n=n, - sense_key=None) - entry.add_sense(sense) - sense.sense_key = get_sense_key(wn, entry, sense, synset.lex_name) - wn.add_entry(entry) - if change_list: - change_list.change_entry(wn, entry) - return entry - - -def delete_entry(wn, synset, entry_id, change_list=None): - """Delete a lemma from a synset""" - print("Deleting %s from synset %s" % (entry_id, synset.id)) - n_entries = len(wn.members_by_id(synset.id)) - entry_global = wn.entry_by_id(entry_id) - - if entry_global: - idxs = [int(sense.id[-2:]) - for sense in entry_global.senses if sense.synset == synset.id] - if not idxs: - print("Entry not in synset") - return - idx = idxs[0] - n_senses = len(entry_global.senses) - else: - print("No entry for this lemma") - return - - if n_senses == 0: - entry = wn_synset.entry_by_id(entry_global.id) - if entry: - wn.del_entry(entry) - return - - if n_senses != 1: - n = [ind for ind, sense in enumerate( - entry_global.senses) if sense.synset == synset.id][0] - sense_n = 0 - for sense in entry_global.senses: - if sense_n >= n: - change_sense_n(wn, entry_global, sense.id, sense_n - 1) - sense_n += 1 - - for sense_id in sense_ids_for_synset(wn, synset): - this_idx = int(sense_id[-2:]) - if this_idx > idx: - change_sense_idx(wn, sense_id, this_idx - 1) - - for sense in entry_global.senses: - if sense.synset == synset.id: - for rel in sense.sense_relations: - delete_sense_rel(wn, rel.target, sense.id, change_list) - delete_sense_rel(wn, sense.id, rel.target, change_list) - - if n_senses == 1: # then delete the whole entry - wn_synset = wn - entry = wn_synset.entry_by_id(entry_global.id) - if change_list: - change_list.change_entry(wn, entry) - wn_synset.del_entry(entry) - wn.del_entry(entry) - else: - wn_synset = wn - entry = wn_synset.entry_by_id(entry_global.id) - if change_list: - change_list.change_entry(wn, entry) - sense = [s for s in entry.senses if s.synset == synset.id] - if sense: - sense = sense[0] - wn_synset.del_sense(entry, sense) - wn.del_sense(entry, sense) - else: - print("this may be a bug") - - -def delete_synset( - wn, - synset, - supersede, - reason, - delent=True, - change_list=None): - """Delete a synset""" - print("Deleting synset %s" % synset.id) - - if delent: - entries = empty_if_none(wn.members_by_id(synset.id)) - - for entry in entries: - delete_entry( - wn, synset, "oewn-%s-%s" % - (escape_lemma(entry), synset.part_of_speech.value), change_list) - - for rel in synset.synset_relations: - delete_rel(wn.synset_by_id(rel.target), synset, change_list) - - wn_synset = wn - wn_synset.synsets = [ss for ss in wn_synset.synsets - if synset.id != ss.id] - if supersede: - if not isinstance(supersede, list): - supersede = [supersede] - else: - supersede = [] - with open("src/deprecations.csv", "a") as out: - out.write("\"%s\",\"%s\",\"%s\",\"%s\",\"%s\"\n" % - (synset.id, synset.ili, - ",".join(s.id for s in supersede), - ",".join(s.ili for s in supersede), - reason.replace("\n", "").replace("\"", "\"\""))) - if change_list: - change_list.change_synset(synset) - - -def change_sense_n(wn, entry, sense_id, new_n, change_list=None): - """Change the position of a sense within an entry (changes only this sense)""" - print("Changing n of sense %s of %s to %s" % - (sense_id, entry.lemma.written_form, new_n)) - if new_n <= 0: - return - - senses = [sense for sense in entry.senses if sense.id == sense_id] - if len(senses) != 1: - raise Exception("Could not find sense") - sense = senses[0] - synset = wn.synset_by_id(sense.synset) - lexname = synset.lex_name - - wn_synset = wn - entry = wn_synset.entry_by_id(entry.id) - sense = [sense for sense in entry.senses if sense.id == sense_id][0] - sense.n = new_n - if change_list: - change_list.change_entry(wn, entry) - - -def change_sense_idx(wn, sense_id, new_idx, change_list=None): - """Change the position of a lemma within a synset""" - print("Changing idx of sense %s to %s" % (sense_id, new_idx)) - new_sense_id = "%s-%02d" % (sense_id[:-3], new_idx) - for entry in wn.entries: - for sense in entry.senses: - if sense.id == sense_id: - wn.change_sense_id(sense, new_sense_id) - for sr in sense.sense_relations: - if sr.target == sense_id: - sr.target = new_sense_id - for sb in entry.syntactic_behaviours: - sb.senses = [ - new_sense_id if s == sense_id else s - for s in sb.senses] - if change_list: - change_list.change_entry(wn, entry) - - -def sense_ids_for_synset(wn, synset): - return [sense.id for lemma in wn.members_by_id(synset.id) - for entry in wn.entry_by_lemma(lemma) - for sense in wn.entry_by_id(entry).senses - if sense.synset == synset.id] - - -def new_id(wn, pos, definition): - s = hashlib.sha256() - s.update(definition.encode()) - nid = "oewn-8%07d-%s" % ((int(s.hexdigest(), 16) % 10000000), pos) - if wn.synset_by_id(nid): - print( - "Could not find ID for new synset. Either a duplicate definition or a hash collision for " + - nid + - ". Note it is possible to force a synset ID by giving it as an argument") - sys.exit(-1) - return nid - - -def add_synset(wn, definition, lexfile, pos, ssid=None, change_list=None): - if not ssid: - ssid = new_id(wn, pos, definition) - ss = Synset(ssid, "in", - PartOfSpeech(pos), lexfile) - ss.definitions = [Definition(definition)] - ss.ili_definition = Definition(definition) - wn.add_synset(ss) - if change_list: - change_list.change_synset(ss) - return ssid - - -def merge_synset(wn, synsets, reason, lexfile, ssid=None, change_list=None): - """Create a new synset merging all the facts from other synsets""" - pos = synsets[0].part_of_speech.value - if not ssid: - ssid = new_id(wn, pos, synsets[0].definitions[0].text) - ss = Synset(ssid, "in", - PartOfSpeech(pos), lexfile) - ss.definitions = [d for s in synsets for d in s.definitions] - ss.examples = [x for s in synsets for x in s.examples] - members = {} - wn.add_synset(ss) - - for s in synsets: - # Add all relations - for r in s.synset_relations: - if not any(r == r2 for r2 in ss.synset_relations): - add_relation( - wn, ss, wn.synset_by_id( - r.target), r.rel_type, change_list) - # Add members - for m in wn.members_by_id(s.id): - if m not in members: - members[m] = add_entry(wn, ss, m, change_list) - add_entry(wn, ss, m, change_list) - e = [e for e in [wn.entry_by_id(e2) for e2 in wn.entry_by_lemma(m)] - if e.lemma.part_of_speech.value == pos][0] - for f in e.forms: - if not any(f2 == f for f in members[m].forms): - members[m].add_form(f) - # syn behaviours - probably fix manually for the moment - if change_list: - change_list.change_synset(ss) - return ss - - -def find_type(source, target): - """Get the first relation type between the synsets""" - x = [r for r in source.synset_relations if r.target == target.id] - if len(x) != 1: - raise Exception( - "Synsets not linked or linked by more than one property") - return x[0].rel_type - - -def update_source(wn, old_source, target, new_source, change_list=None): - """Change the source of a link""" - rel_type = find_type(old_source, target) - delete_rel(old_source, target, change_list) - insert_rel(new_source, rel_type, target, change_list) - if rel_type in wordnet.inverse_synset_rels: - inv_rel_type = wordnet.inverse_synset_rels[rel_type] - delete_rel(target, old_source, change_list) - insert_rel(target, inv_rel_type, new_source, change_list) - - -def update_target(wn, source, old_target, new_target, change_list=None): - """Change the target of a link""" - rel_type = find_type(source, old_target) - delete_rel(source, old_target, change_list) - insert_rel(source, rel_type, new_target, change_list) - if rel_type in wordnet.inverse_synset_rels: - inv_rel_type = wordnet.inverse_synset_rels[rel_type] - delete_rel(old_target, source, change_list) - insert_rel(new_target, inv_rel_type, source, change_list) - - -def update_relation(wn, source, target, new_rel, change_list=None): - """Change the type of a link""" - delete_rel(source, target, change_list) - insert_rel(source, new_rel, target, change_list) - if new_rel in inverse_synset_rels: - inv_rel_type = inverse_synset_rels[new_rel] - delete_rel(target, source, change_list) - insert_rel(target, inv_rel_type, source, change_list) - - -def add_relation(wn, source, target, new_rel, change_list=None): - """Change the type of a link""" - insert_rel(source, new_rel, target, change_list) - if new_rel in inverse_synset_rels: - inv_rel_type = inverse_synset_rels[new_rel] - insert_rel(target, inv_rel_type, source, change_list) - - -def delete_relation(wn, source, target, change_list=None): - """Change the type of a link""" - delete_rel(source, target, change_list) - delete_rel(target, source, change_list) - - -def reverse_rel(wn, source, target, change_list=None): - """Reverse the direction of relations""" - rel_type = find_type(source, target) - delete_rel(source, target, change_list) - if rel_type in inverse_synset_rels: - delete_rel(target, source, change_list) - insert_rel(target, rel_type, source, change_list) - if rel_type in inverse_synset_rels: - inv_rel_type = inverse_synset_rels[rel_type] - insert_rel(source, inv_rel_type, target, change_list) - - -def delete_sense_rel(wn, source, target, change_list=None): - """Delete all relationships between two senses""" - print("Delete %s =*=> %s" % (source, target)) - (source_synset, source_entry) = decompose_sense_id(source) - lex_name = wn.synset_by_id(source_synset).lex_name - wn_source = wn - entry = wn_source.entry_by_id(source_entry) - if entry: - sense = [sense for sense in entry.senses if sense.id == source][0] - if not any(r for r in sense.sense_relations if r.target == target): - print("No sense relations deleted") - else: - sense.sense_relations = [ - r for r in sense.sense_relations if r.target != target] - if change_list: - change_list.change_entry(wn, entry) - else: - print("No entry for " + source_entry) - - -def insert_sense_rel(wn, source, rel_type, target, change_list=None): - """Insert a single relation between two senses""" - print("Insert %s =%s=> %s" % (source, rel_type, target)) - (source_synset, source_entry) = decompose_sense_id(source) - lex_name = wn.synset_by_id(source_synset).lex_name - wn_source = wn - entry = wn_source.entry_by_id(source_entry) - sense = [sense for sense in entry.senses if sense.id == source][0] - sense.sense_relations.append(SenseRelation(target, rel_type)) - if change_list: - change_list.change_entry(wn, entry) - - -def find_sense_type(wn, source, target): - """Get the first relation type between the senses""" - (source_synset, source_entry) = decompose_sense_id(source) - entry = wn.entry_by_id(source_entry) - sense = [sense for sense in entry.senses if sense.id == source][0] - x = set([r for r in sense.sense_relations if r.target == target]) - if len(x) == 0: - raise Exception( - "Synsets not linked or linked by more than one property") - return next(iter(x)).rel_type - - -def update_source_sense(wn, old_source, target, new_source, change_list=None): - """Change the source of a link""" - rel_type = find_sense_type(wn, old_source, target) - delete_sense_rel(wn, old_source, target, change_list) - insert_sense_rel(wn, new_source, rel_type, target, change_list) - if rel_type in inverse_sense_rels: - inv_rel_type = inverse_sense_rels[rel_type] - delete_sense_rel(wn, target, old_source, change_list) - insert_sense_rel(wn, target, inv_rel_type, new_source, change_list) - - -def update_target_sense(wn, source, old_target, new_target, change_list=None): - """Change the target of a link""" - rel_type = find_sense_type(wn, source, old_target) - delete_sense_rel(wn, source, old_target, change_list) - insert_sense_rel(wn, source, rel_type, new_target, change_list) - if rel_type in inverse_sense_rels: - inv_rel_type = inverse_sense_rels[rel_type] - delete_sense_rel(wn, old_target, source, change_list) - insert_sense_rel(wn, new_target, inv_rel_type, source, change_list) - - -def update_sense_relation(wn, source, target, new_rel, change_list=None): - """Change the type of a link""" - delete_sense_rel(wn, source, target, change_list) - insert_sense_rel(wn, source, new_rel, target, change_list) - if new_rel in inverse_sense_rels: - inv_rel_type = inverse_sense_rels[new_rel] - delete_sense_rel(wn, target, source, change_list) - insert_sense_rel(wn, target, inv_rel_type, source, change_list) - - -def add_sense_relation(wn, source, target, new_rel, change_list=None): - """Change the type of a link""" - insert_sense_rel(wn, source, new_rel, target, change_list) - if new_rel in inverse_sense_rels: - inv_rel_type = inverse_sense_rels[new_rel] - insert_sense_rel(wn, target, inv_rel_type, source, change_list) - - -def delete_sense_relation(wn, source, target, change_list=None): - """Change the type of a link""" - delete_sense_rel(wn, source, target, change_list) - delete_sense_rel(wn, target, source, change_list) - - -def reverse_sense_rel(wn, source, target, change_list=None): - """Reverse the direction of a sense relation""" - rel_type = find_sense_type(wn, source, target) - delete_sense_rel(wn, source, target, change_list) - if rel_type in inverse_sense_rels: - delete_sense_rel(wn, target, source, change_list) - insert_sense_rel(wn, target, rel_type, source, change_list) - if rel_type in inverse_sense_rels: - inv_rel_type = inverse_sense_rels[rel_type] - insert_sense_rel(wn, source, inv_rel_type, target, change_list) - - -def sense_exists(wn, sense_id): - if sense_id_re.match(sense_id): - (_, entry_id) = decompose_sense_id(sense_id) - entry = wn.entry_by_id(entry_id) - if entry: - senses = [sense for sense in entry.senses if sense.id == sense_id] - return len(senses) == 1 - return False - - -def update_def(wn, synset, defn, add, change_list=None): - wn_synset = wn - ss = wn_synset.synset_by_id(synset.id) - if add: - ss.definitions = ss.definitions + [Definition(defn)] - else: - ss.definitions = [Definition(defn)] - if change_list: - change_list.change_synset(synset) - - -def update_ili_def(wn, synset, defn, change_list=None): - wn_synset = wn - ss = wn_synset.synset_by_id(synset.id) - ss.ili_definition = Definition(defn) - if change_list: - change_list.change_synset(synset) - - -def add_ex(wn, synset, example, change_list=None): - wn_synset = wn - ss = wn_synset.synset_by_id(synset.id) - ss.examples = ss.examples + [Example(example)] - if change_list: - change_list.change_synset(synset) - - -def delete_ex(wn, synset, example, change_list=None): - wn_synset = wn - ss = wn_synset.synset_by_id(synset.id) - n_exs = len(ss.examples) - ss.examples = [ex for ex in ss.examples if ex.text != example] - if len(ss.examples) == n_exs: - print("No change") - if change_list: - change_list.change_synset(synset) diff --git a/scripts/check_sense_ids.py b/scripts/check_sense_ids.py deleted file mode 100644 index 56f42290..00000000 --- a/scripts/check_sense_ids.py +++ /dev/null @@ -1,24 +0,0 @@ -# This script fixes sense IDs to be strictly increasing -# It won't be needed after 2021 release -from wordnet import * - -wn = parse_wordnet("wn.xml") - -for synset in wn.synsets: - members = wn.members_by_id(synset.id) - senses = [ - [sense for entry in wn.entry_by_lemma(member) - for sense in wn.entry_by_id(entry).senses - if sense.synset == synset.id][0] - for member in members] - senses = sorted(senses, key=lambda s: s.id[-2:]) - actual = sorted([s.id[-2:] for s in senses]) - if actual[0] == '00': - goal = ["%02d" % i for i in range(len(senses))] - else: - goal = ["%02d" % (i + 1) for i in range(len(senses))] - if goal != actual: - for (sense, g) in zip(senses, goal): - if sense.id[-2:] != g: - print("sed -i 's/%s/%s/' src/xml/*.xml" % - (sense.id, sense.id[:-2] + g)) diff --git a/scripts/from-yaml.py b/scripts/from-yaml.py deleted file mode 100644 index ad9b7e4a..00000000 --- a/scripts/from-yaml.py +++ /dev/null @@ -1,11 +0,0 @@ -import wordnet_yaml -import change_manager - - -def main(): - wn = wordnet_yaml.load() - change_manager.save_all_xml(wn) - - -if __name__ == "__main__": - main() diff --git a/scripts/wordnet_yaml.py b/scripts/from_yaml.py similarity index 66% rename from scripts/wordnet_yaml.py rename to scripts/from_yaml.py index 83af9caf..7af46e51 100644 --- a/scripts/wordnet_yaml.py +++ b/scripts/from_yaml.py @@ -1,36 +1,57 @@ -"""WordNet YAML interface""" +"""Converts the internal YAML data into a GWA standard XML file and + writes it to `wn.xml`""" +import sys import yaml from glob import glob -from wordnet import * from yaml import CLoader import codecs -import os -from collections import defaultdict +from wordnet import (Lexicon, Lemma, PartOfSpeech, LexicalEntry, Sense, + SenseRelation, Definition, Example, Pronunciation, + Synset, SynsetRelation, SynsetRelType, Form, + SenseRelType, OtherSenseRelType, SyntacticBehaviour, + escape_lemma, inverse_sense_rels, + inverse_synset_rels) entry_orders = {} KEY_PREFIX_LEN = 5 # = len("oewn-") def map_sense_key(sk): + """ + Maps a sense key into an OEWN from + """ if "%" in sk: e = sk.split("%") - return ("oewn-" + e[0].replace("'","-ap-").replace("/","-sl-").replace("!","-ex-").replace(",","-cm-").replace(":","-cl-").replace("+","-pl-") + + return ("oewn-" + e[0].replace("'","-ap-").replace("/","-sl-") + .replace("!","-ex-").replace(",","-cm-") + .replace(":","-cl-").replace("+","-pl-") + "__" + e[1].replace("_","-sp-").replace(":",".")) else: - return "oewn-" + sk.replace("%", "__").replace("'","-ap-").replace("/","-sl-").replace("!","-ex-").replace(",","-cm-").replace(":","-cl-").replace("+","-pl-") + return ("oewn-" + sk.replace("%", "__").replace("'","-ap-") + .replace("/","-sl-").replace("!","-ex-").replace(",","-cm-") + .replace(":","-cl-").replace("+","-pl-")) def unmap_sense_key(sk): + """ + Maps an OEWN sense key to a WN sense key + """ if "__" in sk: e = sk.split("__") - l = e[0][KEY_PREFIX_LEN:] + oewn_key = e[0][KEY_PREFIX_LEN:] r = "__".join(e[1:]) - return (l.replace("-ap-", "'").replace("-sl-", "/").replace("-ex-", "!").replace("-cm-",",").replace("-cl-",":").replace("-pl-","+") + + return (oewn_key.replace("-ap-", "'").replace("-sl-", "/").replace("-ex-", "!") + .replace("-cm-",",").replace("-cl-",":").replace("-pl-","+") + "%" + r.replace(".", ":").replace("-sp-","_")) else: - return sk[KEY_PREFIX_LEN:].replace("__", "%").replace("-ap-", "'").replace("-sl-", "/").replace("-ex-", "!").replace("-cm-",",").replace("-cl-",":").replace("-pl-","+") + return (sk[KEY_PREFIX_LEN:].replace("__", "%").replace("-ap-", "'") + .replace("-sl-", "/").replace("-ex-", "!").replace("-cm-",",") + .replace("-cl-",":").replace("-pl-","+")) def make_pos(y, pos): + """ + Convert a part of speech value to a single character + """ if "adjposition" in y: return y["adjposition"] + "-" + pos elif len(pos) > 1: @@ -40,11 +61,17 @@ def make_pos(y, pos): def make_sense_id(y, lemma, pos): + """ + Create a sense ID from a YAML entry + """ return "oewn-%s-%s-%s" % ( escape_lemma(lemma), make_pos(y, pos), y["synset"][:-2]) def sense_from_yaml(y, lemma, pos, n): + """ + Create a Sense object from the YAML data + """ s = Sense(map_sense_key(y["id"]), "oewn-" + y["synset"], None, n, y.get("adjposition")) @@ -66,15 +93,19 @@ def sense_from_yaml(y, lemma, pos, n): return s def pronunciation_from_yaml(props): - return [Pronunciation(p["value"], p.get("variety")) for p in props.get("pronunciation",[])] - - -def pronunciation_from_yaml(props): - return [Pronunciation(p["value"], p.get("variety")) for p in props.get("pronunciation",[])] + """ + Create a Pronunciation object from the YAML data + """ + return [Pronunciation(p["value"], p.get("variety")) + for p in props.get("pronunciation",[])] def synset_from_yaml(wn, props, id, lex_name): + """ + Create a Synset from the YAML data + """ if "partOfSpeech" not in props: - print(props) + print("No part of speech for %s" % id) + raise ValueError ss = Synset("oewn-" + id, props.get("ili", "in"), PartOfSpeech(props["partOfSpeech"]), @@ -99,6 +130,9 @@ def synset_from_yaml(wn, props, id, lex_name): return ss def entry_for_synset(wn, ss, lemma): + """ + Find the entry for a synset member + """ for e in wn.entry_by_lemma(lemma): for s in wn.entry_by_id(e).senses: if s.synset == ss.id: @@ -108,6 +142,9 @@ def entry_for_synset(wn, ss, lemma): def fix_sense_rels(wn, sense): + """ + Add inverse sense relations as needed + """ for rel in sense.sense_relations: target_id = rel.target if (rel.rel_type in inverse_sense_rels @@ -122,6 +159,9 @@ def fix_sense_rels(wn, sense): def fix_synset_rels(wn, synset): + """ + Add inverse synset relations as needed + """ for rel in synset.synset_relations: if (rel.rel_type in inverse_synset_rels and inverse_synset_rels[rel.rel_type] != rel.rel_type): @@ -136,11 +176,14 @@ def fix_synset_rels(wn, synset): inverse_synset_rels[rel.rel_type])) -def load(): - wn = Lexicon("oewn", "Engish WordNet", "en", +def load(year="2022"): + """ + Load wordnet from YAML files + """ + wn = Lexicon("oewn", "Open Engish WordNet", "en", "english-wordnet@googlegroups.com", "https://creativecommons.org/licenses/by/4.0", - "2022", + year, "https://github.com/globalwordnet/english-wordnet") with open("src/yaml/frames.yaml", encoding="utf-8") as inp: frames = yaml.load(inp, Loader=CLoader) @@ -172,16 +215,6 @@ def load(): wn.add_synset(synset_from_yaml(wn, props, id, lex_name)) entry_orders[id] = props["members"] - # This is a big hack because of some inconsistencies in the XML that should - # be gone soon - synset_ids_starting_from_zero = set() - for f in glob("src/xml/*.xml"): - wn_lex = parse_wordnet(f) - for entry in wn_lex.entries: - for sense in entry.senses: - if sense.id.endswith("00"): - synset_ids_starting_from_zero.add(sense.synset) - for entry in wn.entries: for sense in entry.senses: fix_sense_rels(wn, sense) @@ -198,18 +231,6 @@ def load(): "2019", "https://github.com/globalwordnet/english-wordnet") by_lex_name[synset.lex_name].add_synset(synset) - for lex_name, wn2 in by_lex_name.items(): - if os.path.exists("src/xml/wn-%s.xml" % lex_name): - wn_lex = parse_wordnet("src/xml/wn-%s.xml" % lex_name) - senseids = { - sense.id[:-2]: sense.id for entry in wn_lex.entries for sense in entry.senses} - for entry in wn2.entries: - if wn_lex.entry_by_id(entry.id): - # Fix the last ID, because it is not actually so - # predicatable in the XML - for sense in entry.senses: - sense.id = senseids.get(sense.id[:-2], sense.id) - return wn @@ -331,8 +352,9 @@ def example_to_yaml(wn, x): SynsetRelType.CO_INSTRUMENT_RESULT]) -def lemma2senseorder(wn, l, synset_id): - for e2 in wn.entry_by_lemma(l): +def lemma2senseorder(wn, lemma, synset_id): + """Find sense order of lemmas""" + for e2 in wn.entry_by_lemma(lemma): for sense in wn.entry_by_id(e2).senses: if sense.synset == synset_id: return sense.id[-2:] @@ -342,85 +364,20 @@ def lemma2senseorder(wn, l, synset_id): def entries_ordered(wn, synset_id): """Get the lemmas for entries ordered correctly""" e = wn.members_by_id(synset_id) - e.sort(key=lambda l: lemma2senseorder(wn, l, synset_id)) + e.sort(key=lambda lemma: lemma2senseorder(wn, lemma, synset_id)) return e -def save(wn, change_list=None): - entry_yaml = {c: {} for c in char_range('a', 'z')} - entry_yaml['0'] = {} - for entry in wn.entries: - e = {} - if entry.forms: - e['form'] = [f.written_form for f in entry.forms] - - sb_map = defaultdict(lambda: []) -# for sb in entry.syntactic_behaviours: -# sb_name = frames_inv[sb.subcategorization_frame] -# for sense in sb.senses: -# sb_map[sense].append(sb_name) -# - e['sense'] = [sense_to_yaml(wn, s, sb_map) for s in entry.senses] - if entry.pronunciation: - e['pronunciation'] = [] - for p in entry.pronunciation: - if p.variety: - e['pronunciation'].append({'value':p.value, 'variety': p.variety}) - else: - e['pronunciation'].append({'value':p.value}) - - first = entry.lemma.written_form[0].lower() - if first not in char_range('a', 'z'): - first = '0' - if entry.lemma.written_form not in entry_yaml[first]: - entry_yaml[first][entry.lemma.written_form] = {} - if entry.lemma.part_of_speech.value in entry_yaml[first][entry.lemma.written_form]: - print( - "Duplicate: %s - %s" % - (entry.lemma.written_form, - entry.lemma.part_of_speech.value)) - entry_yaml[first][entry.lemma.written_form][entry.lemma.part_of_speech.value] = e - - for c in char_range('a', 'z'): - if not change_list or c in change_list.entry_files: - with codecs.open("src/yaml/entries-%s.yaml" % c, "w", "utf-8") as outp: - outp.write(yaml.dump(entry_yaml[c], default_flow_style=False, - allow_unicode=True)) - if not change_list or '0' in change_list.entry_files: - with codecs.open("src/yaml/entries-0.yaml", "w", "utf-8") as outp: - outp.write(yaml.dump(entry_yaml['0'], default_flow_style=False, - allow_unicode=True)) - - synset_yaml = {} - for synset in wn.synsets: - s = {} - if synset.ili and synset.ili != "in": - s["ili"] = synset.ili - s["partOfSpeech"] = synset.part_of_speech.value - s["definition"] = [ - definition_to_yaml( - wn, d) for d in synset.definitions] - if synset.examples: - s["example"] = [example_to_yaml(wn, x) for x in synset.examples] - if synset.source: - s["source"] = synset.source - for r in synset.synset_relations: - if r.rel_type not in ignored_symmetric_synset_rels: - if r.rel_type.value not in s: - s[r.rel_type.value] = [r.target[KEY_PREFIX_LEN:]] - else: - s[r.rel_type.value].append(r.target[KEY_PREFIX_LEN:]) - if synset.lex_name not in synset_yaml: - synset_yaml[synset.lex_name] = {} - synset_yaml[synset.lex_name][synset.id[KEY_PREFIX_LEN:]] = s - s["members"] = [wn.id2entry[m].lemma.written_form for m in synset.members] - - for key, synsets in synset_yaml.items(): - if not change_list or key in change_list.lexfiles: - with codecs.open("src/yaml/%s.yaml" % key, "w", "utf-8") as outp: - outp.write(yaml.dump(synsets, default_flow_style=False, - allow_unicode=True)) - - with open("src/yaml/frames.yaml", "w") as outp: - outp.write(yaml.dump(frames, default_flow_style=False, - allow_unicode=True)) +def main(): + if len(sys.argv) > 1: + year = sys.argv[1] + else: + year = "2024" + wn = load(year) + with codecs.open("wn.xml", "w", "utf-8") as outp: + wn.to_xml(outp, True) + + +if __name__ == "__main__": + main() + diff --git a/scripts/merge-synset.py b/scripts/merge-synset.py deleted file mode 100644 index 59829b8d..00000000 --- a/scripts/merge-synset.py +++ /dev/null @@ -1,75 +0,0 @@ -import sys -import wordnet -import argparse -import re -import change_manager -import csv -from merge import wn_merge - - -def main(): - parser = argparse.ArgumentParser( - description="Merge a synset - delete one or more synset and merge all properties. This may create weird or contradictory results so should be used with care") - parser.add_argument('synsets', metavar='SYNSET_ID', type=str, nargs="*", - help="The ID of the synset to change") - parser.add_argument( - '--reason', - type=str, - nargs="?", - help="The reason for this change including issue number") - parser.add_argument('--lex_file', type=str, - help="The lex file to write the new synset to") - - args = parser.parse_args() - - wn = change_manager.load_wordnet() - - if not args.synsets: - args.synsets = [] - print("Enter synsets (empty line to finish)") - while True: - id1 = input("Enter synset ID: oewn-") - if id1: - args.synsets.append("oewn-" + id1) - else: - break - - if not args.synsets: - print("Need at least one synset to merge") - exit(-1) - - synsets = [wn.synset_by_id(ss) for ss in args.synsets] - - if any(s is None for s in synsets): - print("Cannot find synset") - exit(-1) - - if any(s.part_of_speech != synsets[0].part_of_speech for s in synsets): - print("Merging across parts of speech is not correct!") - exit(-1) - - if not args.lex_file and any( - s.lex_name != synsets[0].lex_name for s in synsets): - print("Merging across lex files: " + - ", ".join(s.lex_name for s in synsets)) - args.lex_file = input("Lex file : ") - elif not args.lex_file: - args.lex_file = synsets[0].lex_name - - if not args.reason: - args.reason = input("Reason for deletion (#IssueNo): ") - - new_id = change_manager.merge_synset( - wn, synsets, args.reason, args.lex_file) - - wn_merge() - wn = change_manager.load_wordnet() - - for synset in synsets: - change_manager.delete_synset(wn, synset, - [new_id], - args.reason) - - -if __name__ == "__main__": - main() diff --git a/scripts/merge.py b/scripts/merge.py deleted file mode 100644 index c1385b3b..00000000 --- a/scripts/merge.py +++ /dev/null @@ -1,121 +0,0 @@ -import xml.etree.ElementTree as ET -from glob import glob - - -def merge_entry(e1, e2): - i = 0 - for c in e1: - if c.tag == "Lemma" or c.tag == "Form" or c.tag == "Sense" or c.tag == "Pronunciation": - i += 1 - for c in e2: - if c.tag == "Sense": - e1.insert(i, c) - i += 1 - if c.tag == "SyntacticBehaviour": - src = [c2 for c2 in e1 if c2.tag == "SyntacticBehaviour" and c2.attrib["subcategorizationFrame"] - == c.attrib["subcategorizationFrame"]] - if not src: - e1.insert(i, c) - i += 1 - else: - for s in src: - s.attrib["senses"] += " " + c.attrib["senses"] - return e1 - - -def order_entry(e): - f = ET.Element('LexicalEntry') - f.attrib = e.attrib - senses = [] - for c in e: - if c.tag == "Lemma" or c.tag == "Form" or c.tag == "Pronunciation": - f.append(c) - elif c.tag == "Sense": - senses.append(c) - senses.sort(key=lambda x: int(x.attrib["n"])) - for c in senses: - del c.attrib["n"] - f.append(c) - for c in e: - if c.tag == "SyntacticBehaviour": - f.append(c) - return f - - -def indent(elem, level=0): - i = "\n" + level * " " - j = "\n" + (level + 1) * " " - if len(elem): - elem.text = j - n = 1 - for subelem in elem: - if n != len(elem): - indent(subelem, level + 1) - subelem.tail = j - else: - subelem.tail = i - n += 1 - else: - elem.tail = i - return elem - - -def wn_merge(): - with open("wn.xml", "w", encoding="utf-8") as out: - out.write(""" - - - """) - lex_entries = {} - - ET.register_namespace("dc", "https://globalwordnet.github.io/schemas/dc/") - - for wn_part in sorted(glob("src/xml/wn-*.xml")): - tree = ET.parse(wn_part).getroot() - for element in tree[0]: - if(element.tag == "LexicalEntry"): - id = element.attrib["id"] - if id in lex_entries: - lex_entries[id] = merge_entry(lex_entries[id], element) - else: - lex_entries[id] = element - for (k, e) in lex_entries.items(): - out.write("\n ") - out.write( - ET.tostring( - indent( - order_entry(e), - level=2)).decode() .replace( - " xmlns:dc=\"https://globalwordnet.github.io/schemas/dc/\"", - "")) - out.write("\n ") - - for wn_part in glob("src/xml/wn-*.xml"): - tree = ET.parse(wn_part).getroot() - for element in tree[0]: - if(element.tag == "Synset"): - out.write(ET.tostring(element).decode() .replace( - " xmlns:dc=\"https://globalwordnet.github.io/schemas/dc/\"", "")) - tree = ET.parse("src/xml/wn-verb.body.xml").getroot() - for element in tree[0]: - if element.tag == "SyntacticBehaviour": - out.write(ET.tostring(element).decode() .replace( - " xmlns:dc=\"https://globalwordnet.github.io/schemas/dc/\"", "")) - out.write(""" - -""") - - -def main(): - wn_merge() - - -if __name__ == "__main__": - main() diff --git a/scripts/remove_examples_dquots.py b/scripts/remove_examples_dquots.py deleted file mode 100644 index 568f563a..00000000 --- a/scripts/remove_examples_dquots.py +++ /dev/null @@ -1,16 +0,0 @@ -import wordnet -import change_manager -from merge import wn_merge - -#wn_merge() -wn = change_manager.parse_wordnet("wn.xml") - -for synset in wn.synsets: - synset.examples = [ - wordnet.Example(ex.text[1:-1], ex.source) - if ex.text.startswith("\"") and ex.text.endswith("\"") - else ex - for ex in synset.examples] - - -change_manager.save(wn) diff --git a/scripts/split-synset.py b/scripts/split-synset.py deleted file mode 100644 index bd04c068..00000000 --- a/scripts/split-synset.py +++ /dev/null @@ -1,70 +0,0 @@ -import sys -import wordnet -import argparse -import re -import change_manager -import csv - - -def main(): - parser = argparse.ArgumentParser( - description="Split a synset - delete the synset and add two (or more) new synsets") - parser.add_argument('synset', metavar='SYNSET_ID', type=str, nargs="?", - help="The ID of the synset to change") - parser.add_argument( - '--definition', - type=str, - action='append', - help="The definition of the new synsets (repeat for each synset)") - parser.add_argument( - '--reason', - type=str, - nargs="?", - help="The reason for this change including issue number") - - args = parser.parse_args() - - wn = change_manager.load_wordnet() - - if not args.synset: - args.synset = "oewn-" + input("Enter synset ID: oewn-") - synset = wn.synset_by_id(args.synset) - - if not synset: - print("Cannot find synset") - exit(-1) - - if not args.definition: - args.definition = [] - print("Enter definitions (empty line to finish)") - while True: - d1 = input("Definition: ") - if d1: - args.definition.append(d1) - else: - break - - if not args.definition: - print("No new definitions") - exit(-1) - - if not args.reason: - args.reason = input("Reason for deletion (#IssueNo): ") - - new_ids = [] - for definition in args.definition: - new_ids.append( - change_manager.add_synset( - wn, - definition, - synset.lexfile, - synset.pos)) - - change_manager.delete_synset( - wn, synset, [ - wn.synset_for_id(new_id) for new_id in new_ids], args.reason) - change_manager.save_all_xml(wn) - - -if __name__ == "__main__": - main() diff --git a/scripts/to-yaml.py b/scripts/to-yaml.py deleted file mode 100644 index 025e65c1..00000000 --- a/scripts/to-yaml.py +++ /dev/null @@ -1,12 +0,0 @@ -from change_manager import parse_wordnet -from wordnet_yaml import save - - -def main(): - print("Don't run this, it will cause data loss!") - #wn = parse_wordnet("wn.xml") - #save(wn) - - -if __name__ == "__main__": - main() diff --git a/scripts/validate.py b/scripts/validate.py index fde287b5..5c5c51f2 100644 --- a/scripts/validate.py +++ b/scripts/validate.py @@ -1,10 +1,11 @@ -from wordnet import * +from wordnet import (parse_wordnet, SynsetRelType, PartOfSpeech, SenseRelType, + Synset, inverse_synset_rels, inverse_sense_rels, equal_pos) import re import sys import glob import sense_keys -from collections import Counter from sense_keys import unmap_sense_key +from collections import Counter def check_symmetry(wn, fix): errors = [] diff --git a/scripts/wordnet.py b/scripts/wordnet.py index 19109c58..60633e09 100644 --- a/scripts/wordnet.py +++ b/scripts/wordnet.py @@ -16,6 +16,7 @@ def __init__(self, id, label, language, email, license, version, url): self.license = license self.version = version self.url = url + self.citation = None self.entries = [] self.synsets = [] self.frames = [] @@ -105,10 +106,15 @@ def to_xml(self, xml_file, part=True): xml_file.write("""\n""") if part: xml_file.write( - """\n""") + """\n""") else: xml_file.write( - """\n""") + """\n""") + if self.citation: + citation_text = f""" + citation="{self.citation}" """ + else: + citation_text = "" xml_file.write( """ """ % (self.id, @@ -125,11 +131,12 @@ def to_xml(self, xml_file, part=True): self.email, self.license, self.version, + citation_text, self.url)) - for entry in self.entries: + for entry in sorted(self.entries, key=lambda x: x.id): entry.to_xml(xml_file, self.comments) - for synset in self.synsets: + for synset in sorted(self.synsets, key=lambda x: x.id): synset.to_xml(xml_file, self.comments) for synbeh in self.frames: synbeh.to_xml(xml_file) From fb0b8cee890d7f919a2f50673243ab19b5f3d1a5 Mon Sep 17 00:00:00 2001 From: "John P. McCrae" Date: Mon, 30 Sep 2024 13:25:52 +0100 Subject: [PATCH 2/2] Remove reference to wordnet_yaml --- scripts/sense_keys.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/scripts/sense_keys.py b/scripts/sense_keys.py index e874684c..f270acf0 100644 --- a/scripts/sense_keys.py +++ b/scripts/sense_keys.py @@ -2,7 +2,6 @@ from glob import glob import re from sys import exit -from wordnet_yaml import unmap_sense_key lex_filenums = { "src/xml/wn-adj.all.xml": 0, @@ -140,3 +139,22 @@ def get_sense_key(wn, e, s, wn_file): head_id = "" return "%s%%%d:%02d:%02d:%s:%s" % (lemma, ss_type, lex_filenum, lex_id, head_word, head_id) + +def unmap_sense_key(sk, KEY_PREFIX_LEN=5): + """ + Maps an OEWN sense key to a WN sense key + """ + if "__" in sk: + e = sk.split("__") + oewn_key = e[0][KEY_PREFIX_LEN:] + r = "__".join(e[1:]) + return (oewn_key.replace("-ap-", "'").replace("-sl-", "/").replace("-ex-", "!") + .replace("-cm-",",").replace("-cl-",":").replace("-pl-","+") + + "%" + r.replace(".", ":").replace("-sp-","_")) + else: + return (sk[KEY_PREFIX_LEN:].replace("__", "%").replace("-ap-", "'") + .replace("-sl-", "/").replace("-ex-", "!").replace("-cm-",",") + .replace("-cl-",":").replace("-pl-","+")) + + +