generate and rate wug outputs

colincwilson · Mar 24, 2024 · 499c7f5 · 499c7f5
1 parent 6370870
commit 499c7f5
Show file tree

Hide file tree

Showing 3 changed files with 199 additions and 193 deletions.
diff --git a/data/tiny_config.pkl b/data/tiny_config.pkl
diff --git a/mingen/01_prepare_data.py b/mingen/01_prepare_data.py
@@ -38,201 +38,207 @@ def format_strings(dat, extra_seg_fixes=None):
     return dat
 
 
-# Select language and transcription conventions.
-parser = configargparse.ArgParser(
-    config_file_parser_class=configargparse.YAMLConfigFileParser)
-parser.add('--language',
-           type=str,
-           choices=['eng', 'eng2', 'eng3', 'deu', 'nld', 'tiny'],
-           default='tiny')
-args = parser.parse_args()
-LANGUAGE = args.language
-
-ddata = Path.home(
-) / 'Languages/00Dictionaries/UniMorph/sigmorphon2021/2021Task0/part2'
-if LANGUAGE == 'tiny':
-    ddata = Path.home() / 'Code/Python/mingen/data'
-
-if LANGUAGE in ['eng', 'eng2', 'eng3']:
-    wordform_omit = None
-    wug_morphosyn = 'V;PST;'
-    # Simplify or split diphthongs, zap diacritics, fix unicode.
-    config.seg_fixes = {
-      'eɪ': 'e', 'oʊ': 'o', 'əʊ': 'o', 'aɪ': 'a ɪ', 'aʊ': 'a ʊ', \
-      'ɔɪ': 'ɔ ɪ', 'ɝ': 'ɛ ɹ', 'ˠ': '', 'm̩': 'm', 'n̩': 'n', 'l̩': 'l', \
-      'ɜ': 'ə', 'uːɪ': 'uː ɪ', 'ɔ̃': 'ɔ', 'ː': '', 'r': 'ɹ', 'ɡ': 'g'}
-    # Albright & Hayes 2003 training data: split diphthongs and
-    # rhoticized vowels, ~Britishize æ -> ɑ, fix regular past.
-    albrighthayes_seg_fixes = \
-        {'tʃ': 't ʃ', 'dʒ': 'd ʒ', 'æ': 'ɑ', 'ɜ˞': 'ɛ ɹ', \
-         'ə˞': 'ɛ ɹ', 'ɚ': 'ɛ ɹ', '([td]) ə d$': '\\1 ɪ d'}
-    if LANGUAGE in ['eng2', 'eng3']:
-        config.seg_fixes |= albrighthayes_seg_fixes
-    config.remove_prefix = None
-
-if LANGUAGE == 'deu':
-    wordform_omit = '[+]'
-    wug_morphosyn = '^V.PTCP;PST$'
-    # Split diphthongs, fix unicode
-    config.seg_fixes = {'ai̯': 'a i', 'au̯': 'a u', 'oi̯': 'o i', \
-      'iːə': 'iː ə', 'eːə': 'eː ə', 'ɛːə': 'ɛː ə', 'ɡ': 'g'}
-    config.remove_prefix = 'g ə'
-
-if LANGUAGE == 'nld':
-    wordform_omit = '[+]'
-    wug_morphosyn = 'V;PST;PL'
-    # Split diphthongs
-    config.seg_fixes = {'ɑʊ': 'ɑ ʊ', 'ɛɪ': 'ɛ ɪ', 'ʊɪ': 'ʊ ɪ', '[+]': ''}
-    config.remove_prefix = None
-
-if LANGUAGE == 'tiny':
-    wordform_omit = None
-    wug_morphosyn = 'V;3;SG'
-    config.seg_fixes = {}
-    config.remove_prefix = None
-
-# # # # # # # # # #
-# Training data.
-fdat = ddata / f'{LANGUAGE}.train'
-if LANGUAGE == 'eng2':
-    fdat = Path('../albrighthayes2003') / 'CELEXFull_unimorph.tsv'
-if LANGUAGE == 'eng3':
-    fdat = Path('../albrighthayes2003') / 'CELEXPrefixStrip_unimorph.tsv'
-dat = pd.read_csv(fdat, sep='\t', \
-    names=['wordform1', 'wordform2', 'morphosyn',
-           'wordform1_orth', 'wordform2_orth'])
-
-# Filter rows by characters in wordforms.
-if wordform_omit is not None:
-    dat = dat[~(dat.wordform1.str.contains(wordform_omit))]
-    dat = dat[~(dat.wordform2.str.contains(wordform_omit))]
-    dat = dat.reset_index()
-print(dat)
-
-# Keep rows with wug-tested morphosyn (todo: could be list).
-dat = dat[(dat.morphosyn.str.contains(wug_morphosyn))]
-dat = dat.drop('morphosyn', axis=1)
-dat = dat.drop_duplicates().reset_index()
-
-# Format strings and save.
-dat = format_strings(dat)
-dat.to_csv(config.save_dir / f'{LANGUAGE}_dat_train.tsv',
-           sep='\t',
-           index=False)
-config.dat_train = dat
-print('Training data')
-print(dat)
-print()
-
-# # # # # # # # # #
-# Wug dev set.
-WUG_DEV = LANGUAGE
-if LANGUAGE in ['eng2', 'eng3']:
-    WUG_DEV = 'eng'
-fwug_dev = ddata / f'{WUG_DEV}.judgements.dev'
-wug_dev = pd.read_csv(
-    fwug_dev,
-    sep='\t',
-    names=['wordform1', 'wordform2', 'morphosyn', 'human_rating'])
-wug_dev = wug_dev.drop('morphosyn', axis=1)
-
-wug_dev = format_strings(wug_dev)
-config.wug_dev = wug_dev
-wug_dev.to_csv(config.save_dir / f'{LANGUAGE}_wug_dev.tsv',
-               sep='\t',
-               index=False)
-print('Wug dev data')
-print(wug_dev)
-print()
-
-# # # # # # # # # #
-# Wug test set.
-WUG_TST = LANGUAGE
-if LANGUAGE in ['eng2', 'eng3']:
-    WUG_TST = 'eng'
-fwug_tst = ddata / f'{WUG_TST}.judgements.tst'
-
-wug_tst = pd.read_csv(fwug_tst,
-                      sep='\t',
-                      names=['wordform1', 'wordform2', 'morphosyn'])
-wug_tst = wug_tst.drop('morphosyn', axis=1)
-
-wug_tst = format_strings(wug_tst)
-config.wug_tst = wug_tst
-wug_tst.to_csv(config.save_dir / f'{LANGUAGE}_wug_tst.tsv',
+def main():
+    # Select language and transcription conventions.
+    parser = configargparse.ArgParser(
+        config_file_parser_class=configargparse.YAMLConfigFileParser)
+    parser.add('--language',
+               type=str,
+               choices=['eng', 'eng2', 'eng3', 'deu', 'nld', 'tiny'],
+               default='tiny')
+    args = parser.parse_args()
+    LANGUAGE = args.language
+
+    ddata = Path.home(
+    ) / 'Languages/00Dictionaries/UniMorph/sigmorphon2021/2021Task0/part2'
+    if LANGUAGE == 'tiny':
+        ddata = Path.home() / 'Code/Python/mingen/data'
+
+    if LANGUAGE in ['eng', 'eng2', 'eng3']:
+        wordform_omit = None
+        wug_morphosyn = 'V;PST;'
+        # Simplify or split diphthongs, zap diacritics, fix unicode.
+        config.seg_fixes = {
+        'eɪ': 'e', 'oʊ': 'o', 'əʊ': 'o', 'aɪ': 'a ɪ', 'aʊ': 'a ʊ', \
+        'ɔɪ': 'ɔ ɪ', 'ɝ': 'ɛ ɹ', 'ˠ': '', 'm̩': 'm', 'n̩': 'n', 'l̩': 'l', \
+        'ɜ': 'ə', 'uːɪ': 'uː ɪ', 'ɔ̃': 'ɔ', 'ː': '', 'r': 'ɹ', 'ɡ': 'g'}
+        # Albright & Hayes 2003 training data: split diphthongs and
+        # rhoticized vowels, ~Britishize æ -> ɑ, fix regular past.
+        albrighthayes_seg_fixes = \
+            {'tʃ': 't ʃ', 'dʒ': 'd ʒ', 'æ': 'ɑ', 'ɜ˞': 'ɛ ɹ', \
+            'ə˞': 'ɛ ɹ', 'ɚ': 'ɛ ɹ', '([td]) ə d$': '\\1 ɪ d'}
+        if LANGUAGE in ['eng2', 'eng3']:
+            config.seg_fixes |= albrighthayes_seg_fixes
+        config.remove_prefix = None
+
+    if LANGUAGE == 'deu':
+        wordform_omit = '[+]'
+        wug_morphosyn = '^V.PTCP;PST$'
+        # Split diphthongs, fix unicode
+        config.seg_fixes = {'ai̯': 'a i', 'au̯': 'a u', 'oi̯': 'o i', \
+        'iːə': 'iː ə', 'eːə': 'eː ə', 'ɛːə': 'ɛː ə', 'ɡ': 'g'}
+        config.remove_prefix = 'g ə'
+
+    if LANGUAGE == 'nld':
+        wordform_omit = '[+]'
+        wug_morphosyn = 'V;PST;PL'
+        # Split diphthongs
+        config.seg_fixes = {'ɑʊ': 'ɑ ʊ', 'ɛɪ': 'ɛ ɪ', 'ʊɪ': 'ʊ ɪ', '[+]': ''}
+        config.remove_prefix = None
+
+    if LANGUAGE == 'tiny':
+        wordform_omit = None
+        wug_morphosyn = 'V;3;SG'
+        config.seg_fixes = {}
+        config.remove_prefix = None
+
+    # # # # # # # # # #
+    # Training data.
+    fdat = ddata / f'{LANGUAGE}.train'
+    if LANGUAGE == 'eng2':
+        fdat = Path('../albrighthayes2003') / 'CELEXFull_unimorph.tsv'
+    if LANGUAGE == 'eng3':
+        fdat = Path('../albrighthayes2003') / 'CELEXPrefixStrip_unimorph.tsv'
+    dat = pd.read_csv(fdat, sep='\t', \
+        names=['wordform1', 'wordform2', 'morphosyn',
+            'wordform1_orth', 'wordform2_orth'])
+
+    # Filter rows by characters in wordforms.
+    if wordform_omit is not None:
+        dat = dat[~(dat.wordform1.str.contains(wordform_omit))]
+        dat = dat[~(dat.wordform2.str.contains(wordform_omit))]
+        dat = dat.reset_index()
+    print(dat)
+
+    # Keep rows with wug-tested morphosyn (todo: could be list).
+    dat = dat[(dat.morphosyn.str.contains(wug_morphosyn))]
+    dat = dat.drop('morphosyn', axis=1)
+    dat = dat.drop_duplicates().reset_index()
+
+    # Format strings and save.
+    dat = format_strings(dat)
+    dat.to_csv(config.save_dir / f'{LANGUAGE}_dat_train.tsv',
                sep='\t',
                index=False)
-print('Wug test data')
-print(wug_tst)
-print()
-
-# # # # # # # # # #
-# Albright-Hayes wugs.
-if LANGUAGE in ['eng', 'eng2', 'eng3']:
-    falbrighthayes = Path('../albrighthayes2003') / 'Wug_unimorph.tsv'
-    wug_albrighthayes = pd.read_csv(
-        falbrighthayes,
+    config.dat_train = dat
+    print('Training data')
+    print(dat)
+    print()
+
+    # # # # # # # # # #
+    # Wug dev set.
+    WUG_DEV = LANGUAGE
+    if LANGUAGE in ['eng2', 'eng3']:
+        WUG_DEV = 'eng'
+    fwug_dev = ddata / f'{WUG_DEV}.judgements.dev'
+    wug_dev = pd.read_csv(
+        fwug_dev,
         sep='\t',
-        comment='#',
         names=['wordform1', 'wordform2', 'morphosyn', 'human_rating'])
+    wug_dev = wug_dev.drop('morphosyn', axis=1)
+
+    wug_dev = format_strings(wug_dev)
+    config.wug_dev = wug_dev
+    wug_dev.to_csv(config.save_dir / f'{LANGUAGE}_wug_dev.tsv',
+                   sep='\t',
+                   index=False)
+    print('Wug dev data')
+    print(wug_dev)
+    print()
 
-    wug_albrighthayes = format_strings( \
-        wug_albrighthayes,
-        extra_seg_fixes=albrighthayes_seg_fixes)
-    config.wug_albrighthayes = wug_albrighthayes
-    wug_albrighthayes.to_csv( \
-        config.save_dir / 'albrighthayes2003_wug.tsv',
-        sep='\t',
-        index=False)
-    print('Albright-Hayes wug data')
-    print(wug_albrighthayes)
+    # # # # # # # # # #
+    # Wug test set.
+    WUG_TST = LANGUAGE
+    if LANGUAGE in ['eng2', 'eng3']:
+        WUG_TST = 'eng'
+    fwug_tst = ddata / f'{WUG_TST}.judgements.tst'
+
+    wug_tst = pd.read_csv(fwug_tst,
+                          sep='\t',
+                          names=['wordform1', 'wordform2', 'morphosyn'])
+    wug_tst = wug_tst.drop('morphosyn', axis=1)
+
+    wug_tst = format_strings(wug_tst)
+    config.wug_tst = wug_tst
+    wug_tst.to_csv(config.save_dir / f'{LANGUAGE}_wug_tst.tsv',
+                   sep='\t',
+                   index=False)
+    print('Wug test data')
+    print(wug_tst)
+    print()
+
+    # # # # # # # # # #
+    # Albright-Hayes wugs.
+    if LANGUAGE in ['eng', 'eng2', 'eng3']:
+        falbrighthayes = Path('../albrighthayes2003') / 'Wug_unimorph.tsv'
+        wug_albrighthayes = pd.read_csv(
+            falbrighthayes,
+            sep='\t',
+            comment='#',
+            names=['wordform1', 'wordform2', 'morphosyn', 'human_rating'])
+
+        wug_albrighthayes = format_strings( \
+            wug_albrighthayes,
+            extra_seg_fixes=albrighthayes_seg_fixes)
+        config.wug_albrighthayes = wug_albrighthayes
+        wug_albrighthayes.to_csv( \
+            config.save_dir / 'albrighthayes2003_wug.tsv',
+            sep='\t',
+            index=False)
+        print('Albright-Hayes wug data')
+        print(wug_albrighthayes)
+        print()
+
+    # # # # # # # # # #
+    # Phonological features
+    segments = set()
+    for stem in dat['stem']:
+        segments |= set(stem.split())
+    for output in dat['output']:
+        segments |= set(output.split())
+    segments -= {config.bos, config.eos}
+    segments = [x for x in segments]
+    segments.sort()
+    print(f'Segments that appear in training data: '
+          f'{segments} (n = {len(segments)})')
     print()
 
-# # # # # # # # # #
-# Phonological features
-segments = set()
-for stem in dat['stem']:
-    segments |= set(stem.split())
-for output in dat['output']:
-    segments |= set(output.split())
-segments -= {config.bos, config.eos}
-segments = [x for x in segments]
-segments.sort()
-print(f'Segments that appear in training data: '
-      f'{segments} (n = {len(segments)})')
-print()
-
-#tensormorph.config.feature_dir = Path.home(
-#) / 'Code/Python/tensormorph_redup/ftrs'
-#tensormorph.config.fdata = config.save_dir / f'{LANGUAGE}.ftr'
-#fm = tensormorph.phon_features.import_features('hayes_features.csv', segments)
-# Import features from file
-feature_matrix = features.import_features(
-    Path.home() / 'Code/Python/transmorph/features/hayes_features.csv',
-    segments)
-
-# Fix up features for mingen.
-ftr_matrix = feature_matrix.ftr_matrix
-ftr_matrix = ftr_matrix.drop('sym', axis=1)  # sym redundant with X (Sigma*)
-config.phon_ftrs = ftr_matrix
-config.ftr_names = list(ftr_matrix.columns.values)
-config.syms = list(ftr_matrix.index)
-
-# Map from symbols to feature-value dictionaries and feature vectors.
-config.sym2ftrs = {}
-config.sym2ftr_vec = {}
-for i, sym in enumerate(config.syms):
-    ftrs = config.phon_ftrs.iloc[i, :].to_dict()
-    config.sym2ftrs[sym] = ftrs
-    config.sym2ftr_vec[sym] = tuple(ftrs.values())
-
-# # # # # # # # # #
-# Save config.
-config_save = {}
-for key in dir(config):
-    if re.search('__', key):
-        continue
-    config_save[key] = getattr(config, key)
-
-with open(config.save_dir / f'{LANGUAGE}_config.pkl', 'wb') as f:
-    pickle.dump(config_save, f)
+    #tensormorph.config.feature_dir = Path.home(
+    #) / 'Code/Python/tensormorph_redup/ftrs'
+    #tensormorph.config.fdata = config.save_dir / f'{LANGUAGE}.ftr'
+    #fm = tensormorph.phon_features.import_features('hayes_features.csv', segments)
+    # Import features from file
+    feature_matrix = features.import_features(
+        Path.home() / 'Code/Python/transmorph/features/hayes_features.csv',
+        segments)
+
+    # Fix up features for mingen.
+    ftr_matrix = feature_matrix.ftr_matrix
+    ftr_matrix = ftr_matrix.drop('sym',
+                                 axis=1)  # sym redundant with X (Sigma*)
+    config.phon_ftrs = ftr_matrix
+    config.ftr_names = list(ftr_matrix.columns.values)
+    config.syms = list(ftr_matrix.index)
+
+    # Map from symbols to feature-value dictionaries and feature vectors.
+    config.sym2ftrs = {}
+    config.sym2ftr_vec = {}
+    for i, sym in enumerate(config.syms):
+        ftrs = config.phon_ftrs.iloc[i, :].to_dict()
+        config.sym2ftrs[sym] = ftrs
+        config.sym2ftr_vec[sym] = tuple(ftrs.values())
+
+    # # # # # # # # # #
+    # Save config.
+    config_save = {}
+    for key in dir(config):
+        if re.search('__', key):
+            continue
+        config_save[key] = getattr(config, key)
+
+    with open(config.save_dir / f'{LANGUAGE}_config.pkl', 'wb') as f:
+        pickle.dump(config_save, f)
+
+
+if __name__ == "__main__":
+    main()