Merge pull request #46 from SupervisedStylometry/glove

implementing glove
SupervisedStylometry · Feb 16, 2024 · 296fc21 · 296fc21
2 parents dbe264b + e89c602
commit 296fc21
Show file tree

Hide file tree

Showing 11 changed files with 136 additions and 94 deletions.
diff --git a/.gitignore b/.gitignore
@@ -21,6 +21,7 @@ mySVM.joblib
 .ipynb_checkpoints/*
 *.ipynb_checkpoints*
 data
+models
 *.json
 *.log
 *.txt

diff --git a/main.py b/main.py
@@ -1,6 +1,7 @@
 import superstyl.preproc.tuyau as tuy
 import superstyl.preproc.features_extract as fex
 from superstyl.preproc.text_count import count_process
+import superstyl.preproc.embedding as embed
 import pandas
 import json
 # from multiprocessing import Pool
@@ -11,10 +12,6 @@
 
 # TODO: eliminate features that occur only n times ?
 # Do the Moisl Selection ?
-# Z-scores, etc. ?
-# Vector-length normalisation ?
-
-# TODO: free up memory as the script goes by deleting unnecessary objects
 
 if __name__ == '__main__':
 
@@ -27,7 +24,6 @@
     parser.add_argument('-p', action='store', help="Processes to use (default 1)", default=1, type=int)
     parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int)
     parser.add_argument('--absolute_freqs', action='store_true', help="switch to get absolute instead of relative freqs", default=False)
-    parser.add_argument('--z_scores', action='store_true', help="Use z-scores?", default=False) # TODO: remove this as already covered in model training?
     parser.add_argument('-s', nargs='+', help="paths to files")
     parser.add_argument('-x', action='store', help="format (txt, xml or tei)", default="txt")
     parser.add_argument('--sampling', action='store_true', help="Sample the texts?", default=False)
@@ -44,8 +40,19 @@
     parser.add_argument('--identify_lang', action='store_true',
                         help="if true, should the language of each text be guessed, using langdetect (default is False)",
                         default=False)
+    parser.add_argument('--embedding', action="store", help="optional path to a word2vec embedding in txt format to compute frequencies among a set of semantic neighbourgs (i.e., pseudo-paronyms)",
+                        default=False)
+    parser.add_argument('--neighbouring_size', action="store", help="size of semantic neighbouring in the embedding (n closest neighbours)",
+                        default=10, type=int)
     args = parser.parse_args()
 
+    embeddedFreqs = False
+    if args.embedding:
+        print(".......loading embedding.......")
+        args.absolute_freqs = True # we need absolute freqs as a basis for embedded frequencies
+        model = embed.load_embeddings(args.embedding)
+        embeddedFreqs = True
+
     print(".......loading texts.......")
 
     if args.sampling:
@@ -54,7 +61,8 @@
                                       keep_punct=args.keep_punct, keep_sym=args.keep_sym, max_samples=args.max_samples)
 
     else:
-        myTexts = tuy.load_texts(args.s, identify_lang=args.identify_lang, format=args.x, keep_punct=args.keep_punct, keep_sym=args.keep_sym)
+        myTexts = tuy.load_texts(args.s, identify_lang=args.identify_lang, format=args.x, keep_punct=args.keep_punct,
+                                 keep_sym=args.keep_sym, max_samples=args.max_samples)
 
     print(".......getting features.......")
 
@@ -80,23 +88,18 @@
     feat_list = [m[0] for m in my_feats]
     myTexts = fex.get_counts(myTexts, feat_list=feat_list, feats=args.t, n=args.n, relFreqs=not args.absolute_freqs)
 
+    if args.embedding:
+        print(".......embedding counts.......")
+        myTexts = embed.get_embedded_counts(myTexts, feat_list, model, topn=args.neighbouring_size)
+
     unique_texts = [text["name"] for text in myTexts]
 
     print(".......feeding data frame.......")
 
-    #feats = pandas.DataFrame(columns=list(feat_list), index=unique_texts)
-
-
-    # with Pool(args.p) as pool:
-    #     print(args.p)
-    # target = zip(myTexts, [feat_list] * len(myTexts))
-        # with tqdm.tqdm(total=len(myTexts)) as pbar:
-            # for text, local_freqs in pool.map(count_process, target):
-
     loc = {}
 
     for t in tqdm.tqdm(myTexts):
-        text, local_freqs = count_process((t, feat_list))
+        text, local_freqs = count_process((t, feat_list), embeddedFreqs=embeddedFreqs)
         loc[text["name"]] = local_freqs
     # Saving metadata for later
     metadata = pandas.DataFrame(columns=['author', 'lang'], index=unique_texts, data =
@@ -110,19 +113,6 @@
     # Free some more
     del loc
 
-    print(".......applying normalisations.......")
-    # And here is the place to implement selection and normalisation
-    if args.z_scores:
-        feat_stats = pandas.DataFrame(columns=["mean", "std"], index=list(feat_list))
-        feat_stats.loc[:,"mean"] = list(feats.mean(axis=0))
-        feat_stats.loc[:, "std"] = list(feats.std(axis=0))
-        feat_stats.to_csv("feat_stats.csv")
-
-        for col in list(feats.columns):
-            feats[col] = (feats[col] - feats[col].mean()) / feats[col].std()
-
-        # TODO: vector-length normalisation? -> No, in pipeline
-
     print(".......saving results.......")
     # frequence based selection
     # WOW, pandas is a great tool, almost as good as using R

diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,7 @@ lxml>=4.9.1
 nltk>=3.6.6
 numpy>=1.26.4
 pybind11>=2.8.1
-scikit-learn>=1.2.1
+scikit-learn>=1.3.0
 scipy>=1.10.0
 six>=1.16.0
 tqdm>=4.64.1
@@ -15,3 +15,4 @@ argparse>=1.4.0
 regex>=2022.10.31
 matplotlib>=3.6.2
 imbalanced-learn>=0.8.1
+gensim>=4.3.2
diff --git a/stats.py b/stats.py
diff --git a/superstyl/preproc/embedding.py b/superstyl/preproc/embedding.py
@@ -0,0 +1,67 @@
+import numpy as np
+from scipy import spatial
+import gensim.models
+
+def load_embeddings(path):
+    """
+    Load w2vec embeddings from a txt file and return a dictionary mapping words to vectors.
+    :param path: the path to the embeddings txt file
+    :return a dictionary of words and vectors
+    """
+    model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=False)
+    return model
+
+def find_similar_words(model, word, topn=10):
+    """
+    Find and return the most similar words to a given word based on cosine similarity.
+    :param model: the embedding model
+    :param word: the word for which closest are retrieved
+    :param topn: the n closest (as per cosine similarity) words on which to compute relative frequency
+    :return a list of the topn closest words
+    """
+    if word not in model:
+        return None
+
+    else:
+        return [s[0] for s in model.most_similar(word, topn=topn)]
+
+# For tests
+# myTexts = [{'name': 'Letter1', 'aut': 'Smith', 'text': 'This is the text', 'lang': 'en', 'wordCounts': {'the': 1, 'this': 1}},
+#  {'name': 'Letter2', 'aut': 'Smith', 'text': 'Also the text', 'lang': 'en',
+#   'wordCounts': {'the': 1, 'also': 1}}]
+# feat_list = ['the']
+# feat = "the"
+def get_embedded_counts(myTexts, feat_list, model, topn=10):
+    """
+    Replace absolute frequencies by frequencies relative to a given semantic neighbouring
+    (i.e., some sort of relative frequency among 'paronyms'), using a Glove embedding (cf. Eder, 2022).
+    :param myTexts: the document collection
+    :param feat_list: a list of features to be selected
+    :param model: the embeddings model
+    :param topn: the n closest (as per cosine similarity) words on which to compute relative frequency
+    :return: the myTexts collection with, for each text, a 'wordCounts' dictionary with said semantic relative frequencies
+    """
+
+    for feat in feat_list:
+        similars = find_similar_words(model, feat, topn=topn)
+        if similars is None:
+            # IN THAT CASE, we do not include it in the embedded freqs
+            continue
+
+        else:
+            for i in enumerate(myTexts):
+
+                if feat in myTexts[i[0]]["wordCounts"].keys():
+
+                    if "embedded" not in myTexts[i[0]].keys():
+                        # then, initialise
+                        myTexts[i[0]]["embedded"] = {}
+
+                    total = sum([myTexts[i[0]]["wordCounts"][s] for s in [feat]+similars if s in myTexts[i[0]]["wordCounts"].keys()])
+                    myTexts[i[0]]["embedded"][feat] = myTexts[i[0]]["wordCounts"][feat] / total
+
+    return myTexts
+
+
+
+
diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py
@@ -75,6 +75,7 @@ def get_counts(myTexts, feat_list, feats = "words", n = 1, relFreqs = False):
     """
     Get counts for a collection of texts
     :param myTexts: the document collection
+    :param feat_list: a list of features to be selected
     :param feats: the type of feats (words, chars, etc.)
     :param n: the length of n-grams
     :param relFreqs: whether to compute relative freqs

diff --git a/superstyl/preproc/text_count.py b/superstyl/preproc/text_count.py
@@ -1,8 +1,13 @@
 
-def count_process(args):
+def count_process(args, embeddedFreqs=False):
+    if embeddedFreqs:
+        key = "embedded"
+    else:
+        key = "wordCounts"
+
     text, feat_list = args
     local_freqs = list([0] * len(feat_list))
-    for word, value in text["wordCounts"].items():
+    for word, value in text[key].items():
         if word in feat_list:
             local_freqs[feat_list.index(word)] = value
     return text, local_freqs
diff --git a/superstyl/preproc/tuyau.py b/superstyl/preproc/tuyau.py
@@ -92,8 +92,35 @@ def normalise(text, keep_punct=False, keep_sym=False):
 
     return out
 
+def max_sampling(myTexts, max_samples=10):
+    """
+    Select a random number of samples, equal to max_samples, for authors or classes that have more than max_samples
+    :param myTexts: the input myTexts object
+    :param max_samples: the maximum number of samples for any class
+    :return: a myTexts object, with the resulting selection of samples
+    """
+    autsCounts = dict()
+    for text in myTexts:
+        if text['aut'] not in autsCounts.keys():
+            autsCounts[text['aut']] = 1
+
+        else:
+            autsCounts[text['aut']] += 1
+
+    for autCount in autsCounts.items():
+        if autCount[1] > max_samples:
+            # get random selection
+            toBeSelected = [text for text in myTexts if text['aut'] == autCount[0]]
+            toBeSelected = random.sample(toBeSelected, k=max_samples)
+            # Great, now remove all texts from this author from our samples
+            myTexts = [text for text in myTexts if text['aut'] != autCount[0]]
+            # and now concat
+            myTexts = myTexts + toBeSelected
+
+    return myTexts
+
 
-def load_texts(paths, identify_lang=False, format="txt", keep_punct=False, keep_sym=False):
+def load_texts(paths, identify_lang=False, format="txt", keep_punct=False, keep_sym=False, max_samples=10):
     """
     Loads a collection of documents into a 'myTexts' object for further processing.
     TODO: a proper class
@@ -102,11 +129,11 @@ def load_texts(paths, identify_lang=False, format="txt", keep_punct=False, keep_
     :param format: format of the source files (implemented values: txt [default], xml)
     :param keep_punct: whether or not to keep punctuation and caps.
     :param keep_sym: whether or not to keep punctuation, caps, letter variants and numbers (no unidecode).
+    :param max_samples: the maximum number of samples for any class
     :return: a myTexts object
     """
 
     myTexts = []
-    # langCerts = []
 
     for path in paths:
         name = path.split('/')[-1]
@@ -127,20 +154,9 @@ def load_texts(paths, identify_lang=False, format="txt", keep_punct=False, keep_
 
         myTexts.append({"name": name, "aut": aut, "text": text, "lang": lang})
 
-        # if cert < 1:
-        # langCerts.append((lang, name, cert))
-
-        # directory = "train_txt/" + lang + "/" + aut + "/"
-
-        # if not os.path.exists(directory):
-        #    os.makedirs(directory)
-
-        # with open(directory + name + ".txt", "w") as out:
-        #    out.write(text)
+    if max_samples is not None:
+        myTexts = max_sampling(myTexts, max_samples=max_samples)
 
-    # with open("lang_certs.csv", 'w') as out:
-    #    for line in langCerts:
-    #        out.write("{}\t{}\t{}\t\n".format(line[0], line[1], float(line[2])))
     return myTexts
 
 
@@ -219,8 +235,9 @@ def docs_to_samples(paths, size, step=None, units="verses", feature="tokens", fo
     :param feature: type of tokens to extract (default is tokens, not lemmas or POS)
     :param format: type of document, one of full text, TEI or simple XML (ONLY TEI and TXT IMPLEMENTED)
     :param keep_punct: whether or not to keep punctuation and caps.
-    :param max_samples: maximum number of samples per author.
+    :param max_samples: maximum number of samples per author/class.
     :param identify_lang: whether or not try to identify lang (default: False)
+    :return: a myTexts object
     """
     myTexts = []
     for path in paths:
@@ -246,22 +263,6 @@ def docs_to_samples(paths, size, step=None, units="verses", feature="tokens", fo
             myTexts.append({"name": name, "aut": aut, "text": text, "lang": lang})
 
     if max_samples is not None:
-        autsCounts = dict()
-        for text in myTexts:
-            if text['aut'] not in autsCounts.keys():
-                autsCounts[text['aut']] = 1
-
-            else:
-                autsCounts[text['aut']] += 1
-
-            for autCount in autsCounts.items():
-                if autCount[1] > max_samples:
-                    # get random selection
-                    toBeSelected = [text for text in myTexts if text['aut'] is autCount[0]]
-                    toBeSelected = random.sample(toBeSelected, k=max_samples)
-                    # Great, now remove all texts from this author from our samples
-                    myTexts = [text for text in myTexts if text['aut'] is not autCount[0]]
-                    # and now concat
-                    myTexts = myTexts + toBeSelected
+        myTexts = max_sampling(myTexts, max_samples=max_samples)
 
     return myTexts
diff --git a/superstyl/svm.py b/superstyl/svm.py
@@ -110,7 +110,7 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True
 
     if kernel == "LinearSVC":
         # try a faster one
-        estimators.append(('model', sk.LinearSVC(class_weight=cw)))
+        estimators.append(('model', sk.LinearSVC(class_weight=cw, dual="auto")))
         # classif = sk.LinearSVC()
 
     else:

diff --git a/tests/test_main.py b/tests/test_main.py
@@ -98,6 +98,8 @@ def test_get_counts(self):
         #TODO: a lot more tests
 
 
+# TODO: tests for SVM, etc.
+# Test all options of main commands, see if they are accepted or not
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/train_svm.py b/train_svm.py
@@ -33,8 +33,8 @@
                         default=False
                         )
     parser.add_argument('--kernel', action='store',
-                        help="type of kernel to use (default LinearSVC; possible alternatives, linear, polynomial, rbf, sigmoid)",
-                        default="LinearSVC", choices=['LinearSVC', 'linear', 'polynomial', 'rbf', 'sigmoid'], type=str)
+                        help="type of kernel to use (default and recommended choice is LinearSVC; possible alternatives are linear, sigmoid, rbf and poly, as per sklearn.svm.SVC)",
+                        default="LinearSVC", choices=['LinearSVC', 'linear', 'sigmoid', 'rbf', 'poly'], type=str)
     parser.add_argument('--final', action='store_true', help="final analysis on unknown dataset (no evaluation)?", default=False)
     parser.add_argument('--get_coefs', action='store_true', help="switch to write to disk and plots the most important coefficients for the training feats for each class",
                         default=False)
-Original file line number
+Diff line change
@@ Expand Up / @@ -21,6 +21,7 @@ mySVM.joblib @@
     .ipynb_checkpoints/*
     *.ipynb_checkpoints*
     data
+    models
     *.json
     *.log
     *.txt
@@ Expand Down @@