Skip to content

Commit

Permalink
Merge pull request #46 from SupervisedStylometry/glove
Browse files Browse the repository at this point in the history
implementing glove
  • Loading branch information
Jean-Baptiste-Camps authored Feb 16, 2024
2 parents dbe264b + e89c602 commit 296fc21
Show file tree
Hide file tree
Showing 11 changed files with 136 additions and 94 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ mySVM.joblib
.ipynb_checkpoints/*
*.ipynb_checkpoints*
data
models
*.json
*.log
*.txt
Expand Down
48 changes: 19 additions & 29 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import superstyl.preproc.tuyau as tuy
import superstyl.preproc.features_extract as fex
from superstyl.preproc.text_count import count_process
import superstyl.preproc.embedding as embed
import pandas
import json
# from multiprocessing import Pool
Expand All @@ -11,10 +12,6 @@

# TODO: eliminate features that occur only n times ?
# Do the Moisl Selection ?
# Z-scores, etc. ?
# Vector-length normalisation ?

# TODO: free up memory as the script goes by deleting unnecessary objects

if __name__ == '__main__':

Expand All @@ -27,7 +24,6 @@
parser.add_argument('-p', action='store', help="Processes to use (default 1)", default=1, type=int)
parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int)
parser.add_argument('--absolute_freqs', action='store_true', help="switch to get absolute instead of relative freqs", default=False)
parser.add_argument('--z_scores', action='store_true', help="Use z-scores?", default=False) # TODO: remove this as already covered in model training?
parser.add_argument('-s', nargs='+', help="paths to files")
parser.add_argument('-x', action='store', help="format (txt, xml or tei)", default="txt")
parser.add_argument('--sampling', action='store_true', help="Sample the texts?", default=False)
Expand All @@ -44,8 +40,19 @@
parser.add_argument('--identify_lang', action='store_true',
help="if true, should the language of each text be guessed, using langdetect (default is False)",
default=False)
parser.add_argument('--embedding', action="store", help="optional path to a word2vec embedding in txt format to compute frequencies among a set of semantic neighbourgs (i.e., pseudo-paronyms)",
default=False)
parser.add_argument('--neighbouring_size', action="store", help="size of semantic neighbouring in the embedding (n closest neighbours)",
default=10, type=int)
args = parser.parse_args()

embeddedFreqs = False
if args.embedding:
print(".......loading embedding.......")
args.absolute_freqs = True # we need absolute freqs as a basis for embedded frequencies
model = embed.load_embeddings(args.embedding)
embeddedFreqs = True

print(".......loading texts.......")

if args.sampling:
Expand All @@ -54,7 +61,8 @@
keep_punct=args.keep_punct, keep_sym=args.keep_sym, max_samples=args.max_samples)

else:
myTexts = tuy.load_texts(args.s, identify_lang=args.identify_lang, format=args.x, keep_punct=args.keep_punct, keep_sym=args.keep_sym)
myTexts = tuy.load_texts(args.s, identify_lang=args.identify_lang, format=args.x, keep_punct=args.keep_punct,
keep_sym=args.keep_sym, max_samples=args.max_samples)

print(".......getting features.......")

Expand All @@ -80,23 +88,18 @@
feat_list = [m[0] for m in my_feats]
myTexts = fex.get_counts(myTexts, feat_list=feat_list, feats=args.t, n=args.n, relFreqs=not args.absolute_freqs)

if args.embedding:
print(".......embedding counts.......")
myTexts = embed.get_embedded_counts(myTexts, feat_list, model, topn=args.neighbouring_size)

unique_texts = [text["name"] for text in myTexts]

print(".......feeding data frame.......")

#feats = pandas.DataFrame(columns=list(feat_list), index=unique_texts)


# with Pool(args.p) as pool:
# print(args.p)
# target = zip(myTexts, [feat_list] * len(myTexts))
# with tqdm.tqdm(total=len(myTexts)) as pbar:
# for text, local_freqs in pool.map(count_process, target):

loc = {}

for t in tqdm.tqdm(myTexts):
text, local_freqs = count_process((t, feat_list))
text, local_freqs = count_process((t, feat_list), embeddedFreqs=embeddedFreqs)
loc[text["name"]] = local_freqs
# Saving metadata for later
metadata = pandas.DataFrame(columns=['author', 'lang'], index=unique_texts, data =
Expand All @@ -110,19 +113,6 @@
# Free some more
del loc

print(".......applying normalisations.......")
# And here is the place to implement selection and normalisation
if args.z_scores:
feat_stats = pandas.DataFrame(columns=["mean", "std"], index=list(feat_list))
feat_stats.loc[:,"mean"] = list(feats.mean(axis=0))
feat_stats.loc[:, "std"] = list(feats.std(axis=0))
feat_stats.to_csv("feat_stats.csv")

for col in list(feats.columns):
feats[col] = (feats[col] - feats[col].mean()) / feats[col].std()

# TODO: vector-length normalisation? -> No, in pipeline

print(".......saving results.......")
# frequence based selection
# WOW, pandas is a great tool, almost as good as using R
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ lxml>=4.9.1
nltk>=3.6.6
numpy>=1.26.4
pybind11>=2.8.1
scikit-learn>=1.2.1
scikit-learn>=1.3.0
scipy>=1.10.0
six>=1.16.0
tqdm>=4.64.1
Expand All @@ -15,3 +15,4 @@ argparse>=1.4.0
regex>=2022.10.31
matplotlib>=3.6.2
imbalanced-learn>=0.8.1
gensim>=4.3.2
26 changes: 0 additions & 26 deletions stats.py

This file was deleted.

67 changes: 67 additions & 0 deletions superstyl/preproc/embedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import numpy as np
from scipy import spatial
import gensim.models

def load_embeddings(path):
"""
Load w2vec embeddings from a txt file and return a dictionary mapping words to vectors.
:param path: the path to the embeddings txt file
:return a dictionary of words and vectors
"""
model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=False)
return model

def find_similar_words(model, word, topn=10):
"""
Find and return the most similar words to a given word based on cosine similarity.
:param model: the embedding model
:param word: the word for which closest are retrieved
:param topn: the n closest (as per cosine similarity) words on which to compute relative frequency
:return a list of the topn closest words
"""
if word not in model:
return None

else:
return [s[0] for s in model.most_similar(word, topn=topn)]

# For tests
# myTexts = [{'name': 'Letter1', 'aut': 'Smith', 'text': 'This is the text', 'lang': 'en', 'wordCounts': {'the': 1, 'this': 1}},
# {'name': 'Letter2', 'aut': 'Smith', 'text': 'Also the text', 'lang': 'en',
# 'wordCounts': {'the': 1, 'also': 1}}]
# feat_list = ['the']
# feat = "the"
def get_embedded_counts(myTexts, feat_list, model, topn=10):
"""
Replace absolute frequencies by frequencies relative to a given semantic neighbouring
(i.e., some sort of relative frequency among 'paronyms'), using a Glove embedding (cf. Eder, 2022).
:param myTexts: the document collection
:param feat_list: a list of features to be selected
:param model: the embeddings model
:param topn: the n closest (as per cosine similarity) words on which to compute relative frequency
:return: the myTexts collection with, for each text, a 'wordCounts' dictionary with said semantic relative frequencies
"""

for feat in feat_list:
similars = find_similar_words(model, feat, topn=topn)
if similars is None:
# IN THAT CASE, we do not include it in the embedded freqs
continue

else:
for i in enumerate(myTexts):

if feat in myTexts[i[0]]["wordCounts"].keys():

if "embedded" not in myTexts[i[0]].keys():
# then, initialise
myTexts[i[0]]["embedded"] = {}

total = sum([myTexts[i[0]]["wordCounts"][s] for s in [feat]+similars if s in myTexts[i[0]]["wordCounts"].keys()])
myTexts[i[0]]["embedded"][feat] = myTexts[i[0]]["wordCounts"][feat] / total

return myTexts




1 change: 1 addition & 0 deletions superstyl/preproc/features_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def get_counts(myTexts, feat_list, feats = "words", n = 1, relFreqs = False):
"""
Get counts for a collection of texts
:param myTexts: the document collection
:param feat_list: a list of features to be selected
:param feats: the type of feats (words, chars, etc.)
:param n: the length of n-grams
:param relFreqs: whether to compute relative freqs
Expand Down
9 changes: 7 additions & 2 deletions superstyl/preproc/text_count.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@

def count_process(args):
def count_process(args, embeddedFreqs=False):
if embeddedFreqs:
key = "embedded"
else:
key = "wordCounts"

text, feat_list = args
local_freqs = list([0] * len(feat_list))
for word, value in text["wordCounts"].items():
for word, value in text[key].items():
if word in feat_list:
local_freqs[feat_list.index(word)] = value
return text, local_freqs
67 changes: 34 additions & 33 deletions superstyl/preproc/tuyau.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,35 @@ def normalise(text, keep_punct=False, keep_sym=False):

return out

def max_sampling(myTexts, max_samples=10):
"""
Select a random number of samples, equal to max_samples, for authors or classes that have more than max_samples
:param myTexts: the input myTexts object
:param max_samples: the maximum number of samples for any class
:return: a myTexts object, with the resulting selection of samples
"""
autsCounts = dict()
for text in myTexts:
if text['aut'] not in autsCounts.keys():
autsCounts[text['aut']] = 1

else:
autsCounts[text['aut']] += 1

for autCount in autsCounts.items():
if autCount[1] > max_samples:
# get random selection
toBeSelected = [text for text in myTexts if text['aut'] == autCount[0]]
toBeSelected = random.sample(toBeSelected, k=max_samples)
# Great, now remove all texts from this author from our samples
myTexts = [text for text in myTexts if text['aut'] != autCount[0]]
# and now concat
myTexts = myTexts + toBeSelected

return myTexts


def load_texts(paths, identify_lang=False, format="txt", keep_punct=False, keep_sym=False):
def load_texts(paths, identify_lang=False, format="txt", keep_punct=False, keep_sym=False, max_samples=10):
"""
Loads a collection of documents into a 'myTexts' object for further processing.
TODO: a proper class
Expand All @@ -102,11 +129,11 @@ def load_texts(paths, identify_lang=False, format="txt", keep_punct=False, keep_
:param format: format of the source files (implemented values: txt [default], xml)
:param keep_punct: whether or not to keep punctuation and caps.
:param keep_sym: whether or not to keep punctuation, caps, letter variants and numbers (no unidecode).
:param max_samples: the maximum number of samples for any class
:return: a myTexts object
"""

myTexts = []
# langCerts = []

for path in paths:
name = path.split('/')[-1]
Expand All @@ -127,20 +154,9 @@ def load_texts(paths, identify_lang=False, format="txt", keep_punct=False, keep_

myTexts.append({"name": name, "aut": aut, "text": text, "lang": lang})

# if cert < 1:
# langCerts.append((lang, name, cert))

# directory = "train_txt/" + lang + "/" + aut + "/"

# if not os.path.exists(directory):
# os.makedirs(directory)

# with open(directory + name + ".txt", "w") as out:
# out.write(text)
if max_samples is not None:
myTexts = max_sampling(myTexts, max_samples=max_samples)

# with open("lang_certs.csv", 'w') as out:
# for line in langCerts:
# out.write("{}\t{}\t{}\t\n".format(line[0], line[1], float(line[2])))
return myTexts


Expand Down Expand Up @@ -219,8 +235,9 @@ def docs_to_samples(paths, size, step=None, units="verses", feature="tokens", fo
:param feature: type of tokens to extract (default is tokens, not lemmas or POS)
:param format: type of document, one of full text, TEI or simple XML (ONLY TEI and TXT IMPLEMENTED)
:param keep_punct: whether or not to keep punctuation and caps.
:param max_samples: maximum number of samples per author.
:param max_samples: maximum number of samples per author/class.
:param identify_lang: whether or not try to identify lang (default: False)
:return: a myTexts object
"""
myTexts = []
for path in paths:
Expand All @@ -246,22 +263,6 @@ def docs_to_samples(paths, size, step=None, units="verses", feature="tokens", fo
myTexts.append({"name": name, "aut": aut, "text": text, "lang": lang})

if max_samples is not None:
autsCounts = dict()
for text in myTexts:
if text['aut'] not in autsCounts.keys():
autsCounts[text['aut']] = 1

else:
autsCounts[text['aut']] += 1

for autCount in autsCounts.items():
if autCount[1] > max_samples:
# get random selection
toBeSelected = [text for text in myTexts if text['aut'] is autCount[0]]
toBeSelected = random.sample(toBeSelected, k=max_samples)
# Great, now remove all texts from this author from our samples
myTexts = [text for text in myTexts if text['aut'] is not autCount[0]]
# and now concat
myTexts = myTexts + toBeSelected
myTexts = max_sampling(myTexts, max_samples=max_samples)

return myTexts
2 changes: 1 addition & 1 deletion superstyl/svm.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True

if kernel == "LinearSVC":
# try a faster one
estimators.append(('model', sk.LinearSVC(class_weight=cw)))
estimators.append(('model', sk.LinearSVC(class_weight=cw, dual="auto")))
# classif = sk.LinearSVC()

else:
Expand Down
2 changes: 2 additions & 0 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ def test_get_counts(self):
#TODO: a lot more tests


# TODO: tests for SVM, etc.
# Test all options of main commands, see if they are accepted or not

if __name__ == '__main__':
unittest.main()
4 changes: 2 additions & 2 deletions train_svm.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
default=False
)
parser.add_argument('--kernel', action='store',
help="type of kernel to use (default LinearSVC; possible alternatives, linear, polynomial, rbf, sigmoid)",
default="LinearSVC", choices=['LinearSVC', 'linear', 'polynomial', 'rbf', 'sigmoid'], type=str)
help="type of kernel to use (default and recommended choice is LinearSVC; possible alternatives are linear, sigmoid, rbf and poly, as per sklearn.svm.SVC)",
default="LinearSVC", choices=['LinearSVC', 'linear', 'sigmoid', 'rbf', 'poly'], type=str)
parser.add_argument('--final', action='store_true', help="final analysis on unknown dataset (no evaluation)?", default=False)
parser.add_argument('--get_coefs', action='store_true', help="switch to write to disk and plots the most important coefficients for the training feats for each class",
default=False)
Expand Down

0 comments on commit 296fc21

Please sign in to comment.