Skip to content

Commit

Permalink
switched to gensim and w2vec
Browse files Browse the repository at this point in the history
  • Loading branch information
Jean-Baptiste-Camps committed Feb 15, 2024
1 parent 4e2ca36 commit 30c7a9c
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 27 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ argparse>=1.4.0
regex>=2022.10.31
matplotlib>=3.6.2
imbalanced-learn>=0.8.1
gensim>=4.3.2
41 changes: 14 additions & 27 deletions superstyl/preproc/embedding.py
Original file line number Diff line number Diff line change
@@ -1,61 +1,49 @@
import numpy as np
from scipy import spatial
import gensim.models

def load_glove_embeddings(path):
def load_embeddings(path):
"""
Load GloVe embeddings from a txt file and return a dictionary mapping words to vectors.
Load w2vec embeddings from a txt file and return a dictionary mapping words to vectors.
:param path: the path to the embeddings txt file
:return a dictionary of words and vectors
"""
# path = "glove.6B.100d.txt"
embeddings_dict = {}
with open(path, 'r', encoding='utf-8') as f:
for line in f:
values = line.split()
word = values[0]
vector = np.asarray(values[1:], "float32")
embeddings_dict[word] = vector
return embeddings_dict
model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=False)
return model

def find_similar_words(embeddings_dict, word, topn=10):
def find_similar_words(model, word, topn=10):
"""
Find and return the most similar words to a given word based on cosine similarity.
:param embeddings_dict: the dictionnary of embeddings
:param model: the embedding model
:param word: the word for which closest are retrieved
:param topn: the n closest (as per cosine similarity) words on which to compute relative frequency
:return a list of the topn closest words (including original word itself)
:return a list of the topn closest words
"""
if word not in embeddings_dict:
if word not in model:
return None

else:
similarities = {}
target_embedding = embeddings_dict[word]
for other_word, other_embedding in embeddings_dict.items():
similarity = 1 - spatial.distance.cosine(target_embedding, other_embedding)
similarities[other_word] = similarity
sorted_similarities = sorted(similarities.items(), key=lambda item: item[1], reverse=True)
return [s[0] for s in sorted_similarities[0:topn]]
return [s[0] for s in model.most_similar(word, topn=topn)]

# For tests
# myTexts = [{'name': 'Letter1', 'aut': 'Smith', 'text': 'This is the text', 'lang': 'en', 'wordCounts': {'the': 1, 'this': 1}},
# {'name': 'Letter2', 'aut': 'Smith', 'text': 'Also the text', 'lang': 'en',
# 'wordCounts': {'the': 1, 'also': 1}}]
# feat_list = ['the']
# feat = "the"
def get_embedded_counts(myTexts, feat_list, embeddings_dict, topn=10):
def get_embedded_counts(myTexts, feat_list, model, topn=10):
"""
Replace absolute frequencies by frequencies relative to a given semantic neighbouring
(i.e., some sort of relative frequency among 'paronyms'), using a Glove embedding (cf. Eder, 2022).
:param myTexts: the document collection
:param feat_list: a list of features to be selected
:param embeddings_dict: the dictionnary of embeddings
:param model: the embeddings model
:param topn: the n closest (as per cosine similarity) words on which to compute relative frequency
:return: the myTexts collection with, for each text, a 'wordCounts' dictionary with said semantic relative frequencies
"""

for feat in feat_list:
similars = find_similar_words(embeddings_dict, feat, topn=topn)
similars = find_similar_words(model, feat, topn=topn)
if similars is None:
# IN THAT CASE, we do not include it in the embedded freqs
continue
Expand All @@ -69,10 +57,9 @@ def get_embedded_counts(myTexts, feat_list, embeddings_dict, topn=10):
# then, initialise
myTexts[i[0]]["embedded"] = {}

total = sum([myTexts[i[0]]["wordCounts"][s] for s in similars if s in myTexts[i[0]]["wordCounts"].keys()])
total = sum([myTexts[i[0]]["wordCounts"][s] for s in [feat]+similars if s in myTexts[i[0]]["wordCounts"].keys()])
myTexts[i[0]]["embedded"][feat] = myTexts[i[0]]["wordCounts"][feat] / total


return myTexts


Expand Down

0 comments on commit 30c7a9c

Please sign in to comment.