Skip to content

Commit

Permalink
solve bug in relative frequencies in feature list extract
Browse files Browse the repository at this point in the history
  • Loading branch information
Jean-Baptiste-Camps committed Feb 20, 2024
1 parent d4c2b3c commit 3f84d04
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 60 deletions.
55 changes: 30 additions & 25 deletions superstyl/preproc/features_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,13 @@
import nltk


def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False):
def count_words(text, feats = "words", n = 1):
"""
Get word counts from a text
:param text: the source text
:param feat_list: a list of features to be selected
:param feats: the type of feats (words, chars, etc.)
:param n: the length of n-grams
:param relFreqs: whether to compute relative freqs
:return: feature frequencies in text
:return: features absolute frequencies in text as a counter
"""

if feats == "words":
Expand All @@ -26,28 +24,23 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False):
if n > 1:
tokens = ["".join(t) for t in list(nltk.ngrams(tokens, n))]

counts = {}
counts = Counter()
counts.update(tokens)

for t in tokens:
if t not in counts.keys():
counts[t] = 1

else:
counts[t] = counts[t] + 1
return counts

if relFreqs:
total = sum(counts.values())
for t in counts.keys():
if counts[t] > 0:
counts[t] = counts[t] / total
else:
counts[t] = 0
def relative_frequencies(wordCounts):
"""
For a counter of word counts, return the relative frequencies
:param wordCounts: a dictionary of word counts
:return a counter of word relative frequencies
"""

if feat_list:
# and keep only the ones in the feature list
counts = {f: counts[f] for f in feat_list if f in counts.keys()}
total = sum(wordCounts.values())
for t in wordCounts.keys():
wordCounts[t] = wordCounts[t] / total

return counts
return wordCounts


def get_feature_list(myTexts, feats="words", n=1, relFreqs=True):
Expand All @@ -61,10 +54,13 @@ def get_feature_list(myTexts, feats="words", n=1, relFreqs=True):
my_feats = Counter()

for text in myTexts:
counts = count_words(text["text"], feats=feats, n=n, relFreqs=relFreqs)
counts = count_words(text["text"], feats=feats, n=n)

my_feats.update(counts)

if relFreqs:
my_feats = relative_frequencies(my_feats)

# sort them
my_feats = [(i, my_feats[i]) for i in sorted(my_feats, key=my_feats.get, reverse=True)]

Expand All @@ -83,7 +79,16 @@ def get_counts(myTexts, feat_list, feats = "words", n = 1, relFreqs = False):
"""

for i in enumerate(myTexts):
myTexts[i[0]]["wordCounts"] = count_words(
myTexts[i[0]]["text"], feat_list=feat_list, feats=feats, n=n, relFreqs=relFreqs)

counts = count_words(myTexts[i[0]]["text"], feats=feats, n=n)

if relFreqs:
counts = relative_frequencies(counts)

if feat_list:
# and keep only the ones in the feature list
counts = {f: counts[f] for f in feat_list if f in counts.keys()}

myTexts[i[0]]["wordCounts"] = counts

return myTexts
56 changes: 21 additions & 35 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,42 +145,28 @@ def test_detect_lang(self):
# Now, lower level features,
# from features_extract
def test_counts(self):
# Scenario: given a text, extract a list of the features that appear in it, with their counts in absolute frequency
# GIVEN
text = "the cat the dog the squirrel the cat the cat"
superstyl.preproc.features_extract.count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False)
self.assertEqual(
superstyl.preproc.features_extract.count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False),
{'the': 5, 'cat': 3, 'dog': 1, 'squirrel': 1}
)
self.assertEqual(
superstyl.preproc.features_extract.count_words(text, feat_list=None, feats="words", n=1, relFreqs=True),
{'the': 0.5, 'cat': 0.3, 'dog': 0.1, 'squirrel': 0.1}
)
self.assertEqual(
superstyl.preproc.features_extract.count_words(text, feat_list=['the', 'cat'], feats="words", n=1, relFreqs=False),
{'the': 5, 'cat': 3}
)
self.assertEqual(
superstyl.preproc.features_extract.count_words(text, feat_list=['the', 'cat'], feats="words", n=1, relFreqs=True),
{'the': 0.5, 'cat': 0.3}
)
self.assertEqual(
superstyl.preproc.features_extract.count_words(text, feat_list=None, feats="words", n=2, relFreqs=False),
{'the_cat': 3, 'cat_the': 2, 'the_dog': 1, 'dog_the': 1, 'the_squirrel': 1, 'squirrel_the': 1}
)
self.assertEqual(
superstyl.preproc.features_extract.count_words(text, feat_list=None, feats="words", n=2, relFreqs=True),
{'the_cat': 3/9, 'cat_the': 2/9, 'the_dog': 1/9, 'dog_the': 1/9, 'the_squirrel': 1/9, 'squirrel_the': 1/9}
)
# WHEN
results = superstyl.preproc.features_extract.count_words(text, feats = "words", n = 1)
# THEN
expected = {'the': 5, 'cat': 3, 'dog': 1, 'squirrel': 1}
self.assertEqual(results, expected)

# WHEN
results = superstyl.preproc.features_extract.count_words(text, feats="words", n=2)
# THEN
expected = {'the_cat': 3, 'cat_the': 2, 'the_dog': 1, 'dog_the': 1, 'the_squirrel': 1, 'squirrel_the': 1}
self.assertEqual(results, expected)

# GIVEN
text = "the yo yo"
self.assertEqual(
superstyl.preproc.features_extract.count_words(text, feat_list=None, feats="chars", n=3, relFreqs=False),
{'the': 1, 'he_': 1, 'e_y': 1, '_yo': 2, 'yo_': 1, 'o_y': 1}
)
self.assertEqual(
superstyl.preproc.features_extract.count_words(text, feat_list=['the'], feats="chars", n=3, relFreqs=True),
{'the': 1/7}
)
# WHEN
results = superstyl.preproc.features_extract.count_words(text, feats="chars", n=3)
# THEN
expected = {'the': 1, 'he_': 1, 'e_y': 1, '_yo': 2, 'yo_': 1, 'o_y': 1}
self.assertEqual(results, expected)

def test_max_sampling(self):
# FEATURE: randomly select a maximum number of samples by author/class
Expand Down Expand Up @@ -214,8 +200,8 @@ def test_get_feature_list(self):
results = superstyl.preproc.features_extract.get_feature_list(myTexts, feats="words", n=1, relFreqs=True)
# THEN
# TODO: BUG (sum of relative frequencies?)
expected = [('This', 2/12), ('is', 2/12), ('the', 2/12), ('text', 2/12), ('Voici', 1/12),
('le', 1/12), ('texte', 1/12), ('also', 1/12)]
expected = [('This', 2/12), ('is', 2/12), ('the', 2/12), ('text', 2/12), ('also', 1/12), ('Voici', 1/12),
('le', 1/12), ('texte', 1/12)]
self.assertEqual(results, expected)

# WHEN
Expand Down

0 comments on commit 3f84d04

Please sign in to comment.