solve bug in relative frequencies in feature list extract

SupervisedStylometry · Feb 20, 2024 · 3f84d04 · 3f84d04
1 parent d4c2b3c
commit 3f84d04
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 60 deletions.
diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py
@@ -4,15 +4,13 @@
 import nltk
 
 
-def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False):
+def count_words(text, feats = "words", n = 1):
     """
     Get word counts from  a text
     :param text: the source text
-    :param feat_list: a list of features to be selected
     :param feats: the type of feats (words, chars, etc.)
     :param n: the length of n-grams
-    :param relFreqs: whether to compute relative freqs
-    :return: feature frequencies in text
+    :return: features absolute frequencies in text as a counter
     """
 
     if feats == "words":
@@ -26,28 +24,23 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False):
         if n > 1:
             tokens = ["".join(t) for t in list(nltk.ngrams(tokens, n))]
 
-    counts = {}
+    counts = Counter()
+    counts.update(tokens)
 
-    for t in tokens:
-        if t not in counts.keys():
-            counts[t] = 1
-
-        else:
-            counts[t] = counts[t] + 1
+    return counts
 
-    if relFreqs:
-        total = sum(counts.values())
-        for t in counts.keys():
-            if counts[t] > 0:
-                counts[t] = counts[t] / total
-            else:
-                counts[t] = 0
+def relative_frequencies(wordCounts):
+    """
+    For a counter of word counts, return the relative frequencies
+    :param wordCounts: a dictionary of word counts
+    :return a counter of word relative frequencies
+    """
 
-    if feat_list:
-        # and keep only the ones in the feature list
-        counts = {f: counts[f] for f in feat_list if f in counts.keys()}
+    total = sum(wordCounts.values())
+    for t in wordCounts.keys():
+        wordCounts[t] = wordCounts[t] / total
 
-    return counts
+    return wordCounts
 
 
 def get_feature_list(myTexts, feats="words", n=1, relFreqs=True):
@@ -61,10 +54,13 @@ def get_feature_list(myTexts, feats="words", n=1, relFreqs=True):
     my_feats = Counter()
 
     for text in myTexts:
-        counts = count_words(text["text"], feats=feats, n=n, relFreqs=relFreqs)
+        counts = count_words(text["text"], feats=feats, n=n)
 
         my_feats.update(counts)
 
+    if relFreqs:
+        my_feats = relative_frequencies(my_feats)
+
     # sort them
     my_feats = [(i, my_feats[i]) for i in sorted(my_feats, key=my_feats.get, reverse=True)]
 
@@ -83,7 +79,16 @@ def get_counts(myTexts, feat_list, feats = "words", n = 1, relFreqs = False):
     """
 
     for i in enumerate(myTexts):
-        myTexts[i[0]]["wordCounts"] = count_words(
-            myTexts[i[0]]["text"], feat_list=feat_list, feats=feats, n=n, relFreqs=relFreqs)
+
+        counts = count_words(myTexts[i[0]]["text"], feats=feats, n=n)
+
+        if relFreqs:
+            counts = relative_frequencies(counts)
+
+        if feat_list:
+            # and keep only the ones in the feature list
+            counts = {f: counts[f] for f in feat_list if f in counts.keys()}
+
+        myTexts[i[0]]["wordCounts"] = counts
 
     return myTexts
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -145,42 +145,28 @@ def test_detect_lang(self):
     # Now, lower level features,
     # from features_extract
     def test_counts(self):
+        # Scenario: given a text, extract a list of the features that appear in it, with their counts in absolute frequency
+        # GIVEN
         text = "the cat the dog the squirrel the cat the cat"
-        superstyl.preproc.features_extract.count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False)
-        self.assertEqual(
-            superstyl.preproc.features_extract.count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False),
-            {'the': 5, 'cat': 3, 'dog': 1, 'squirrel': 1}
-        )
-        self.assertEqual(
-            superstyl.preproc.features_extract.count_words(text, feat_list=None, feats="words", n=1, relFreqs=True),
-            {'the': 0.5, 'cat': 0.3, 'dog': 0.1, 'squirrel': 0.1}
-        )
-        self.assertEqual(
-            superstyl.preproc.features_extract.count_words(text, feat_list=['the', 'cat'], feats="words", n=1, relFreqs=False),
-            {'the': 5, 'cat': 3}
-        )
-        self.assertEqual(
-            superstyl.preproc.features_extract.count_words(text, feat_list=['the', 'cat'], feats="words", n=1, relFreqs=True),
-            {'the': 0.5, 'cat': 0.3}
-        )
-        self.assertEqual(
-            superstyl.preproc.features_extract.count_words(text, feat_list=None, feats="words", n=2, relFreqs=False),
-            {'the_cat': 3, 'cat_the': 2, 'the_dog': 1, 'dog_the': 1, 'the_squirrel': 1, 'squirrel_the': 1}
-        )
-        self.assertEqual(
-            superstyl.preproc.features_extract.count_words(text, feat_list=None, feats="words", n=2, relFreqs=True),
-            {'the_cat': 3/9, 'cat_the': 2/9, 'the_dog': 1/9, 'dog_the': 1/9, 'the_squirrel': 1/9, 'squirrel_the': 1/9}
-        )
+        # WHEN
+        results = superstyl.preproc.features_extract.count_words(text, feats = "words", n = 1)
+        # THEN
+        expected = {'the': 5, 'cat': 3, 'dog': 1, 'squirrel': 1}
+        self.assertEqual(results, expected)
 
+        # WHEN
+        results = superstyl.preproc.features_extract.count_words(text, feats="words", n=2)
+        # THEN
+        expected = {'the_cat': 3, 'cat_the': 2, 'the_dog': 1, 'dog_the': 1, 'the_squirrel': 1, 'squirrel_the': 1}
+        self.assertEqual(results, expected)
+
+        # GIVEN
         text = "the yo yo"
-        self.assertEqual(
-            superstyl.preproc.features_extract.count_words(text, feat_list=None, feats="chars", n=3, relFreqs=False),
-            {'the': 1, 'he_': 1, 'e_y': 1, '_yo': 2, 'yo_': 1, 'o_y': 1}
-        )
-        self.assertEqual(
-            superstyl.preproc.features_extract.count_words(text, feat_list=['the'], feats="chars", n=3, relFreqs=True),
-            {'the': 1/7}
-        )
+        # WHEN
+        results = superstyl.preproc.features_extract.count_words(text, feats="chars", n=3)
+        # THEN
+        expected = {'the': 1, 'he_': 1, 'e_y': 1, '_yo': 2, 'yo_': 1, 'o_y': 1}
+        self.assertEqual(results, expected)
 
     def test_max_sampling(self):
         # FEATURE: randomly select a maximum number of samples by author/class
@@ -214,8 +200,8 @@ def test_get_feature_list(self):
         results = superstyl.preproc.features_extract.get_feature_list(myTexts, feats="words", n=1, relFreqs=True)
         # THEN
         # TODO: BUG (sum of relative frequencies?)
-        expected = [('This', 2/12), ('is', 2/12), ('the', 2/12), ('text', 2/12), ('Voici', 1/12),
-                    ('le', 1/12), ('texte', 1/12), ('also', 1/12)]
+        expected = [('This', 2/12), ('is', 2/12), ('the', 2/12), ('text', 2/12),  ('also', 1/12), ('Voici', 1/12),
+                    ('le', 1/12), ('texte', 1/12)]
         self.assertEqual(results, expected)
 
         # WHEN