diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py index 8fda4661..99641be6 100755 --- a/superstyl/preproc/features_extract.py +++ b/superstyl/preproc/features_extract.py @@ -4,15 +4,13 @@ import nltk -def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False): +def count_words(text, feats = "words", n = 1): """ Get word counts from a text :param text: the source text - :param feat_list: a list of features to be selected :param feats: the type of feats (words, chars, etc.) :param n: the length of n-grams - :param relFreqs: whether to compute relative freqs - :return: feature frequencies in text + :return: features absolute frequencies in text as a counter """ if feats == "words": @@ -26,28 +24,23 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False): if n > 1: tokens = ["".join(t) for t in list(nltk.ngrams(tokens, n))] - counts = {} + counts = Counter() + counts.update(tokens) - for t in tokens: - if t not in counts.keys(): - counts[t] = 1 - - else: - counts[t] = counts[t] + 1 + return counts - if relFreqs: - total = sum(counts.values()) - for t in counts.keys(): - if counts[t] > 0: - counts[t] = counts[t] / total - else: - counts[t] = 0 +def relative_frequencies(wordCounts): + """ + For a counter of word counts, return the relative frequencies + :param wordCounts: a dictionary of word counts + :return a counter of word relative frequencies + """ - if feat_list: - # and keep only the ones in the feature list - counts = {f: counts[f] for f in feat_list if f in counts.keys()} + total = sum(wordCounts.values()) + for t in wordCounts.keys(): + wordCounts[t] = wordCounts[t] / total - return counts + return wordCounts def get_feature_list(myTexts, feats="words", n=1, relFreqs=True): @@ -61,10 +54,13 @@ def get_feature_list(myTexts, feats="words", n=1, relFreqs=True): my_feats = Counter() for text in myTexts: - counts = count_words(text["text"], feats=feats, n=n, relFreqs=relFreqs) + counts = count_words(text["text"], feats=feats, n=n) my_feats.update(counts) + if relFreqs: + my_feats = relative_frequencies(my_feats) + # sort them my_feats = [(i, my_feats[i]) for i in sorted(my_feats, key=my_feats.get, reverse=True)] @@ -83,7 +79,16 @@ def get_counts(myTexts, feat_list, feats = "words", n = 1, relFreqs = False): """ for i in enumerate(myTexts): - myTexts[i[0]]["wordCounts"] = count_words( - myTexts[i[0]]["text"], feat_list=feat_list, feats=feats, n=n, relFreqs=relFreqs) + + counts = count_words(myTexts[i[0]]["text"], feats=feats, n=n) + + if relFreqs: + counts = relative_frequencies(counts) + + if feat_list: + # and keep only the ones in the feature list + counts = {f: counts[f] for f in feat_list if f in counts.keys()} + + myTexts[i[0]]["wordCounts"] = counts return myTexts diff --git a/tests/test_main.py b/tests/test_main.py index d95a19c0..76cf29f7 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -145,42 +145,28 @@ def test_detect_lang(self): # Now, lower level features, # from features_extract def test_counts(self): + # Scenario: given a text, extract a list of the features that appear in it, with their counts in absolute frequency + # GIVEN text = "the cat the dog the squirrel the cat the cat" - superstyl.preproc.features_extract.count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False) - self.assertEqual( - superstyl.preproc.features_extract.count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False), - {'the': 5, 'cat': 3, 'dog': 1, 'squirrel': 1} - ) - self.assertEqual( - superstyl.preproc.features_extract.count_words(text, feat_list=None, feats="words", n=1, relFreqs=True), - {'the': 0.5, 'cat': 0.3, 'dog': 0.1, 'squirrel': 0.1} - ) - self.assertEqual( - superstyl.preproc.features_extract.count_words(text, feat_list=['the', 'cat'], feats="words", n=1, relFreqs=False), - {'the': 5, 'cat': 3} - ) - self.assertEqual( - superstyl.preproc.features_extract.count_words(text, feat_list=['the', 'cat'], feats="words", n=1, relFreqs=True), - {'the': 0.5, 'cat': 0.3} - ) - self.assertEqual( - superstyl.preproc.features_extract.count_words(text, feat_list=None, feats="words", n=2, relFreqs=False), - {'the_cat': 3, 'cat_the': 2, 'the_dog': 1, 'dog_the': 1, 'the_squirrel': 1, 'squirrel_the': 1} - ) - self.assertEqual( - superstyl.preproc.features_extract.count_words(text, feat_list=None, feats="words", n=2, relFreqs=True), - {'the_cat': 3/9, 'cat_the': 2/9, 'the_dog': 1/9, 'dog_the': 1/9, 'the_squirrel': 1/9, 'squirrel_the': 1/9} - ) + # WHEN + results = superstyl.preproc.features_extract.count_words(text, feats = "words", n = 1) + # THEN + expected = {'the': 5, 'cat': 3, 'dog': 1, 'squirrel': 1} + self.assertEqual(results, expected) + # WHEN + results = superstyl.preproc.features_extract.count_words(text, feats="words", n=2) + # THEN + expected = {'the_cat': 3, 'cat_the': 2, 'the_dog': 1, 'dog_the': 1, 'the_squirrel': 1, 'squirrel_the': 1} + self.assertEqual(results, expected) + + # GIVEN text = "the yo yo" - self.assertEqual( - superstyl.preproc.features_extract.count_words(text, feat_list=None, feats="chars", n=3, relFreqs=False), - {'the': 1, 'he_': 1, 'e_y': 1, '_yo': 2, 'yo_': 1, 'o_y': 1} - ) - self.assertEqual( - superstyl.preproc.features_extract.count_words(text, feat_list=['the'], feats="chars", n=3, relFreqs=True), - {'the': 1/7} - ) + # WHEN + results = superstyl.preproc.features_extract.count_words(text, feats="chars", n=3) + # THEN + expected = {'the': 1, 'he_': 1, 'e_y': 1, '_yo': 2, 'yo_': 1, 'o_y': 1} + self.assertEqual(results, expected) def test_max_sampling(self): # FEATURE: randomly select a maximum number of samples by author/class @@ -214,8 +200,8 @@ def test_get_feature_list(self): results = superstyl.preproc.features_extract.get_feature_list(myTexts, feats="words", n=1, relFreqs=True) # THEN # TODO: BUG (sum of relative frequencies?) - expected = [('This', 2/12), ('is', 2/12), ('the', 2/12), ('text', 2/12), ('Voici', 1/12), - ('le', 1/12), ('texte', 1/12), ('also', 1/12)] + expected = [('This', 2/12), ('is', 2/12), ('the', 2/12), ('text', 2/12), ('also', 1/12), ('Voici', 1/12), + ('le', 1/12), ('texte', 1/12)] self.assertEqual(results, expected) # WHEN