diff --git a/pypln/backend/celery_task.py b/pypln/backend/celery_task.py
index 2d3d93d..7eef8e7 100644
--- a/pypln/backend/celery_task.py
+++ b/pypln/backend/celery_task.py
@@ -34,6 +34,7 @@
mongo_client = pymongo.MongoClient(host=config.MONGODB_URIS)
database = mongo_client[config.MONGODB_DBNAME]
document_collection = database[config.MONGODB_COLLECTION]
+corpora_collection = database[config.MONGODB_CORPORA_COLLECTION]
class DocumentNotFound(Exception):
pass
@@ -69,3 +70,34 @@ def process(self, document):
and must return a dictionary with the keys to be saved in the database.
"""
raise NotImplementedError
+
+class PyPLNCorpusTask(Task):
+ """
+ This is the base class for a Corpus task. It is very similar to
+ `PyPLNTask`, but it needs a corpus_id and a list of document_ids.
+ """
+
+ def run(self, corpus_id, document_ids):
+ """
+ This method is called by Celery, and should not be overridden.
+ It will call the `process` method with a list of dictionaries
+ containing all the documents and will update de database with results.
+ """
+ documents = document_collection.find({"_id": {"$in": document_ids}})
+ if documents is None:
+ self.retry(exc=DocumentNotFound('Documents with ids "{}" '
+ 'not found in database'.format(document_ids)))
+ result = self.process(documents)
+ corpora_collection.update({"corpus_id": corpus_id}, {"$set": result},
+ upsert=True)
+ return corpus_id, document_ids
+
+ def process(self, documents):
+ """
+ This method should be implemented by subclasses. It is responsible for
+ performing the analysis itself. It will receive a list of dictionaries
+ as a paramenter (containing all the documents and the analysis that are
+ ready for it) and must return a dictionary with the new keys to be
+ saved in the corpora analysis collection.
+ """
+ raise NotImplementedError
diff --git a/pypln/backend/config.py b/pypln/backend/config.py
index ec1d48e..e6250b2 100644
--- a/pypln/backend/config.py
+++ b/pypln/backend/config.py
@@ -31,6 +31,8 @@ def split_uris(uri):
cast=split_uris)
MONGODB_DBNAME = config('MONGODB_DBNAME', default='pypln')
MONGODB_COLLECTION = config('MONGODB_COLLECTION', default='analysis')
+MONGODB_CORPORA_COLLECTION = config('MONGODB_CORPORA_COLLECTION',
+ default='corpora_analysis')
ELASTICSEARCH_CONFIG = {
'hosts': config('ELASTICSEARCH_HOSTS',
diff --git a/pypln/backend/workers/__init__.py b/pypln/backend/workers/__init__.py
index 0125bde..a755fc4 100644
--- a/pypln/backend/workers/__init__.py
+++ b/pypln/backend/workers/__init__.py
@@ -29,8 +29,9 @@
from palavras_semantic_tagger import SemanticTagger
from word_cloud import WordCloud
from elastic_indexer import ElasticIndexer
+from corpus_freqdist import CorpusFreqDist
__all__ = ['Extractor', 'Tokenizer', 'FreqDist', 'POS', 'Statistics',
'Bigrams', 'PalavrasRaw', 'Lemmatizer', 'NounPhrase', 'SemanticTagger',
- 'WordCloud', 'ElasticIndexer']
+ 'WordCloud', 'ElasticIndexer', 'CorpusFreqDist']
diff --git a/pypln/backend/workers/corpus_freqdist.py b/pypln/backend/workers/corpus_freqdist.py
new file mode 100644
index 0000000..217aceb
--- /dev/null
+++ b/pypln/backend/workers/corpus_freqdist.py
@@ -0,0 +1,32 @@
+# coding: utf-8
+#
+# Copyright 2012 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN. If not, see .
+from pypln.backend.celery_task import PyPLNCorpusTask
+
+from collections import Counter
+
+class CorpusFreqDist(PyPLNCorpusTask):
+
+ def process(self, documents):
+ result = Counter()
+ for document in documents:
+ d = {}
+ for word, count in document['freqdist']:
+ d[word] = count
+ result += Counter(d)
+ return {'freqdist': result.most_common()}
diff --git a/tests/test_celery_corpus_task.py b/tests/test_celery_corpus_task.py
new file mode 100644
index 0000000..c37d22a
--- /dev/null
+++ b/tests/test_celery_corpus_task.py
@@ -0,0 +1,63 @@
+# coding: utf-8
+#
+# Copyright 2015 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN. If not, see .
+from pypln.backend.celery_task import PyPLNCorpusTask
+from mock import MagicMock
+from utils import TaskTest
+
+
+class FakeCorpusTask(PyPLNCorpusTask):
+ def process(self, documents):
+ return {'result': sum([d["input"] for d in documents])}
+
+class TestCeleryCorpusTask(TaskTest):
+ def test_task_should_only_get_the_correct_documents(self):
+ # This is just preparing the expected input in the database
+ wrong_doc_id = self.collection.insert({'input': 999}, w=1)
+ correct_doc_id_1 = self.collection.insert({'input': 1}, w=1)
+ correct_doc_id_2 = self.collection.insert({'input': 1}, w=1)
+ fake_corpus_id = 1
+
+ FakeCorpusTask.process = MagicMock(return_value={'result': 2})
+
+ corpus_task = FakeCorpusTask()
+
+ corpus_task.delay(fake_corpus_id, [correct_doc_id_1, correct_doc_id_2])
+
+ corpus_task.process.assert_called()
+
+ # We need to compare the call args because it's called with a mongo
+ # cursor, not a list.
+ # We're getting [0][0] because we want the args (not kwargs) for the
+ # first call to the method.
+ call_args = list(corpus_task.process.call_args[0][0])
+ for arg in call_args:
+ self.assertEqual(arg['input'], 1)
+
+ def test_task_is_saving_the_result_to_mongo_with_the_corpus_id(self):
+ expected_result = 42
+ doc_id_1 = self.collection.insert({'input': 21}, w=1)
+ doc_id_2 = self.collection.insert({'input': 21}, w=1)
+ fake_corpus_id = 1
+
+ FakeCorpusTask().delay(fake_corpus_id, [doc_id_1, doc_id_2])
+
+ resulting_corpus_analysis = self.corpora_collection.find_one(
+ {'corpus_id': fake_corpus_id})['result']
+
+ self.assertEqual(resulting_corpus_analysis, expected_result)
diff --git a/tests/test_worker_corpus_freqdist.py b/tests/test_worker_corpus_freqdist.py
new file mode 100644
index 0000000..68fc0c9
--- /dev/null
+++ b/tests/test_worker_corpus_freqdist.py
@@ -0,0 +1,43 @@
+# coding: utf-8
+#
+# Copyright 2012 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN. If not, see .
+from pypln.backend.workers import CorpusFreqDist
+from utils import TaskTest
+
+
+class TestCorpusFreqDistWorker(TaskTest):
+ def test_freqdist_should_return_a_list_of_tuples_with_frequency_distribution(self):
+
+ freqdist_1 = [[u'is', 2], [u'the', 2], [u'blue', 1], [u'sun', 1],
+ [u'sky', 1], [u',', 1], [u'yellow', 1], [u'.', 1]]
+
+ freqdist_2 = [[u'the', 2], [u'brown', 1], [u'lazy', 1],
+ [u'over', 1], [u'fox', 1], [u'dog', 1], [u'.', 1],
+ [u'quick', 1], [u'jumps', 1]]
+
+ corpus_fd = [(u'the', 4), (u'is', 2), (u'.', 2), (u'blue', 1),
+ (u'brown', 1), (u'lazy', 1), (u'fox', 1), (u'jumps', 1),
+ (u'sun', 1), (u'dog', 1), (u'sky', 1), (u',', 1),
+ (u'yellow', 1), (u'quick', 1), (u'over', 1)]
+
+ result = CorpusFreqDist().process([{'freqdist': freqdist_1},
+ {'freqdist': freqdist_2}])
+
+ resulting_corpus_fd = result['freqdist']
+
+ self.assertEqual(resulting_corpus_fd, corpus_fd)
diff --git a/tests/utils.py b/tests/utils.py
index a2168c2..a8f4845 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -34,6 +34,7 @@ def setUp(self):
app.conf.update(CELERY_ALWAYS_EAGER=True)
self.db = pymongo.Connection()[self.db_name]
self.collection = self.db[config.MONGODB_COLLECTION]
+ self.corpora_collection = self.db[config.MONGODB_CORPORA_COLLECTION]
def tearDown(self):
self.collection.remove({})