diff --git a/pypln/backend/celery_task.py b/pypln/backend/celery_task.py index 2d3d93d..7eef8e7 100644 --- a/pypln/backend/celery_task.py +++ b/pypln/backend/celery_task.py @@ -34,6 +34,7 @@ mongo_client = pymongo.MongoClient(host=config.MONGODB_URIS) database = mongo_client[config.MONGODB_DBNAME] document_collection = database[config.MONGODB_COLLECTION] +corpora_collection = database[config.MONGODB_CORPORA_COLLECTION] class DocumentNotFound(Exception): pass @@ -69,3 +70,34 @@ def process(self, document): and must return a dictionary with the keys to be saved in the database. """ raise NotImplementedError + +class PyPLNCorpusTask(Task): + """ + This is the base class for a Corpus task. It is very similar to + `PyPLNTask`, but it needs a corpus_id and a list of document_ids. + """ + + def run(self, corpus_id, document_ids): + """ + This method is called by Celery, and should not be overridden. + It will call the `process` method with a list of dictionaries + containing all the documents and will update de database with results. + """ + documents = document_collection.find({"_id": {"$in": document_ids}}) + if documents is None: + self.retry(exc=DocumentNotFound('Documents with ids "{}" ' + 'not found in database'.format(document_ids))) + result = self.process(documents) + corpora_collection.update({"corpus_id": corpus_id}, {"$set": result}, + upsert=True) + return corpus_id, document_ids + + def process(self, documents): + """ + This method should be implemented by subclasses. It is responsible for + performing the analysis itself. It will receive a list of dictionaries + as a paramenter (containing all the documents and the analysis that are + ready for it) and must return a dictionary with the new keys to be + saved in the corpora analysis collection. + """ + raise NotImplementedError diff --git a/pypln/backend/config.py b/pypln/backend/config.py index ec1d48e..e6250b2 100644 --- a/pypln/backend/config.py +++ b/pypln/backend/config.py @@ -31,6 +31,8 @@ def split_uris(uri): cast=split_uris) MONGODB_DBNAME = config('MONGODB_DBNAME', default='pypln') MONGODB_COLLECTION = config('MONGODB_COLLECTION', default='analysis') +MONGODB_CORPORA_COLLECTION = config('MONGODB_CORPORA_COLLECTION', + default='corpora_analysis') ELASTICSEARCH_CONFIG = { 'hosts': config('ELASTICSEARCH_HOSTS', diff --git a/pypln/backend/workers/__init__.py b/pypln/backend/workers/__init__.py index 0125bde..a755fc4 100644 --- a/pypln/backend/workers/__init__.py +++ b/pypln/backend/workers/__init__.py @@ -29,8 +29,9 @@ from palavras_semantic_tagger import SemanticTagger from word_cloud import WordCloud from elastic_indexer import ElasticIndexer +from corpus_freqdist import CorpusFreqDist __all__ = ['Extractor', 'Tokenizer', 'FreqDist', 'POS', 'Statistics', 'Bigrams', 'PalavrasRaw', 'Lemmatizer', 'NounPhrase', 'SemanticTagger', - 'WordCloud', 'ElasticIndexer'] + 'WordCloud', 'ElasticIndexer', 'CorpusFreqDist'] diff --git a/pypln/backend/workers/corpus_freqdist.py b/pypln/backend/workers/corpus_freqdist.py new file mode 100644 index 0000000..217aceb --- /dev/null +++ b/pypln/backend/workers/corpus_freqdist.py @@ -0,0 +1,32 @@ +# coding: utf-8 +# +# Copyright 2012 NAMD-EMAP-FGV +# +# This file is part of PyPLN. You can get more information at: http://pypln.org/. +# +# PyPLN is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# PyPLN is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with PyPLN. If not, see . +from pypln.backend.celery_task import PyPLNCorpusTask + +from collections import Counter + +class CorpusFreqDist(PyPLNCorpusTask): + + def process(self, documents): + result = Counter() + for document in documents: + d = {} + for word, count in document['freqdist']: + d[word] = count + result += Counter(d) + return {'freqdist': result.most_common()} diff --git a/tests/test_celery_corpus_task.py b/tests/test_celery_corpus_task.py new file mode 100644 index 0000000..c37d22a --- /dev/null +++ b/tests/test_celery_corpus_task.py @@ -0,0 +1,63 @@ +# coding: utf-8 +# +# Copyright 2015 NAMD-EMAP-FGV +# +# This file is part of PyPLN. You can get more information at: http://pypln.org/. +# +# PyPLN is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# PyPLN is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with PyPLN. If not, see . +from pypln.backend.celery_task import PyPLNCorpusTask +from mock import MagicMock +from utils import TaskTest + + +class FakeCorpusTask(PyPLNCorpusTask): + def process(self, documents): + return {'result': sum([d["input"] for d in documents])} + +class TestCeleryCorpusTask(TaskTest): + def test_task_should_only_get_the_correct_documents(self): + # This is just preparing the expected input in the database + wrong_doc_id = self.collection.insert({'input': 999}, w=1) + correct_doc_id_1 = self.collection.insert({'input': 1}, w=1) + correct_doc_id_2 = self.collection.insert({'input': 1}, w=1) + fake_corpus_id = 1 + + FakeCorpusTask.process = MagicMock(return_value={'result': 2}) + + corpus_task = FakeCorpusTask() + + corpus_task.delay(fake_corpus_id, [correct_doc_id_1, correct_doc_id_2]) + + corpus_task.process.assert_called() + + # We need to compare the call args because it's called with a mongo + # cursor, not a list. + # We're getting [0][0] because we want the args (not kwargs) for the + # first call to the method. + call_args = list(corpus_task.process.call_args[0][0]) + for arg in call_args: + self.assertEqual(arg['input'], 1) + + def test_task_is_saving_the_result_to_mongo_with_the_corpus_id(self): + expected_result = 42 + doc_id_1 = self.collection.insert({'input': 21}, w=1) + doc_id_2 = self.collection.insert({'input': 21}, w=1) + fake_corpus_id = 1 + + FakeCorpusTask().delay(fake_corpus_id, [doc_id_1, doc_id_2]) + + resulting_corpus_analysis = self.corpora_collection.find_one( + {'corpus_id': fake_corpus_id})['result'] + + self.assertEqual(resulting_corpus_analysis, expected_result) diff --git a/tests/test_worker_corpus_freqdist.py b/tests/test_worker_corpus_freqdist.py new file mode 100644 index 0000000..68fc0c9 --- /dev/null +++ b/tests/test_worker_corpus_freqdist.py @@ -0,0 +1,43 @@ +# coding: utf-8 +# +# Copyright 2012 NAMD-EMAP-FGV +# +# This file is part of PyPLN. You can get more information at: http://pypln.org/. +# +# PyPLN is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# PyPLN is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with PyPLN. If not, see . +from pypln.backend.workers import CorpusFreqDist +from utils import TaskTest + + +class TestCorpusFreqDistWorker(TaskTest): + def test_freqdist_should_return_a_list_of_tuples_with_frequency_distribution(self): + + freqdist_1 = [[u'is', 2], [u'the', 2], [u'blue', 1], [u'sun', 1], + [u'sky', 1], [u',', 1], [u'yellow', 1], [u'.', 1]] + + freqdist_2 = [[u'the', 2], [u'brown', 1], [u'lazy', 1], + [u'over', 1], [u'fox', 1], [u'dog', 1], [u'.', 1], + [u'quick', 1], [u'jumps', 1]] + + corpus_fd = [(u'the', 4), (u'is', 2), (u'.', 2), (u'blue', 1), + (u'brown', 1), (u'lazy', 1), (u'fox', 1), (u'jumps', 1), + (u'sun', 1), (u'dog', 1), (u'sky', 1), (u',', 1), + (u'yellow', 1), (u'quick', 1), (u'over', 1)] + + result = CorpusFreqDist().process([{'freqdist': freqdist_1}, + {'freqdist': freqdist_2}]) + + resulting_corpus_fd = result['freqdist'] + + self.assertEqual(resulting_corpus_fd, corpus_fd) diff --git a/tests/utils.py b/tests/utils.py index a2168c2..a8f4845 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -34,6 +34,7 @@ def setUp(self): app.conf.update(CELERY_ALWAYS_EAGER=True) self.db = pymongo.Connection()[self.db_name] self.collection = self.db[config.MONGODB_COLLECTION] + self.corpora_collection = self.db[config.MONGODB_CORPORA_COLLECTION] def tearDown(self): self.collection.remove({})