From 3af9bef64e6174e7e1b425b4aaae1c95172b82bb Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Wed, 22 Jun 2016 20:42:12 -0300 Subject: [PATCH 01/10] Remove mocks for GridFSDataRetriever wich no longer exists --- pypln/web/backend_adapter/tests.py | 4 ++-- pypln/web/core/storage.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pypln/web/backend_adapter/tests.py b/pypln/web/backend_adapter/tests.py index 28e18a8..15f9740 100644 --- a/pypln/web/backend_adapter/tests.py +++ b/pypln/web/backend_adapter/tests.py @@ -68,14 +68,14 @@ def test_should_create_indexing_pipelines_for_document(self, extractor): extractor.assert_called_with() extractor.return_value.si.assert_called_with(ObjectId(self.document.blob.name)) - @patch('pypln.web.backend_adapter.pipelines.GridFSDataRetriever', autospec=True) + @patch('pypln.web.backend_adapter.pipelines.Extractor', autospec=True) def test_should_add_index_name_to_the_document_in_mongo(self, gridfs_data_retriever): create_indexing_pipeline(self.document) mongo_document = self.get_mongo_doc(self.document) self.assertEqual(mongo_document['index_name'], self.document.index_name) - @patch('pypln.web.backend_adapter.pipelines.GridFSDataRetriever', autospec=True) + @patch('pypln.web.backend_adapter.pipelines.Extractor', autospec=True) def test_should_add_doc_type_to_the_document_in_mongo(self, gridfs_data_retriever): create_indexing_pipeline(self.document) diff --git a/pypln/web/core/storage.py b/pypln/web/core/storage.py index abd5130..f4103d1 100644 --- a/pypln/web/core/storage.py +++ b/pypln/web/core/storage.py @@ -28,7 +28,6 @@ from django.conf import settings from django.utils.encoding import filepath_to_uri from pymongo import Connection -from gridfs import GridFS, NoFile class MongoDBBase64Storage(Storage): From 0b716e91935b74ab5993f83be213be042bc795ad Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Tue, 12 Jul 2016 12:23:27 -0300 Subject: [PATCH 02/10] Adapts `manage` helper function to the new settings scheme --- contrib/postactivate | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/contrib/postactivate b/contrib/postactivate index 9482463..b3634e7 100755 --- a/contrib/postactivate +++ b/contrib/postactivate @@ -29,6 +29,10 @@ function manage_with_settings() { PYTHONPATH="$PYPLN_ROOT:$PYTHONPATH" python "$PYPLN_ROOT"/manage.py $* --settings=pypln.web.settings.$SETTINGS; } +function manage() { + PYTHONPATH="$PYPLN_ROOT:$PYTHONPATH" python "$PYPLN_ROOT"/manage.py $* +} + alias manage_dev="manage_with_settings development" alias manage_test="manage_with_settings test" alias run_tests="manage_test test pypln.web.core.tests" From 8aac6cac3eeb00e3b2a6cb4cf7757b96cd68f478 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Tue, 12 Jul 2016 15:27:03 -0300 Subject: [PATCH 03/10] Adds 'properties' to Corpus model This adds a fixture for a corpus freqdist, which generates a warning (since django tries to load a fixture for something that is not a model). For now I'll leave this warning. But maybe we should fix this before merging this branch. --- .../fixtures/mongodb/corpora_analysis.json | 16 ++++++++++++++ pypln/web/core/models.py | 9 ++++++++ pypln/web/core/tests/test_models.py | 21 ++++++++++++++++++- pypln/web/core/tests/utils.py | 7 +++++++ pypln/web/settings.py | 1 + 5 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 pypln/web/core/fixtures/mongodb/corpora_analysis.json diff --git a/pypln/web/core/fixtures/mongodb/corpora_analysis.json b/pypln/web/core/fixtures/mongodb/corpora_analysis.json new file mode 100644 index 0000000..955bf97 --- /dev/null +++ b/pypln/web/core/fixtures/mongodb/corpora_analysis.json @@ -0,0 +1,16 @@ +[ + { + "_id" : { "$oid": "5785005257bc3a1070d8cdbf" }, + "corpus_id" : 1, + "freqdist" : [ + [ "á", 1 ], + [ "non-ascii", 1 ], + [ ".", 1 ], + [ "char", 1 ], + [ "file", 1 ], + [ "test", 1 ], + [ ":", 1 ], + [ "with", 1 ] + ] + } +] diff --git a/pypln/web/core/models.py b/pypln/web/core/models.py index df4e177..6808d12 100644 --- a/pypln/web/core/models.py +++ b/pypln/web/core/models.py @@ -21,6 +21,7 @@ from django.contrib.auth.models import User from django.dispatch import receiver from django.db import models +import pymongo from rest_framework.reverse import reverse from rest_framework.authtoken.models import Token @@ -28,6 +29,7 @@ from pypln.web.core.storage import MongoDBBase64Storage mongodb_storage = MongoDBBase64Storage() +corpus_collection = pymongo.Connection(host=settings.MONGODB_URIS)[settings.MONGODB_DBNAME][settings.MONGODB_CORPORA_COLLECTION] class Corpus(models.Model): @@ -43,6 +45,13 @@ class Meta: def __unicode__(self): return self.name + @property + def properties(self): + corpus_analysis = corpus_collection.find_one({"corpus_id": self.id}) + if corpus_analysis is None: + return {} + return corpus_analysis + class Document(models.Model): blob = models.FileField(upload_to='/', storage=mongodb_storage) diff --git a/pypln/web/core/tests/test_models.py b/pypln/web/core/tests/test_models.py index 753be0d..bf92df1 100644 --- a/pypln/web/core/tests/test_models.py +++ b/pypln/web/core/tests/test_models.py @@ -27,7 +27,7 @@ from pypln.web.core.models import Corpus, Document from pypln.web.core.tests.utils import TestWithMongo -__all__ = ["CorpusModelTest", "DocumentModelTest"] +__all__ = ["CorpusModelTest", "CorpusPropertiesTest", "DocumentModelTest"] class CorpusModelTest(TestCase): fixtures = ['users'] @@ -46,6 +46,25 @@ def test_different_users_can_have_corpora_with_the_same_name(self): self.assertEqual(corpus_1.name, corpus_2.name) +class CorpusPropertiesTest(TestWithMongo): + fixtures = ['users', 'corpora', 'corpora_analysis'] + + def test_returns_keyerror_when_key_does_not_exist(self): + expected_data = u'Test file with non-ascii char: á.' + corpus = Corpus.objects.all()[0] + with self.assertRaises(KeyError): + corpus.properties['analysis_that_does_not_exist'] + + def test_get_freqdist_from_store(self): + expected_data = [ + [u"á", 1], [u"non-ascii", 1], [u".", 1], + [u"char", 1], [u"file", 1], [u"test", 1], [u":", 1], + [u"with", 1 ] + ] + corpus = Corpus.objects.all()[0] + self.assertEqual(corpus.properties['freqdist'], expected_data) + + class DocumentModelTest(TestWithMongo): fixtures = ['users', 'corpora', 'documents'] diff --git a/pypln/web/core/tests/utils.py b/pypln/web/core/tests/utils.py index 4ed906f..8bc22ed 100644 --- a/pypln/web/core/tests/utils.py +++ b/pypln/web/core/tests/utils.py @@ -42,6 +42,13 @@ def _pre_setup(self, *args, **kwargs): mongodb_storage.save(os.path.basename(doc.blob.name), StringIO(u"Test file with non-ascii char: á.".encode('utf-8'))) + if hasattr(self, 'fixtures') and self.fixtures is not None and 'corpora_analysis' in self.fixtures: + filename = os.path.join(settings.PROJECT_ROOT, 'core/fixtures/mongodb/corpora_analysis.json') + with open(filename, 'r') as mongo_fixture: + for obj in json_util.loads(mongo_fixture.read()): + mongodb_storage._connection[settings.MONGODB_DBNAME][settings.MONGODB_CORPORA_COLLECTION].insert(obj) + + def _post_teardown(self, *args, **kwargs): mongodb_storage._connection.drop_database(mongodb_storage._db.name) super(TestWithMongo, self)._post_teardown(*args, **kwargs) diff --git a/pypln/web/settings.py b/pypln/web/settings.py index 9c36239..36152b6 100644 --- a/pypln/web/settings.py +++ b/pypln/web/settings.py @@ -58,6 +58,7 @@ def split_uris(uri): MONGODB_DBNAME = config('MONGODB_DBNAME', default='pypln') MONGODB_COLLECTION = config('MONGODB_COLLECTION', default='analysis') +MONGODB_CORPORA_COLLECTION = config('MONGODB_CORPORA_COLLECTION', default='corpora_analysis') ALLOWED_HOSTS = config('ALLOWED_HOSTS', cast=Csv()) From da84551adbe4c71693b47199b32c15f036091d98 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Tue, 12 Jul 2016 18:24:01 -0300 Subject: [PATCH 04/10] Makes sure TestWithMongo writes to mongo before running the test We were getting some errors when tests ran because the write operation didn't finish before trying to retrieve data. --- pypln/web/core/tests/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypln/web/core/tests/utils.py b/pypln/web/core/tests/utils.py index 8bc22ed..43fb1d2 100644 --- a/pypln/web/core/tests/utils.py +++ b/pypln/web/core/tests/utils.py @@ -46,7 +46,7 @@ def _pre_setup(self, *args, **kwargs): filename = os.path.join(settings.PROJECT_ROOT, 'core/fixtures/mongodb/corpora_analysis.json') with open(filename, 'r') as mongo_fixture: for obj in json_util.loads(mongo_fixture.read()): - mongodb_storage._connection[settings.MONGODB_DBNAME][settings.MONGODB_CORPORA_COLLECTION].insert(obj) + mongodb_storage._connection[settings.MONGODB_DBNAME][settings.MONGODB_CORPORA_COLLECTION].insert(obj, w=1) def _post_teardown(self, *args, **kwargs): From 8b1ef1a6fe32d5fa191976c170eba6a215bf1de5 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Tue, 12 Jul 2016 18:26:32 -0300 Subject: [PATCH 05/10] WIP: adds CorpusFreqDist view with reading capability The view will also be used to run the analyisis, but is not yet working for that. --- .../core/tests/views/test_corpus_analysis.py | 172 ++++++++++++++++++ pypln/web/core/urls.py | 3 + pypln/web/core/views.py | 14 ++ 3 files changed, 189 insertions(+) create mode 100644 pypln/web/core/tests/views/test_corpus_analysis.py diff --git a/pypln/web/core/tests/views/test_corpus_analysis.py b/pypln/web/core/tests/views/test_corpus_analysis.py new file mode 100644 index 0000000..4bc9f1b --- /dev/null +++ b/pypln/web/core/tests/views/test_corpus_analysis.py @@ -0,0 +1,172 @@ +# -*- coding:utf-8 -*- +# +# Copyright 2012 NAMD-EMAP-FGV +# +# This file is part of PyPLN. You can get more information at: http://pypln.org/. +# +# PyPLN is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# PyPLN is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with PyPLN. If not, see . +import json + +from django.contrib.auth.models import User +from django.core.urlresolvers import reverse +from mock import patch + +from pypln.web.core.models import Corpus, User +from pypln.web.core.tests.utils import TestWithMongo + +__all__ = ["CorpusFreqDistViewTest"] + + +class CorpusFreqDistViewTest(TestWithMongo): + fixtures = ['users', 'corpora', 'documents', 'corpora_analysis'] + + def test_requires_login(self): + response = self.client.get(reverse('corpus-freqdist', + kwargs={'pk': 2})) + self.assertEqual(response.status_code, 403) + +# def test_shows_corpus_correctly(self): +# self.client.login(username="user", password="user") +# corpus = Corpus.objects.filter(owner__username="user")[0] +# response = self.client.get(reverse('corpus-detail', +# kwargs={'pk': corpus.id})) +# +# self.assertEqual(response.status_code, 200) +# self.assertEqual(response.renderer_context['view'].get_object(), corpus) + + + def test_returns_404_for_inexistent_corpus(self): + self.client.login(username="user", password="user") + response = self.client.get(reverse('corpus-freqdist', + kwargs={'pk': 9999})) + self.assertEqual(response.status_code, 404) + + def test_returns_404_if_user_is_not_the_owner_of_the_corpus(self): + self.client.login(username="user", password="user") + corpus = Corpus.objects.filter(owner__username="admin")[0] + response = self.client.get(reverse('corpus-freqdist', + kwargs={'pk': corpus.id})) + self.assertEqual(response.status_code, 404) + + def test_shows_corpus_freqdist_correctly(self): + self.client.login(username="admin", password="admin") + corpus = Corpus.objects.filter(owner__username="admin")[0] + response = self.client.get(reverse('corpus-freqdist', + kwargs={'pk': corpus.id})) + + self.assertEqual(response.status_code, 200) + self.assertEqual(response.renderer_context['view'].get_object(), + corpus) + expected_data = corpus.properties['freqdist'] + self.assertEqual(response.data['value'], expected_data) + + + @patch('pypln.web.backend_adapter.pipelines.corpus_freqdist') + def _test_queue_freqdist_analysis_for_a_corpus_that_didnt_have_one(self, + corpus_freqdist): + self.user = User.objects.get(username="user") + self.assertEqual(len(self.user.document_set.all()), 1) + self.client.login(username="user", password="user") + + corpus = self.user.corpus_set.all()[0] + response = self.client.put(reverse('corpus-freqdist', + kwargs={"pk": corpus.id})) + + self.assertEqual(response.status_code, 201) + self.assertTrue(corpus_freqdist.called) + + document_ids = corpus.document_set.all() + + corpus_freqdist.assert_called_with(corpus.id, document_ids) + +# def test_edit_corpus(self): +# self.client.login(username="user", password="user") +# corpus = Corpus.objects.filter(owner__username="user")[0] +# response = self.client.put(reverse('corpus-detail', +# kwargs={'pk': corpus.id}), json.dumps({"name": "New name", +# "description": "New description"}), content_type="application/json") +# +# self.assertEqual(response.status_code, 200) +# updated_corpus = Corpus.objects.filter(owner__username="user")[0] +# self.assertEqual(updated_corpus.name, "New name") +# self.assertEqual(updated_corpus.description, "New description") +# +# def test_cant_change_name_to_one_that_already_exists_for_this_user(self): +# self.client.login(username="user", password="user") +# user = User.objects.get(username="user") +# conflicting_corpus = Corpus.objects.create(name="Conflicting name", +# owner=user, description="This corpus is here to create a conflict") +# +# corpus = Corpus.objects.filter(owner__username="user")[0] +# response = self.client.put(reverse('corpus-detail', +# kwargs={'pk': corpus.id}), json.dumps({"name": "Conflicting name", +# "description": "New description"}), content_type="application/json") +# +# self.assertEqual(response.status_code, 400) +# not_updated_corpus = Corpus.objects.filter(owner__username="user")[0] +# self.assertEqual(not_updated_corpus.name, "User Test Corpus") +# self.assertEqual(not_updated_corpus.description, "This corpus belongs to the user 'user'") +# +# def test_cant_edit_other_peoples_corpora(self): +# """ +# A PUT request to another person's corpus actually raises Http404, as +# if the document did not exist. Since rest_framework uses PUT-as-create, +# this means a new object is created with the provided information. +# """ +# self.client.login(username="user", password="user") +# corpus = Corpus.objects.filter(owner__username="admin")[0] +# response = self.client.put(reverse('corpus-detail', +# kwargs={'pk': corpus.id}), json.dumps({"name": "New name", +# "description": "New description"}), content_type="application/json") +# self.assertEqual(response.status_code, 404) +# +# reloaded_corpus = Corpus.objects.filter(owner__username="admin")[0] +# self.assertNotEqual(reloaded_corpus.name, "New name") +# self.assertNotEqual(reloaded_corpus.description, "New description") +# +# def test_cant_change_the_owner_of_a_corpus(self): +# self.client.login(username="user", password="user") +# corpus = Corpus.objects.filter(owner__username="user")[0] +# # We try to set 'admin' as the owner (id=1) +# response = self.client.put(reverse('corpus-detail', +# kwargs={'pk': corpus.id}), json.dumps({"name": "Corpus", +# "description": "description", "owner": 1}), +# content_type="application/json") +# +# self.assertEqual(response.status_code, 200) +# # but the view sets the request user as the owner anyway +# self.assertEqual(response.data["owner"], "user") +# +# def test_delete_a_corpus(self): +# self.client.login(username="user", password="user") +# self.assertEqual(len(Corpus.objects.filter(owner__username="user")), 1) +# +# corpus = Corpus.objects.filter(owner__username="user")[0] +# response = self.client.delete(reverse('corpus-detail', +# kwargs={'pk': corpus.id})) +# +# self.assertEqual(response.status_code, 204) +# self.assertEqual(len(Corpus.objects.filter(owner__username="user")), 0) +# +# def test_cant_delete_other_peoples_corpora(self): +# self.client.login(username="user", password="user") +# self.assertEqual(len(Corpus.objects.filter(owner__username="user")), 1) +# +# corpus = Corpus.objects.filter(owner__username="admin")[0] +# response = self.client.delete(reverse('corpus-detail', +# kwargs={'pk': corpus.id})) +# +# self.assertEqual(response.status_code, 404) +# self.assertEqual(len(Corpus.objects.filter(owner__username="user")), 1) +#""" diff --git a/pypln/web/core/urls.py b/pypln/web/core/urls.py index 77206cd..c893fa8 100644 --- a/pypln/web/core/urls.py +++ b/pypln/web/core/urls.py @@ -20,6 +20,7 @@ from django.conf.urls import patterns, url, include from rest_framework.urlpatterns import format_suffix_patterns from pypln.web.core.views import CorpusList, CorpusDetail, CorpusDocumentList +from pypln.web.core.views import CorpusFreqDist from pypln.web.core.views import DocumentList, DocumentDetail from pypln.web.core.views import PropertyList, PropertyDetail @@ -28,6 +29,8 @@ url(r'^user/api-token/$', 'auth_token', name='auth_token'), url(r'^corpora/$', CorpusList.as_view(), name='corpus-list'), url(r'^corpora/(?P\d+)/$', CorpusDetail.as_view(), name='corpus-detail'), + url(r'^corpora/(?P\d+)/freqdist/$', CorpusFreqDist.as_view(), + name='corpus-freqdist'), url(r'^corpora/(?P\d+)/documents/$', CorpusDocumentList.as_view(), name='corpus-document-list'), url(r'^documents/$', DocumentList.as_view(), name='document-list'), diff --git a/pypln/web/core/views.py b/pypln/web/core/views.py index fc10cd9..ce9fc06 100644 --- a/pypln/web/core/views.py +++ b/pypln/web/core/views.py @@ -116,6 +116,20 @@ def get_queryset(self): def perform_update(self, serializer): instance = serializer.save(owner=self.request.user) +class CorpusFreqDist(generics.RetrieveUpdateAPIView): + """ + """ + model = Corpus + permission_classes = (permissions.IsAuthenticated, ) + + class CorpusFreqDistSerializer(serializers.Serializer): + value = serializers.ReadOnlyField(source="properties.freqdist") + + serializer_class = CorpusFreqDistSerializer + + def get_queryset(self): + return Corpus.objects.filter(owner=self.request.user) + class DocumentList(generics.ListCreateAPIView): """ Lists all documents available to the current user and creates new documents. From 1dc3ae16a4253498dc7893fa85ce30bf77c109e4 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Tue, 12 Jul 2016 18:43:46 -0300 Subject: [PATCH 06/10] Returns 404 if corpus does not have a freqdist yet This also includes a change to the corpus analysis fixture, to keep the data with the "user" user. --- .../fixtures/mongodb/corpora_analysis.json | 2 +- pypln/web/core/tests/test_models.py | 2 +- .../core/tests/views/test_corpus_analysis.py | 19 ++++++++----------- pypln/web/core/views.py | 7 +++++++ 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/pypln/web/core/fixtures/mongodb/corpora_analysis.json b/pypln/web/core/fixtures/mongodb/corpora_analysis.json index 955bf97..ff77cb0 100644 --- a/pypln/web/core/fixtures/mongodb/corpora_analysis.json +++ b/pypln/web/core/fixtures/mongodb/corpora_analysis.json @@ -1,7 +1,7 @@ [ { "_id" : { "$oid": "5785005257bc3a1070d8cdbf" }, - "corpus_id" : 1, + "corpus_id" : 2, "freqdist" : [ [ "á", 1 ], [ "non-ascii", 1 ], diff --git a/pypln/web/core/tests/test_models.py b/pypln/web/core/tests/test_models.py index bf92df1..8f039fe 100644 --- a/pypln/web/core/tests/test_models.py +++ b/pypln/web/core/tests/test_models.py @@ -61,7 +61,7 @@ def test_get_freqdist_from_store(self): [u"char", 1], [u"file", 1], [u"test", 1], [u":", 1], [u"with", 1 ] ] - corpus = Corpus.objects.all()[0] + corpus = Corpus.objects.get(pk=2) self.assertEqual(corpus.properties['freqdist'], expected_data) diff --git a/pypln/web/core/tests/views/test_corpus_analysis.py b/pypln/web/core/tests/views/test_corpus_analysis.py index 4bc9f1b..4d50a47 100644 --- a/pypln/web/core/tests/views/test_corpus_analysis.py +++ b/pypln/web/core/tests/views/test_corpus_analysis.py @@ -36,16 +36,6 @@ def test_requires_login(self): kwargs={'pk': 2})) self.assertEqual(response.status_code, 403) -# def test_shows_corpus_correctly(self): -# self.client.login(username="user", password="user") -# corpus = Corpus.objects.filter(owner__username="user")[0] -# response = self.client.get(reverse('corpus-detail', -# kwargs={'pk': corpus.id})) -# -# self.assertEqual(response.status_code, 200) -# self.assertEqual(response.renderer_context['view'].get_object(), corpus) - - def test_returns_404_for_inexistent_corpus(self): self.client.login(username="user", password="user") response = self.client.get(reverse('corpus-freqdist', @@ -59,9 +49,16 @@ def test_returns_404_if_user_is_not_the_owner_of_the_corpus(self): kwargs={'pk': corpus.id})) self.assertEqual(response.status_code, 404) - def test_shows_corpus_freqdist_correctly(self): + def test_returns_404_if_corpus_has_no_freqdist_yet(self): self.client.login(username="admin", password="admin") corpus = Corpus.objects.filter(owner__username="admin")[0] + response = self.client.get(reverse('corpus-freqdist', + kwargs={'pk': corpus.id})) + self.assertEqual(response.status_code, 404) + + def test_shows_corpus_freqdist_correctly(self): + self.client.login(username="user", password="user") + corpus = Corpus.objects.filter(owner__username="user")[0] response = self.client.get(reverse('corpus-freqdist', kwargs={'pk': corpus.id})) diff --git a/pypln/web/core/views.py b/pypln/web/core/views.py index ce9fc06..773406e 100644 --- a/pypln/web/core/views.py +++ b/pypln/web/core/views.py @@ -130,6 +130,13 @@ class CorpusFreqDistSerializer(serializers.Serializer): def get_queryset(self): return Corpus.objects.filter(owner=self.request.user) + def get_object(self, *args, **kwargs): + corpus = super(CorpusFreqDist, self).get_object(*args, **kwargs) + if corpus.properties.has_key("freqdist"): + return corpus + else: + raise Http404("FreqDist for Corpus {} is not yet available".format(corpus)) + class DocumentList(generics.ListCreateAPIView): """ Lists all documents available to the current user and creates new documents. From 3a23140c6e9269a762a3a585f7249950b72e2f0e Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Fri, 15 Jul 2016 16:07:00 -0300 Subject: [PATCH 07/10] Adds a function to queue a corpus freq dist task in the backend --- pypln/web/backend_adapter/pipelines.py | 4 ++++ pypln/web/backend_adapter/tests.py | 19 ++++++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/pypln/web/backend_adapter/pipelines.py b/pypln/web/backend_adapter/pipelines.py index 4ffbf3f..fddb8f1 100644 --- a/pypln/web/backend_adapter/pipelines.py +++ b/pypln/web/backend_adapter/pipelines.py @@ -44,6 +44,10 @@ def create_indexing_pipeline(doc): {"index_name": doc.index_name, "doc_type": doc.doc_type}}) (Extractor().si(doc_id) | ElasticIndexer().si(doc_id))() +def corpus_freqdist(corpus): + blob_ids = list(corpus.document_set.values_list('blob', flat=True)) + CorpusFreqDist().si(corpus.pk, blob_ids) + def get_config_from_router(api, timeout=5): client = Client() client.connect(api) diff --git a/pypln/web/backend_adapter/tests.py b/pypln/web/backend_adapter/tests.py index 15f9740..f3afd53 100644 --- a/pypln/web/backend_adapter/tests.py +++ b/pypln/web/backend_adapter/tests.py @@ -26,13 +26,13 @@ from mock import patch from pypln.web.backend_adapter.pipelines import (create_indexing_pipeline, - call_default_pipeline, create_pipeline_from_document) -from pypln.web.core.models import IndexedDocument, Document, mongodb_storage + call_default_pipeline, create_pipeline_from_document, corpus_freqdist) +from pypln.web.core.models import IndexedDocument, Document, mongodb_storage, Corpus from pypln.web.core.tests.utils import TestWithMongo __all__ = ["CreatePipelineTest", "CreateIndexingPipelineTest", - "CreatePipelineFromDocumentTest"] + "CreatePipelineFromDocumentTest", "CorpusFreqDistTest"] class CreatePipelineTest(TestWithMongo): @@ -91,3 +91,16 @@ def test_create_pipeline_from_document_instantiates_a_document_id(self, fake_cal doc = Document.objects.all()[0] create_pipeline_from_document(doc) fake_call_default_pipeline.assert_called_with(ObjectId(doc.blob.name)) + + +class CorpusFreqDistTest(TestWithMongo): + fixtures = ['users', 'corpora', 'documents'] + + @patch('pypln.web.backend_adapter.pipelines.CorpusFreqDist', autospec=True) + def test_should_call_CorpusFreqDist_with_document_ids(self, + corpus_freqdist_worker): + corpus = Corpus.objects.get(pk=2) + ids = [u"562526d9798ebd4616b23bb1"] + corpus_freqdist(corpus) + corpus_freqdist_worker.assert_called_with() + corpus_freqdist_worker.return_value.si.assert_called_with(corpus.pk, ids) From 959025212197c8ca270e74ea89bc23ab02b8f4e0 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Mon, 18 Jul 2016 21:41:19 -0300 Subject: [PATCH 08/10] Queue a FreqDist analysis for a corpus when a PUT request is received --- .../core/tests/views/test_corpus_analysis.py | 115 ++++-------------- pypln/web/core/views.py | 19 ++- 2 files changed, 40 insertions(+), 94 deletions(-) diff --git a/pypln/web/core/tests/views/test_corpus_analysis.py b/pypln/web/core/tests/views/test_corpus_analysis.py index 4d50a47..71704a0 100644 --- a/pypln/web/core/tests/views/test_corpus_analysis.py +++ b/pypln/web/core/tests/views/test_corpus_analysis.py @@ -68,102 +68,37 @@ def test_shows_corpus_freqdist_correctly(self): expected_data = corpus.properties['freqdist'] self.assertEqual(response.data['value'], expected_data) - - @patch('pypln.web.backend_adapter.pipelines.corpus_freqdist') - def _test_queue_freqdist_analysis_for_a_corpus_that_didnt_have_one(self, - corpus_freqdist): - self.user = User.objects.get(username="user") - self.assertEqual(len(self.user.document_set.all()), 1) - self.client.login(username="user", password="user") + @patch('pypln.web.core.views.corpus_freqdist') + def test_queue_freqdist_analysis_for_a_corpus_that_still_does_not_have_one(self,corpus_freqdist): + """ + This is a regression test. There used to be a bug that returned 404 + before queueing the analysis if the corpus didn't have a freqdist + analysis yet. + """ + self.user = User.objects.get(username="admin") + self.client.login(username="admin", password="admin") corpus = self.user.corpus_set.all()[0] response = self.client.put(reverse('corpus-freqdist', kwargs={"pk": corpus.id})) - self.assertEqual(response.status_code, 201) + self.assertFalse(corpus.properties.has_key("freqdist")) + + self.assertEqual(response.status_code, 200) self.assertTrue(corpus_freqdist.called) + corpus_freqdist.assert_called_with(corpus) - document_ids = corpus.document_set.all() + @patch('pypln.web.core.views.corpus_freqdist') + def test_queue_freqdist_analysis_for_a_corpus_that_has_one(self,corpus_freqdist): + self.user = User.objects.get(username="user") + self.client.login(username="user", password="user") - corpus_freqdist.assert_called_with(corpus.id, document_ids) + corpus = self.user.corpus_set.all()[0] + response = self.client.put(reverse('corpus-freqdist', + kwargs={"pk": corpus.id})) -# def test_edit_corpus(self): -# self.client.login(username="user", password="user") -# corpus = Corpus.objects.filter(owner__username="user")[0] -# response = self.client.put(reverse('corpus-detail', -# kwargs={'pk': corpus.id}), json.dumps({"name": "New name", -# "description": "New description"}), content_type="application/json") -# -# self.assertEqual(response.status_code, 200) -# updated_corpus = Corpus.objects.filter(owner__username="user")[0] -# self.assertEqual(updated_corpus.name, "New name") -# self.assertEqual(updated_corpus.description, "New description") -# -# def test_cant_change_name_to_one_that_already_exists_for_this_user(self): -# self.client.login(username="user", password="user") -# user = User.objects.get(username="user") -# conflicting_corpus = Corpus.objects.create(name="Conflicting name", -# owner=user, description="This corpus is here to create a conflict") -# -# corpus = Corpus.objects.filter(owner__username="user")[0] -# response = self.client.put(reverse('corpus-detail', -# kwargs={'pk': corpus.id}), json.dumps({"name": "Conflicting name", -# "description": "New description"}), content_type="application/json") -# -# self.assertEqual(response.status_code, 400) -# not_updated_corpus = Corpus.objects.filter(owner__username="user")[0] -# self.assertEqual(not_updated_corpus.name, "User Test Corpus") -# self.assertEqual(not_updated_corpus.description, "This corpus belongs to the user 'user'") -# -# def test_cant_edit_other_peoples_corpora(self): -# """ -# A PUT request to another person's corpus actually raises Http404, as -# if the document did not exist. Since rest_framework uses PUT-as-create, -# this means a new object is created with the provided information. -# """ -# self.client.login(username="user", password="user") -# corpus = Corpus.objects.filter(owner__username="admin")[0] -# response = self.client.put(reverse('corpus-detail', -# kwargs={'pk': corpus.id}), json.dumps({"name": "New name", -# "description": "New description"}), content_type="application/json") -# self.assertEqual(response.status_code, 404) -# -# reloaded_corpus = Corpus.objects.filter(owner__username="admin")[0] -# self.assertNotEqual(reloaded_corpus.name, "New name") -# self.assertNotEqual(reloaded_corpus.description, "New description") -# -# def test_cant_change_the_owner_of_a_corpus(self): -# self.client.login(username="user", password="user") -# corpus = Corpus.objects.filter(owner__username="user")[0] -# # We try to set 'admin' as the owner (id=1) -# response = self.client.put(reverse('corpus-detail', -# kwargs={'pk': corpus.id}), json.dumps({"name": "Corpus", -# "description": "description", "owner": 1}), -# content_type="application/json") -# -# self.assertEqual(response.status_code, 200) -# # but the view sets the request user as the owner anyway -# self.assertEqual(response.data["owner"], "user") -# -# def test_delete_a_corpus(self): -# self.client.login(username="user", password="user") -# self.assertEqual(len(Corpus.objects.filter(owner__username="user")), 1) -# -# corpus = Corpus.objects.filter(owner__username="user")[0] -# response = self.client.delete(reverse('corpus-detail', -# kwargs={'pk': corpus.id})) -# -# self.assertEqual(response.status_code, 204) -# self.assertEqual(len(Corpus.objects.filter(owner__username="user")), 0) -# -# def test_cant_delete_other_peoples_corpora(self): -# self.client.login(username="user", password="user") -# self.assertEqual(len(Corpus.objects.filter(owner__username="user")), 1) -# -# corpus = Corpus.objects.filter(owner__username="admin")[0] -# response = self.client.delete(reverse('corpus-detail', -# kwargs={'pk': corpus.id})) -# -# self.assertEqual(response.status_code, 404) -# self.assertEqual(len(Corpus.objects.filter(owner__username="user")), 1) -#""" + self.assertTrue(corpus.properties.has_key("freqdist")) + + self.assertEqual(response.status_code, 200) + self.assertTrue(corpus_freqdist.called) + corpus_freqdist.assert_called_with(corpus) diff --git a/pypln/web/core/views.py b/pypln/web/core/views.py index 773406e..d6985c5 100644 --- a/pypln/web/core/views.py +++ b/pypln/web/core/views.py @@ -28,7 +28,7 @@ from rest_framework.response import Response from rest_framework import serializers -from pypln.web.backend_adapter.pipelines import create_pipeline_from_document +from pypln.web.backend_adapter.pipelines import create_pipeline_from_document, corpus_freqdist from pypln.web.core.models import Corpus, Document from pypln.web.core.serializers import CorpusSerializer, DocumentSerializer from pypln.web.core.serializers import PropertyListSerializer @@ -118,6 +118,13 @@ def perform_update(self, serializer): class CorpusFreqDist(generics.RetrieveUpdateAPIView): """ + Shows FreqDist for the corpus + + `GET` requests will show the last calculated FreqDist for the corpus + + `PUT` requests will queue a new task for calculating the Corpus + FreqDist using the documents currently contained in the corpus + """ model = Corpus permission_classes = (permissions.IsAuthenticated, ) @@ -130,13 +137,17 @@ class CorpusFreqDistSerializer(serializers.Serializer): def get_queryset(self): return Corpus.objects.filter(owner=self.request.user) - def get_object(self, *args, **kwargs): - corpus = super(CorpusFreqDist, self).get_object(*args, **kwargs) + def retrieve(self, *args, **kwargs): + corpus = self.get_object() if corpus.properties.has_key("freqdist"): - return corpus + return super(CorpusFreqDist, self).retrieve(self, *args, **kwargs) else: raise Http404("FreqDist for Corpus {} is not yet available".format(corpus)) + def perform_update(self, serializer): + corpus_freqdist(serializer.instance) + + class DocumentList(generics.ListCreateAPIView): """ Lists all documents available to the current user and creates new documents. From c199d1ec168b5b51fddd4a77501a293f00ad32a2 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Tue, 19 Jul 2016 00:20:25 -0300 Subject: [PATCH 09/10] Sends a list of ObjectId's to the task This is what the backend expects. --- pypln/web/backend_adapter/pipelines.py | 4 ++-- pypln/web/backend_adapter/tests.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pypln/web/backend_adapter/pipelines.py b/pypln/web/backend_adapter/pipelines.py index fddb8f1..207bc7a 100644 --- a/pypln/web/backend_adapter/pipelines.py +++ b/pypln/web/backend_adapter/pipelines.py @@ -45,8 +45,8 @@ def create_indexing_pipeline(doc): (Extractor().si(doc_id) | ElasticIndexer().si(doc_id))() def corpus_freqdist(corpus): - blob_ids = list(corpus.document_set.values_list('blob', flat=True)) - CorpusFreqDist().si(corpus.pk, blob_ids) + blob_ids = map(ObjectId, corpus.document_set.values_list('blob', flat=True)) + CorpusFreqDist().delay(corpus.pk, blob_ids) def get_config_from_router(api, timeout=5): client = Client() diff --git a/pypln/web/backend_adapter/tests.py b/pypln/web/backend_adapter/tests.py index f3afd53..1448147 100644 --- a/pypln/web/backend_adapter/tests.py +++ b/pypln/web/backend_adapter/tests.py @@ -100,7 +100,7 @@ class CorpusFreqDistTest(TestWithMongo): def test_should_call_CorpusFreqDist_with_document_ids(self, corpus_freqdist_worker): corpus = Corpus.objects.get(pk=2) - ids = [u"562526d9798ebd4616b23bb1"] + ids = [ObjectId("562526d9798ebd4616b23bb1")] corpus_freqdist(corpus) corpus_freqdist_worker.assert_called_with() - corpus_freqdist_worker.return_value.si.assert_called_with(corpus.pk, ids) + corpus_freqdist_worker.return_value.delay.assert_called_with(corpus.pk, ids) From fa253f928d4816fc96cc494625a4c6875bbeaf44 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Wed, 10 Aug 2016 17:24:39 -0300 Subject: [PATCH 10/10] Clarifies the name of the function that queues freqdist analysis for corpora --- pypln/web/backend_adapter/pipelines.py | 2 +- pypln/web/backend_adapter/tests.py | 4 ++-- .../core/tests/views/test_corpus_analysis.py | 18 ++++++++++-------- pypln/web/core/views.py | 5 +++-- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/pypln/web/backend_adapter/pipelines.py b/pypln/web/backend_adapter/pipelines.py index 207bc7a..f77505e 100644 --- a/pypln/web/backend_adapter/pipelines.py +++ b/pypln/web/backend_adapter/pipelines.py @@ -44,7 +44,7 @@ def create_indexing_pipeline(doc): {"index_name": doc.index_name, "doc_type": doc.doc_type}}) (Extractor().si(doc_id) | ElasticIndexer().si(doc_id))() -def corpus_freqdist(corpus): +def calculate_corpus_freqdist(corpus): blob_ids = map(ObjectId, corpus.document_set.values_list('blob', flat=True)) CorpusFreqDist().delay(corpus.pk, blob_ids) diff --git a/pypln/web/backend_adapter/tests.py b/pypln/web/backend_adapter/tests.py index 1448147..936963e 100644 --- a/pypln/web/backend_adapter/tests.py +++ b/pypln/web/backend_adapter/tests.py @@ -26,7 +26,7 @@ from mock import patch from pypln.web.backend_adapter.pipelines import (create_indexing_pipeline, - call_default_pipeline, create_pipeline_from_document, corpus_freqdist) + call_default_pipeline, create_pipeline_from_document, calculate_corpus_freqdist) from pypln.web.core.models import IndexedDocument, Document, mongodb_storage, Corpus from pypln.web.core.tests.utils import TestWithMongo @@ -101,6 +101,6 @@ def test_should_call_CorpusFreqDist_with_document_ids(self, corpus_freqdist_worker): corpus = Corpus.objects.get(pk=2) ids = [ObjectId("562526d9798ebd4616b23bb1")] - corpus_freqdist(corpus) + calculate_corpus_freqdist(corpus) corpus_freqdist_worker.assert_called_with() corpus_freqdist_worker.return_value.delay.assert_called_with(corpus.pk, ids) diff --git a/pypln/web/core/tests/views/test_corpus_analysis.py b/pypln/web/core/tests/views/test_corpus_analysis.py index 71704a0..dcc85de 100644 --- a/pypln/web/core/tests/views/test_corpus_analysis.py +++ b/pypln/web/core/tests/views/test_corpus_analysis.py @@ -68,8 +68,9 @@ def test_shows_corpus_freqdist_correctly(self): expected_data = corpus.properties['freqdist'] self.assertEqual(response.data['value'], expected_data) - @patch('pypln.web.core.views.corpus_freqdist') - def test_queue_freqdist_analysis_for_a_corpus_that_still_does_not_have_one(self,corpus_freqdist): + @patch('pypln.web.core.views.calculate_corpus_freqdist') + def test_queue_freqdist_analysis_for_a_corpus_that_still_does_not_have_one(self, + calculate_corpus_freqdist): """ This is a regression test. There used to be a bug that returned 404 before queueing the analysis if the corpus didn't have a freqdist @@ -85,11 +86,12 @@ def test_queue_freqdist_analysis_for_a_corpus_that_still_does_not_have_one(self, self.assertFalse(corpus.properties.has_key("freqdist")) self.assertEqual(response.status_code, 200) - self.assertTrue(corpus_freqdist.called) - corpus_freqdist.assert_called_with(corpus) + self.assertTrue(calculate_corpus_freqdist.called) + calculate_corpus_freqdist.assert_called_with(corpus) - @patch('pypln.web.core.views.corpus_freqdist') - def test_queue_freqdist_analysis_for_a_corpus_that_has_one(self,corpus_freqdist): + @patch('pypln.web.core.views.calculate_corpus_freqdist') + def test_queue_freqdist_analysis_for_a_corpus_that_has_one(self, + calculate_corpus_freqdist): self.user = User.objects.get(username="user") self.client.login(username="user", password="user") @@ -100,5 +102,5 @@ def test_queue_freqdist_analysis_for_a_corpus_that_has_one(self,corpus_freqdist) self.assertTrue(corpus.properties.has_key("freqdist")) self.assertEqual(response.status_code, 200) - self.assertTrue(corpus_freqdist.called) - corpus_freqdist.assert_called_with(corpus) + self.assertTrue(calculate_corpus_freqdist.called) + calculate_corpus_freqdist.assert_called_with(corpus) diff --git a/pypln/web/core/views.py b/pypln/web/core/views.py index d6985c5..f4b0600 100644 --- a/pypln/web/core/views.py +++ b/pypln/web/core/views.py @@ -28,7 +28,8 @@ from rest_framework.response import Response from rest_framework import serializers -from pypln.web.backend_adapter.pipelines import create_pipeline_from_document, corpus_freqdist +from pypln.web.backend_adapter.pipelines import (create_pipeline_from_document, + calculate_corpus_freqdist) from pypln.web.core.models import Corpus, Document from pypln.web.core.serializers import CorpusSerializer, DocumentSerializer from pypln.web.core.serializers import PropertyListSerializer @@ -145,7 +146,7 @@ def retrieve(self, *args, **kwargs): raise Http404("FreqDist for Corpus {} is not yet available".format(corpus)) def perform_update(self, serializer): - corpus_freqdist(serializer.instance) + calculate_corpus_freqdist(serializer.instance) class DocumentList(generics.ListCreateAPIView):