Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert changes from health check and document classifier #452

Merged
merged 10 commits into from
Oct 17, 2023
8 changes: 3 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -286,13 +286,11 @@ sde_indexing_helper/media/
**/.ipynb_checkpoints/
**/*.xlsx


# config details for the api access
# Config details for the api access
config_generation/config.py


#model's inference files
# Model's inference files
Document_Classifier_inference/model.pt

#Database Backup
# Database backup
backup.json
18 changes: 9 additions & 9 deletions Document_Classifier_inference/README.md
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
# Automated Document Tagging


# Project Description:
This purpose of this tag the content of a given url onto one of the five classes "Image","Documentation","Software and Tools",
"Mission and Instruments", and "Data".

#Datasets:
Reference link for datasets: https://docs.google.com/spreadsheets/d/1rK7hvb_HRd-sqL3jrSYll5BiDvwnzQY2qVWDmpg6Bbk/edit#gid=1560325588
This purpose of this is to tag the content of a given url onto one of the six classes "Image","Documentation","Software and Tools",
"Mission and Instruments", "Training and Education", and "Data".

# to run the repository:
* location for saved model in drive: https://drive.google.com/drive/u/1/folders/1jkJSpN3ZuXhZIis4dSc-v0LkSV3pMrcs
* saved weight_name: model.pt
* prediction sample:python3 main.py predicts --config_file config.json --url "url_link"
# Datasets:

Reference link for datasets: https://docs.google.com/spreadsheets/d/1rK7hvb_HRd-sqL3jrSYll5BiDvwnzQY2qVWDmpg6Bbk/edit#gid=1560325588

# To run the inference pipeline:

- location for saved model in drive: https://drive.google.com/drive/u/1/folders/1jkJSpN3ZuXhZIis4dSc-v0LkSV3pMrcs
- saved weight_name: model.pt
- prediction sample: `python3 main.py predicts --config_file config.json --url "url_link"`

For more details: contact [email protected]
4 changes: 0 additions & 4 deletions sde_collections/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@ class CandidateURLSerializer(serializers.ModelSerializer):
generated_title_id = serializers.SerializerMethodField(read_only=True)
match_pattern_type = serializers.SerializerMethodField(read_only=True)
candidate_urls_count = serializers.SerializerMethodField(read_only=True)
inferenced_by = serializers.CharField(read_only=True)
is_pdf = serializers.BooleanField(required=False)

def get_candidate_urls_count(self, obj):
titlepattern = obj.titlepattern_urls.last()
Expand Down Expand Up @@ -69,8 +67,6 @@ class Meta:
"document_type",
"document_type_display",
"visited",
"inferenced_by",
"is_pdf",
)


Expand Down
7 changes: 0 additions & 7 deletions sde_collections/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
PushToGithubView,
RequiredUrlsDeleteView,
TitlePatternViewSet,
HealthCheckView,
)

router = routers.DefaultRouter()
Expand All @@ -33,11 +32,6 @@
PushToGithubView.as_view(),
name="push-to-github",
),
path(
"api/health-check/<int:pk>",
view=HealthCheckView.as_view(),
name="health-check"
),
path(
"delete-required-url/<int:pk>",
view=RequiredUrlsDeleteView.as_view(),
Expand All @@ -54,5 +48,4 @@
# Update an existing CandidateURL instance: /candidate-urls/{id}/
# Delete an existing CandidateURL instance: /candidate-urls/{id}/
path("api/", include(router.urls)),
path("api/model_inference", views.model_inference, name="model_inference"),
]
117 changes: 14 additions & 103 deletions sde_collections/views.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,22 @@
import csv
import re
from io import StringIO

from django.contrib.auth import get_user_model
from django.contrib.auth.mixins import LoginRequiredMixin
from django.db import models
from django.http import HttpResponse
from django.shortcuts import redirect
from django.urls import reverse
from django.utils import timezone
from django.views import View
from django.views.generic.detail import DetailView
from django.views.generic.edit import DeleteView
from django.views.generic.list import ListView
from rest_framework import generics, status, viewsets
from rest_framework.response import Response
from rest_framework.views import APIView

from Document_Classifier_inference.main import batch_predicts

from .forms import CollectionGithubIssueForm, RequiredUrlForm
from .models.candidate_url import CandidateURL
from .models.collection import Collection, RequiredUrls
from .models.collection_choice_fields import (
CurationStatusChoices,
WorkflowStatusChoices,
)
from .models.collection_choice_fields import CurationStatusChoices, WorkflowStatusChoices
from .models.pattern import DocumentTypePattern, ExcludePattern, TitlePattern
from .serializers import (
CandidateURLBulkCreateSerializer,
Expand All @@ -36,46 +27,10 @@
TitlePatternSerializer,
)
from .tasks import push_to_github_task
from .utils.health_check import health_check

User = get_user_model()


def model_inference(request):
if request.method == "POST":
collection_id = request.POST.get("collection_id")
candidate_urls = CandidateURL.objects.filter(
collection_id=Collection.objects.get(pk=collection_id),
).exclude(document_type__in=[1, 2, 3, 4, 5, 6])
# These list of urls are to be inferred
to_infer_url_list = [candidate_url.url for candidate_url in candidate_urls]
if to_infer_url_list:
collection_id = candidate_urls[0].collection_id
prediction, pdf_lists = batch_predicts(
"Document_Classifier_inference/config.json", to_infer_url_list
)
# Update document_type for corresponding URLs
for candidate_url in candidate_urls:
new_document_type = prediction.get(candidate_url.url)
if new_document_type is not None:
candidate_url.document_type = new_document_type
candidate_url.inferenced_by = "model"
candidate_url.save() # Updating the changes in candidateurl table
# Create a new DocumentTypePattern entry for each URL and its document_type
DocumentTypePattern.objects.create(
collection_id=candidate_url.collection_id,
match_pattern=candidate_url.url.replace("https://", ""),
match_pattern_type=DocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL,
document_type=new_document_type,
) # Adding the new record in documenttypepattern table
if (
candidate_url.url in pdf_lists
): # flagging created for url with pdf response
candidate_url.is_pdf = True
candidate_url.save()
return HttpResponse(status=204)


class CollectionListView(LoginRequiredMixin, ListView):
"""
Display a list of collections in the system
Expand Down Expand Up @@ -139,8 +94,7 @@ def post(self, request, *args, **kwargs):
else:
if "claim_button" in request.POST:
user = self.request.user
collection.curation_status = CurationStatusChoices.BEING_CURATED
collection.workflow_status = WorkflowStatusChoices.CURATION_IN_PROGRESS
collection.curation_status = WorkflowStatusChoices.CURATION_IN_PROGRESS
collection.curated_by = user
collection.curation_started = timezone.now()
collection.save()
Expand Down Expand Up @@ -310,28 +264,20 @@ def get_queryset(self):

def create(self, request, *args, **kwargs):
document_type = request.POST.get("document_type")
inferencer = request.POST.get("inferencer")
collection_id = request.POST.get("collection")
match_pattern = request.POST.get("match_pattern")
candidate_url = CandidateURL.objects.get(
collection_id=Collection.objects.get(id=collection_id),
url="https://" + match_pattern,
)
if not int(document_type) == 0: # 0=none
candidate_url.inferenced_by = inferencer
candidate_url.save()
return super().create(request, *args, **kwargs)
try:
candidate_url.inferenced_by = ""
candidate_url.save()
DocumentTypePattern.objects.get(
collection_id=Collection.objects.get(id=collection_id),
match_pattern=match_pattern,
match_pattern_type=DocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL,
).delete()
return Response(status=status.HTTP_204_NO_CONTENT)
except DocumentTypePattern.DoesNotExist:
return Response(status=status.HTTP_204_NO_CONTENT)
else:
collection_id = request.POST.get("collection")
match_pattern = request.POST.get("match_pattern")
try:
DocumentTypePattern.objects.get(
collection_id=Collection.objects.get(id=collection_id),
match_pattern=match_pattern,
match_pattern_type=DocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL,
).delete()
return Response(status=status.HTTP_200_OK)
except DocumentTypePattern.DoesNotExist:
return Response(status=status.HTTP_204_NO_CONTENT)


class CollectionViewSet(viewsets.ModelViewSet):
Expand All @@ -353,38 +299,3 @@ def post(self, request):
{"Success": "Started pushing collections to github"},
status=status.HTTP_200_OK,
)


class HealthCheckView(View):
"""
This view checks whether the rules in indexer db has been correctly reflected
in our prod/test sinequa instances or not and at the end generates a report.
"""

def get(self, *args, **kwargs):
collection = Collection.objects.get(pk=kwargs.get("pk"))
sync_check_report = health_check(collection, server_name="production")
field_names = [
"id",
"collection_name",
"config_folder",
"curation_status",
"workflow_status",
"pattern_name",
"pattern",
"scraped_title",
"non_compliant_url",
]

# download the report in CSV format
csv_data = StringIO()
writer = csv.DictWriter(csv_data, fieldnames=field_names)
writer.writeheader()
for item in sync_check_report:
writer.writerow(item)

http_response = HttpResponse(content_type="text/csv")
http_response["Content-Disposition"] = 'attachment; filename="report.csv"'
http_response.write(csv_data.getvalue())

return http_response
Loading