diff --git a/.gitignore b/.gitignore index f601cebf..74a95675 100644 --- a/.gitignore +++ b/.gitignore @@ -286,13 +286,11 @@ sde_indexing_helper/media/ **/.ipynb_checkpoints/ **/*.xlsx - -# config details for the api access +# Config details for the api access config_generation/config.py - -#model's inference files +# Model's inference files Document_Classifier_inference/model.pt -#Database Backup +# Database backup backup.json diff --git a/Document_Classifier_inference/README.md b/Document_Classifier_inference/README.md index 601e34f9..fefa838b 100644 --- a/Document_Classifier_inference/README.md +++ b/Document_Classifier_inference/README.md @@ -1,18 +1,18 @@ # Automated Document Tagging - # Project Description: -This purpose of this tag the content of a given url onto one of the five classes "Image","Documentation","Software and Tools", -"Mission and Instruments", and "Data". -#Datasets: -Reference link for datasets: https://docs.google.com/spreadsheets/d/1rK7hvb_HRd-sqL3jrSYll5BiDvwnzQY2qVWDmpg6Bbk/edit#gid=1560325588 +This purpose of this is to tag the content of a given url onto one of the six classes "Image","Documentation","Software and Tools", +"Mission and Instruments", "Training and Education", and "Data". -# to run the repository: -* location for saved model in drive: https://drive.google.com/drive/u/1/folders/1jkJSpN3ZuXhZIis4dSc-v0LkSV3pMrcs -* saved weight_name: model.pt -* prediction sample:python3 main.py predicts --config_file config.json --url "url_link" +# Datasets: + +Reference link for datasets: https://docs.google.com/spreadsheets/d/1rK7hvb_HRd-sqL3jrSYll5BiDvwnzQY2qVWDmpg6Bbk/edit#gid=1560325588 +# To run the inference pipeline: +- location for saved model in drive: https://drive.google.com/drive/u/1/folders/1jkJSpN3ZuXhZIis4dSc-v0LkSV3pMrcs +- saved weight_name: model.pt +- prediction sample: `python3 main.py predicts --config_file config.json --url "url_link"` For more details: contact rd0081@uah.edu diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py index 745fcc52..8d4805e1 100644 --- a/sde_collections/serializers.py +++ b/sde_collections/serializers.py @@ -40,8 +40,6 @@ class CandidateURLSerializer(serializers.ModelSerializer): generated_title_id = serializers.SerializerMethodField(read_only=True) match_pattern_type = serializers.SerializerMethodField(read_only=True) candidate_urls_count = serializers.SerializerMethodField(read_only=True) - inferenced_by = serializers.CharField(read_only=True) - is_pdf = serializers.BooleanField(required=False) def get_candidate_urls_count(self, obj): titlepattern = obj.titlepattern_urls.last() @@ -69,8 +67,6 @@ class Meta: "document_type", "document_type_display", "visited", - "inferenced_by", - "is_pdf", ) diff --git a/sde_collections/urls.py b/sde_collections/urls.py index 13c1fbea..831ad842 100644 --- a/sde_collections/urls.py +++ b/sde_collections/urls.py @@ -13,7 +13,6 @@ PushToGithubView, RequiredUrlsDeleteView, TitlePatternViewSet, - HealthCheckView, ) router = routers.DefaultRouter() @@ -33,11 +32,6 @@ PushToGithubView.as_view(), name="push-to-github", ), - path( - "api/health-check/", - view=HealthCheckView.as_view(), - name="health-check" - ), path( "delete-required-url/", view=RequiredUrlsDeleteView.as_view(), @@ -54,5 +48,4 @@ # Update an existing CandidateURL instance: /candidate-urls/{id}/ # Delete an existing CandidateURL instance: /candidate-urls/{id}/ path("api/", include(router.urls)), - path("api/model_inference", views.model_inference, name="model_inference"), ] diff --git a/sde_collections/views.py b/sde_collections/views.py index a1d69ac9..de68c7eb 100644 --- a/sde_collections/views.py +++ b/sde_collections/views.py @@ -1,15 +1,11 @@ -import csv import re -from io import StringIO from django.contrib.auth import get_user_model from django.contrib.auth.mixins import LoginRequiredMixin from django.db import models -from django.http import HttpResponse from django.shortcuts import redirect from django.urls import reverse from django.utils import timezone -from django.views import View from django.views.generic.detail import DetailView from django.views.generic.edit import DeleteView from django.views.generic.list import ListView @@ -17,15 +13,10 @@ from rest_framework.response import Response from rest_framework.views import APIView -from Document_Classifier_inference.main import batch_predicts - from .forms import CollectionGithubIssueForm, RequiredUrlForm from .models.candidate_url import CandidateURL from .models.collection import Collection, RequiredUrls -from .models.collection_choice_fields import ( - CurationStatusChoices, - WorkflowStatusChoices, -) +from .models.collection_choice_fields import CurationStatusChoices, WorkflowStatusChoices from .models.pattern import DocumentTypePattern, ExcludePattern, TitlePattern from .serializers import ( CandidateURLBulkCreateSerializer, @@ -36,46 +27,10 @@ TitlePatternSerializer, ) from .tasks import push_to_github_task -from .utils.health_check import health_check User = get_user_model() -def model_inference(request): - if request.method == "POST": - collection_id = request.POST.get("collection_id") - candidate_urls = CandidateURL.objects.filter( - collection_id=Collection.objects.get(pk=collection_id), - ).exclude(document_type__in=[1, 2, 3, 4, 5, 6]) - # These list of urls are to be inferred - to_infer_url_list = [candidate_url.url for candidate_url in candidate_urls] - if to_infer_url_list: - collection_id = candidate_urls[0].collection_id - prediction, pdf_lists = batch_predicts( - "Document_Classifier_inference/config.json", to_infer_url_list - ) - # Update document_type for corresponding URLs - for candidate_url in candidate_urls: - new_document_type = prediction.get(candidate_url.url) - if new_document_type is not None: - candidate_url.document_type = new_document_type - candidate_url.inferenced_by = "model" - candidate_url.save() # Updating the changes in candidateurl table - # Create a new DocumentTypePattern entry for each URL and its document_type - DocumentTypePattern.objects.create( - collection_id=candidate_url.collection_id, - match_pattern=candidate_url.url.replace("https://", ""), - match_pattern_type=DocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL, - document_type=new_document_type, - ) # Adding the new record in documenttypepattern table - if ( - candidate_url.url in pdf_lists - ): # flagging created for url with pdf response - candidate_url.is_pdf = True - candidate_url.save() - return HttpResponse(status=204) - - class CollectionListView(LoginRequiredMixin, ListView): """ Display a list of collections in the system @@ -139,8 +94,7 @@ def post(self, request, *args, **kwargs): else: if "claim_button" in request.POST: user = self.request.user - collection.curation_status = CurationStatusChoices.BEING_CURATED - collection.workflow_status = WorkflowStatusChoices.CURATION_IN_PROGRESS + collection.curation_status = WorkflowStatusChoices.CURATION_IN_PROGRESS collection.curated_by = user collection.curation_started = timezone.now() collection.save() @@ -310,28 +264,20 @@ def get_queryset(self): def create(self, request, *args, **kwargs): document_type = request.POST.get("document_type") - inferencer = request.POST.get("inferencer") - collection_id = request.POST.get("collection") - match_pattern = request.POST.get("match_pattern") - candidate_url = CandidateURL.objects.get( - collection_id=Collection.objects.get(id=collection_id), - url="https://" + match_pattern, - ) if not int(document_type) == 0: # 0=none - candidate_url.inferenced_by = inferencer - candidate_url.save() return super().create(request, *args, **kwargs) - try: - candidate_url.inferenced_by = "" - candidate_url.save() - DocumentTypePattern.objects.get( - collection_id=Collection.objects.get(id=collection_id), - match_pattern=match_pattern, - match_pattern_type=DocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL, - ).delete() - return Response(status=status.HTTP_204_NO_CONTENT) - except DocumentTypePattern.DoesNotExist: - return Response(status=status.HTTP_204_NO_CONTENT) + else: + collection_id = request.POST.get("collection") + match_pattern = request.POST.get("match_pattern") + try: + DocumentTypePattern.objects.get( + collection_id=Collection.objects.get(id=collection_id), + match_pattern=match_pattern, + match_pattern_type=DocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL, + ).delete() + return Response(status=status.HTTP_200_OK) + except DocumentTypePattern.DoesNotExist: + return Response(status=status.HTTP_204_NO_CONTENT) class CollectionViewSet(viewsets.ModelViewSet): @@ -353,38 +299,3 @@ def post(self, request): {"Success": "Started pushing collections to github"}, status=status.HTTP_200_OK, ) - - -class HealthCheckView(View): - """ - This view checks whether the rules in indexer db has been correctly reflected - in our prod/test sinequa instances or not and at the end generates a report. - """ - - def get(self, *args, **kwargs): - collection = Collection.objects.get(pk=kwargs.get("pk")) - sync_check_report = health_check(collection, server_name="production") - field_names = [ - "id", - "collection_name", - "config_folder", - "curation_status", - "workflow_status", - "pattern_name", - "pattern", - "scraped_title", - "non_compliant_url", - ] - - # download the report in CSV format - csv_data = StringIO() - writer = csv.DictWriter(csv_data, fieldnames=field_names) - writer.writeheader() - for item in sync_check_report: - writer.writerow(item) - - http_response = HttpResponse(content_type="text/csv") - http_response["Content-Disposition"] = 'attachment; filename="report.csv"' - http_response.write(csv_data.getvalue()) - - return http_response diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index 5295e22e..4f574069 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -4,8 +4,7 @@ var collection_id = getCollectionId(); var selected_text = ""; var INDIVIDUAL_URL = 1 var MULTI_URL_PATTERN = 2 -var candidate_urls_table; -var url_lists=null; + $(document).ready(function () { handleAjaxStartAndStop(); initializeDataTable(); @@ -20,7 +19,7 @@ function initializeDataTable() { var true_icon = 'check'; var false_icon = 'close'; - candidate_urls_table = $('#candidate_urls_table').DataTable({ + var candidate_urls_table = $('#candidate_urls_table').DataTable({ "scrollY": true, "serverSide": true, "stateSave": true, @@ -50,9 +49,6 @@ function initializeDataTable() { { "data": "generated_title_id", "visible": false, "searchable": false }, { "data": "match_pattern_type", "visible": false, "searchable": false }, { "data": "candidate_urls_count", "visible": false, "searchable": false }, - { "data": "inferenced_by", "visible": false, "searchable": false }, - { "data": "is_pdf", "visible": false, "searchable": false } - ], "createdRow": function (row, data, dataIndex) { if (data['excluded']) { @@ -60,7 +56,7 @@ function initializeDataTable() { } } }); -} + var exclude_patterns_table = $('#exclude_patterns_table').DataTable({ "scrollY": true, "serverSide": true, @@ -123,9 +119,9 @@ function initializeDataTable() { { "data": "id", "visible": false, "searchable": false }, ] }); +} function setupClickHandlers() { - handleInferenceButton(); handleAddNewPatternClick(); handleCreateDocumentTypePatternButton(); @@ -136,7 +132,7 @@ function setupClickHandlers() { handleDeleteExcludePatternButtonClick(); handleDeleteTitlePatternButtonClick(); - handleDocumentTypeSelect(); + handleDocumentTypeSelect() handleExcludeIndividualUrlClick(); handleNewTitleChange(); @@ -193,11 +189,10 @@ function getDocumentTypeColumn() { 3: 'Documentation', 4: 'Software and Tools', 5: 'Missions and Instruments', - 6: 'Training and Education' + 6: 'Training and Education', }; - var inferenceValue = row['inferenced_by']; button_text = data ? dict[data] : 'Select'; - button_color = inferenceValue === 'user' ? 'btn-success' : (inferenceValue === 'model' ? 'btn-primary' : 'btn-secondary'); + button_color = data ? 'btn-success' : 'btn-secondary'; return ` - -
@@ -41,20 +34,7 @@


- -
-

Document Type Inference

-
-
- User Inference:Press here for -
-
- Model's Inference:
@@ -285,7 +265,6 @@

SDE Collections

URL
- - - - @@ -32,7 +28,6 @@

SDE Collections

{% for collection in collections %} - @@ -47,33 +42,6 @@

SDE Collections

class="btn btn-sm {% if collection.num_candidate_urls > 0 %}btn-primary {% else %}disabled{% endif %}" role="button">{{ collection.num_candidate_urls }} - - -
Priority Name URL Division Candidate URLsNew?Status Workflow Status CuratorHas Config? Connector Type Config Folder
{{ collection.cleaning_order }} {{ collection.name }} chevron_right - {% if collection.new_collection %} - check - {% else %} - close - {% endif %} - - - {{ collection.has_sinequa_config }} {{ collection.get_connector_display }} {{ collection.config_folder }}