NASA-IMPACT · code-geek · Oct 17, 2023 · Oct 17, 2023 · Oct 17, 2023 · Oct 17, 2023
diff --git a/.gitignore b/.gitignore
@@ -286,13 +286,11 @@ sde_indexing_helper/media/
 **/.ipynb_checkpoints/
 **/*.xlsx
 
-
-# config details for the api access
+# Config details for the api access
 config_generation/config.py
 
-
-#model's inference files
+# Model's inference files
 Document_Classifier_inference/model.pt
 
-#Database Backup
+# Database backup
 backup.json
diff --git a/Document_Classifier_inference/README.md b/Document_Classifier_inference/README.md
@@ -1,18 +1,18 @@
 # Automated Document Tagging
 
-
 # Project Description:
-This purpose of this tag the content of a given url onto one of the five classes "Image","Documentation","Software and Tools",
-"Mission and Instruments", and "Data".
 
-#Datasets:
-Reference link for datasets: https://docs.google.com/spreadsheets/d/1rK7hvb_HRd-sqL3jrSYll5BiDvwnzQY2qVWDmpg6Bbk/edit#gid=1560325588
+This purpose of this is to tag the content of a given url onto one of the six classes "Image","Documentation","Software and Tools",
+"Mission and Instruments", "Training and Education", and "Data".
 
-# to run the repository:
-* location for saved model in drive: https://drive.google.com/drive/u/1/folders/1jkJSpN3ZuXhZIis4dSc-v0LkSV3pMrcs
-* saved weight_name: model.pt
-* prediction sample:python3 main.py predicts --config_file config.json --url "url_link"
+# Datasets:
+
+Reference link for datasets: https://docs.google.com/spreadsheets/d/1rK7hvb_HRd-sqL3jrSYll5BiDvwnzQY2qVWDmpg6Bbk/edit#gid=1560325588
 
+# To run the inference pipeline:
 
+- location for saved model in drive: https://drive.google.com/drive/u/1/folders/1jkJSpN3ZuXhZIis4dSc-v0LkSV3pMrcs
+- saved weight_name: model.pt
+- prediction sample: `python3 main.py predicts --config_file config.json --url "url_link"`
 
 For more details: contact [email protected]
diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
@@ -40,8 +40,6 @@ class CandidateURLSerializer(serializers.ModelSerializer):
     generated_title_id = serializers.SerializerMethodField(read_only=True)
     match_pattern_type = serializers.SerializerMethodField(read_only=True)
     candidate_urls_count = serializers.SerializerMethodField(read_only=True)
-    inferenced_by = serializers.CharField(read_only=True)
-    is_pdf = serializers.BooleanField(required=False)
 
     def get_candidate_urls_count(self, obj):
         titlepattern = obj.titlepattern_urls.last()
@@ -69,8 +67,6 @@ class Meta:
             "document_type",
             "document_type_display",
             "visited",
-            "inferenced_by",
-            "is_pdf",
         )
 
 

diff --git a/sde_collections/urls.py b/sde_collections/urls.py
@@ -13,7 +13,6 @@
     PushToGithubView,
     RequiredUrlsDeleteView,
     TitlePatternViewSet,
-    HealthCheckView,
 )
 
 router = routers.DefaultRouter()
@@ -33,11 +32,6 @@
         PushToGithubView.as_view(),
         name="push-to-github",
     ),
-    path(
-        "api/health-check/<int:pk>",
-        view=HealthCheckView.as_view(),
-        name="health-check"
-    ),
     path(
         "delete-required-url/<int:pk>",
         view=RequiredUrlsDeleteView.as_view(),
@@ -54,5 +48,4 @@
     # Update an existing CandidateURL instance: /candidate-urls/{id}/
     # Delete an existing CandidateURL instance: /candidate-urls/{id}/
     path("api/", include(router.urls)),
-    path("api/model_inference", views.model_inference, name="model_inference"),
 ]
diff --git a/sde_collections/views.py b/sde_collections/views.py
@@ -1,31 +1,22 @@
-import csv
 import re
-from io import StringIO
 
 from django.contrib.auth import get_user_model
 from django.contrib.auth.mixins import LoginRequiredMixin
 from django.db import models
-from django.http import HttpResponse
 from django.shortcuts import redirect
 from django.urls import reverse
 from django.utils import timezone
-from django.views import View
 from django.views.generic.detail import DetailView
 from django.views.generic.edit import DeleteView
 from django.views.generic.list import ListView
 from rest_framework import generics, status, viewsets
 from rest_framework.response import Response
 from rest_framework.views import APIView
 
-from Document_Classifier_inference.main import batch_predicts
-
 from .forms import CollectionGithubIssueForm, RequiredUrlForm
 from .models.candidate_url import CandidateURL
 from .models.collection import Collection, RequiredUrls
-from .models.collection_choice_fields import (
-    CurationStatusChoices,
-    WorkflowStatusChoices,
-)
+from .models.collection_choice_fields import CurationStatusChoices, WorkflowStatusChoices
 from .models.pattern import DocumentTypePattern, ExcludePattern, TitlePattern
 from .serializers import (
     CandidateURLBulkCreateSerializer,
@@ -36,46 +27,10 @@
     TitlePatternSerializer,
 )
 from .tasks import push_to_github_task
-from .utils.health_check import health_check
 
 User = get_user_model()
 
 
-def model_inference(request):
-    if request.method == "POST":
-        collection_id = request.POST.get("collection_id")
-        candidate_urls = CandidateURL.objects.filter(
-            collection_id=Collection.objects.get(pk=collection_id),
-        ).exclude(document_type__in=[1, 2, 3, 4, 5, 6])
-        # These list of urls are to be inferred
-        to_infer_url_list = [candidate_url.url for candidate_url in candidate_urls]
-        if to_infer_url_list:
-            collection_id = candidate_urls[0].collection_id
-            prediction, pdf_lists = batch_predicts(
-                "Document_Classifier_inference/config.json", to_infer_url_list
-            )
-            # Update document_type for corresponding URLs
-            for candidate_url in candidate_urls:
-                new_document_type = prediction.get(candidate_url.url)
-                if new_document_type is not None:
-                    candidate_url.document_type = new_document_type
-                    candidate_url.inferenced_by = "model"
-                    candidate_url.save()  # Updating the changes in candidateurl table
-                    # Create a new DocumentTypePattern entry for each URL and its document_type
-                    DocumentTypePattern.objects.create(
-                        collection_id=candidate_url.collection_id,
-                        match_pattern=candidate_url.url.replace("https://", ""),
-                        match_pattern_type=DocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL,
-                        document_type=new_document_type,
-                    )  # Adding the new record in documenttypepattern table
-                if (
-                    candidate_url.url in pdf_lists
-                ):  # flagging created for url with pdf response
-                    candidate_url.is_pdf = True
-                    candidate_url.save()
-        return HttpResponse(status=204)
-
-
 class CollectionListView(LoginRequiredMixin, ListView):
     """
     Display a list of collections in the system
@@ -139,8 +94,7 @@ def post(self, request, *args, **kwargs):
         else:
             if "claim_button" in request.POST:
                 user = self.request.user
-                collection.curation_status = CurationStatusChoices.BEING_CURATED
-                collection.workflow_status = WorkflowStatusChoices.CURATION_IN_PROGRESS
+                collection.curation_status = WorkflowStatusChoices.CURATION_IN_PROGRESS
                 collection.curated_by = user
                 collection.curation_started = timezone.now()
                 collection.save()
@@ -310,28 +264,20 @@ def get_queryset(self):
 
     def create(self, request, *args, **kwargs):
         document_type = request.POST.get("document_type")
-        inferencer = request.POST.get("inferencer")
-        collection_id = request.POST.get("collection")
-        match_pattern = request.POST.get("match_pattern")
-        candidate_url = CandidateURL.objects.get(
-            collection_id=Collection.objects.get(id=collection_id),
-            url="https://" + match_pattern,
-        )
         if not int(document_type) == 0:  # 0=none
-            candidate_url.inferenced_by = inferencer
-            candidate_url.save()
             return super().create(request, *args, **kwargs)
-        try:
-            candidate_url.inferenced_by = ""
-            candidate_url.save()
-            DocumentTypePattern.objects.get(
-                collection_id=Collection.objects.get(id=collection_id),
-                match_pattern=match_pattern,
-                match_pattern_type=DocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL,
-            ).delete()
-            return Response(status=status.HTTP_204_NO_CONTENT)
-        except DocumentTypePattern.DoesNotExist:
-            return Response(status=status.HTTP_204_NO_CONTENT)
+        else:
+            collection_id = request.POST.get("collection")
+            match_pattern = request.POST.get("match_pattern")
+            try:
+                DocumentTypePattern.objects.get(
+                    collection_id=Collection.objects.get(id=collection_id),
+                    match_pattern=match_pattern,
+                    match_pattern_type=DocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL,
+                ).delete()
+                return Response(status=status.HTTP_200_OK)
+            except DocumentTypePattern.DoesNotExist:
+                return Response(status=status.HTTP_204_NO_CONTENT)
 
 
 class CollectionViewSet(viewsets.ModelViewSet):
@@ -353,38 +299,3 @@ def post(self, request):
             {"Success": "Started pushing collections to github"},
             status=status.HTTP_200_OK,
         )
-
-
-class HealthCheckView(View):
-    """
-    This view checks whether the rules in indexer db has been correctly reflected
-    in our prod/test sinequa instances or not and at the end generates a report.
-    """
-
-    def get(self, *args, **kwargs):
-        collection = Collection.objects.get(pk=kwargs.get("pk"))
-        sync_check_report = health_check(collection, server_name="production")
-        field_names = [
-            "id",
-            "collection_name",
-            "config_folder",
-            "curation_status",
-            "workflow_status",
-            "pattern_name",
-            "pattern",
-            "scraped_title",
-            "non_compliant_url",
-        ]
-
-        # download the report in CSV format
-        csv_data = StringIO()
-        writer = csv.DictWriter(csv_data, fieldnames=field_names)
-        writer.writeheader()
-        for item in sync_check_report:
-            writer.writerow(item)
-
-        http_response = HttpResponse(content_type="text/csv")
-        http_response["Content-Disposition"] = 'attachment; filename="report.csv"'
-        http_response.write(csv_data.getvalue())
-
-        return http_response