From 5780be25ca291e1b1b79a3e5f6e3be1d84090a4f Mon Sep 17 00:00:00 2001
From: Lorenzo Vagliano <lorenzo.ventura.vagliano@cern.ch>
Date: Wed, 13 Nov 2024 13:46:23 +0100
Subject: [PATCH] feat(export): Added year export for extraction

Signed-off-by: Lorenzo Vagliano
---
 scoap3/management/commands/year_export.py |  40 ++++
 scoap3/misc/api/serializers.py            |   8 +
 scoap3/tasks.py                           |   6 +
 scoap3/utils/tests/test_year_export.py    | 238 ++++++++++++++++++++++
 scoap3/utils/tools.py                     |  78 +++++++
 5 files changed, 370 insertions(+)
 create mode 100644 scoap3/management/commands/year_export.py
 create mode 100644 scoap3/utils/tests/test_year_export.py

diff --git a/scoap3/management/commands/year_export.py b/scoap3/management/commands/year_export.py
new file mode 100644
index 000000000..e3d329320
--- /dev/null
+++ b/scoap3/management/commands/year_export.py
@@ -0,0 +1,40 @@
+import csv
+import datetime
+import logging
+
+from django.core.files.storage import storages
+from django.core.management.base import BaseCommand, CommandParser
+
+from scoap3.utils.tools import year_export
+
+logger = logging.getLogger(__name__)
+
+
+class Command(BaseCommand):
+    help = "Export article information by year"
+
+    def add_arguments(self, parser: CommandParser) -> None:
+        parser.add_argument(
+            "--start",
+            type=str,
+            required=False,
+            help="Start date.",
+        )
+
+        parser.add_argument(
+            "--end",
+            type=str,
+            required=False,
+            help="End date.",
+        )
+
+    def handle(self, *args, **options):
+        storage = storages["default"]
+        result = year_export(options["start"], options["end"])
+
+        with storage.open(
+            f"scoap3_export_years_{datetime.datetime.now()}.csv", "w"
+        ) as f:
+            writer = csv.writer(f)
+            writer.writerow(result["header"])
+            writer.writerows(result["data"])
diff --git a/scoap3/misc/api/serializers.py b/scoap3/misc/api/serializers.py
index b9ef54580..8b1dc2adb 100644
--- a/scoap3/misc/api/serializers.py
+++ b/scoap3/misc/api/serializers.py
@@ -43,6 +43,14 @@ def get_ror(self, obj):
         else:
             return None
 
+    def to_representation(self, instance):
+        representation = super().to_representation(instance)
+
+        if representation.get("ror") is None:
+            representation.pop("ror", None)
+
+        return representation
+
 
 class InstitutionIdentifierSerializer(serializers.ModelSerializer):
     class Meta:
diff --git a/scoap3/tasks.py b/scoap3/tasks.py
index 30e44181c..3ba1a5353 100644
--- a/scoap3/tasks.py
+++ b/scoap3/tasks.py
@@ -24,6 +24,7 @@
     PublicationInfo,
     Publisher,
 )
+from scoap3.utils.tools import year_export
 
 logger = logging.getLogger(__name__)
 cc = coco.CountryConverter()
@@ -434,3 +435,8 @@ def link_affiliations(folder_name, index_range):
             with storage.open(os.path.join(folder_name, filename)) as file:
                 json_data = json.load(file)
                 update_affiliations(json_data)
+
+
+@celery_app.task(acks_late=True)
+def year_data_export(start_date, end_date):
+    year_export(start_date, end_date)
diff --git a/scoap3/utils/tests/test_year_export.py b/scoap3/utils/tests/test_year_export.py
new file mode 100644
index 000000000..d6891d7b0
--- /dev/null
+++ b/scoap3/utils/tests/test_year_export.py
@@ -0,0 +1,238 @@
+import pytest
+from django.test import TestCase
+
+from scoap3.articles.models import Article, ArticleIdentifier
+from scoap3.authors.models import Author, AuthorIdentifier
+from scoap3.misc.models import (
+    Affiliation,
+    Country,
+    InstitutionIdentifier,
+    PublicationInfo,
+    Publisher,
+    RelatedMaterial,
+)
+from scoap3.utils.tools import year_export
+
+
+@pytest.mark.django_db
+@pytest.mark.vcr
+class TestYearExport(TestCase):
+    def setUp(self):
+        self.publisher_1 = Publisher.objects.create(name="Elsevier")
+        self.publisher_2 = Publisher.objects.create(name="Springer")
+
+        self.country_gb = Country.objects.create(code="GB", name="United Kingdom")
+        self.country_fr = Country.objects.create(code="FR", name="France")
+        self.country_jp = Country.objects.create(code="JP", name="Japan")
+        self.country_be = Country.objects.create(code="BE", name="Belgium")
+        self.country_br = Country.objects.create(code="BR", name="Brazil")
+        self.country_it = Country.objects.create(code="IT", name="Italy")
+        self.country_es = Country.objects.create(code="ES", name="Spain")
+
+        self.publisher_1.save()
+        self.publisher_2.save()
+
+        self.country_gb.save()
+        self.country_fr.save()
+        self.country_jp.save()
+        self.country_be.save()
+        self.country_br.save()
+        self.country_it.save()
+        self.country_es.save()
+
+    def create_article(
+        self,
+        title,
+        subtitle,
+        abstract,
+        publication_date,
+        doi_value,
+        publisher,
+        journal_title,
+        author_data,
+        country,
+        affiliation_value,
+    ):
+        related_material_software_type = RelatedMaterial.objects.create(
+            title="Test Software material",
+            doi="TestMatSoftDOI",
+            related_material_type="software",
+        )
+
+        related_material_dataset_type = RelatedMaterial.objects.create(
+            title="Test Dataset material",
+            doi="TestMatDataDOI",
+            related_material_type="dataset",
+        )
+
+        article = Article.objects.create(
+            title=title,
+            subtitle=subtitle,
+            abstract=abstract,
+            publication_date=publication_date,
+        )
+        article.related_materials.add(related_material_software_type)
+        article.related_materials.add(related_material_dataset_type)
+
+        doi = ArticleIdentifier.objects.create(
+            article_id=article,
+            identifier_type="DOI",
+            identifier_value=doi_value,
+        )
+
+        publication_info = PublicationInfo.objects.create(
+            journal_title=journal_title,
+            article_id=article,
+            publisher=publisher,
+        )
+
+        author = Author.objects.create(article_id=article, **author_data)
+
+        orcid = AuthorIdentifier.objects.create(
+            author_id=author, identifier_type="ORCID", identifier_value="1000-1000-1000"
+        )
+
+        affiliation = Affiliation.objects.create(
+            country=country,
+            value=affiliation_value,
+            organization="Example Organization",
+        )
+        affiliation.author_id.add(author)
+
+        ror = InstitutionIdentifier.objects.create(
+            affiliation_id=affiliation,
+            identifier_type="ROR",
+            identifier_value="123",
+        )
+
+        doi.save()
+        publication_info.save()
+        author.save()
+        affiliation.save()
+        ror.save()
+        related_material_software_type.save()
+        related_material_dataset_type.save()
+        orcid.save()
+        article.save()
+
+        return article
+
+    def test_year_export_multiple(self):
+        self.create_article(
+            title="Test Article",
+            subtitle="Test Subtitle",
+            abstract="Test Abstract",
+            publication_date="2024-01-01",
+            doi_value="TestDOI6",
+            publisher=self.publisher_1,
+            journal_title="Adv. High Energy Phys.",
+            author_data={
+                "last_name": "ExampleSurname",
+                "first_name": "ExampleName",
+                "email": "ExampleName.ExampleSurname@gmail.com",
+                "author_order": 100,
+            },
+            country=self.country_it,
+            affiliation_value="Example",
+        )
+
+        self.create_article(
+            title="Test Article 2",
+            subtitle="Test Subtitle 2",
+            abstract="Test Abstract 2",
+            publication_date="2024-02-02",
+            doi_value="TestDOI7",
+            publisher=self.publisher_2,
+            journal_title="Adv. High Energy Phys.",
+            author_data={
+                "last_name": "ExampleSurname2",
+                "first_name": "ExampleName2",
+                "email": "ExampleName2.ExampleSurname2@gmail.com",
+                "author_order": 100,
+            },
+            country=self.country_it,
+            affiliation_value="Example2",
+        )
+
+        result = year_export("2024-01-01", "2024-05-05")
+        expected_result = {
+            "header": [
+                "year",
+                "journal",
+                "doi",
+                "arxiv number",
+                "primary arxiv category",
+                "total number of authors",
+                "total number of ORCIDs linked to the authors",
+                "total number of affiliations",
+                "total number of ROR linked with the affiliations",
+                "total number of related materials, type dataset",
+                "total number of related materials, type software",
+            ],
+            "data": [
+                [
+                    2024,
+                    "Adv. High Energy Phys.",
+                    "TestDOI6",
+                    None,
+                    None,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                ],
+                [
+                    2024,
+                    "Adv. High Energy Phys.",
+                    "TestDOI7",
+                    None,
+                    None,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                ],
+            ],
+        }
+
+        result["data"].sort(key=lambda x: x[2])
+        expected_result["data"].sort(key=lambda x: x[2])
+
+        assert result == expected_result
+
+    def test_year_export_no_data(self):
+        result = year_export("2024-01-01", "2024-05-05")
+        expected_result = {
+            "header": [
+                "year",
+                "journal",
+                "doi",
+                "arxiv number",
+                "primary arxiv category",
+                "total number of authors",
+                "total number of ORCIDs linked to the authors",
+                "total number of affiliations",
+                "total number of ROR linked with the affiliations",
+                "total number of related materials, type dataset",
+                "total number of related materials, type software",
+            ],
+            "data": [],
+        }
+
+        assert result == expected_result
+
+    def tearDown(self):
+        Publisher.objects.all().delete()
+        ArticleIdentifier.objects.all().delete()
+        Article.objects.all().delete()
+        PublicationInfo.objects.all().delete()
+        Author.objects.all().delete()
+        Affiliation.objects.all().delete()
+        Country.objects.all().delete()
+        AuthorIdentifier.objects.all().delete()
+        RelatedMaterial.objects.all().delete()
+        InstitutionIdentifier.objects.all().delete()
diff --git a/scoap3/utils/tools.py b/scoap3/utils/tools.py
index 91d055a11..348948f3d 100644
--- a/scoap3/utils/tools.py
+++ b/scoap3/utils/tools.py
@@ -1,5 +1,6 @@
 import logging
 from collections import Counter
+from datetime import datetime
 
 from django.db import connection
 from django.db.models import Max
@@ -162,6 +163,83 @@ def author_export(search_year, search_country):
     return {"header": result_headers, "data": result_data}
 
 
+def year_export(start_date=None, end_date=None):
+    result_headers = [
+        "year",
+        "journal",
+        "doi",
+        "arxiv number",
+        "primary arxiv category",
+        "total number of authors",
+        "total number of ORCIDs linked to the authors",
+        "total number of affiliations",
+        "total number of ROR linked with the affiliations",
+        "total number of related materials, type dataset",
+        "total number of related materials, type software",
+    ]
+    result_data = []
+
+    search = ArticleDocument.search()
+
+    if start_date or end_date:
+        date_range = {}
+        if start_date:
+            date_range["gte"] = datetime.strptime(start_date, "%Y-%m-%d")
+        if end_date:
+            date_range["lte"] = datetime.strptime(end_date, "%Y-%m-%d")
+
+        search = search.filter("range", publication_date=date_range)
+
+    for article in search.scan():
+        year = article.publication_date.year
+        journal = article.publication_info[0].journal_title
+        doi = get_first_doi(article)
+        arxiv = get_first_arxiv(article)
+        arxiv_category = get_arxiv_primary_category(article)
+
+        article_data = article.to_dict()
+        authors = article_data.get("authors", [])
+        total_authors = len(authors)
+
+        total_orcid = sum(1 for author in authors if author.get("orcid"))
+
+        total_affiliations = 0
+        total_ror = 0
+        for author in authors:
+            affiliations = author.get("affiliations", [])
+            total_affiliations += len(affiliations)
+
+            for affiliation in affiliations:
+                if affiliation.get("ror"):
+                    total_ror += 1
+
+        total_related_materials_dataset = 0
+        total_related_materials_software = 0
+        for related_material in article.related_materials:
+            if related_material.related_material_type == "dataset":
+                total_related_materials_dataset += 1
+            elif related_material.related_material_type == "software":
+                total_related_materials_software += 1
+
+        result_data.append(
+            [
+                year,
+                journal,
+                doi,
+                arxiv,
+                arxiv_category,
+                total_authors,
+                total_orcid,
+                total_affiliations,
+                total_ror,
+                total_related_materials_dataset,
+                total_related_materials_software,
+            ]
+        )
+
+    return {"header": result_headers, "data": result_data}
+
+
 def update_article_db_model_sequence(new_start_sequence):
     max_id = Article.objects.aggregate(max_id=Max("id"))["max_id"] or 0
     if new_start_sequence <= max_id: