From 5780be25ca291e1b1b79a3e5f6e3be1d84090a4f Mon Sep 17 00:00:00 2001 From: Lorenzo Vagliano Date: Wed, 13 Nov 2024 13:46:23 +0100 Subject: [PATCH] feat(export): Added year export for extraction Signed-off-by: Lorenzo Vagliano --- scoap3/management/commands/year_export.py | 40 ++++ scoap3/misc/api/serializers.py | 8 + scoap3/tasks.py | 6 + scoap3/utils/tests/test_year_export.py | 238 ++++++++++++++++++++++ scoap3/utils/tools.py | 78 +++++++ 5 files changed, 370 insertions(+) create mode 100644 scoap3/management/commands/year_export.py create mode 100644 scoap3/utils/tests/test_year_export.py diff --git a/scoap3/management/commands/year_export.py b/scoap3/management/commands/year_export.py new file mode 100644 index 000000000..e3d329320 --- /dev/null +++ b/scoap3/management/commands/year_export.py @@ -0,0 +1,40 @@ +import csv +import datetime +import logging + +from django.core.files.storage import storages +from django.core.management.base import BaseCommand, CommandParser + +from scoap3.utils.tools import year_export + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = "Export article information by year" + + def add_arguments(self, parser: CommandParser) -> None: + parser.add_argument( + "--start", + type=str, + required=False, + help="Start date.", + ) + + parser.add_argument( + "--end", + type=str, + required=False, + help="End date.", + ) + + def handle(self, *args, **options): + storage = storages["default"] + result = year_export(options["start"], options["end"]) + + with storage.open( + f"scoap3_export_years_{datetime.datetime.now()}.csv", "w" + ) as f: + writer = csv.writer(f) + writer.writerow(result["header"]) + writer.writerows(result["data"]) diff --git a/scoap3/misc/api/serializers.py b/scoap3/misc/api/serializers.py index b9ef54580..8b1dc2adb 100644 --- a/scoap3/misc/api/serializers.py +++ b/scoap3/misc/api/serializers.py @@ -43,6 +43,14 @@ def get_ror(self, obj): else: return None + def to_representation(self, instance): + representation = super().to_representation(instance) + + if representation.get("ror") is None: + representation.pop("ror", None) + + return representation + class InstitutionIdentifierSerializer(serializers.ModelSerializer): class Meta: diff --git a/scoap3/tasks.py b/scoap3/tasks.py index 30e44181c..3ba1a5353 100644 --- a/scoap3/tasks.py +++ b/scoap3/tasks.py @@ -24,6 +24,7 @@ PublicationInfo, Publisher, ) +from scoap3.utils.tools import year_export logger = logging.getLogger(__name__) cc = coco.CountryConverter() @@ -434,3 +435,8 @@ def link_affiliations(folder_name, index_range): with storage.open(os.path.join(folder_name, filename)) as file: json_data = json.load(file) update_affiliations(json_data) + + +@celery_app.task(acks_late=True) +def year_data_export(start_date, end_date): + year_export(start_date, end_date) diff --git a/scoap3/utils/tests/test_year_export.py b/scoap3/utils/tests/test_year_export.py new file mode 100644 index 000000000..d6891d7b0 --- /dev/null +++ b/scoap3/utils/tests/test_year_export.py @@ -0,0 +1,238 @@ +import pytest +from django.test import TestCase + +from scoap3.articles.models import Article, ArticleIdentifier +from scoap3.authors.models import Author, AuthorIdentifier +from scoap3.misc.models import ( + Affiliation, + Country, + InstitutionIdentifier, + PublicationInfo, + Publisher, + RelatedMaterial, +) +from scoap3.utils.tools import year_export + + +@pytest.mark.django_db +@pytest.mark.vcr +class TestYearExport(TestCase): + def setUp(self): + self.publisher_1 = Publisher.objects.create(name="Elsevier") + self.publisher_2 = Publisher.objects.create(name="Springer") + + self.country_gb = Country.objects.create(code="GB", name="United Kingdom") + self.country_fr = Country.objects.create(code="FR", name="France") + self.country_jp = Country.objects.create(code="JP", name="Japan") + self.country_be = Country.objects.create(code="BE", name="Belgium") + self.country_br = Country.objects.create(code="BR", name="Brazil") + self.country_it = Country.objects.create(code="IT", name="Italy") + self.country_es = Country.objects.create(code="ES", name="Spain") + + self.publisher_1.save() + self.publisher_2.save() + + self.country_gb.save() + self.country_fr.save() + self.country_jp.save() + self.country_be.save() + self.country_br.save() + self.country_it.save() + self.country_es.save() + + def create_article( + self, + title, + subtitle, + abstract, + publication_date, + doi_value, + publisher, + journal_title, + author_data, + country, + affiliation_value, + ): + related_material_software_type = RelatedMaterial.objects.create( + title="Test Software material", + doi="TestMatSoftDOI", + related_material_type="software", + ) + + related_material_dataset_type = RelatedMaterial.objects.create( + title="Test Dataset material", + doi="TestMatDataDOI", + related_material_type="dataset", + ) + + article = Article.objects.create( + title=title, + subtitle=subtitle, + abstract=abstract, + publication_date=publication_date, + ) + article.related_materials.add(related_material_software_type) + article.related_materials.add(related_material_dataset_type) + + doi = ArticleIdentifier.objects.create( + article_id=article, + identifier_type="DOI", + identifier_value=doi_value, + ) + + publication_info = PublicationInfo.objects.create( + journal_title=journal_title, + article_id=article, + publisher=publisher, + ) + + author = Author.objects.create(article_id=article, **author_data) + + orcid = AuthorIdentifier.objects.create( + author_id=author, identifier_type="ORCID", identifier_value="1000-1000-1000" + ) + + affiliation = Affiliation.objects.create( + country=country, + value=affiliation_value, + organization="Example Organization", + ) + affiliation.author_id.add(author) + + ror = InstitutionIdentifier.objects.create( + affiliation_id=affiliation, + identifier_type="ROR", + identifier_value="123", + ) + + doi.save() + publication_info.save() + author.save() + affiliation.save() + ror.save() + related_material_software_type.save() + related_material_dataset_type.save() + orcid.save() + article.save() + + return article + + def test_year_export_multiple(self): + self.create_article( + title="Test Article", + subtitle="Test Subtitle", + abstract="Test Abstract", + publication_date="2024-01-01", + doi_value="TestDOI6", + publisher=self.publisher_1, + journal_title="Adv. High Energy Phys.", + author_data={ + "last_name": "ExampleSurname", + "first_name": "ExampleName", + "email": "ExampleName.ExampleSurname@gmail.com", + "author_order": 100, + }, + country=self.country_it, + affiliation_value="Example", + ) + + self.create_article( + title="Test Article 2", + subtitle="Test Subtitle 2", + abstract="Test Abstract 2", + publication_date="2024-02-02", + doi_value="TestDOI7", + publisher=self.publisher_2, + journal_title="Adv. High Energy Phys.", + author_data={ + "last_name": "ExampleSurname2", + "first_name": "ExampleName2", + "email": "ExampleName2.ExampleSurname2@gmail.com", + "author_order": 100, + }, + country=self.country_it, + affiliation_value="Example2", + ) + + result = year_export("2024-01-01", "2024-05-05") + expected_result = { + "header": [ + "year", + "journal", + "doi", + "arxiv number", + "primary arxiv category", + "total number of authors", + "total number of ORCIDs linked to the authors", + "total number of affiliations", + "total number of ROR linked with the affiliations", + "total number of related materials, type dataset", + "total number of related materials, type software", + ], + "data": [ + [ + 2024, + "Adv. High Energy Phys.", + "TestDOI6", + None, + None, + 1, + 1, + 1, + 1, + 1, + 1, + ], + [ + 2024, + "Adv. High Energy Phys.", + "TestDOI7", + None, + None, + 1, + 1, + 1, + 1, + 1, + 1, + ], + ], + } + + result["data"].sort(key=lambda x: x[2]) + expected_result["data"].sort(key=lambda x: x[2]) + + assert result == expected_result + + def test_year_export_no_data(self): + result = year_export("2024-01-01", "2024-05-05") + expected_result = { + "header": [ + "year", + "journal", + "doi", + "arxiv number", + "primary arxiv category", + "total number of authors", + "total number of ORCIDs linked to the authors", + "total number of affiliations", + "total number of ROR linked with the affiliations", + "total number of related materials, type dataset", + "total number of related materials, type software", + ], + "data": [], + } + + assert result == expected_result + + def tearDown(self): + Publisher.objects.all().delete() + ArticleIdentifier.objects.all().delete() + Article.objects.all().delete() + PublicationInfo.objects.all().delete() + Author.objects.all().delete() + Affiliation.objects.all().delete() + Country.objects.all().delete() + AuthorIdentifier.objects.all().delete() + RelatedMaterial.objects.all().delete() + InstitutionIdentifier.objects.all().delete() diff --git a/scoap3/utils/tools.py b/scoap3/utils/tools.py index 91d055a11..348948f3d 100644 --- a/scoap3/utils/tools.py +++ b/scoap3/utils/tools.py @@ -1,5 +1,6 @@ import logging from collections import Counter +from datetime import datetime from django.db import connection from django.db.models import Max @@ -162,6 +163,83 @@ def author_export(search_year, search_country): return {"header": result_headers, "data": result_data} +def year_export(start_date=None, end_date=None): + result_headers = [ + "year", + "journal", + "doi", + "arxiv number", + "primary arxiv category", + "total number of authors", + "total number of ORCIDs linked to the authors", + "total number of affiliations", + "total number of ROR linked with the affiliations", + "total number of related materials, type dataset", + "total number of related materials, type software", + ] + result_data = [] + + search = ArticleDocument.search() + + if start_date or end_date: + date_range = {} + if start_date: + date_range["gte"] = datetime.strptime(start_date, "%Y-%m-%d") + if end_date: + date_range["lte"] = datetime.strptime(end_date, "%Y-%m-%d") + + search = search.filter("range", publication_date=date_range) + + for article in search.scan(): + year = article.publication_date.year + journal = article.publication_info[0].journal_title + doi = get_first_doi(article) + arxiv = get_first_arxiv(article) + arxiv_category = get_arxiv_primary_category(article) + + article_data = article.to_dict() + authors = article_data.get("authors", []) + total_authors = len(authors) + + total_orcid = sum(1 for author in authors if author.get("orcid")) + + total_affiliations = 0 + total_ror = 0 + for author in authors: + affiliations = author.get("affiliations", []) + total_affiliations += len(affiliations) + + for affiliation in affiliations: + if affiliation.get("ror"): + total_ror += 1 + + total_related_materials_dataset = 0 + total_related_materials_software = 0 + for related_material in article.related_materials: + if related_material.related_material_type == "dataset": + total_related_materials_dataset += 1 + elif related_material.related_material_type == "software": + total_related_materials_software += 1 + + result_data.append( + [ + year, + journal, + doi, + arxiv, + arxiv_category, + total_authors, + total_orcid, + total_affiliations, + total_ror, + total_related_materials_dataset, + total_related_materials_software, + ] + ) + + return {"header": result_headers, "data": result_data} + + def update_article_db_model_sequence(new_start_sequence): max_id = Article.objects.aggregate(max_id=Max("id"))["max_id"] or 0 if new_start_sequence <= max_id: