Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

curation: add request checks; award acronym in community check #1104

Merged
merged 4 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion site/zenodo_rdm/curation/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,13 @@
award_acronym_in_additional_description,
award_acronym_in_description,
award_acronym_in_title,
award_number_in_additional_description,
award_number_in_description,
community_data_award_acronym,
contains_high_conf_keywords,
contains_low_conf_keywords,
eu_community_request,
eu_subcommunity_declined_request,
published_before_award_start,
test_phrases_in_record,
user_verified,
Expand All @@ -31,6 +36,11 @@
"additional_desc_contains_low_conf_keywords": additional_desc_contains_low_conf_keywords,
"additional_desc_contains_high_conf_keywords": additional_desc_contains_high_conf_keywords,
"award_acronym_in_additional_description": award_acronym_in_additional_description,
"eu_community_request": eu_community_request,
"eu_subcommunity_declined_request": eu_subcommunity_declined_request,
"community_data_award_acronym": community_data_award_acronym,
"award_number_in_additional_description": award_number_in_additional_description,
"award_number_in_description": award_number_in_description,
}
"""Rules to run for EU Curation."""

Expand All @@ -45,8 +55,13 @@
"additional_desc_contains_low_conf_keywords": 0,
"additional_desc_contains_high_conf_keywords": 0,
"award_acronym_in_additional_description": 0,
"eu_community_request": False,
"eu_subcommunity_declined_request": False,
"community_data_award_acronym": 0,
"award_number_in_additional_description": 0,
"award_number_in_description": 0,
}
"""Rule scores for EU Curation."""
"""Rule scores for EU Curation (bool value implies direct approval/decline)."""


CURATION_THRESHOLDS = {"EU_RECORDS_CURATION": 100}
Expand Down
192 changes: 150 additions & 42 deletions site/zenodo_rdm/curation/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,42 +8,69 @@

import arrow
from flask import current_app
from invenio_access.permissions import system_identity
from invenio_communities.proxies import current_communities
from invenio_rdm_records.requests import CommunityInclusion, CommunitySubmission
from invenio_records_resources.proxies import current_service_registry
from invenio_requests.proxies import current_requests_service
from invenio_search.engine import dsl


def award_acronym_in_description(record):
"""Check if EU award name in record description."""
award_service = current_service_registry.get("awards")
description = record.metadata.get("description")
if not description:
return False
def _award_acronym_in_text(award, text):
"""Check for award acronym in data."""
if award.get("acronym") and (award.get("acronym").lower() in text.lower()):
return True
return False


def _award_number_in_text(award, text):
"""Check for award number in data."""
if award.get("number") and (str(award.get("number")) in text):
return True
return False


def _get_ec_awards(record):
"""Get all EC funded awards of record."""
award_service = current_service_registry.get("awards")
awards = []
funding = record.metadata.get("funding", [])
for f in funding:
if f["funder"].get("id") == "00k4n6c32":
if award_id := f.get("award", {}).get("id"):
award = award_service.record_cls.pid.resolve(award_id)
if award.get("acronym") and (
award.get("acronym").lower() in description.lower()
):
return True
awards.append(award)
return awards


def award_acronym_in_description(record):
"""Check if EU award name in record description."""
if description := record.metadata.get("description"):
awards = _get_ec_awards(record)
for award in awards:
if _award_acronym_in_text(award, description):
return True
return False


def award_number_in_description(record):
"""Check if EU award number in record description."""
if description := record.metadata.get("description"):
awards = _get_ec_awards(record)
for award in awards:
if _award_number_in_text(award, description):
return True
return False


def award_acronym_in_title(record):
"""Check if EU award name in record title."""
award_service = current_service_registry.get("awards")
title = record.metadata["title"]

funding = record.metadata.get("funding", [])
for f in funding:
if f["funder"].get("id") == "00k4n6c32":
if award_id := f.get("award", {}).get("id"):
award = award_service.record_cls.pid.resolve(award_id)
if award.get("acronym") and (
award.get("acronym").lower() in title.lower()
):
return True
awards = _get_ec_awards(record)
for award in awards:
if _award_acronym_in_text(award, title):
return True
return False


Expand All @@ -62,18 +89,13 @@ def test_phrases_in_record(record):

def published_before_award_start(record):
"""Check if published before award start date."""
award_service = current_service_registry.get("awards")

funding = record.metadata.get("funding", [])
for f in funding:
if f["funder"].get("id") == "00k4n6c32":
if award_id := f.get("award", {}).get("id"):
award = award_service.record_cls.pid.resolve(award_id)
if award.get("start_date") and (
record.created.timestamp()
< arrow.get(award.get("start_date")).datetime.timestamp()
):
return True
awards = _get_ec_awards(record)
for award in awards:
if award.get("start_date") and (
record.created.timestamp()
< arrow.get(award.get("start_date")).datetime.timestamp()
):
return True
return False


Expand Down Expand Up @@ -143,17 +165,103 @@ def additional_desc_contains_low_conf_keywords(record):

def award_acronym_in_additional_description(record):
"""Check if EU award name in record additional description."""
award_service = current_service_registry.get("awards")
additional_descriptions = record.metadata.get("additional_descriptions", [])
record_data = " ".join([x.get("description", "") for x in additional_descriptions])

funding = record.metadata.get("funding", [])
for f in funding:
if f["funder"].get("id") == "00k4n6c32":
if award_id := f.get("award", {}).get("id"):
award = award_service.record_cls.pid.resolve(award_id)
if award.get("acronym") and (
award.get("acronym").lower() in record_data.lower()
):
return True
awards = _get_ec_awards(record)
for award in awards:
if _award_acronym_in_text(award, record_data):
return True
return False


def award_number_in_additional_description(record):
"""Check if EU award number in record additional description."""
additional_descriptions = record.metadata.get("additional_descriptions", [])
record_data = " ".join([x.get("description", "") for x in additional_descriptions])

awards = _get_ec_awards(record)
for award in awards:
if _award_number_in_text(award, record_data):
return True
return False


def eu_community_request(record):
"""Check if record was rejected from EU community."""
community_requests = dsl.Q(
"bool",
must=[
dsl.Q(
"term",
**{"receiver.community": current_app.config.get("EU_COMMUNITY_UUID")},
),
dsl.Q("term", **{"topic.record": record.pid.pid_value}),
],
)
request_types = dsl.Q(
"bool",
should=[
dsl.Q("term", **{"type": CommunityInclusion.type_id}),
dsl.Q("term", **{"type": CommunitySubmission.type_id}),
],
minimum_should_match=1,
)
finalq = community_requests & request_types
results = current_requests_service.search(system_identity, extra_filter=finalq)

for result in results:
# return true if there was a declined request or an existing open request
# as we respond to open requests ourselves.
if result["is_closed"] and result["status"] == "declined":
return True
if result["is_open"] and not result["is_expired"]:
yashlamba marked this conversation as resolved.
Show resolved Hide resolved
return True
return False


def eu_subcommunity_declined_request(record):
"""Check if record was rejected from EU sub community."""
record_requests = dsl.Q(
"bool",
must=[
dsl.Q("term", **{"topic.record": record.pid.pid_value}),
slint marked this conversation as resolved.
Show resolved Hide resolved
dsl.Q("term", **{"is_open": False}),
],
)
request_types = dsl.Q(
"bool",
should=[
dsl.Q("term", **{"type": CommunityInclusion.type_id}),
dsl.Q("term", **{"type": CommunitySubmission.type_id}),
],
minimum_should_match=1,
)
finalq = record_requests & request_types
results = current_requests_service.search(system_identity, extra_filter=finalq)

for result in results:
community = current_communities.service.record_cls.pid.resolve(
result["receiver"]["community"]
)
if community.parent and str(community.parent.id) == current_app.config.get(
"EU_COMMUNITY_UUID"
slint marked this conversation as resolved.
Show resolved Hide resolved
):
if result["status"] == "declined":
return True
return False


def community_data_award_acronym(record):
"""Check if award acronym in community data."""
comm_text = ""
for comm in record.parent.communities:
comm_text += comm.metadata.get("title", "")
comm_text += " " + comm.metadata.get("page", "")

if comm_text:
awards = _get_ec_awards(record)
for award in awards:
if _award_acronym_in_text(award, comm_text):
return True
return False