From 942ac3f1f3f966f50b11b33ffdbb3356c4706bb6 Mon Sep 17 00:00:00 2001 From: yashlamba Date: Tue, 17 Dec 2024 10:53:05 +0100 Subject: [PATCH 1/4] curation: add eu/sub-community request checks --- site/zenodo_rdm/curation/config.py | 8 +++- site/zenodo_rdm/curation/rules.py | 68 ++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/site/zenodo_rdm/curation/config.py b/site/zenodo_rdm/curation/config.py index bb846513..eeeaa97e 100644 --- a/site/zenodo_rdm/curation/config.py +++ b/site/zenodo_rdm/curation/config.py @@ -15,6 +15,8 @@ award_acronym_in_title, contains_high_conf_keywords, contains_low_conf_keywords, + eu_community_declined_request, + eu_subcommunity_declined_request, published_before_award_start, test_phrases_in_record, user_verified, @@ -31,6 +33,8 @@ "additional_desc_contains_low_conf_keywords": additional_desc_contains_low_conf_keywords, "additional_desc_contains_high_conf_keywords": additional_desc_contains_high_conf_keywords, "award_acronym_in_additional_description": award_acronym_in_additional_description, + "eu_community_declined_request": eu_community_declined_request, + "eu_subcommunity_declined_request": eu_subcommunity_declined_request, } """Rules to run for EU Curation.""" @@ -45,8 +49,10 @@ "additional_desc_contains_low_conf_keywords": 0, "additional_desc_contains_high_conf_keywords": 0, "award_acronym_in_additional_description": 0, + "eu_community_declined_request": False, + "eu_subcommunity_declined_request": False, } -"""Rule scores for EU Curation.""" +"""Rule scores for EU Curation (bool value implies direct approval/decline).""" CURATION_THRESHOLDS = {"EU_RECORDS_CURATION": 100} diff --git a/site/zenodo_rdm/curation/rules.py b/site/zenodo_rdm/curation/rules.py index 4e503daf..487e66ff 100644 --- a/site/zenodo_rdm/curation/rules.py +++ b/site/zenodo_rdm/curation/rules.py @@ -8,7 +8,12 @@ import arrow from flask import current_app +from invenio_access.permissions import system_identity +from invenio_communities.proxies import current_communities +from invenio_rdm_records.requests import CommunityInclusion, CommunitySubmission from invenio_records_resources.proxies import current_service_registry +from invenio_requests.proxies import current_requests_service +from invenio_search.engine import dsl def award_acronym_in_description(record): @@ -157,3 +162,66 @@ def award_acronym_in_additional_description(record): ): return True return False + + +def eu_community_declined_request(record): + """Check if record was rejected from EU community.""" + community_requests = dsl.Q( + "bool", + must=[ + dsl.Q( + "term", + **{"receiver.community": current_app.config.get("EU_COMMUNITY_UUID")}, + ), + dsl.Q("term", **{"topic.record": record.pid.pid_value}), + ], + ) + request_types = dsl.Q( + "bool", + should=[ + dsl.Q("term", **{"type": CommunityInclusion.type_id}), + dsl.Q("term", **{"type": CommunitySubmission.type_id}), + ], + minimum_should_match=1, + ) + finalq = community_requests & request_types + results = current_requests_service.search(system_identity, extra_filter=finalq) + + for result in results: + if result["is_closed"] and result["status"] == "declined": + return True + if result["is_open"] and not result["is_expired"]: + return True + return False + + +def eu_subcommunity_declined_request(record): + """Check if record was rejected from EU sub community.""" + record_requests = dsl.Q( + "bool", + must=[ + dsl.Q("term", **{"topic.record": record.pid.pid_value}), + dsl.Q("term", **{"is_open": False}), + ], + ) + request_types = dsl.Q( + "bool", + should=[ + dsl.Q("term", **{"type": CommunityInclusion.type_id}), + dsl.Q("term", **{"type": CommunitySubmission.type_id}), + ], + minimum_should_match=1, + ) + finalq = record_requests & request_types + results = current_requests_service.search(system_identity, extra_filter=finalq) + + for result in results: + receiver = current_communities.service.record_cls.pid.resolve( + result["receiver"]["community"] + ) + if receiver.parent and str(receiver.parent.id) == current_app.config.get( + "EU_COMMUNITY_UUID" + ): + if result["status"] == "declined": + return True + return False From 2cf5ec245313eeee40f373bfd25e7fe381f17fe9 Mon Sep 17 00:00:00 2001 From: yashlamba Date: Tue, 17 Dec 2024 10:53:45 +0100 Subject: [PATCH 2/4] curation: add community name acronym check --- site/zenodo_rdm/curation/config.py | 3 +++ site/zenodo_rdm/curation/rules.py | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/site/zenodo_rdm/curation/config.py b/site/zenodo_rdm/curation/config.py index eeeaa97e..9b3db292 100644 --- a/site/zenodo_rdm/curation/config.py +++ b/site/zenodo_rdm/curation/config.py @@ -13,6 +13,7 @@ award_acronym_in_additional_description, award_acronym_in_description, award_acronym_in_title, + community_name_award_acronym, contains_high_conf_keywords, contains_low_conf_keywords, eu_community_declined_request, @@ -35,6 +36,7 @@ "award_acronym_in_additional_description": award_acronym_in_additional_description, "eu_community_declined_request": eu_community_declined_request, "eu_subcommunity_declined_request": eu_subcommunity_declined_request, + "community_name_award_acronym": community_name_award_acronym, } """Rules to run for EU Curation.""" @@ -51,6 +53,7 @@ "award_acronym_in_additional_description": 0, "eu_community_declined_request": False, "eu_subcommunity_declined_request": False, + "community_name_award_acronym": 0, } """Rule scores for EU Curation (bool value implies direct approval/decline).""" diff --git a/site/zenodo_rdm/curation/rules.py b/site/zenodo_rdm/curation/rules.py index 487e66ff..0ab05391 100644 --- a/site/zenodo_rdm/curation/rules.py +++ b/site/zenodo_rdm/curation/rules.py @@ -225,3 +225,24 @@ def eu_subcommunity_declined_request(record): if result["status"] == "declined": return True return False + + +def community_name_award_acronym(record): + """Check if award acronym in community name.""" + comm_text = "" + for comm in record.parent.communities: + comm_text += comm.metadata.get("title", "") + comm_text += comm.metadata.get("page", "") + + if comm_text: + award_service = current_service_registry.get("awards") + funding = record.metadata.get("funding", []) + for f in funding: + if f["funder"].get("id") == "00k4n6c32": + if award_id := f.get("award", {}).get("id"): + award = award_service.record_cls.pid.resolve(award_id) + if award.get("acronym") and ( + award.get("acronym").lower() in comm_text.lower() + ): + return True + return False From c2fbcb8667dc50ab19a428c63097cc8881c34b39 Mon Sep 17 00:00:00 2001 From: yashlamba Date: Tue, 17 Dec 2024 13:45:06 +0100 Subject: [PATCH 3/4] curation: cleanup award processing; minor fixes --- site/zenodo_rdm/curation/config.py | 6 +- site/zenodo_rdm/curation/rules.py | 103 +++++++++++++---------------- 2 files changed, 50 insertions(+), 59 deletions(-) diff --git a/site/zenodo_rdm/curation/config.py b/site/zenodo_rdm/curation/config.py index 9b3db292..a0b358fa 100644 --- a/site/zenodo_rdm/curation/config.py +++ b/site/zenodo_rdm/curation/config.py @@ -16,7 +16,7 @@ community_name_award_acronym, contains_high_conf_keywords, contains_low_conf_keywords, - eu_community_declined_request, + eu_community_request, eu_subcommunity_declined_request, published_before_award_start, test_phrases_in_record, @@ -34,7 +34,7 @@ "additional_desc_contains_low_conf_keywords": additional_desc_contains_low_conf_keywords, "additional_desc_contains_high_conf_keywords": additional_desc_contains_high_conf_keywords, "award_acronym_in_additional_description": award_acronym_in_additional_description, - "eu_community_declined_request": eu_community_declined_request, + "eu_community_request": eu_community_request, "eu_subcommunity_declined_request": eu_subcommunity_declined_request, "community_name_award_acronym": community_name_award_acronym, } @@ -51,7 +51,7 @@ "additional_desc_contains_low_conf_keywords": 0, "additional_desc_contains_high_conf_keywords": 0, "award_acronym_in_additional_description": 0, - "eu_community_declined_request": False, + "eu_community_request": False, "eu_subcommunity_declined_request": False, "community_name_award_acronym": 0, } diff --git a/site/zenodo_rdm/curation/rules.py b/site/zenodo_rdm/curation/rules.py index 0ab05391..944b011c 100644 --- a/site/zenodo_rdm/curation/rules.py +++ b/site/zenodo_rdm/curation/rules.py @@ -16,39 +16,45 @@ from invenio_search.engine import dsl -def award_acronym_in_description(record): - """Check if EU award name in record description.""" - award_service = current_service_registry.get("awards") - description = record.metadata.get("description") - if not description: - return False +def _award_acronym_number_in_text(award, text): + """Check for award number/acronym in data.""" + if award.get("acronym") and (award.get("acronym") in text): + return True + if award.get("number") and (award.get("number") in text): + return True + return False + +def _get_ec_awards(record): + award_service = current_service_registry.get("awards") + awards = [] funding = record.metadata.get("funding", []) for f in funding: if f["funder"].get("id") == "00k4n6c32": if award_id := f.get("award", {}).get("id"): award = award_service.record_cls.pid.resolve(award_id) - if award.get("acronym") and ( - award.get("acronym").lower() in description.lower() - ): - return True + awards.append(award) + return awards + + +def award_acronym_in_description(record): + """Check if EU award name in record description.""" + if description := record.metadata.get("description"): + awards = _get_ec_awards(record) + for award in awards: + if _award_acronym_number_in_text(award, description): + return True return False def award_acronym_in_title(record): """Check if EU award name in record title.""" - award_service = current_service_registry.get("awards") title = record.metadata["title"] - funding = record.metadata.get("funding", []) - for f in funding: - if f["funder"].get("id") == "00k4n6c32": - if award_id := f.get("award", {}).get("id"): - award = award_service.record_cls.pid.resolve(award_id) - if award.get("acronym") and ( - award.get("acronym").lower() in title.lower() - ): - return True + awards = _get_ec_awards(record) + for award in awards: + if _award_acronym_number_in_text(award, title): + return True return False @@ -67,18 +73,13 @@ def test_phrases_in_record(record): def published_before_award_start(record): """Check if published before award start date.""" - award_service = current_service_registry.get("awards") - - funding = record.metadata.get("funding", []) - for f in funding: - if f["funder"].get("id") == "00k4n6c32": - if award_id := f.get("award", {}).get("id"): - award = award_service.record_cls.pid.resolve(award_id) - if award.get("start_date") and ( - record.created.timestamp() - < arrow.get(award.get("start_date")).datetime.timestamp() - ): - return True + awards = _get_ec_awards(record) + for award in awards: + if award.get("start_date") and ( + record.created.timestamp() + < arrow.get(award.get("start_date")).datetime.timestamp() + ): + return True return False @@ -148,23 +149,17 @@ def additional_desc_contains_low_conf_keywords(record): def award_acronym_in_additional_description(record): """Check if EU award name in record additional description.""" - award_service = current_service_registry.get("awards") additional_descriptions = record.metadata.get("additional_descriptions", []) record_data = " ".join([x.get("description", "") for x in additional_descriptions]) - funding = record.metadata.get("funding", []) - for f in funding: - if f["funder"].get("id") == "00k4n6c32": - if award_id := f.get("award", {}).get("id"): - award = award_service.record_cls.pid.resolve(award_id) - if award.get("acronym") and ( - award.get("acronym").lower() in record_data.lower() - ): - return True + awards = _get_ec_awards(record) + for award in awards: + if _award_acronym_number_in_text(award, record_data): + return True return False -def eu_community_declined_request(record): +def eu_community_request(record): """Check if record was rejected from EU community.""" community_requests = dsl.Q( "bool", @@ -188,6 +183,8 @@ def eu_community_declined_request(record): results = current_requests_service.search(system_identity, extra_filter=finalq) for result in results: + # return true if there was a declined request or an existing open request + # as we respond to open requests ourselves. if result["is_closed"] and result["status"] == "declined": return True if result["is_open"] and not result["is_expired"]: @@ -216,10 +213,10 @@ def eu_subcommunity_declined_request(record): results = current_requests_service.search(system_identity, extra_filter=finalq) for result in results: - receiver = current_communities.service.record_cls.pid.resolve( + community = current_communities.service.record_cls.pid.resolve( result["receiver"]["community"] ) - if receiver.parent and str(receiver.parent.id) == current_app.config.get( + if community.parent and str(community.parent.id) == current_app.config.get( "EU_COMMUNITY_UUID" ): if result["status"] == "declined": @@ -232,17 +229,11 @@ def community_name_award_acronym(record): comm_text = "" for comm in record.parent.communities: comm_text += comm.metadata.get("title", "") - comm_text += comm.metadata.get("page", "") + comm_text += " " + comm.metadata.get("page", "") if comm_text: - award_service = current_service_registry.get("awards") - funding = record.metadata.get("funding", []) - for f in funding: - if f["funder"].get("id") == "00k4n6c32": - if award_id := f.get("award", {}).get("id"): - award = award_service.record_cls.pid.resolve(award_id) - if award.get("acronym") and ( - award.get("acronym").lower() in comm_text.lower() - ): - return True + awards = _get_ec_awards(record) + for award in awards: + if _award_acronym_number_in_text(award, comm_text): + return True return False From 8d58f5c514302a0ceeb9ceaf267e51742956beff Mon Sep 17 00:00:00 2001 From: yashlamba Date: Tue, 17 Dec 2024 13:54:15 +0100 Subject: [PATCH 4/4] curation: add grant agreement number checks; docstrings fix --- site/zenodo_rdm/curation/config.py | 12 ++++++-- site/zenodo_rdm/curation/rules.py | 48 +++++++++++++++++++++++------- 2 files changed, 47 insertions(+), 13 deletions(-) diff --git a/site/zenodo_rdm/curation/config.py b/site/zenodo_rdm/curation/config.py index a0b358fa..ac936fe9 100644 --- a/site/zenodo_rdm/curation/config.py +++ b/site/zenodo_rdm/curation/config.py @@ -13,7 +13,9 @@ award_acronym_in_additional_description, award_acronym_in_description, award_acronym_in_title, - community_name_award_acronym, + award_number_in_additional_description, + award_number_in_description, + community_data_award_acronym, contains_high_conf_keywords, contains_low_conf_keywords, eu_community_request, @@ -36,7 +38,9 @@ "award_acronym_in_additional_description": award_acronym_in_additional_description, "eu_community_request": eu_community_request, "eu_subcommunity_declined_request": eu_subcommunity_declined_request, - "community_name_award_acronym": community_name_award_acronym, + "community_data_award_acronym": community_data_award_acronym, + "award_number_in_additional_description": award_number_in_additional_description, + "award_number_in_description": award_number_in_description, } """Rules to run for EU Curation.""" @@ -53,7 +57,9 @@ "award_acronym_in_additional_description": 0, "eu_community_request": False, "eu_subcommunity_declined_request": False, - "community_name_award_acronym": 0, + "community_data_award_acronym": 0, + "award_number_in_additional_description": 0, + "award_number_in_description": 0, } """Rule scores for EU Curation (bool value implies direct approval/decline).""" diff --git a/site/zenodo_rdm/curation/rules.py b/site/zenodo_rdm/curation/rules.py index 944b011c..fd489fdf 100644 --- a/site/zenodo_rdm/curation/rules.py +++ b/site/zenodo_rdm/curation/rules.py @@ -16,16 +16,22 @@ from invenio_search.engine import dsl -def _award_acronym_number_in_text(award, text): - """Check for award number/acronym in data.""" - if award.get("acronym") and (award.get("acronym") in text): +def _award_acronym_in_text(award, text): + """Check for award acronym in data.""" + if award.get("acronym") and (award.get("acronym").lower() in text.lower()): return True - if award.get("number") and (award.get("number") in text): + return False + + +def _award_number_in_text(award, text): + """Check for award number in data.""" + if award.get("number") and (str(award.get("number")) in text): return True return False def _get_ec_awards(record): + """Get all EC funded awards of record.""" award_service = current_service_registry.get("awards") awards = [] funding = record.metadata.get("funding", []) @@ -42,7 +48,17 @@ def award_acronym_in_description(record): if description := record.metadata.get("description"): awards = _get_ec_awards(record) for award in awards: - if _award_acronym_number_in_text(award, description): + if _award_acronym_in_text(award, description): + return True + return False + + +def award_number_in_description(record): + """Check if EU award number in record description.""" + if description := record.metadata.get("description"): + awards = _get_ec_awards(record) + for award in awards: + if _award_number_in_text(award, description): return True return False @@ -53,7 +69,7 @@ def award_acronym_in_title(record): awards = _get_ec_awards(record) for award in awards: - if _award_acronym_number_in_text(award, title): + if _award_acronym_in_text(award, title): return True return False @@ -154,7 +170,19 @@ def award_acronym_in_additional_description(record): awards = _get_ec_awards(record) for award in awards: - if _award_acronym_number_in_text(award, record_data): + if _award_acronym_in_text(award, record_data): + return True + return False + + +def award_number_in_additional_description(record): + """Check if EU award number in record additional description.""" + additional_descriptions = record.metadata.get("additional_descriptions", []) + record_data = " ".join([x.get("description", "") for x in additional_descriptions]) + + awards = _get_ec_awards(record) + for award in awards: + if _award_number_in_text(award, record_data): return True return False @@ -224,8 +252,8 @@ def eu_subcommunity_declined_request(record): return False -def community_name_award_acronym(record): - """Check if award acronym in community name.""" +def community_data_award_acronym(record): + """Check if award acronym in community data.""" comm_text = "" for comm in record.parent.communities: comm_text += comm.metadata.get("title", "") @@ -234,6 +262,6 @@ def community_name_award_acronym(record): if comm_text: awards = _get_ec_awards(record) for award in awards: - if _award_acronym_number_in_text(award, comm_text): + if _award_acronym_in_text(award, comm_text): return True return False