From a1a6858cb1a3911a9ba2b90bd195934cc15ba8c7 Mon Sep 17 00:00:00 2001 From: Lisa-Ann B Date: Thu, 26 Sep 2024 16:00:51 -0400 Subject: [PATCH 1/7] Update provenance_schema for Publication - #476 --- src/schema/provenance_schema.yaml | 88 +++++-------------------------- 1 file changed, 13 insertions(+), 75 deletions(-) diff --git a/src/schema/provenance_schema.yaml b/src/schema/provenance_schema.yaml index 8e7db7a..41882d1 100644 --- a/src/schema/provenance_schema.yaml +++ b/src/schema/provenance_schema.yaml @@ -355,7 +355,7 @@ ENTITIES: derivation: source: true target: true - properties: + properties: &shared_dataset_properties <<: *shared_properties <<: *shared_entity_properties <<: *doi_properties @@ -669,87 +669,25 @@ ENTITIES: source: true target: true properties: - error_message: - type: string - description: "An open text field that holds the last error message that arose from pipeline validation or analysis." - dataset_type: - before_create_trigger: set_publication_dataset_type - type: string - generated: true - immutable: true - description: "The assay types of this Dataset. Valid values are from UBKG are queried by schema_manager.get_valueset_dataset_type() using the Ontology API." + <<: *shared_dataset_properties title: type: string description: "The title of the publication." required_on_create: true # Only required for create via POST, not update via PUT - pipeline_message: - #todo: where is this attribute sourced from? Is it stored in the database? <- Not in neo4j - type: string - ingest_metadata: - type: json_string # dict - description: "The metadata returned from the processing at data submission time." - run_id: - type: string - ingest_id: - type: string - # A user who is a member of multiple groups HAS to send in the group_uuid - group_uuid: - type: string - immutable: true - description: "The uuid of globus group which the user who created this entity is a member of. This is required on Create/POST if the user creating the Donor is a member of more than one write group. This property cannot be set via PUT (only on Create/POST)." - before_create_trigger: set_group_uuid #method that, if group_uuid is not already set looks for membership in a single "data provider" group and sets to that. Otherwise if not set and no single "provider group" membership throws error - # No like image and metadata files handling for Donor/Sample - # Dataset has only one thumbnail file - thumbnail_file: - generated: true - type: json_string - description: "The dataset thumbnail file detail. Stored in db as a stringfied json, e.g., {'filename': 'thumbnail.jpg', 'file_uuid': 'dadasdasdadda'}" - # The updated_peripherally tag is a temporary measure to correctly handle any attributes - # which are potentially updated by multiple triggers - updated_peripherally: true - thumbnail_file_to_add: - type: json_string - transient: true - exposed: false - description: 'Just a temporary file id. Provide as a json object with an temp_file_id like {"temp_file_id":"dzevgd6xjs4d5grmcp4n"}' - before_create_trigger: commit_thumbnail_file - # This before_update_trigger with the same commit process can be used by ingest-api to update the dataset via PUT call - before_update_trigger: commit_thumbnail_file - # The updated_peripherally tag is a temporary measure to correctly handle any attributes - # which are potentially updated by multiple triggers - updated_peripherally: true - thumbnail_file_to_remove: - # This is only valid on update via a PUT request + creation_action: type: string transient: true - exposed: false - description: 'The thumbnail image file previously uploaded to delete. Provide as a string of the file_uuid like: "232934234234234234234270c0ea6c51d604a850558ef2247d0b4"' - before_update_trigger: delete_thumbnail_file - # The updated_peripherally tag is a temporary measure to correctly handle any attributes - # which are potentially updated by multiple triggers - updated_peripherally: true - retraction_reason: - type: string - before_property_update_validators: - - validate_if_retraction_permitted - - validate_sub_status_provided - description: 'Information recorded about why a the dataset was retracted.' - sub_status: - type: string - before_property_update_validators: - - validate_if_retraction_permitted - - validate_retraction_reason_provided - - validate_retracted_dataset_sub_status_value - description: 'A sub-status provided to further define the status. The only current allowable value is "Retracted"' - provider_info: - type: string - description: 'Information recorded about the data provider before an analysis pipeline is run on the data.' - dbgap_sra_experiment_url: - type: string - description: 'A URL linking the dataset to the associated uploaded data at dbGaP.' - dbgap_study_url: + generated: true + immutable: true + on_read_trigger: get_creation_action_activity + on_index_trigger: get_creation_action_activity + description: "The activity that was performed." + dataset_type: + before_create_trigger: set_publication_dataset_type type: string - description: 'A URL linking the dataset to the particular study on dbGap it belongs to' + generated: true + immutable: true + description: "The assay types of this Dataset. Valid values are from UBKG are queried by schema_manager.get_valueset_dataset_type() using the Ontology API." publication_date: type: string description: 'The date of publication' From e86f2fae03ff4938dbf131cf72745aa8cdf1853e Mon Sep 17 00:00:00 2001 From: maxsibilla Date: Wed, 2 Oct 2024 12:41:16 -0400 Subject: [PATCH 2/7] Updating origin_sample to be a list called origin_samples --- src/schema/provenance_schema.yaml | 6 +++--- src/schema/schema_neo4j_queries.py | 6 +++--- src/schema/schema_triggers.py | 21 +++++++++++---------- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/schema/provenance_schema.yaml b/src/schema/provenance_schema.yaml index 41882d1..0dcec46 100644 --- a/src/schema/provenance_schema.yaml +++ b/src/schema/provenance_schema.yaml @@ -597,10 +597,10 @@ ENTITIES: type: string description: 'The timestamp of when this entity was last modified or published.' on_index_trigger: get_last_touch - origin_sample: + origin_samples: type: json_string - description: 'The Sample ancestor that has the sample_category of "Organ".' - on_index_trigger: get_origin_sample + description: 'The Sample ancestors that has the sample_category of "Organ".' + on_index_trigger: get_origin_samples assigned_to_group_name: type: string description: The group who is responsible for the next step in the ingest process diff --git a/src/schema/schema_neo4j_queries.py b/src/schema/schema_neo4j_queries.py index 8c60698..bc51987 100644 --- a/src/schema/schema_neo4j_queries.py +++ b/src/schema/schema_neo4j_queries.py @@ -126,12 +126,12 @@ def get_dataset_direct_descendants(neo4j_driver, uuid, property_key=None, match_ """ -def get_origin_sample(neo4j_driver, uuid): +def get_origin_samples(neo4j_driver, uuid): result = {} query = (f"MATCH (e:Entity)-[:WAS_GENERATED_BY|USED*]->(s:Sample) " f"WHERE e.uuid='{uuid}' and s.sample_category='Organ' " - f"return s AS {record_field_name}") + f"return apoc.coll.toSet(COLLECT(s)) AS {record_field_name}") logger.info("======get_origin_sample() query======") logger.info(query) @@ -140,7 +140,7 @@ def get_origin_sample(neo4j_driver, uuid): record = session.read_transaction(_execute_readonly_tx, query) if record and record[record_field_name]: # Convert the entity node to dict - result = _node_to_dict(record[record_field_name]) + result = _nodes_to_dicts(record[record_field_name]) return result diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index ed2baca..a20b25b 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -1734,7 +1734,7 @@ def get_last_touch(property_key, normalized_type, user_token, existing_data_dict return property_key, last_touch -def get_origin_sample(property_key, normalized_type, user_token, existing_data_dict, new_data_dict): +def get_origin_samples(property_key, normalized_type, user_token, existing_data_dict, new_data_dict): """Trigger event method to grab the ancestor of this entity where entity type is Sample and the sample_category is Organ. Parameters @@ -1763,19 +1763,20 @@ def get_origin_sample(property_key, normalized_type, user_token, existing_data_d # Return the organ if this is an organ return property_key, existing_data_dict - origin_sample = None + origin_samples = None if normalized_type in ["Sample", "Dataset", "Publication"]: - origin_sample = schema_neo4j_queries.get_origin_sample(schema_manager.get_neo4j_driver_instance(), + origin_samples = schema_neo4j_queries.get_origin_samples(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) - organ_hierarchy_key, organ_hierarchy_value = get_organ_hierarchy(property_key='organ_hierarchy', - normalized_type=Ontology.ops().entities().SAMPLE, - user_token=user_token, - existing_data_dict=origin_sample, - new_data_dict=new_data_dict) - origin_sample[organ_hierarchy_key] = organ_hierarchy_value + for origin_sample in origin_samples: + organ_hierarchy_key, organ_hierarchy_value = get_organ_hierarchy(property_key='organ_hierarchy', + normalized_type=Ontology.ops().entities().SAMPLE, + user_token=user_token, + existing_data_dict=origin_sample, + new_data_dict=new_data_dict) + origin_sample[organ_hierarchy_key] = organ_hierarchy_value - return property_key, origin_sample + return property_key, origin_samples except Exception: logger.error(f"No origin sample found for {normalized_type} with UUID: {existing_data_dict['uuid']}") return property_key, None From f973a6021969dcfea0a95d6729d4a2a155e9966c Mon Sep 17 00:00:00 2001 From: maxsibilla Date: Wed, 2 Oct 2024 12:54:22 -0400 Subject: [PATCH 3/7] Updating provenace schema for Sample to be origin_samples --- src/schema/provenance_schema.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/schema/provenance_schema.yaml b/src/schema/provenance_schema.yaml index 0dcec46..78f7caf 100644 --- a/src/schema/provenance_schema.yaml +++ b/src/schema/provenance_schema.yaml @@ -950,10 +950,10 @@ ENTITIES: type: string description: 'The timestamp of when this entity was last modified or published.' on_index_trigger: get_last_touch - origin_sample: + origin_samples: type: json_string - description: 'The Sample ancestor that has the sample_category of "Organ".' - on_index_trigger: get_origin_sample + description: 'The list of Sample ancestors that has the sample_category of "Organ".' + on_index_trigger: get_origin_samples next_identifier: type: string immutable: true From fa08588cd3556e77a86acb8069eb60a8466f0a6c Mon Sep 17 00:00:00 2001 From: maxsibilla Date: Tue, 8 Oct 2024 10:01:37 -0400 Subject: [PATCH 4/7] Adding origin samples to GET response for entities --- src/schema/provenance_schema.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/schema/provenance_schema.yaml b/src/schema/provenance_schema.yaml index 78f7caf..4cdafdb 100644 --- a/src/schema/provenance_schema.yaml +++ b/src/schema/provenance_schema.yaml @@ -600,6 +600,7 @@ ENTITIES: origin_samples: type: json_string description: 'The Sample ancestors that has the sample_category of "Organ".' + on_read_trigger: get_origin_samples on_index_trigger: get_origin_samples assigned_to_group_name: type: string @@ -953,6 +954,7 @@ ENTITIES: origin_samples: type: json_string description: 'The list of Sample ancestors that has the sample_category of "Organ".' + on_read_trigger: get_origin_samples on_index_trigger: get_origin_samples next_identifier: type: string From 28769337a0af6ccc0f2390e34671f833ebdb302e Mon Sep 17 00:00:00 2001 From: maxsibilla Date: Tue, 8 Oct 2024 10:50:13 -0400 Subject: [PATCH 5/7] Updating tests --- test/data/get_ancestors_success_dataset.json | 19 ++++++ .../get_entity_by_type_success_sample.json | 64 +++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/test/data/get_ancestors_success_dataset.json b/test/data/get_ancestors_success_dataset.json index bfe7718..f665958 100644 --- a/test/data/get_ancestors_success_dataset.json +++ b/test/data/get_ancestors_success_dataset.json @@ -89,6 +89,25 @@ "last_modified_user_email": "TESTUSER@example.com", "last_modified_user_sub": "9e5b670f-228d-433c-bb86-a3228d5ca49d", "organ": "BD", + "origin_samples": { + "created_by_user_displayname": "Test User", + "created_by_user_email": "TESTUSER@example.com", + "created_by_user_sub": "9e5b670f-228d-433c-bb86-a3228d5ca49d", + "created_timestamp": 1681828779121, + "data_access_level": "consortium", + "entity_type": "Sample", + "group_name": "CODCC Testing Group", + "group_uuid": "57192604-18e0-11ed-b79b-972795fc9504", + "lab_tissue_sample_id": "Human Blood", + "last_modified_timestamp": 1681828779121, + "last_modified_user_displayname": "Test User", + "last_modified_user_email": "TESTUSER@example.com", + "last_modified_user_sub": "9e5b670f-228d-433c-bb86-a3228d5ca49d", + "organ": "BD", + "sample_category": "Organ", + "sennet_id": "SNT458.VPHX.635", + "uuid": "cf3d0408de9afd703c8bd71808176b38" + }, "sample_category": "Organ", "sennet_id": "SNT458.VPHX.635", "uuid": "cf3d0408de9afd703c8bd71808176b38", diff --git a/test/data/get_entity_by_type_success_sample.json b/test/data/get_entity_by_type_success_sample.json index 8c27abb..3dee0b5 100644 --- a/test/data/get_entity_by_type_success_sample.json +++ b/test/data/get_entity_by_type_success_sample.json @@ -23,6 +23,38 @@ "last_modified_user_email": "TESTUSER@example.com", "last_modified_user_sub": "9e5b670f-228d-433c-bb86-a3228d5ca49d", "organ": "BR", + "origin_samples": { + "created_by_user_displayname": "Test User", + "created_by_user_email": "TESTUSER@example.com", + "created_by_user_sub": "9e5b670f-228d-433c-bb86-a3228d5ca49d", + "created_timestamp": 1681828388360, + "data_access_level": "consortium", + "description": "Sample lab notes", + "entity_type": "Sample", + "group_name": "CODCC Testing Group", + "group_uuid": "57192604-18e0-11ed-b79b-972795fc9504", + "image_files": [ + { + "description": "Test image", + "file_uuid": "ffff1b46e377b91565ed53464cc8d859", + "filename": "a4fc82ba0010139e33c6209b917ac9c487172222.png" + } + ], + "lab_tissue_sample_id": "Human Brain", + "last_modified_timestamp": 1681828388360, + "last_modified_user_displayname": "Test User", + "last_modified_user_email": "TESTUSER@example.com", + "last_modified_user_sub": "9e5b670f-228d-433c-bb86-a3228d5ca49d", + "organ": "BR", + "protocol_url": "dx.doi.org/10.17504/protocols.io.3byl4j398lo5/v1", + "sample_category": "Organ", + "sennet_id": "SNT834.LVJG.639", + "thumbnail_file": { + "file_uuid": "ffffb2c9be7816087e13580e244855c5", + "filename": "image_handler.jpg" + }, + "uuid": "3c4fc147a08429f58856779fcde96f42" + }, "protocol_url": "dx.doi.org/10.17504/protocols.io.3byl4j398lo5/v1", "sample_category": "Organ", "sennet_id": "SNT834.LVJG.639", @@ -75,6 +107,38 @@ "last_modified_user_email": "TESTUSER@example.com", "last_modified_user_sub": "9e5b670f-228d-433c-bb86-a3228d5ca49d", "organ": "BR", + "origin_samples": { + "created_by_user_displayname": "Test User", + "created_by_user_email": "TESTUSER@example.com", + "created_by_user_sub": "9e5b670f-228d-433c-bb86-a3228d5ca49d", + "created_timestamp": 1681828388360, + "data_access_level": "consortium", + "description": "Sample lab notes", + "entity_type": "Sample", + "group_name": "CODCC Testing Group", + "group_uuid": "57192604-18e0-11ed-b79b-972795fc9504", + "image_files": [ + { + "description": "Test image", + "file_uuid": "ffff1b46e377b91565ed53464cc8d859", + "filename": "a4fc82ba0010139e33c6209b917ac9c487172222.png" + } + ], + "lab_tissue_sample_id": "Human Brain", + "last_modified_timestamp": 1681828388360, + "last_modified_user_displayname": "Test User", + "last_modified_user_email": "TESTUSER@example.com", + "last_modified_user_sub": "9e5b670f-228d-433c-bb86-a3228d5ca49d", + "organ": "BR", + "protocol_url": "dx.doi.org/10.17504/protocols.io.3byl4j398lo5/v1", + "sample_category": "Organ", + "sennet_id": "SNT834.LVJG.639", + "thumbnail_file": { + "file_uuid": "ffffb2c9be7816087e13580e244855c5", + "filename": "image_handler.jpg" + }, + "uuid": "4aff569f5d61477abfe5c40364d04a1c" + }, "protocol_url": "dx.doi.org/10.17504/protocols.io.3byl4j398lo5/v1", "sample_category": "Organ", "sennet_id": "SNT834.LVJG.639", From 66c3d049f72a262e27a718fbe37e73347ce86578 Mon Sep 17 00:00:00 2001 From: maxsibilla Date: Wed, 9 Oct 2024 08:44:24 -0400 Subject: [PATCH 6/7] Adding check for entity constraints to not affect publications or collections --- src/app.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/app.py b/src/app.py index d273b6a..aa17f6f 100644 --- a/src/app.py +++ b/src/app.py @@ -1018,7 +1018,9 @@ def create_entity(entity_type: str, user_token: str, json_data_dict: dict): direct_ancestor_uuids = [] for direct_ancestor_uuid in json_data_dict['direct_ancestor_uuids']: direct_ancestor_dict = query_target_entity(direct_ancestor_uuid) - validate_constraints_by_entities(direct_ancestor_dict, json_data_dict, normalized_entity_type) + # We don't need to check this for Publications or Collections + if normalized_entity_type == 'Dataset': + validate_constraints_by_entities(direct_ancestor_dict, json_data_dict, normalized_entity_type) direct_ancestor_uuids.append(direct_ancestor_dict['uuid']) json_data_dict['direct_ancestor_uuids'] = direct_ancestor_uuids From a328bb4daa785e27f4423e3d2890b6c5d55a1989 Mon Sep 17 00:00:00 2001 From: maxsibilla Date: Wed, 9 Oct 2024 08:52:00 -0400 Subject: [PATCH 7/7] Reverting change --- src/app.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/app.py b/src/app.py index aa17f6f..d273b6a 100644 --- a/src/app.py +++ b/src/app.py @@ -1018,9 +1018,7 @@ def create_entity(entity_type: str, user_token: str, json_data_dict: dict): direct_ancestor_uuids = [] for direct_ancestor_uuid in json_data_dict['direct_ancestor_uuids']: direct_ancestor_dict = query_target_entity(direct_ancestor_uuid) - # We don't need to check this for Publications or Collections - if normalized_entity_type == 'Dataset': - validate_constraints_by_entities(direct_ancestor_dict, json_data_dict, normalized_entity_type) + validate_constraints_by_entities(direct_ancestor_dict, json_data_dict, normalized_entity_type) direct_ancestor_uuids.append(direct_ancestor_dict['uuid']) json_data_dict['direct_ancestor_uuids'] = direct_ancestor_uuids