Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Porting over UBKG assaytype/rulechain support #588

Merged
merged 1 commit into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/instance/app.cfg.example
Original file line number Diff line number Diff line change
Expand Up @@ -107,5 +107,9 @@ UBKG_SERVER = 'https://ontology.api.hubmapconsortium.org/'
UBKG_ENDPOINT_VALUESET = 'valueset?parent_sab=SENNET&parent_code={code}&child_sabs=SENNET'
UBKG_CODES = '{"specimen_categories":"C020076", "organ_types":{"code": "C000008", "key": "organs", "endpoint": "organs?application_context=SENNET"}, "entities": "C000012", "source_types":"C050020"}'

# UBKG Integration Configs for Rule Chain
UBKG_INTEGRATION_ENDPOINT = 'http://gateway.dev.hubmapconsortium.org:8181/'
APPLICATION_CONTEXT = 'SENNET'

# URI from which to load the assay classifier rules.
RULE_CHAIN_URI = 'https://raw.githubusercontent.com/sennetconsortium/ingest-api/main/src/routes/assayclassifier/testing_rule_chain.json'
79 changes: 74 additions & 5 deletions src/lib/rule_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,41 @@
SCHEMA_FILE = "rule_chain_schema.json"
SCHEMA_BASE_URI = "http://schemata.hubmapconsortium.org/"


rule_chain = None

# Have to translate pre-UBKG keys to UBKG keys
# Format is:
# "Key before UBKG integration": "UBKG Key"
pre_integration_to_ubkg_translation = {
'vitessce-hints': 'vitessce_hints',
'dir-schema': 'dir_schema',
'tbl-schema': 'tbl_schema',
'contains-pii': 'contains_full_genetic_sequences',
'dataset-type': 'dataset_type',
'is-multi-assay': 'is_multiassay',
'pipeline-shorthand': 'pipeline_shorthand',
'must-contain': 'must_contain',
}

# These are the keys returned by the rule chain before UBKG integration.
# We will return the UBKG data in this format as well for MVP.
# This is to avoid too much churn on end-users.
# We set primary manually so ignore it.
pre_integration_keys = [
'assaytype',
'vitessce-hints',
'dir-schema',
'tbl-schema',
'contains-pii',
# 'primary',
'dataset-type',
'description',
'is-multi-assay',
'pipeline-shorthand',
'must-contain',
"process_state"
]


def initialize_rule_chain():
"""Initialize the rule chain from the source URI.
Expand Down Expand Up @@ -79,9 +111,9 @@ def calculate_data_types(entity: Entity) -> list[str]:
# the data_types field is not empty and not a list of empty strings
# If it has a value it must be an old derived dataset so use that to match the rules
if (
hasattr(entity, "data_types")
and entity.data_types
and set(entity.data_types) != {""}
hasattr(entity, "data_types")
and entity.data_types
and set(entity.data_types) != {""}
):
data_types = entity.data_types
# Moving forward (2024) we are no longer using data_types for derived datasets.
Expand Down Expand Up @@ -134,7 +166,7 @@ def build_entity_metadata(entity: Union[Entity, dict]) -> dict:
# The primary publication will always have metadata,
# so we have to do the association here.
if entity.entity_type == "Publication":
metadata["data_types"] = calculate_data_types(entity)
metadata["data_types"] = calculate_data_types(entity)

# If there is no metadata, then it must be a derived dataset
else:
Expand All @@ -150,6 +182,43 @@ def build_entity_metadata(entity: Union[Entity, dict]) -> dict:
return metadata


def apply_source_type_transformations(source_type: str, rule_value_set: dict) -> dict:
# If we get more complicated transformations we should consider refactoring.
# For now, this should suffice.
if source_type.upper() == "MOUSE":
rule_value_set["contains-pii"] = False

return rule_value_set


def get_data_from_ubkg(ubkg_code: str) -> dict:
query = urllib.parse.urlencode({"application_context": current_app.config['APPLICATION_CONTEXT']})
ubkg_api_url = f"{current_app.config['UBKG_INTEGRATION_ENDPOINT']}assayclasses/{ubkg_code}?{query}"
req = urllib.request.Request(ubkg_api_url)
try:
with urllib.request.urlopen(req) as response:
response_data = response.read().decode("utf-8")
except urllib.error.URLError as excp:
print(f"Error getting extra info from UBKG {excp}")
return {}

return json.loads(response_data)


def standardize_results(rule_chain_json: dict, ubkg_json: dict) -> dict:
# Initialize this with conditional logic to set 'primary' true or false.
ubkg_transformed_json = {
"primary": ubkg_json.get("process_state") == "primary"
}

for pre_integration_key in pre_integration_keys:
ubkg_key = pre_integration_to_ubkg_translation.get(pre_integration_key, pre_integration_key)
ubkg_value = ubkg_json.get(ubkg_key)
ubkg_transformed_json[pre_integration_key] = ubkg_value

return rule_chain_json | ubkg_transformed_json


class NoMatchException(Exception):
pass

Expand Down
26 changes: 17 additions & 9 deletions src/routes/assayclassifier/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
build_entity_metadata,
calculate_assay_info,
initialize_rule_chain,
get_data_from_ubkg,
standardize_results
)
from lib.services import get_entity

Expand All @@ -30,7 +32,7 @@ def get_ds_assaytype(ds_uuid: str):
token = get_token()
entity = get_entity(ds_uuid, token)
metadata = build_entity_metadata(entity)
rule_value_set = calculate_assay_info(metadata)
rules_json = calculate_assay_info(metadata)

if sources := entity.sources:
source_type = ""
Expand All @@ -39,9 +41,12 @@ def get_ds_assaytype(ds_uuid: str):
# If there is a single Human source_type, treat this as a Human case
if source_type.upper() == "HUMAN":
break
apply_source_type_transformations(source_type, rule_value_set)
apply_source_type_transformations(source_type, rules_json)

return jsonify(rule_value_set)
ubkg_value_json = get_data_from_ubkg(rules_json.get("ubkg_code")).get("value", {})
merged_json = standardize_results(rules_json, ubkg_value_json)
merged_json["ubkg_json"] = ubkg_value_json
return jsonify(merged_json)
except ValueError as excp:
logger.error(excp, exc_info=True)
return Response("Bad parameter: {excp}", 400)
Expand Down Expand Up @@ -97,21 +102,21 @@ def get_ds_rule_metadata(ds_uuid: str):
)


def apply_source_type_transformations(source_type: str, rule_value_set: dict) -> dict:
def apply_source_type_transformations(source_type: str, rules_json: dict) -> dict:
# If we get more complicated transformations we should consider refactoring.
# For now, this should suffice.
if "MOUSE" in source_type.upper():
rule_value_set["contains-pii"] = False
rules_json["contains-pii"] = False

return rule_value_set
return rules_json


@assayclassifier_blueprint.route("/assaytype", methods=["POST"])
@require_valid_token()
@require_json(param="metadata")
def get_assaytype_from_metadata(token: str, user: User, metadata: dict):
try:
rule_value_set = calculate_assay_info(metadata)
rules_json = calculate_assay_info(metadata)

if parent_sample_ids := metadata.get("parent_sample_id"):
source_type = ""
Expand All @@ -123,8 +128,11 @@ def get_assaytype_from_metadata(token: str, user: User, metadata: dict):
if source_type.upper() == "HUMAN":
break

apply_source_type_transformations(source_type, rule_value_set)
return jsonify(rule_value_set)
apply_source_type_transformations(source_type, rules_json)
ubkg_value_json = get_data_from_ubkg(rules_json.get("ubkg_code")).get("value", {})
merged_json = standardize_results(rules_json, ubkg_value_json)
merged_json["ubkg_json"] = ubkg_value_json
return jsonify(merged_json)
except ResponseException as re:
logger.error(re, exc_info=True)
return re.response
Expand Down
Loading
Loading