From 5b5511233251ef4269f9d54748147031eb4e9c29 Mon Sep 17 00:00:00 2001 From: amykglen Date: Thu, 6 May 2021 23:10:44 -0700 Subject: [PATCH 01/15] Choose edge-finding technique based on number of nodes in each result --- code/ARAX/ARAXQuery/ARAX_resultify.py | 73 +++++++++++++++++---------- 1 file changed, 45 insertions(+), 28 deletions(-) diff --git a/code/ARAX/ARAXQuery/ARAX_resultify.py b/code/ARAX/ARAXQuery/ARAX_resultify.py index e73bfa5c8..116356a73 100644 --- a/code/ARAX/ARAXQuery/ARAX_resultify.py +++ b/code/ARAX/ARAXQuery/ARAX_resultify.py @@ -16,6 +16,7 @@ ''' import collections +import copy import math import os import sys @@ -521,18 +522,23 @@ def _get_results_for_kg_by_qg(kg: KnowledgeGraph, # all nodes *must # Handle case where QG contains multiple qnodes and no qedges (we'll dump everything in one result) if not qg.edges and len(qg.nodes) > 1: - nodes_by_qg_key = _get_kg_node_keys_by_qg_key(kg) result_graph = _create_new_empty_result_graph(qg) - result_graph["nodes"] = nodes_by_qg_key + result_graph["nodes"] = kg_node_keys_by_qg_key final_result_graphs = [result_graph] else: # Build up some indexes for edges in the KG (by their subject/object nodes and qedge keys) - edges_by_qg_id_and_subject_node = collections.defaultdict(lambda: collections.defaultdict(lambda: set())) - edges_by_qg_id_and_object_node = collections.defaultdict(lambda: collections.defaultdict(lambda: set())) + edge_keys_by_subject = collections.defaultdict(lambda: collections.defaultdict(lambda: set())) + edge_keys_by_object = collections.defaultdict(lambda: collections.defaultdict(lambda: set())) + edge_keys_by_node_pair = collections.defaultdict(lambda: collections.defaultdict(lambda: set())) for edge_key, edge in kg.edges.items(): - for qedge_key in edge.qedge_keys: - edges_by_qg_id_and_subject_node[qedge_key][edge.subject].add(edge_key) - edges_by_qg_id_and_object_node[qedge_key][edge.object].add(edge_key) + for qedge_id in edge.qedge_keys: + edge_keys_by_subject[qedge_id][edge.subject].add(edge_key) + edge_keys_by_object[qedge_id][edge.object].add(edge_key) + node_pair_string = f"{edge.subject}--{edge.object}" + edge_keys_by_node_pair[qedge_id][node_pair_string].add(edge_key) + if ignore_edge_direction: + node_pair_other_direction = f"{edge.object}--{edge.subject}" + edge_keys_by_node_pair[qedge_id][node_pair_other_direction].add(edge_key) # Create results off the "required" portion of the QG (excluding any qnodes/qedges belong to an "option group") required_qg = QueryGraph(nodes={qnode_key: qnode for qnode_key, qnode in qg.nodes.items() if not qnode.option_group_id}, @@ -541,8 +547,9 @@ def _get_results_for_kg_by_qg(kg: KnowledgeGraph, # all nodes *must if qg_is_disconnected: raise ValueError(f"Required portion of QG is disconnected. This isn't allowed! 'Required' qnode IDs are: " f"{[qnode_key for qnode_key in required_qg.nodes]}") - result_graphs_required = _create_result_graphs(kg, required_qg, edges_by_qg_id_and_subject_node, - edges_by_qg_id_and_object_node, ignore_edge_direction) + result_graphs_required = _create_result_graphs(kg, required_qg, kg_node_keys_by_qg_key, + edge_keys_by_subject, edge_keys_by_object, + edge_keys_by_node_pair, ignore_edge_direction) # Then create results for each of the "option groups" in the QG (including the required portion of the QG with each) option_groups_in_qg = {qedge.option_group_id for qedge in qg.edges.values() if qedge.option_group_id} @@ -558,8 +565,9 @@ def _get_results_for_kg_by_qg(kg: KnowledgeGraph, # all nodes *must raise ValueError(f"Required + option group {option_group_id} portion of the QG is disconnected. " f"This isn't allowed! 'Required'/group {option_group_id} qnode IDs are: " f"{[qnode_key for qnode_key in option_group_qg.nodes]}") - result_graphs_for_option_group = _create_result_graphs(kg, option_group_qg, edges_by_qg_id_and_subject_node, - edges_by_qg_id_and_object_node, ignore_edge_direction) + result_graphs_for_option_group = _create_result_graphs(kg, option_group_qg, kg_node_keys_by_qg_key, + edge_keys_by_subject, edge_keys_by_object, + edge_keys_by_node_pair, ignore_edge_direction) option_group_results_dict[option_group_id] = result_graphs_for_option_group # Organize our results for the 'required' portion of the QG by the IDs of their is_set=False nodes @@ -702,8 +710,7 @@ def _create_new_empty_result_graph(query_graph: QueryGraph) -> Dict[str, Dict[st def _copy_result_graph(result_graph: Dict[str, Dict[str, Set[str]]]) -> Dict[str, Dict[str, Set[str]]]: - result_graph_copy = {'nodes': {qnode_key: node_keys for qnode_key, node_keys in result_graph['nodes'].items()}, - 'edges': {qedge_key: edge_keys for qedge_key, edge_keys in result_graph['edges'].items()}} + result_graph_copy = copy.deepcopy(result_graph) return result_graph_copy @@ -790,7 +797,7 @@ def _find_qnode_connected_to_sub_qg(qnode_keys_to_connect_to: Set[str], qnode_ke return "", set() -def _get_qg_adj_map_undirected(qg) -> Dict[str, Set[str]]: +def _get_qg_adj_map_undirected(qg: QueryGraph) -> Dict[str, Set[str]]: """ This function creates a node adjacency map for a given query graph. Example: {"n0": {"n1"}, "n1": {"n0"}} """ @@ -850,11 +857,12 @@ def _clean_up_dead_ends(result_graph: Dict[str, Dict[str, Set[str]]], def _create_result_graphs(kg: KnowledgeGraph, qg: QueryGraph, - edges_by_qg_id_and_subject: DefaultDict[str, DefaultDict[str, set]], - edges_by_qg_id_and_object: DefaultDict[str, DefaultDict[str, set]], + kg_node_keys_by_qg_key: Dict[str, Set[str]], + edge_keys_by_subject: DefaultDict[str, DefaultDict[str, set]], + edge_keys_by_object: DefaultDict[str, DefaultDict[str, set]], + edge_keys_by_node_pair: DefaultDict[str, DefaultDict[str, set]], ignore_edge_direction: bool = True) -> List[Result]: result_graphs = [] - kg_node_keys_by_qg_key = _get_kg_node_keys_by_qg_key(kg) kg_node_adj_map_by_qg_key = _get_kg_node_adj_map_by_qg_key(kg_node_keys_by_qg_key, kg, qg) qg_adj_map = _get_qg_adj_map_undirected(qg) @@ -926,17 +934,26 @@ def _create_result_graphs(kg: KnowledgeGraph, qedge = qg.edges[qedge_key] qedge_source_node_ids = result_graph['nodes'][qedge.subject] qedge_target_node_ids = result_graph['nodes'][qedge.object] - edges_with_matching_subject = {edge_key for source_node in qedge_source_node_ids - for edge_key in edges_by_qg_id_and_subject[qedge_key][source_node]} - edges_with_matching_object = {edge_key for target_node in qedge_target_node_ids - for edge_key in edges_by_qg_id_and_object[qedge_key][target_node]} - result_graph['edges'][qedge_key] = edges_with_matching_subject.intersection(edges_with_matching_object) - if ignore_edge_direction: - edges_with_reverse_subject = {edge_key for target_node in qedge_target_node_ids - for edge_key in edges_by_qg_id_and_subject[qedge_key][target_node]} - edges_with_reverse_object = {edge_key for source_node in qedge_source_node_ids - for edge_key in edges_by_qg_id_and_object[qedge_key][source_node]} - result_graph['edges'][qedge_key].update(edges_with_reverse_subject.intersection(edges_with_reverse_object)) + # Pick the more efficient method for edge-finding depending on the number of nodes for this result/qedge + if len(qedge_source_node_ids) < 10 or len(qedge_target_node_ids) < 10: + possible_node_pairs = {f"{node_1}--{node_2}" for node_1 in qedge_source_node_ids + for node_2 in qedge_target_node_ids} + for node_pair in possible_node_pairs: + ids_of_matching_edges = edge_keys_by_node_pair[qedge_key].get(node_pair, set()) + result_graph['edges'][qedge_key].update(ids_of_matching_edges) + else: + # This technique is more efficient when there are large numbers of both subject and object nodes + edges_with_matching_subject = {edge_key for source_node in qedge_source_node_ids + for edge_key in edge_keys_by_subject[qedge_key][source_node]} + edges_with_matching_object = {edge_key for target_node in qedge_target_node_ids + for edge_key in edge_keys_by_object[qedge_key][target_node]} + result_graph['edges'][qedge_key] = edges_with_matching_subject.intersection(edges_with_matching_object) + if ignore_edge_direction: + edges_with_reverse_subject = {edge_key for target_node in qedge_target_node_ids + for edge_key in edge_keys_by_subject[qedge_key][target_node]} + edges_with_reverse_object = {edge_key for source_node in qedge_source_node_ids + for edge_key in edge_keys_by_object[qedge_key][source_node]} + result_graph['edges'][qedge_key].update(edges_with_reverse_subject.intersection(edges_with_reverse_object)) final_result_graphs = [result_graph for result_graph in result_graphs if _result_graph_is_fulfilled(result_graph, qg)] return final_result_graphs From a5ad9b39186cfa5c97686982e60256966013f581 Mon Sep 17 00:00:00 2001 From: amykglen Date: Fri, 7 May 2021 14:57:09 -0700 Subject: [PATCH 02/15] Improve subprocess calls in KG2c build (stop on error) --- .../NodeSynonymizer/dump_kg2_node_data.py | 57 ++++++++++--------- code/kg2/canonicalized/build_kg2c.py | 16 +++--- 2 files changed, 38 insertions(+), 35 deletions(-) diff --git a/code/ARAX/NodeSynonymizer/dump_kg2_node_data.py b/code/ARAX/NodeSynonymizer/dump_kg2_node_data.py index 5a2656638..0612e4055 100644 --- a/code/ARAX/NodeSynonymizer/dump_kg2_node_data.py +++ b/code/ARAX/NodeSynonymizer/dump_kg2_node_data.py @@ -33,31 +33,34 @@ def dump_kg2_node_info(file_name: str, write_mode: str, is_test: bool): """ query = f"match (n) return properties(n) as p, labels(n) as l {'limit 20' if is_test else ''}" res = _run_cypher_query(query) - with open(file_name, write_mode, encoding="utf-8") as fid: - for item in res: - prop_dict = item['p'] - labels = item['l'] - try: - label = list(set(labels) - {'Base'}).pop() - except: - label = "" - try: - fid.write('%s\t' % prop_dict['id']) - except: - fid.write('\t') - try: - fid.write('%s\t' % remove_tab_newlines.sub(" ", prop_dict['name'])) # better approach - except: - fid.write('\t') - try: - fid.write('%s\t' % remove_tab_newlines.sub(" ", prop_dict['full_name'])) - except: - fid.write('\t') - try: - fid.write('%s\n' % label) - except: - fid.write('\n') - print(f"Successfully created file '{file_name}'.") + if res: + with open(file_name, write_mode, encoding="utf-8") as fid: + for item in res: + prop_dict = item['p'] + labels = item['l'] + try: + label = list(set(labels) - {'Base'}).pop() + except: + label = "" + try: + fid.write('%s\t' % prop_dict['id']) + except: + fid.write('\t') + try: + fid.write('%s\t' % remove_tab_newlines.sub(" ", prop_dict['name'])) # better approach + except: + fid.write('\t') + try: + fid.write('%s\t' % remove_tab_newlines.sub(" ", prop_dict['full_name'])) + except: + fid.write('\t') + try: + fid.write('%s\n' % label) + except: + fid.write('\n') + print(f"Successfully created file '{file_name}'.") + else: + raise Exception(f"Failed to get results from Neo4j for {file_name}") return @@ -93,7 +96,7 @@ def dump_kg2_equivalencies(output_file_name: str, is_test: bool): csv_writer.writerows(list(distinct_pairs)) print(f"Successfully created file '{output_file_name}'.") else: - print(f"Sorry, couldn't get equivalency data. No file created.") + raise Exception(f"Failed to get results from Neo4j for {output_file_name}") def dump_kg2_synonym_field(output_file_name: str, is_test: bool): @@ -109,7 +112,7 @@ def dump_kg2_synonym_field(output_file_name: str, is_test: bool): json.dump(synonym_map, output_file) print(f"Successfully created file '{output_file_name}'.") else: - print(f"Sorry, couldn't get synonym data. No file created.") + raise Exception(f"Failed to get results from Neo4j for {output_file_name}") def main(): diff --git a/code/kg2/canonicalized/build_kg2c.py b/code/kg2/canonicalized/build_kg2c.py index b60df9b06..3f7119f6b 100644 --- a/code/kg2/canonicalized/build_kg2c.py +++ b/code/kg2/canonicalized/build_kg2c.py @@ -47,12 +47,12 @@ def _upload_output_files_to_s3(): tarball_path = f"{KG2C_DIR}/kg2c-tsv.tar.gz" json_file_path = f"{KG2C_DIR}/kg2c.json" json_lite_file_path = f"{KG2C_DIR}/kg2c_lite.json" - subprocess.call(f"tar -czvf {tarball_path} nodes_c.tsv nodes_c_header.tsv edges_c.tsv edges_c_header.tsv", shell=True) - subprocess.call(f"aws s3 cp --no-progress --region us-west-2 {tarball_path} s3://rtx-kg2/", shell=True) - subprocess.call(f"gzip -f {json_file_path}", shell=True) - subprocess.call(f"gzip -f {json_lite_file_path}", shell=True) - subprocess.call(f"aws s3 cp --no-progress --region us-west-2 {json_file_path}.gz s3://rtx-kg2/", shell=True) - subprocess.call(f"aws s3 cp --no-progress --region us-west-2 {json_lite_file_path}.gz s3://rtx-kg2/", shell=True) + subprocess.check_call(["tar", "-czvf", tarball_path, "nodes_c.tsv", "nodes_c_header.tsv", "edges_c.tsv", "edges_c_header.tsv"]) + subprocess.check_call(["aws", "s3", "cp", "--no-progress", "--region", "us-west-2", tarball_path, "s3://rtx-kg2/"]) + subprocess.check_call(["gzip", "-f", json_file_path]) + subprocess.check_call(["gzip", "-f", json_lite_file_path]) + subprocess.check_call(["aws", "s3", "cp", "--no-progress", "--region", "us-west-2", f"{json_file_path}.gz", "s3://rtx-kg2/"]) + subprocess.check_call(["aws", "s3", "cp", "--no-progress", "--region", "us-west-2", f"{json_lite_file_path}.gz", "s3://rtx-kg2/"]) def _print_log_message(message: str): @@ -84,7 +84,7 @@ def main(): # Build a new node synonymizer, if we're supposed to if build_synonymizer and not args.test: _print_log_message("Building node synonymizer off of specified KG2..") - subprocess.call(f"bash -x {KG2C_DIR}/build-synonymizer.sh", shell=True) + subprocess.check_call(["bash", "-x", f"{KG2C_DIR}/build-synonymizer.sh"]) # Actually build KG2c _print_log_message("Creating KG2c files..") @@ -96,7 +96,7 @@ def main(): _upload_output_files_to_s3() # Remove the config_local file we created (otherwise will always be used instead of configv2.json) - subprocess.call(f"rm {CODE_DIR}/config_local.json", shell=True) + subprocess.call(["rm", f"{CODE_DIR}/config_local.json"]) _print_log_message(f"DONE WITH KG2c BUILD! Took {round(((time.time() - start) / 60) / 60, 1)} hours") From 9b02cf7bfa9ffadfca22b556c699a649c9418a89 Mon Sep 17 00:00:00 2001 From: amykglen Date: Fri, 7 May 2021 15:40:44 -0700 Subject: [PATCH 03/15] Use logging library for KG2c build log --- code/kg2/canonicalized/build_kg2c.py | 34 ++++---- code/kg2/canonicalized/create_kg2c_files.py | 78 +++++++++---------- .../canonicalized/record_kg2c_meta_info.py | 40 +++++----- 3 files changed, 72 insertions(+), 80 deletions(-) diff --git a/code/kg2/canonicalized/build_kg2c.py b/code/kg2/canonicalized/build_kg2c.py index 3f7119f6b..03a03d965 100644 --- a/code/kg2/canonicalized/build_kg2c.py +++ b/code/kg2/canonicalized/build_kg2c.py @@ -8,6 +8,7 @@ Usage: python3 build_kg2c.py [--test] """ import argparse +import logging from datetime import datetime import json import os @@ -27,7 +28,7 @@ def _setup_rtx_config_local(kg2_neo4j_endpoint: str): # Create a config_local.json file based off of configv2.json, but modified for our needs - _print_log_message("Creating a config_local.json file pointed to the right KG2 Neo4j and synonymizer..") + logging.info("Creating a config_local.json file pointed to the right KG2 Neo4j and synonymizer..") RTXConfiguration() # Ensures we have a reasonably up-to-date configv2.json with open(f"{CODE_DIR}/configv2.json") as configv2_file: rtx_config_dict = json.load(configv2_file) @@ -38,12 +39,12 @@ def _setup_rtx_config_local(kg2_neo4j_endpoint: str): path_info["node_synonymizer"]["path"] = "/something/node_synonymizer.sqlite" with open(f"{CODE_DIR}/config_local.json", "w+") as config_local_file: json.dump(rtx_config_dict, config_local_file) - _print_log_message(f"KG2 neo4j bolt entry in config_local is now: " - f"{rtx_config_dict['Contextual']['KG2']['neo4j']['bolt']}") + logging.info(f"KG2 neo4j bolt entry in config_local is now: " + f"{rtx_config_dict['Contextual']['KG2']['neo4j']['bolt']}") def _upload_output_files_to_s3(): - _print_log_message("Uploading KG2c json and TSV files to S3..") + logging.info("Uploading KG2c json and TSV files to S3..") tarball_path = f"{KG2C_DIR}/kg2c-tsv.tar.gz" json_file_path = f"{KG2C_DIR}/kg2c.json" json_lite_file_path = f"{KG2C_DIR}/kg2c_lite.json" @@ -55,13 +56,12 @@ def _upload_output_files_to_s3(): subprocess.check_call(["aws", "s3", "cp", "--no-progress", "--region", "us-west-2", f"{json_lite_file_path}.gz", "s3://rtx-kg2/"]) -def _print_log_message(message: str): - current_time = datetime.utcfromtimestamp(time.time()).strftime('%H:%M:%S') - print(f"{current_time}: {message}") - - def main(): - _print_log_message("STARTING KG2c BUILD") + logging.basicConfig(level=logging.INFO, + format='%(asctime)s %(levelname)s: %(message)s', + handlers=[logging.FileHandler("build.log"), + logging.StreamHandler()]) + logging.info("STARTING KG2c BUILD") start = time.time() # Grab any parameters passed to this script arg_parser = argparse.ArgumentParser() @@ -75,30 +75,30 @@ def main(): biolink_model_version = kg2c_config_info["biolink_model_version"] upload_to_s3 = kg2c_config_info["upload_to_s3"] build_synonymizer = kg2c_config_info["build_synonymizer"] - _print_log_message(f"Biolink model version to use is {biolink_model_version}") - _print_log_message(f"KG2 Neo4j to use is {kg2_neo4j_endpoint}") + logging.info(f"Biolink model version to use is {biolink_model_version}") + logging.info(f"KG2 Neo4j to use is {kg2_neo4j_endpoint}") # Set up an RTX config_local.json file that points to the right KG2 and synonymizer _setup_rtx_config_local(kg2_neo4j_endpoint) # Build a new node synonymizer, if we're supposed to if build_synonymizer and not args.test: - _print_log_message("Building node synonymizer off of specified KG2..") + logging.info("Building node synonymizer off of specified KG2..") subprocess.check_call(["bash", "-x", f"{KG2C_DIR}/build-synonymizer.sh"]) # Actually build KG2c - _print_log_message("Creating KG2c files..") + logging.info("Creating KG2c files..") create_kg2c_files(args.test) - _print_log_message("Recording meta KG info..") + logging.info("Recording meta KG info..") record_meta_kg_info(biolink_model_version, args.test) if upload_to_s3 and not args.test: - _print_log_message("Uploading KG2c files to S3..") + logging.info("Uploading KG2c files to S3..") _upload_output_files_to_s3() # Remove the config_local file we created (otherwise will always be used instead of configv2.json) subprocess.call(["rm", f"{CODE_DIR}/config_local.json"]) - _print_log_message(f"DONE WITH KG2c BUILD! Took {round(((time.time() - start) / 60) / 60, 1)} hours") + logging.info(f"DONE WITH KG2c BUILD! Took {round(((time.time() - start) / 60) / 60, 1)} hours") if __name__ == "__main__": diff --git a/code/kg2/canonicalized/create_kg2c_files.py b/code/kg2/canonicalized/create_kg2c_files.py index 94d1b977e..200e4f30b 100644 --- a/code/kg2/canonicalized/create_kg2c_files.py +++ b/code/kg2/canonicalized/create_kg2c_files.py @@ -7,6 +7,7 @@ import argparse import csv import json +import logging import os import pickle import random @@ -41,14 +42,14 @@ def _run_kg2_cypher_query(cypher_query: str) -> List[Dict[str, any]]: try: driver = GraphDatabase.driver(rtxc.neo4j_bolt, auth=(rtxc.neo4j_username, rtxc.neo4j_password)) with driver.session() as session: - _print_log_message(f" Sending cypher query to KG2 neo4j ({rtxc.neo4j_bolt})..") + logging.info(f" Sending cypher query to KG2 neo4j ({rtxc.neo4j_bolt})..") query_results = session.run(cypher_query).data() - _print_log_message(f" Got {len(query_results)} results back from neo4j") + logging.info(f" Got {len(query_results)} results back from neo4j") driver.close() except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() - _print_log_message(f"ERROR: Encountered a problem interacting with {rtxc.neo4j_bolt}. {tb}") + logging.error(f"Encountered a problem interacting with {rtxc.neo4j_bolt}. {tb}") return [] else: return query_results @@ -59,7 +60,7 @@ def _convert_list_to_string_encoded_format(input_list_or_str: Union[List[str], s filtered_list = [item for item in input_list_or_str if item] # Get rid of any None items str_items = [item for item in filtered_list if isinstance(item, str)] if len(str_items) < len(filtered_list): - _print_log_message(f" WARNING: List contains non-str items (this is unexpected; I'll exclude them)") + logging.warning(f" List contains non-str items (this is unexpected; I'll exclude them)") return DELIMITER_CHAR.join(str_items) else: return input_list_or_str @@ -73,11 +74,6 @@ def _get_edge_key(subject: str, object: str, predicate: str) -> str: return f"{subject}--{predicate}--{object}" -def _print_log_message(message: str): - current_time = datetime.utcfromtimestamp(time.time()).strftime('%H:%M:%S') - print(f"{current_time}: {message}") - - def _clean_up_description(description: str) -> str: # Removes all of the "UMLS Semantic Type: UMLS_STY:XXXX;" bits from descriptions return re.sub("UMLS Semantic Type: UMLS_STY:[a-zA-Z][0-9]{3}[;]?", "", description).strip().strip(";") @@ -159,13 +155,13 @@ def _create_edge(subject: str, object: str, predicate: str, provided_by: List[st def _write_list_to_neo4j_ready_tsv(input_list: List[Dict[str, any]], file_name_root: str, is_test: bool): # Converts a list into the specific format Neo4j wants (string with delimiter) - _print_log_message(f" Creating {file_name_root} header file..") + logging.info(f" Creating {file_name_root} header file..") column_headers = list(input_list[0].keys()) modified_headers = _modify_column_headers_for_neo4j(column_headers, file_name_root) with open(f"{KG2C_DIR}/{'test_' if is_test else ''}{file_name_root}_header.tsv", "w+") as header_file: dict_writer = csv.DictWriter(header_file, modified_headers, delimiter='\t') dict_writer.writeheader() - _print_log_message(f" Creating {file_name_root} file..") + logging.info(f" Creating {file_name_root} file..") with open(f"{KG2C_DIR}/{'test_' if is_test else ''}{file_name_root}.tsv", "w+") as data_file: dict_writer = csv.DictWriter(data_file, column_headers, delimiter='\t') dict_writer.writerows(input_list) @@ -173,7 +169,7 @@ def _write_list_to_neo4j_ready_tsv(input_list: List[Dict[str, any]], file_name_r def create_kg2c_json_file(canonicalized_nodes_dict: Dict[str, Dict[str, any]], canonicalized_edges_dict: Dict[str, Dict[str, any]], is_test: bool): - _print_log_message(f" Creating KG2c JSON file..") + logging.info(f" Creating KG2c JSON file..") kgx_format_json = {"nodes": list(canonicalized_nodes_dict.values()), "edges": list(canonicalized_edges_dict.values())} with open(f"{KG2C_DIR}/kg2c{'_test' if is_test else ''}.json", "w+") as output_file: @@ -182,7 +178,7 @@ def create_kg2c_json_file(canonicalized_nodes_dict: Dict[str, Dict[str, any]], def create_kg2c_lite_json_file(canonicalized_nodes_dict: Dict[str, Dict[str, any]], canonicalized_edges_dict: Dict[str, Dict[str, any]], is_test: bool): - _print_log_message(f" Creating KG2c lite JSON file..") + logging.info(f" Creating KG2c lite JSON file..") # Filter out all except these properties so we create a lightweight KG node_lite_properties = ["id", "name", "category", "expanded_categories"] edge_lite_properties = ["id", "predicate", "subject", "object", "provided_by", "publications"] @@ -199,13 +195,13 @@ def create_kg2c_lite_json_file(canonicalized_nodes_dict: Dict[str, Dict[str, any lite_kg["edges"].append(lite_edge) # Save this lite KG to a JSON file - _print_log_message(f" Saving lite json...") + logging.info(f" Saving lite json...") with open(f"{KG2C_DIR}/kg2c_lite{'_test' if is_test else ''}.json", "w+") as output_file: json.dump(lite_kg, output_file) def create_kg2c_sqlite_db(canonicalized_nodes_dict: Dict[str, Dict[str, any]], is_test: bool): - _print_log_message(" Creating KG2c sqlite database..") + logging.info(" Creating KG2c sqlite database..") db_name = f"kg2c{'_test' if is_test else ''}.sqlite" # Remove any preexisting version of this database if os.path.exists(db_name): @@ -218,7 +214,7 @@ def create_kg2c_sqlite_db(canonicalized_nodes_dict: Dict[str, Dict[str, any]], i connection.execute("CREATE UNIQUE INDEX node_id_index ON nodes (id)") connection.commit() cursor = connection.execute(f"SELECT COUNT(*) FROM nodes") - _print_log_message(f" Done creating nodes table; contains {cursor.fetchone()[0]} rows.") + logging.info(f" Done creating nodes table; contains {cursor.fetchone()[0]} rows.") cursor.close() connection.close() @@ -241,7 +237,7 @@ def create_kg2c_tsv_files(canonicalized_nodes_dict: Dict[str, Dict[str, any]], canonicalized_edge['object_for_conversion'] = canonicalized_edge['object'] # Finally dump all our nodes/edges into TSVs (formatted for neo4j) - _print_log_message(f" Creating TSVs for Neo4j..") + logging.info(f" Creating TSVs for Neo4j..") _write_list_to_neo4j_ready_tsv(list(canonicalized_nodes_dict.values()), "nodes_c", is_test) _write_list_to_neo4j_ready_tsv(list(canonicalized_edges_dict.values()), "edges_c", is_test) @@ -249,16 +245,16 @@ def create_kg2c_tsv_files(canonicalized_nodes_dict: Dict[str, Dict[str, any]], def _canonicalize_nodes(neo4j_nodes: List[Dict[str, any]]) -> Tuple[Dict[str, Dict[str, any]], Dict[str, str]]: synonymizer = NodeSynonymizer() node_ids = [node.get('id') for node in neo4j_nodes if node.get('id')] - _print_log_message(f" Sending NodeSynonymizer.get_canonical_curies() {len(node_ids)} curies..") + logging.info(f" Sending NodeSynonymizer.get_canonical_curies() {len(node_ids)} curies..") canonicalized_info = synonymizer.get_canonical_curies(curies=node_ids, return_all_categories=True) all_canonical_curies = {canonical_info['preferred_curie'] for canonical_info in canonicalized_info.values() if canonical_info} - _print_log_message(f" Sending NodeSynonymizer.get_equivalent_nodes() {len(all_canonical_curies)} curies..") + logging.info(f" Sending NodeSynonymizer.get_equivalent_nodes() {len(all_canonical_curies)} curies..") equivalent_curies_info = synonymizer.get_equivalent_nodes(all_canonical_curies) recognized_curies = {curie for curie in equivalent_curies_info if equivalent_curies_info.get(curie)} equivalent_curies_dict = {curie: list(equivalent_curies_info.get(curie)) for curie in recognized_curies} with open(f"{KG2C_DIR}/equivalent_curies.pickle", "wb") as equiv_curies_dump: # Save these for use by downstream script pickle.dump(equivalent_curies_dict, equiv_curies_dump, protocol=pickle.HIGHEST_PROTOCOL) - _print_log_message(f" Creating canonicalized nodes..") + logging.info(f" Creating canonicalized nodes..") curie_map = dict() canonicalized_nodes = dict() for neo4j_node in neo4j_nodes: @@ -283,7 +279,7 @@ def _canonicalize_nodes(neo4j_nodes: List[Dict[str, any]]) -> Tuple[Dict[str, Di name = canonical_info['preferred_name'] if canonical_info else neo4j_node['name'] category = canonical_info['preferred_category'] if canonical_info else neo4j_node['category'] if not category.startswith("biolink:"): - _print_log_message(f" WARNING: Preferred category for {canonicalized_curie} doesn't start with 'biolink:': {category}") + logging.warning(f" Preferred category for {canonicalized_curie} doesn't start with 'biolink:': {category}") all_categories = list(canonical_info['all_categories']) if canonical_info else [neo4j_node['category']] expanded_categories = list(canonical_info['expanded_categories']) if canonical_info else [neo4j_node['category']] iri = neo4j_node['iri'] if neo4j_node['id'] == canonicalized_curie else None @@ -291,11 +287,11 @@ def _canonicalize_nodes(neo4j_nodes: List[Dict[str, any]]) -> Tuple[Dict[str, Di # Check for bug where not all categories in synonymizer were of "biolink:PascalCase" format if not all(category.startswith("biolink:") for category in all_categories): - _print_log_message(f" WARNING: all_categories for {canonicalized_curie} contain non 'biolink:PascalCase' " - f"items: {all_categories}") + logging.warning(f" all_categories for {canonicalized_curie} contain non 'biolink:PascalCase' " + f"items: {all_categories}") if not all(category.startswith("biolink:") for category in expanded_categories): - _print_log_message(f" WARNING: expanded_categories for {canonicalized_curie} contain non 'biolink:PascalCase' " - f"items: {expanded_categories}") + logging.warning(f" expanded_categories for {canonicalized_curie} contain non 'biolink:PascalCase' " + f"items: {expanded_categories}") canonicalized_node = _create_node(preferred_curie=canonicalized_curie, name=name, @@ -350,28 +346,28 @@ def create_kg2c_files(is_test=False): canonicalizes the nodes, merges edges (based on subject, object, predicate), and saves the resulting canonicalized graph in multiple file formats: JSON, sqlite, and TSV (ready for import into Neo4j). """ - _print_log_message(f" Extracting nodes from KG2..") + logging.info(f" Extracting nodes from KG2..") nodes_query = f"match (n) return n.id as id, n.name as name, n.category as category, " \ f"n.publications as publications, n.iri as iri, n.description as description{' limit 5000' if is_test else ''}" neo4j_nodes = _run_kg2_cypher_query(nodes_query) if neo4j_nodes: - _print_log_message(f" Canonicalizing nodes..") + logging.info(f" Canonicalizing nodes..") canonicalized_nodes_dict, curie_map = _canonicalize_nodes(neo4j_nodes) - _print_log_message(f" Number of KG2 nodes was reduced to {len(canonicalized_nodes_dict)} ({round((len(canonicalized_nodes_dict) / len(neo4j_nodes)) * 100)}%)") + logging.info(f" Number of KG2 nodes was reduced to {len(canonicalized_nodes_dict)} ({round((len(canonicalized_nodes_dict) / len(neo4j_nodes)) * 100)}%)") else: - _print_log_message(f"ERROR: Couldn't get node data from KG2 neo4j.") + logging.error(f"Couldn't get node data from KG2 neo4j.") return - _print_log_message(f" Extracting edges from KG2..") + logging.info(f" Extracting edges from KG2..") edges_query = f"match (n)-[e]->(m) return n.id as subject, m.id as object, e.predicate as " \ f"predicate, e.provided_by as provided_by, e.publications as publications, e.id as id" \ f"{' limit 20000' if is_test else ''}" neo4j_edges = _run_kg2_cypher_query(edges_query) if neo4j_edges: - _print_log_message(f" Canonicalizing edges..") + logging.info(f" Canonicalizing edges..") canonicalized_edges_dict = _canonicalize_edges(neo4j_edges, curie_map, is_test) - _print_log_message(f" Number of KG2 edges was reduced to {len(canonicalized_edges_dict)} ({round((len(canonicalized_edges_dict) / len(neo4j_edges)) * 100)}%)") + logging.info(f" Number of KG2 edges was reduced to {len(canonicalized_edges_dict)} ({round((len(canonicalized_edges_dict) / len(neo4j_edges)) * 100)}%)") else: - _print_log_message(f"ERROR: Couldn't get edge data from KG2 neo4j.") + logging.error(f"Couldn't get edge data from KG2 neo4j.") return # Create a node containing information about this KG2C build @@ -391,18 +387,18 @@ def create_kg2c_files(is_test=False): descriptions_list=[]) canonicalized_nodes_dict[kg2c_build_node['id']] = kg2c_build_node else: - _print_log_message(f" WARNING: No build node detected in the regular KG2, so I'm not creating a KG2c build node.") + logging.warning(f" No build node detected in the regular KG2, so I'm not creating a KG2c build node.") # Choose best descriptions using Chunyu's NLP-based method node_ids = list(canonicalized_nodes_dict) description_lists = [canonicalized_nodes_dict[node_id]["descriptions_list"] for node_id in node_ids] num_cpus = os.cpu_count() - _print_log_message(f" Detected {num_cpus} cpus; will use all of them to choose best descriptions") + logging.info(f" Detected {num_cpus} cpus; will use all of them to choose best descriptions") pool = Pool(num_cpus) - _print_log_message(f" Starting to use Chunyu's NLP-based method to choose best descriptions (in parallel)..") + logging.info(f" Starting to use Chunyu's NLP-based method to choose best descriptions (in parallel)..") start = time.time() best_descriptions = pool.map(_get_best_description, description_lists) - _print_log_message(f" Choosing best descriptions took {round((time.time() - start) / 60, 2)} minutes") + logging.info(f" Choosing best descriptions took {round((time.time() - start) / 60, 2)} minutes") # Actually decorate nodes with their 'best' description for num in range(len(node_ids)): node_id = node_ids[num] @@ -411,13 +407,13 @@ def create_kg2c_files(is_test=False): del canonicalized_nodes_dict[node_id]["descriptions_list"] # Do some final clean-up/formatting of nodes, now that all merging is done - _print_log_message(f" Doing final clean-up/formatting of nodes") + logging.info(f" Doing final clean-up/formatting of nodes") for node_id, node in canonicalized_nodes_dict.items(): # Sort all of our list properties (nicer for users that way) for array_property_name in ARRAY_NODE_PROPERTIES: node[array_property_name] = sorted([item for item in node[array_property_name] if item]) node["publications"] = node["publications"][:10] # We don't need a ton of publications, so truncate them - _print_log_message(f" Doing final clean-up/formatting of edges") + logging.info(f" Doing final clean-up/formatting of edges") # Convert our edge IDs to integers (to save space downstream) and add them as actual properties on the edges edge_num = 1 for edge_id, edge in sorted(canonicalized_edges_dict.items()): @@ -440,10 +436,10 @@ def main(): arg_parser.add_argument('--test', dest='test', action='store_true', default=False) args = arg_parser.parse_args() - _print_log_message(f"Starting to create KG2canonicalized..") + logging.info(f"Starting to create KG2canonicalized..") start = time.time() create_kg2c_files(args.test) - _print_log_message(f"Done! Took {round(((time.time() - start) / 60) / 60, 2)} hours.") + logging.info(f"Done! Took {round(((time.time() - start) / 60) / 60, 2)} hours.") if __name__ == "__main__": diff --git a/code/kg2/canonicalized/record_kg2c_meta_info.py b/code/kg2/canonicalized/record_kg2c_meta_info.py index 1f091317f..5539d8339 100644 --- a/code/kg2/canonicalized/record_kg2c_meta_info.py +++ b/code/kg2/canonicalized/record_kg2c_meta_info.py @@ -7,6 +7,7 @@ """ import argparse import json +import logging import os import pickle import sqlite3 @@ -22,11 +23,6 @@ KG2C_DIR = f"{os.path.dirname(os.path.abspath(__file__))}" -def _print_log_message(message: str): - current_time = datetime.utcfromtimestamp(time.time()).strftime('%H:%M:%S') - print(f"{current_time}: {message}") - - def serialize_with_sets(obj: any) -> any: # Thank you https://stackoverflow.com/a/60544597 if isinstance(obj, set): @@ -60,7 +56,7 @@ def _convert_to_trapi_predicate_format(english_predicate: str) -> str: def _create_expanded_predicates_maps(biolink_version: str) -> Tuple[DefaultDict[str, set], Dict[str, str]]: # Build maps of predicate ancestors and inverses, since KG2c considers these when answering queries - _print_log_message("Generating ancestor/inverse predicates maps..") + logging.info("Generating ancestor/inverse predicates maps..") # First load the biolink model into a tree root_predicate = "biolink:related_to" @@ -86,8 +82,8 @@ def _create_expanded_predicates_maps(biolink_version: str) -> Tuple[DefaultDict[ biolink_tree.create_node(root_predicate, root_predicate) _create_tree_recursive(root_predicate, parent_to_child_dict, biolink_tree) else: - _print_log_message(f"WARNING: Unable to load Biolink yaml file. Will not be able to factor predicate ancestors" - f" or inverses into meta triples.") + logging.warning(f"Unable to load Biolink yaml file. Will not be able to factor predicate ancestors " + f"or inverses into meta triples.") # Then use the biolink tree to build up a more convenient map of predicate ancestors ancestors_map = defaultdict(set) @@ -101,7 +97,7 @@ def _create_expanded_predicates_maps(biolink_version: str) -> Tuple[DefaultDict[ def build_meta_kg(nodes_by_id: Dict[str, Dict[str, any]], edges_by_id: Dict[str, Dict[str, any]], meta_kg_file_name: str, label_property_name: str, biolink_model_version: str, is_test: bool): predicate_ancestors, inverses_map = _create_expanded_predicates_maps(biolink_model_version) - _print_log_message("Gathering all meta triples..") + logging.info("Gathering all meta triples..") meta_triples = set() for edge in edges_by_id.values(): subject_node_id = edge["subject"] @@ -123,9 +119,9 @@ def build_meta_kg(nodes_by_id: Dict[str, Dict[str, any]], edges_by_id: Dict[str, for inverse_ancestor in inverse_ancestors: meta_triples.add((object_category, inverse_ancestor, subject_category)) meta_edges = [{"subject": triple[0], "predicate": triple[1], "object": triple[2]} for triple in meta_triples] - _print_log_message(f"Created {len(meta_edges)} meta edges") + logging.info(f"Created {len(meta_edges)} meta edges") - _print_log_message("Gathering all meta nodes..") + logging.info("Gathering all meta nodes..") with open(f"{KG2C_DIR}/equivalent_curies.pickle", "rb") as equiv_curies_file: equivalent_curies_dict = pickle.load(equiv_curies_file) meta_nodes = defaultdict(lambda: defaultdict(lambda: set())) @@ -135,9 +131,9 @@ def build_meta_kg(nodes_by_id: Dict[str, Dict[str, any]], edges_by_id: Dict[str, categories = node[label_property_name] for category in categories: meta_nodes[category]["id_prefixes"].update(prefixes) - _print_log_message(f"Created {len(meta_nodes)} meta nodes") + logging.info(f"Created {len(meta_nodes)} meta nodes") - _print_log_message("Saving meta KG to JSON file..") + logging.info("Saving meta KG to JSON file..") meta_kg = {"nodes": meta_nodes, "edges": meta_edges} with open(f"{KG2C_DIR}/{meta_kg_file_name}", "w+") as meta_kg_file: json.dump(meta_kg, meta_kg_file, default=serialize_with_sets) @@ -145,7 +141,7 @@ def build_meta_kg(nodes_by_id: Dict[str, Dict[str, any]], edges_by_id: Dict[str, def add_neighbor_counts_to_sqlite(nodes_by_id: Dict[str, Dict[str, any]], edges_by_id: Dict[str, Dict[str, any]], sqlite_file_name: str, label_property_name: str, is_test: bool): - _print_log_message("Counting up node neighbors by category..") + logging.info("Counting up node neighbors by category..") # First gather neighbors of each node by label/category neighbors_by_label = defaultdict(lambda: defaultdict(lambda: set())) for edge in edges_by_id.values(): @@ -166,7 +162,7 @@ def add_neighbor_counts_to_sqlite(nodes_by_id: Dict[str, Dict[str, any]], edges_ neighbor_counts[node_id][label] = len(neighbor_ids) # Then write these counts to the sqlite file - _print_log_message(f"Saving neighbor counts (for {len(neighbor_counts)} nodes) to sqlite..") + logging.info(f"Saving neighbor counts (for {len(neighbor_counts)} nodes) to sqlite..") connection = sqlite3.connect(sqlite_file_name) connection.execute("DROP TABLE IF EXISTS neighbors") connection.execute("CREATE TABLE neighbors (id TEXT, neighbor_counts TEXT)") @@ -175,14 +171,14 @@ def add_neighbor_counts_to_sqlite(nodes_by_id: Dict[str, Dict[str, any]], edges_ connection.execute("CREATE UNIQUE INDEX node_neighbor_index ON neighbors (id)") connection.commit() cursor = connection.execute(f"SELECT COUNT(*) FROM neighbors") - _print_log_message(f"Done adding neighbor counts to sqlite; neighbors table contains {cursor.fetchone()[0]} rows") + logging.info(f"Done adding neighbor counts to sqlite; neighbors table contains {cursor.fetchone()[0]} rows") cursor.close() connection.close() def add_category_counts_to_sqlite(nodes_by_id: Dict[str, Dict[str, any]], sqlite_file_name: str, label_property_name: str): - _print_log_message("Counting up nodes by category..") + logging.info("Counting up nodes by category..") # Organize node IDs by their categories/labels nodes_by_label = defaultdict(set) for node_id, node in nodes_by_id.items(): @@ -190,7 +186,7 @@ def add_category_counts_to_sqlite(nodes_by_id: Dict[str, Dict[str, any]], sqlite nodes_by_label[category].add(node_id) # Then write these counts to the sqlite file - _print_log_message(f"Saving category counts (for {len(nodes_by_label)} categories) to sqlite..") + logging.info(f"Saving category counts (for {len(nodes_by_label)} categories) to sqlite..") connection = sqlite3.connect(sqlite_file_name) connection.execute("DROP TABLE IF EXISTS category_counts") connection.execute("CREATE TABLE category_counts (category TEXT, count INTEGER)") @@ -199,7 +195,7 @@ def add_category_counts_to_sqlite(nodes_by_id: Dict[str, Dict[str, any]], sqlite connection.execute("CREATE UNIQUE INDEX category_index ON category_counts (category)") connection.commit() cursor = connection.execute(f"SELECT COUNT(*) FROM category_counts") - _print_log_message(f"Done adding category counts to sqlite; category_counts table contains " + logging.info(f"Done adding category counts to sqlite; category_counts table contains " f"{cursor.fetchone()[0]} rows") cursor.close() connection.close() @@ -213,7 +209,7 @@ def record_meta_kg_info(biolink_version: str, is_test: bool): start = time.time() with open(f"{KG2C_DIR}/{input_kg_file_name}", "r") as input_kg_file: - _print_log_message(f"Loading {input_kg_file_name} into memory..") + logging.info(f"Loading {input_kg_file_name} into memory..") kg2c_dict = json.load(input_kg_file) nodes_by_id = {node["id"]: node for node in kg2c_dict["nodes"]} edges_by_id = {edge["id"]: edge for edge in kg2c_dict["edges"]} @@ -223,11 +219,11 @@ def record_meta_kg_info(biolink_version: str, is_test: bool): add_neighbor_counts_to_sqlite(nodes_by_id, edges_by_id, sqlite_file_name, label_property_name, is_test) add_category_counts_to_sqlite(nodes_by_id, sqlite_file_name, label_property_name) - _print_log_message(f"Recording meta KG info took {round((time.time() - start) / 60, 1)} minutes.") + logging.info(f"Recording meta KG info took {round((time.time() - start) / 60, 1)} minutes.") def main(): - _print_log_message("Starting to record KG2c meta info..") + logging.info("Starting to record KG2c meta info..") arg_parser = argparse.ArgumentParser() arg_parser.add_argument("biolink_model_version", type=str) arg_parser.add_argument("--test", dest="test", action='store_true', default=False) From 40a328b4d984f7f91afa4a36b857be472f463d43 Mon Sep 17 00:00:00 2001 From: amykglen Date: Fri, 7 May 2021 15:49:29 -0700 Subject: [PATCH 04/15] Configure logging to work when scripts are run standalone --- code/kg2/canonicalized/create_kg2c_files.py | 4 ++++ code/kg2/canonicalized/record_kg2c_meta_info.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/code/kg2/canonicalized/create_kg2c_files.py b/code/kg2/canonicalized/create_kg2c_files.py index 200e4f30b..bc8ad5301 100644 --- a/code/kg2/canonicalized/create_kg2c_files.py +++ b/code/kg2/canonicalized/create_kg2c_files.py @@ -432,6 +432,10 @@ def create_kg2c_files(is_test=False): def main(): + logging.basicConfig(level=logging.INFO, + format='%(asctime)s %(levelname)s: %(message)s', + handlers=[logging.FileHandler("createkg2cfiles.log"), + logging.StreamHandler()]) arg_parser = argparse.ArgumentParser() arg_parser.add_argument('--test', dest='test', action='store_true', default=False) args = arg_parser.parse_args() diff --git a/code/kg2/canonicalized/record_kg2c_meta_info.py b/code/kg2/canonicalized/record_kg2c_meta_info.py index 5539d8339..2de28e733 100644 --- a/code/kg2/canonicalized/record_kg2c_meta_info.py +++ b/code/kg2/canonicalized/record_kg2c_meta_info.py @@ -223,6 +223,10 @@ def record_meta_kg_info(biolink_version: str, is_test: bool): def main(): + logging.basicConfig(level=logging.INFO, + format='%(asctime)s %(levelname)s: %(message)s', + handlers=[logging.FileHandler("metainfo.log"), + logging.StreamHandler()]) logging.info("Starting to record KG2c meta info..") arg_parser = argparse.ArgumentParser() arg_parser.add_argument("biolink_model_version", type=str) From 7e8adb3ff392298b7adcda39d57b9aeee7a5b67a Mon Sep 17 00:00:00 2001 From: isbluis Date: Fri, 7 May 2021 22:54:09 +0000 Subject: [PATCH 05/15] Add temporary warning that this is TRAPI 1.0 with link to 1.1 interface --- code/UI/interactive/index.html | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/code/UI/interactive/index.html b/code/UI/interactive/index.html index cdc9e7f26..ca62bd031 100644 --- a/code/UI/interactive/index.html +++ b/code/UI/interactive/index.html @@ -56,6 +56,12 @@




+
TRAPI 1.0
+
+

This is the interface based on TRAPI 1.0. Click here to go to the TRAPI 1.1-based interface

+


+ +
Session History (-)

Your query history will be displayed here. It can be edited or re-set.

From 61197166ed72819f5f020a1d3711c8ec5758b4c3 Mon Sep 17 00:00:00 2001 From: amykglen Date: Fri, 7 May 2021 16:20:13 -0700 Subject: [PATCH 06/15] Restore any preexisting config_local.json (vs. overwriting) --- code/kg2/canonicalized/README.md | 12 +++++------- code/kg2/canonicalized/build_kg2c.py | 20 ++++++++++++-------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/code/kg2/canonicalized/README.md b/code/kg2/canonicalized/README.md index 077d5ad10..c70f37c44 100644 --- a/code/kg2/canonicalized/README.md +++ b/code/kg2/canonicalized/README.md @@ -67,14 +67,12 @@ In creating KG2c, edges from the regular KG2 are remapped to use only 'preferred 1. Install AWS CLI: `sudo apt-get install -y awscli` 1. And configure it: `aws configure` 1. Locally modify `kg2c_config.json` (in `RTX/code/kg2/canonicalized/`) for your particular needs - - Most importantly, be sure to specify the Neo4j endpoint for the KG2 you want to build this KG2c from under the `"kg2_neo4j"` slot - - The Biolink model version specified should match that used by the KG2 you specify -1. If you do **not** want a new `NodeSynonymizer` to be built (i.e., you already have a synonymizer made from the KG2 this KG2c will be built from): - 1. Ensure your synonymizer file is in `RTX/code/ARAX/NodeSynonymizer/` and is named `node_synonymizer.sqlite` - 1. Make sure to specify `false` for the `"build_synonymizer"` option in `kg2c_config.json` -1. Then build KG2c (should take a few hours and around 80GB of RAM): + - Most importantly, be sure to specify the **Neo4j endpoint** for the KG2 you want to build this KG2c from under the `"kg2_neo4j"` slot + - Make sure the Biolink model version specified matches that used by the KG2 you specified + - Indicate whether or not you want a new NodeSynonymizer to be built + - If you do **not** want a new `NodeSynonymizer` to be built (i.e., you already have a synonymizer made from the KG2 this KG2c will be built from), ensure your synonymizer file is in `RTX/code/ARAX/NodeSynonymizer/` and is named `node_synonymizer.sqlite` +1. Then build KG2c (should take around 5-10 hours and 130GB of RAM): - `python3 RTX/code/kg2/canonicalized/build_kg2c.py` - - *WARNING: If you happen to already have a custom `config_local.json` file, this will override it; make a copy if you don't want to lose it* In the end, KG2c will be created and stored in multiple file formats, including TSVs ready for import into Neo4j. diff --git a/code/kg2/canonicalized/build_kg2c.py b/code/kg2/canonicalized/build_kg2c.py index 03a03d965..bcb75da58 100644 --- a/code/kg2/canonicalized/build_kg2c.py +++ b/code/kg2/canonicalized/build_kg2c.py @@ -3,12 +3,11 @@ This script creates a canonicalized version of KG2 stored in various file formats, including TSVs ready for import into Neo4j. Files are created in the directory this script is in. It relies on the options you specify in kg2c_config.json; in particular, the KG2c will be built off of the KG2 endpoint you specify in that config file. -WARNING: If you happen to already have a custom version of config_local.json on your machine, this script will override -it; make a copy if you don't want to lose it. Usage: python3 build_kg2c.py [--test] """ import argparse import logging +import pathlib from datetime import datetime import json import os @@ -32,15 +31,17 @@ def _setup_rtx_config_local(kg2_neo4j_endpoint: str): RTXConfiguration() # Ensures we have a reasonably up-to-date configv2.json with open(f"{CODE_DIR}/configv2.json") as configv2_file: rtx_config_dict = json.load(configv2_file) - # Point to the 'right' KG2 (the one specified in the KG2c config) + # Point to the 'right' KG2 (the one specified in the KG2c config) and synonymizer (we always use simple name) rtx_config_dict["Contextual"]["KG2"]["neo4j"]["bolt"] = f"bolt://{kg2_neo4j_endpoint}:7687" - # Point to the 'right' synonymizer (we'll always use the basic name, and don't need full arax.ncats.io path) for mode, path_info in rtx_config_dict["Contextual"].items(): - path_info["node_synonymizer"]["path"] = "/something/node_synonymizer.sqlite" + path_info["node_synonymizer"]["path"] = "/something/node_synonymizer.sqlite" # Only need name, not full path + # Save a copy of any pre-existing config_local.json so we don't overwrite it + original_config_local_file = pathlib.Path(f"{CODE_DIR}/config_local.json") + if original_config_local_file.exists(): + subprocess.check_call(["cp", f"{CODE_DIR}/config_local.json", f"{CODE_DIR}/config_local.json_KG2CBUILDTEMP"]) + # Save our new config_local.json file with open(f"{CODE_DIR}/config_local.json", "w+") as config_local_file: json.dump(rtx_config_dict, config_local_file) - logging.info(f"KG2 neo4j bolt entry in config_local is now: " - f"{rtx_config_dict['Contextual']['KG2']['neo4j']['bolt']}") def _upload_output_files_to_s3(): @@ -95,8 +96,11 @@ def main(): logging.info("Uploading KG2c files to S3..") _upload_output_files_to_s3() - # Remove the config_local file we created (otherwise will always be used instead of configv2.json) + # Remove the config_local file we created and put original config_local back in place (if there was one) subprocess.call(["rm", f"{CODE_DIR}/config_local.json"]) + original_config_local_file = pathlib.Path(f"{CODE_DIR}/config_local.json_KG2CBUILDTEMP") + if original_config_local_file.exists(): + subprocess.check_call(["mv", f"{CODE_DIR}/config_local.json_KG2CBUILDTEMP", f"{CODE_DIR}/config_local.json"]) logging.info(f"DONE WITH KG2c BUILD! Took {round(((time.time() - start) / 60) / 60, 1)} hours") From 36b20cebf6fb61d19d515a9f43101a094c3aef4a Mon Sep 17 00:00:00 2001 From: amykglen Date: Fri, 7 May 2021 16:58:13 -0700 Subject: [PATCH 07/15] Tweak build set-up steps in KG2c README --- code/kg2/canonicalized/README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/code/kg2/canonicalized/README.md b/code/kg2/canonicalized/README.md index c70f37c44..53985a853 100644 --- a/code/kg2/canonicalized/README.md +++ b/code/kg2/canonicalized/README.md @@ -62,10 +62,11 @@ In creating KG2c, edges from the regular KG2 are remapped to use only 'preferred ### Build KG2canonicalized -1. Follow steps 1-3 in [this section](https://github.com/RTXteam/RTX/wiki/Dev-info#setting-up-for-local-dev-work-on-arax) of the ARAX dev wiki, if you haven't already -1. If you wish to upload your eventual output KG2c files to S3: - 1. Install AWS CLI: `sudo apt-get install -y awscli` - 1. And configure it: `aws configure` +1. If the machine you'll be using has never previously built a KG2c: + 1. Follow steps 1-3 in [this section](https://github.com/RTXteam/RTX/wiki/Dev-info#setting-up-for-local-dev-work-on-arax) of the ARAX dev wiki + 1. If you wish to upload your eventual output KG2c files to S3: + 1. Install AWS CLI: `sudo apt-get install -y awscli` + 1. And configure it: `aws configure` 1. Locally modify `kg2c_config.json` (in `RTX/code/kg2/canonicalized/`) for your particular needs - Most importantly, be sure to specify the **Neo4j endpoint** for the KG2 you want to build this KG2c from under the `"kg2_neo4j"` slot - Make sure the Biolink model version specified matches that used by the KG2 you specified From a5bdcee0c64c81bbf8e2018907584e3276497e86 Mon Sep 17 00:00:00 2001 From: amykglen Date: Fri, 7 May 2021 23:30:12 -0700 Subject: [PATCH 08/15] Minor tweaks towards using less memory --- code/kg2/canonicalized/create_kg2c_files.py | 35 +++++++-------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/code/kg2/canonicalized/create_kg2c_files.py b/code/kg2/canonicalized/create_kg2c_files.py index bc8ad5301..fda70169d 100644 --- a/code/kg2/canonicalized/create_kg2c_files.py +++ b/code/kg2/canonicalized/create_kg2c_files.py @@ -6,6 +6,7 @@ """ import argparse import csv +import gc import json import logging import os @@ -67,7 +68,8 @@ def _convert_list_to_string_encoded_format(input_list_or_str: Union[List[str], s def _merge_two_lists(list_a: List[any], list_b: List[any]) -> List[any]: - return list(set(list_a + list_b)) + unique_items = list(set(list_a + list_b)) + return [item for item in unique_items if item] def _get_edge_key(subject: str, object: str, predicate: str) -> str: @@ -278,21 +280,10 @@ def _canonicalize_nodes(neo4j_nodes: List[Dict[str, any]]) -> Tuple[Dict[str, Di # Initiate the canonical node for this synonym group name = canonical_info['preferred_name'] if canonical_info else neo4j_node['name'] category = canonical_info['preferred_category'] if canonical_info else neo4j_node['category'] - if not category.startswith("biolink:"): - logging.warning(f" Preferred category for {canonicalized_curie} doesn't start with 'biolink:': {category}") all_categories = list(canonical_info['all_categories']) if canonical_info else [neo4j_node['category']] expanded_categories = list(canonical_info['expanded_categories']) if canonical_info else [neo4j_node['category']] iri = neo4j_node['iri'] if neo4j_node['id'] == canonicalized_curie else None all_names = [neo4j_node['name']] - - # Check for bug where not all categories in synonymizer were of "biolink:PascalCase" format - if not all(category.startswith("biolink:") for category in all_categories): - logging.warning(f" all_categories for {canonicalized_curie} contain non 'biolink:PascalCase' " - f"items: {all_categories}") - if not all(category.startswith("biolink:") for category in expanded_categories): - logging.warning(f" expanded_categories for {canonicalized_curie} contain non 'biolink:PascalCase' " - f"items: {expanded_categories}") - canonicalized_node = _create_node(preferred_curie=canonicalized_curie, name=name, category=category, @@ -373,6 +364,8 @@ def create_kg2c_files(is_test=False): # Create a node containing information about this KG2C build kg2_build_node = canonicalized_nodes_dict.get('RTX:KG2') if kg2_build_node: + description = f"This KG2c build was created from {kg2_build_node['name']} on " \ + f"{datetime.now().strftime('%Y-%m-%d %H:%M')}." kg2c_build_node = _create_node(preferred_curie=f"{kg2_build_node['id']}c", name=f"{kg2_build_node['name']}c", all_categories=kg2_build_node['all_categories'], @@ -382,9 +375,8 @@ def create_kg2c_files(is_test=False): publications=[], iri=f"{kg2_build_node['iri']}c", all_names=[f"{kg2_build_node['name']}c"], - description=f"This KG2c build was created from {kg2_build_node['name']} on " - f"{datetime.now().strftime('%Y-%m-%d %H:%M')}.", - descriptions_list=[]) + description=description, + descriptions_list=[description]) canonicalized_nodes_dict[kg2c_build_node['id']] = kg2c_build_node else: logging.warning(f" No build node detected in the regular KG2, so I'm not creating a KG2c build node.") @@ -398,30 +390,27 @@ def create_kg2c_files(is_test=False): logging.info(f" Starting to use Chunyu's NLP-based method to choose best descriptions (in parallel)..") start = time.time() best_descriptions = pool.map(_get_best_description, description_lists) - logging.info(f" Choosing best descriptions took {round((time.time() - start) / 60, 2)} minutes") + logging.info(f" Choosing best descriptions took {round(((time.time() - start) / 60) / 60, 2)} hours") # Actually decorate nodes with their 'best' description for num in range(len(node_ids)): node_id = node_ids[num] best_description = best_descriptions[num] canonicalized_nodes_dict[node_id]["description"] = best_description del canonicalized_nodes_dict[node_id]["descriptions_list"] + del description_lists + del best_descriptions + gc.collect() # Do some final clean-up/formatting of nodes, now that all merging is done logging.info(f" Doing final clean-up/formatting of nodes") for node_id, node in canonicalized_nodes_dict.items(): - # Sort all of our list properties (nicer for users that way) - for array_property_name in ARRAY_NODE_PROPERTIES: - node[array_property_name] = sorted([item for item in node[array_property_name] if item]) node["publications"] = node["publications"][:10] # We don't need a ton of publications, so truncate them logging.info(f" Doing final clean-up/formatting of edges") # Convert our edge IDs to integers (to save space downstream) and add them as actual properties on the edges edge_num = 1 - for edge_id, edge in sorted(canonicalized_edges_dict.items()): + for edge_id, edge in canonicalized_edges_dict.items(): edge["id"] = edge_num edge_num += 1 - # Sort all of our list properties (nicer for users that way) - for array_property_name in ARRAY_EDGE_PROPERTIES: - edge[array_property_name] = sorted([item for item in edge[array_property_name] if item]) edge["publications"] = edge["publications"][:20] # We don't need a ton of publications, so truncate them # Actually create all of our output files (different formats for storing KG2c) From 073db6479081959acece7192a8d3f518dc500b19 Mon Sep 17 00:00:00 2001 From: amykglen Date: Sat, 8 May 2021 09:13:21 -0700 Subject: [PATCH 09/15] Minor clean-up --- code/kg2/canonicalized/README.md | 2 +- code/kg2/canonicalized/build_kg2c.py | 1 - code/kg2/canonicalized/create_kg2c_files.py | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/code/kg2/canonicalized/README.md b/code/kg2/canonicalized/README.md index 53985a853..a95d6c4e5 100644 --- a/code/kg2/canonicalized/README.md +++ b/code/kg2/canonicalized/README.md @@ -72,7 +72,7 @@ In creating KG2c, edges from the regular KG2 are remapped to use only 'preferred - Make sure the Biolink model version specified matches that used by the KG2 you specified - Indicate whether or not you want a new NodeSynonymizer to be built - If you do **not** want a new `NodeSynonymizer` to be built (i.e., you already have a synonymizer made from the KG2 this KG2c will be built from), ensure your synonymizer file is in `RTX/code/ARAX/NodeSynonymizer/` and is named `node_synonymizer.sqlite` -1. Then build KG2c (should take around 5-10 hours and 130GB of RAM): +1. Then build KG2c (should take around 5-10 hours and 200GB of RAM): - `python3 RTX/code/kg2/canonicalized/build_kg2c.py` In the end, KG2c will be created and stored in multiple file formats, including TSVs ready for import into Neo4j. diff --git a/code/kg2/canonicalized/build_kg2c.py b/code/kg2/canonicalized/build_kg2c.py index bcb75da58..d532b2cdc 100644 --- a/code/kg2/canonicalized/build_kg2c.py +++ b/code/kg2/canonicalized/build_kg2c.py @@ -93,7 +93,6 @@ def main(): logging.info("Recording meta KG info..") record_meta_kg_info(biolink_model_version, args.test) if upload_to_s3 and not args.test: - logging.info("Uploading KG2c files to S3..") _upload_output_files_to_s3() # Remove the config_local file we created and put original config_local back in place (if there was one) diff --git a/code/kg2/canonicalized/create_kg2c_files.py b/code/kg2/canonicalized/create_kg2c_files.py index fda70169d..05c9d0176 100644 --- a/code/kg2/canonicalized/create_kg2c_files.py +++ b/code/kg2/canonicalized/create_kg2c_files.py @@ -11,7 +11,6 @@ import logging import os import pickle -import random import re import sqlite3 import sys From 73f556a63835124cc7cfba0a4f27b18cc9eb7c11 Mon Sep 17 00:00:00 2001 From: Chunyu Ma Date: Sat, 8 May 2021 14:40:10 -0400 Subject: [PATCH 10/15] add 'biolink:Metabolite' as drug --- .../GraphSage_train/py_scripts/pull_canonicalized_KG2C.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/code/ARAX/ARAXQuery/Overlay/GraphSage_train/py_scripts/pull_canonicalized_KG2C.py b/code/ARAX/ARAXQuery/Overlay/GraphSage_train/py_scripts/pull_canonicalized_KG2C.py index 5364ea724..1b0b97ffe 100644 --- a/code/ARAX/ARAXQuery/Overlay/GraphSage_train/py_scripts/pull_canonicalized_KG2C.py +++ b/code/ARAX/ARAXQuery/Overlay/GraphSage_train/py_scripts/pull_canonicalized_KG2C.py @@ -21,15 +21,15 @@ ######### Please ignore this part until Eric finds a better way to categorize these nodes with ambiguous node type ########### # !Note: Before running the below code, please first check this DSL query, if there is returned value > 0, report error on github. -# !DSL query: match (z) where (('biolink:Disease' in z.all_categories or 'biolink:PhenotypicFeature' in z.all_categories or 'biolink:DiseaseOrPhenotypicFeature' in z.all_categories) and ('biolink:Drug' in z.all_categories or 'biolink:ChemicalSubstance' in z.all_categories)) return count(distinct z.id) +# !DSL query: match (z) where (('biolink:Disease' in z.all_categories or 'biolink:PhenotypicFeature' in z.all_categories or 'biolink:DiseaseOrPhenotypicFeature' in z.all_categories) and ('biolink:Drug' in z.all_categories or 'biolink:ChemicalSubstance' in z.all_categories or 'biolink:Metabolite' in z.all_categories)) return count(distinct z.id) ############################################################################################################################## ## Pull a dataframe of all of the graph edges excluding: # the edges with one end node with all_categories including 'drug' and another end node with all_categories including 'disease' -# 'drug' here represents all nodes with cateory that is either 'biolink:Drug' or 'biolink:ChemicalSubstance' +# 'drug' here represents all nodes with cateory that is either 'biolink:Drug' or 'biolink:ChemicalSubstance' or 'biolink:Metabolite' # 'disease' here represents all nodes with cateory that is either 'biolink:Disease'. 'biolink:PhenotypicFeature' or 'biolink:DiseaseOrPhenotypicFeature' -query = "match (disease) where (disease.category='biolink:Disease' or disease.category='biolink:PhenotypicFeature' or disease.category='biolink:DiseaseOrPhenotypicFeature') with collect(distinct disease.id) as disease_ids match (drug) where (drug.category='biolink:Drug' or drug.category='biolink:ChemicalSubstance') with collect(distinct drug.id) as drug_ids, disease_ids as disease_ids match (m1)-[]-(m2) where m1<>m2 and not (m1.id in drug_ids and m2.id in disease_ids) and not (m1.id in disease_ids and m2.id in drug_ids) with distinct m1 as node1, m2 as node2 return node1.id as source, node2.id as target" +query = "match (disease) where (disease.category='biolink:Disease' or disease.category='biolink:PhenotypicFeature' or disease.category='biolink:DiseaseOrPhenotypicFeature') with collect(distinct disease.id) as disease_ids match (drug) where (drug.category='biolink:Drug' or drug.category='biolink:ChemicalSubstance' or drug.category='biolink:Metabolite') with collect(distinct drug.id) as drug_ids, disease_ids as disease_ids match (m1)-[]-(m2) where m1<>m2 and not (m1.id in drug_ids and m2.id in disease_ids) and not (m1.id in disease_ids and m2.id in drug_ids) with distinct m1 as node1, m2 as node2 return node1.id as source, node2.id as target" res = session.run(query) KG2_alledges = pd.DataFrame(res.data()) KG2_alledges.to_csv(output_path + 'graph_edges.txt', sep='\t', index=None) @@ -42,7 +42,7 @@ KG2_allnodes_label.to_csv(output_path + 'graph_nodes_label_remove_name.txt', sep='\t', index=None) ## Pulls a dataframe of all of the graph drug-associated nodes -query = f"match (n) where (n.category='biolink:Drug') or (n.category='biolink:ChemicalSubstance') with distinct n.id as id, n.name as name return id, name" +query = f"match (n) where (n.category='biolink:Drug') or (n.category='biolink:ChemicalSubstance') or (n.category='biolink:Metabolite') with distinct n.id as id, n.name as name return id, name" res = session.run(query) drugs = pd.DataFrame(res.data()) drugs.to_csv(output_path + 'drugs.txt', sep='\t', index=None) From 6a8f5e4a1de4ba77f7b5aa09baafdfef0b2f9547 Mon Sep 17 00:00:00 2001 From: Chunyu Ma Date: Sat, 8 May 2021 14:45:17 -0400 Subject: [PATCH 11/15] add 'biolink:Metabolite' as drug --- .../GraphSage_train/py_scripts/generate_training_data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/code/ARAX/ARAXQuery/Overlay/GraphSage_train/py_scripts/generate_training_data.py b/code/ARAX/ARAXQuery/Overlay/GraphSage_train/py_scripts/generate_training_data.py index 352e17ada..771c3851e 100644 --- a/code/ARAX/ARAXQuery/Overlay/GraphSage_train/py_scripts/generate_training_data.py +++ b/code/ARAX/ARAXQuery/Overlay/GraphSage_train/py_scripts/generate_training_data.py @@ -29,7 +29,7 @@ def __init__(self): def get_drug_curies_from_graph(self): ## Pulls a dataframe of all of the graph drug-associated nodes - query = "match (n {category:'biolink:ChemicalSubstance'}) with distinct n.id as id, n.name as name, n.equivalent_curies as equivalent_curies return id, name, equivalent_curies union match (n {category:'biolink:Drug'}) with distinct n.id as id, n.name as name, n.equivalent_curies as equivalent_curies return id, name, equivalent_curies" + query = "match (n) where n.category in ['biolink:ChemicalSubstance', 'biolink:Drug', 'biolink:Metabolite'] with distinct n.id as id, n.name as name, n.equivalent_curies as equivalent_curies return id, name, equivalent_curies" session = self.driver.session() res = session.run(query) drugs = pd.DataFrame(res.data()) @@ -359,8 +359,8 @@ def generate_SemmedData(self, mysqldump_path, output_path=os.getcwd()): if __name__ == "__main__": dataGenerator = DataGeneration() drugs = dataGenerator.get_drug_curies_from_graph() - drugs.to_csv('/home/cqm5886/work/RTX/code/reasoningtool/MLDrugRepurposing/Test_graphsage/kg2_5_1/raw_training_data/drugs.txt',sep='\t',index=False) + drugs.to_csv('/home/cqm5886/work/RTX/code/reasoningtool/MLDrugRepurposing/Test_graphsage/kg2_6_3/raw_training_data/drugs.txt',sep='\t',index=False) dataGenerator.generate_MyChemData(drugs=drugs, output_path='/home/cqm5886/work/RTX/code/reasoningtool/MLDrugRepurposing/Test_graphsage/kg2_5_1/raw_training_data/',dist=2) - ## For semmedVER43_2020_R_PREDICATION.sql.gz, you might dowload from /data/orangeboard/databases/KG2.3.4/semmedVER43_2020_R_PREDICATION.sql.gz on arax.ncats.io server or directly download the latest one from semmedb website - # dataGenerator.generate_SemmedData(mysqldump_path='/home/cqm5886/work/RTX/code/reasoningtool/MLDrugRepurposing/Test_graphsage/semmedVER43_2020_R_PREDICATION.sql.gz', output_path='/home/cqm5886/work/RTX/code/reasoningtool/MLDrugRepurposing/Test_graphsage/kg2_5_1/raw_training_data/') +# For semmedVER43_2020_R_PREDICATION.sql.gz, you might dowload from /data/orangeboard/databases/KG2.3.4/semmedVER43_2020_R_PREDICATION.sql.gz on arax.ncats.io server or directly download the latest one from semmedb website +# dataGenerator.generate_SemmedData(mysqldump_path='/home/cqm5886/work/RTX/code/reasoningtool/MLDrugRepurposing/Test_graphsage/semmedVER43_2020_R_PREDICATION.sql.gz', output_path='/home/cqm5886/work/RTX/code/reasoningtool/MLDrugRepurposing/Test_graphsage/kg2_5_1/raw_training_data/') From 49d4deb73969af42a3a94941e977a8fe55e105ca Mon Sep 17 00:00:00 2001 From: Lindsey Kvarfordt <39020520+kvarforl@users.noreply.github.com> Date: Mon, 10 May 2021 10:34:28 -0700 Subject: [PATCH 12/15] Updated to only provide TSV files Per Will Byrd's email on 5-10-2021, medikanren no longer needs us to provide indexes, just reformatted TSV files so I've removed the create indexes step from the instructions --- code/kg2/mediKanren/README.md | 43 ++++------------------------------- 1 file changed, 4 insertions(+), 39 deletions(-) diff --git a/code/kg2/mediKanren/README.md b/code/kg2/mediKanren/README.md index e75f6c6cc..12ec3a8c0 100644 --- a/code/kg2/mediKanren/README.md +++ b/code/kg2/mediKanren/README.md @@ -54,33 +54,8 @@ python3.7 kg2_tsv_to_medikanren_tsv.py /path/to/kg2/tsv/files mediKanren/biolink -### Generate index files -From the `RTX/code/kg2/mediKanren` subdirectory run the following: - -run `bash -x ./create-index.sh > create-index.log 2>&1` (This could take a few days and require between 64 and 128 GB of ram) - -### Testing the Indexes - -From `~/kg2-code/mediKanren/mediKanren/biolink`, run `racket` and run the following commands: -``` -(require "mk-db.rkt") -(define rtx2 (make-db "data/rtx_kg2")) -(run* (c) (db:categoryo rtx2 c)) -(run* (p) (db:predicateo rtx2 p)) -(run 10 (c) (db:concepto rtx2 c)) -(run 10 (e) (db:edgeo rtx2 e)) -``` - -The the above should return: -1) All of the node labels -2) All of the predicates -3) A sample of 10 nodes -4) A sample of 10 edges - -Verify that the above information returned looks correct. - -### Upload the indexes and TSV files +### Upload the TSV files Navigate to the `RTX/code/kg2/mediKanren/mediKanren/biolink/data/rtx_kg2` subdirectory. @@ -88,29 +63,19 @@ Compress the TSV files into one tar.gz file: ``` tar -zcvf kg2-medikanren-tsvs.tar.gz *.tsv ``` -And compress the index files into another: - -``` -tar --exclude='*.tsv' --exclude='*.tar.gz' -zcvf kg2-medikanren-indexes.tar.gz . -``` - -Upload both tarballs to the public s3 bucket +Upload the tarball to the public s3 bucket ``` aws s3 cp kg2-medikanren-tsvs.tar.gz s3://rtx-kg2-public/ -aws s3 cp kg2-medikanren-indexes.tar.gz s3://rtx-kg2-public/ - ``` -The tarballs should look something like this, in terms of S3 URLs: +The tarball should look something like this, in terms of S3 URL: - `s3://rtx-kg2-public/kg2-medikanren-tsvs.tar.gz` -- `s3://rtx-kg2-public/kg2-medikanren-indexes.tar.gz` -But make sure to provide downloadable HTTP links to the mediKanren team, like this: +But make sure to provide a downloadable HTTP links to the mediKanren team, like this: - `http://rtx-kg2-public.s3-website-us-west-2.amazonaws.com/kg2-medikanren-tsvs.tar.gz` -- `http://rtx-kg2-public.s3-website-us-west-2.amazonaws.com/kg2-medikanren-indexes.tar.gz` --- From 23ee69c5cec650529315d0846111f6392eb1f6de Mon Sep 17 00:00:00 2001 From: Stephen Ramsey Date: Mon, 10 May 2021 12:44:06 -0700 Subject: [PATCH 13/15] #1417 --- code/ARAX/Examples/kg2_api_example.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 code/ARAX/Examples/kg2_api_example.py diff --git a/code/ARAX/Examples/kg2_api_example.py b/code/ARAX/Examples/kg2_api_example.py new file mode 100644 index 000000000..2860204c9 --- /dev/null +++ b/code/ARAX/Examples/kg2_api_example.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python3 + +import json +import pprint +import requests + +with open("kg2_api_example.json", "r") as input_file: + trapi_message = json.load(input_file) + result = requests.post("https://arax.ncats.io/api/rtxkg2/v1.0/query?bypass_cache=false", + json=trapi_message) + pprint.pprint(result.json()) From ea1e7947a82b35abca06985a554a67520665fd1a Mon Sep 17 00:00:00 2001 From: Stephen Ramsey Date: Mon, 10 May 2021 12:44:12 -0700 Subject: [PATCH 14/15] #1417 --- code/ARAX/Examples/kg2_api_example.json | 32 +++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 code/ARAX/Examples/kg2_api_example.json diff --git a/code/ARAX/Examples/kg2_api_example.json b/code/ARAX/Examples/kg2_api_example.json new file mode 100644 index 000000000..439039988 --- /dev/null +++ b/code/ARAX/Examples/kg2_api_example.json @@ -0,0 +1,32 @@ +{ + "message":{ + "query_graph":{ + "nodes":{ + "n00":{ + "id":"CHEMBL.COMPOUND:CHEMBL112", + "category":[ + "biolink:Drug" + ], + "is_set":false + }, + "n01":{ + "category":[ + "biolink:Gene", + "biolink:Protein" + ], + "is_set":false + } + }, + "edges":{ + "e00":{ + "predicate":[ + "biolink:interacts_with" + ], + "subject":"n00", + "object":"n01", + "exclude":false + } + } + } + } +} From f269c86084061fe901238f0aa50a51265872fcb2 Mon Sep 17 00:00:00 2001 From: Lindsey Kvarfordt <39020520+kvarforl@users.noreply.github.com> Date: Mon, 10 May 2021 16:13:24 -0700 Subject: [PATCH 15/15] Update kg2-versions.md --- code/kg2/kg2-versions.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/code/kg2/kg2-versions.md b/code/kg2/kg2-versions.md index 73c2571b2..636d1353b 100644 --- a/code/kg2/kg2-versions.md +++ b/code/kg2/kg2-versions.md @@ -1,3 +1,22 @@ +# 2.6.3 + +**Date: 2021.5.7** + +Biolink Model Version: 1.8.1 + +Nodes: 10,694,772 + +Edges: 51,687,002 + +Notes: + - Built by modifying edges.tsv to address 1432 speedily. changes then incorporated into the whole build process. + +Issues: + + - Issue [#1432](https://github.com/RTXteam/RTX/issues/1432) + +Build host: `kg2lindsey.rtx.ai` `~/kg2-build/` + # 2.6.2 **Date: 2021.5.1**