From 655979b69b912f3c75e531e1dee513e01317917d Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Wed, 18 Sep 2024 15:42:00 -0400 Subject: [PATCH 1/9] Create script to generate ncbitaxon obo graph and add it to mira docker image --- docker/Dockerfile | 15 +++++++++++++++ docker/generate_graph.py | 26 ++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 docker/generate_graph.py diff --git a/docker/Dockerfile b/docker/Dockerfile index 6bdfa9d6f..03b54bfa0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,3 +1,16 @@ +# Create an initial docker image to generate the graph and transfer it to the +# second docker image +# We do this to avoid involving additional imports in the second docker image +FROM python:3.10-slim AS graph-builder + +WORKDIR /graphs +RUN apt-get update && apt-get install -y git +RUN pip install pyobo networkx obonet + +# Copy and run the script to generate the pickled graph +COPY generate_graph.py /graphs/generate_graph.py +RUN python generate_graph.py + FROM ubuntu:focal WORKDIR /sw @@ -42,5 +55,7 @@ RUN python -m pip install --upgrade pip && \ # Copy the example json for reconstructing the ode semantics RUN wget -O /sw/sir_flux_span.json https://raw.githubusercontent.com/gyorilab/mira/main/tests/sir_flux_span.json +RUN mkdir -p /graphs +COPY --from=graph-builder /graphs/relabeled_obo_graph.pkl /graphs/relabeled_obo_graph.pkl COPY startup.sh startup.sh ENTRYPOINT ["/bin/bash", "/sw/startup.sh"] diff --git a/docker/generate_graph.py b/docker/generate_graph.py new file mode 100644 index 000000000..231caab00 --- /dev/null +++ b/docker/generate_graph.py @@ -0,0 +1,26 @@ +from pyobo import get_version +from pyobo.getters import _ensure_ontology_path +from pathlib import Path +from obonet import read_obo +import networkx +import pickle + +def download_convert_ncbitaxon_obo_to_graph(): + resource_prefix = "ncbitaxon" + version = get_version(resource_prefix) + + # Checks to see if the pickled ncbitaxon obo graph exists in the container + cached_relabeled_obo_graph_path = Path("/graphs/relabeled_obo_graph.pkl") + if not cached_relabeled_obo_graph_path.exists(): + _, obo_path = _ensure_ontology_path(resource_prefix, force=False, + version=version) + obo_graph = read_obo(obo_path) + relabeled_graph = networkx.relabel_nodes(obo_graph, + lambda node_index: + node_index.lower()) + with open(cached_relabeled_obo_graph_path, + 'wb') as relabeled_graph_file: + pickle.dump(relabeled_graph, relabeled_graph_file) + +if __name__ == "__main__": + download_convert_ncbitaxon_obo_to_graph() From 0cd6d2002d0066d0cccb24930b43f9b753702db3 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Thu, 19 Sep 2024 09:50:45 -0400 Subject: [PATCH 2/9] Add docker volume to create directory to store pickled obo files. Adjust startup script to generate obo graph if not found in mounted host directory. Add shell script to build image, make host directory, and run the container. --- docker/Dockerfile | 15 +-------- docker/README.md | 7 +++++ docker/build_run_docker.sh | 5 +++ ...nerate_graph.py => generate_obo_graphs.py} | 4 ++- docker/startup.sh | 10 ++++++ mira/dkg/construct.py | 31 ++++++------------- 6 files changed, 36 insertions(+), 36 deletions(-) create mode 100755 docker/build_run_docker.sh rename docker/{generate_graph.py => generate_obo_graphs.py} (91%) diff --git a/docker/Dockerfile b/docker/Dockerfile index 03b54bfa0..181e710d1 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,16 +1,3 @@ -# Create an initial docker image to generate the graph and transfer it to the -# second docker image -# We do this to avoid involving additional imports in the second docker image -FROM python:3.10-slim AS graph-builder - -WORKDIR /graphs -RUN apt-get update && apt-get install -y git -RUN pip install pyobo networkx obonet - -# Copy and run the script to generate the pickled graph -COPY generate_graph.py /graphs/generate_graph.py -RUN python generate_graph.py - FROM ubuntu:focal WORKDIR /sw @@ -42,6 +29,7 @@ RUN wget -O /sw/nodes.tsv.gz https://askem-mira.s3.amazonaws.com/dkg/$domain/bui sed -i 's/#dbms.security.auth_enabled/dbms.security.auth_enabled/' /etc/neo4j/neo4j.conf && \ neo4j-admin import --delimiter='TAB' --skip-duplicate-nodes=true --skip-bad-relationships=true --nodes /sw/nodes.tsv.gz --relationships /sw/edges.tsv.gz +COPY generate_obo_graphs.py /sw/generate_obo_graphs.py # Python packages RUN python -m pip install --upgrade pip && \ python -m pip install git+https://github.com/gyorilab/mira.git@main#egg=mira[web,uvicorn,dkg-client,dkg-construct] && \ @@ -56,6 +44,5 @@ RUN python -m pip install --upgrade pip && \ RUN wget -O /sw/sir_flux_span.json https://raw.githubusercontent.com/gyorilab/mira/main/tests/sir_flux_span.json RUN mkdir -p /graphs -COPY --from=graph-builder /graphs/relabeled_obo_graph.pkl /graphs/relabeled_obo_graph.pkl COPY startup.sh startup.sh ENTRYPOINT ["/bin/bash", "/sw/startup.sh"] diff --git a/docker/README.md b/docker/README.md index d009ec30d..787fe4c5f 100644 --- a/docker/README.md +++ b/docker/README.md @@ -40,6 +40,13 @@ docker run -p 8771:8771 -p 7687:7687 -e MIRA_NEO4J_URL=bolt://0.0.0.0:7687 mira: This exposes a REST API at `http://localhost:8771`. This also exposes Neo4j's bolt port at port 7687. + +Running the `build_run_docker.sh` script builds the docker image, +create directory `docker/mounted_graph_storage` to store the pickled obo +graphs, and start the container. When you first run the script and +start the container, it will take a few minutes to generate and store the +pickled graphs. + ## MIRA Metaregistry The MIRA metaregistry contains the prefixes and their associated metadata for all use cases. diff --git a/docker/build_run_docker.sh b/docker/build_run_docker.sh new file mode 100755 index 000000000..0e4dc4e8b --- /dev/null +++ b/docker/build_run_docker.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +docker build --tag mira_epi_dkg:latest . +mkdir -p mounted_graph_storage +docker run --detach -v ./mounted_graph_storage:/graphs -p 7474:7474 -p 8771:8771 -p 7687:7687 -e MIRA_NEO4J_URL=bolt://0.0.0.0:7687 --name mira mira_epi_dkg:latest \ No newline at end of file diff --git a/docker/generate_graph.py b/docker/generate_obo_graphs.py similarity index 91% rename from docker/generate_graph.py rename to docker/generate_obo_graphs.py index 231caab00..3dcb28cf3 100644 --- a/docker/generate_graph.py +++ b/docker/generate_obo_graphs.py @@ -10,11 +10,13 @@ def download_convert_ncbitaxon_obo_to_graph(): version = get_version(resource_prefix) # Checks to see if the pickled ncbitaxon obo graph exists in the container - cached_relabeled_obo_graph_path = Path("/graphs/relabeled_obo_graph.pkl") + cached_relabeled_obo_graph_path = Path("/graphs/ncbitaxon_obo_graph.pkl") if not cached_relabeled_obo_graph_path.exists(): _, obo_path = _ensure_ontology_path(resource_prefix, force=False, version=version) obo_graph = read_obo(obo_path) + + # Normalize node indices relabeled_graph = networkx.relabel_nodes(obo_graph, lambda node_index: node_index.lower()) diff --git a/docker/startup.sh b/docker/startup.sh index 3f412a327..9ee790f36 100755 --- a/docker/startup.sh +++ b/docker/startup.sh @@ -1,4 +1,14 @@ #!/bin/bash + +# Check if the ncbitaxon pickled graph file exists +if [ ! -f /graphs/ncbitaxon_obo_graph.pkl ]; then + echo "Pickled ncbitaxon obo graph file not found. Generating it" + python /sw/generate_obo_graphs.py +else + echo "Pickled ncbitaxon obo graph file already exists in the container in + /graphs/" +fi + neo4j start sleep 100 neo4j status diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index 44eb37b13..595a1db6a 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -434,14 +434,11 @@ def extract_ontology_subtree(curie: str, add_subtree: bool = False): under the corresponding entry's subtree in its respective ontology. Relation information is also extracted with this option. - Running this method for the first time for each specific resource will - take a long time (minutes) as the obo resource file has to be downloaded, - converted to a networkx graph, have their node indices normalized, and - pickled. - - Subsequent runs of this method will take a few seconds as the pickled + Execution of this method will take a few seconds as the pickled graph object has to be loaded. + Currently we only support the addition of ncbitaxon terms. + Parameters ---------- curie : @@ -463,21 +460,13 @@ def extract_ontology_subtree(curie: str, add_subtree: bool = False): resource_prefix = curie.split(":")[0] if resource_prefix == "ncbitaxon": type = "class" - version = get_version(resource_prefix) - cached_relabeled_obo_graph_path = prefix_directory_join(resource_prefix, - name="relabeled_obo_graph.pkl", - version=version) - if not cached_relabeled_obo_graph_path.exists(): - _, obo_path = _ensure_ontology_path(resource_prefix, force=False, - version=version) - obo_graph = read_obo(obo_path) - relabeled_graph = networkx.relabel_nodes(obo_graph, - lambda node_index: node_index.lower()) - with open(cached_relabeled_obo_graph_path,'wb') as relabeled_graph_file: - pickle.dump(relabeled_graph, relabeled_graph_file) - else: - with open(cached_relabeled_obo_graph_path,'rb') as relabeled_graph_file: - relabeled_graph = pickle.load(relabeled_graph_file) + cached_relabeled_obo_graph_path = (Path(__file__).resolve().parents[2] + / "docker" / + "mounted_graph_storage" / + "ncbitaxon_obo_graph.pkl") + + with open(cached_relabeled_obo_graph_path,'rb') as relabeled_graph_file: + relabeled_graph = pickle.load(relabeled_graph_file) else: return nodes, edges From 993b9050779d399697cc734d426ee1d17894a7f9 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Thu, 19 Sep 2024 10:21:37 -0400 Subject: [PATCH 3/9] Format file --- docker/generate_obo_graphs.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/docker/generate_obo_graphs.py b/docker/generate_obo_graphs.py index 3dcb28cf3..1d51a2642 100644 --- a/docker/generate_obo_graphs.py +++ b/docker/generate_obo_graphs.py @@ -5,6 +5,7 @@ import networkx import pickle + def download_convert_ncbitaxon_obo_to_graph(): resource_prefix = "ncbitaxon" version = get_version(resource_prefix) @@ -12,17 +13,20 @@ def download_convert_ncbitaxon_obo_to_graph(): # Checks to see if the pickled ncbitaxon obo graph exists in the container cached_relabeled_obo_graph_path = Path("/graphs/ncbitaxon_obo_graph.pkl") if not cached_relabeled_obo_graph_path.exists(): - _, obo_path = _ensure_ontology_path(resource_prefix, force=False, - version=version) + _, obo_path = _ensure_ontology_path( + resource_prefix, force=False, version=version + ) obo_graph = read_obo(obo_path) # Normalize node indices - relabeled_graph = networkx.relabel_nodes(obo_graph, - lambda node_index: - node_index.lower()) - with open(cached_relabeled_obo_graph_path, - 'wb') as relabeled_graph_file: + relabeled_graph = networkx.relabel_nodes( + obo_graph, lambda node_index: node_index.lower() + ) + with open( + cached_relabeled_obo_graph_path, "wb" + ) as relabeled_graph_file: pickle.dump(relabeled_graph, relabeled_graph_file) + if __name__ == "__main__": download_convert_ncbitaxon_obo_to_graph() From 4f29d502ecbd7af790979e8a28f44bd4ec2225ca Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Thu, 19 Sep 2024 11:39:20 -0400 Subject: [PATCH 4/9] Make adding the graph to docker simpler --- docker/Dockerfile | 4 ++-- docker/README.md | 7 ------- docker/build_run_docker.sh | 5 ----- mira/dkg/construct.py | 8 ++++---- mira/dkg/generate_obo_graphs.py | 34 +++++++++++++++++++++++++++++++++ 5 files changed, 40 insertions(+), 18 deletions(-) delete mode 100755 docker/build_run_docker.sh create mode 100644 mira/dkg/generate_obo_graphs.py diff --git a/docker/Dockerfile b/docker/Dockerfile index 181e710d1..838c1f734 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -29,7 +29,6 @@ RUN wget -O /sw/nodes.tsv.gz https://askem-mira.s3.amazonaws.com/dkg/$domain/bui sed -i 's/#dbms.security.auth_enabled/dbms.security.auth_enabled/' /etc/neo4j/neo4j.conf && \ neo4j-admin import --delimiter='TAB' --skip-duplicate-nodes=true --skip-bad-relationships=true --nodes /sw/nodes.tsv.gz --relationships /sw/edges.tsv.gz -COPY generate_obo_graphs.py /sw/generate_obo_graphs.py # Python packages RUN python -m pip install --upgrade pip && \ python -m pip install git+https://github.com/gyorilab/mira.git@main#egg=mira[web,uvicorn,dkg-client,dkg-construct] && \ @@ -40,9 +39,10 @@ RUN python -m pip install --upgrade pip && \ python -m pip install --no-dependencies "lxml>=4.6.4" && \ python -m pip install --no-dependencies --ignore-requires-python sbmlmath +RUN python -m mira.dkg.generate_obo_graphs + # Copy the example json for reconstructing the ode semantics RUN wget -O /sw/sir_flux_span.json https://raw.githubusercontent.com/gyorilab/mira/main/tests/sir_flux_span.json -RUN mkdir -p /graphs COPY startup.sh startup.sh ENTRYPOINT ["/bin/bash", "/sw/startup.sh"] diff --git a/docker/README.md b/docker/README.md index 787fe4c5f..d009ec30d 100644 --- a/docker/README.md +++ b/docker/README.md @@ -40,13 +40,6 @@ docker run -p 8771:8771 -p 7687:7687 -e MIRA_NEO4J_URL=bolt://0.0.0.0:7687 mira: This exposes a REST API at `http://localhost:8771`. This also exposes Neo4j's bolt port at port 7687. - -Running the `build_run_docker.sh` script builds the docker image, -create directory `docker/mounted_graph_storage` to store the pickled obo -graphs, and start the container. When you first run the script and -start the container, it will take a few minutes to generate and store the -pickled graphs. - ## MIRA Metaregistry The MIRA metaregistry contains the prefixes and their associated metadata for all use cases. diff --git a/docker/build_run_docker.sh b/docker/build_run_docker.sh deleted file mode 100755 index 0e4dc4e8b..000000000 --- a/docker/build_run_docker.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -docker build --tag mira_epi_dkg:latest . -mkdir -p mounted_graph_storage -docker run --detach -v ./mounted_graph_storage:/graphs -p 7474:7474 -p 8771:8771 -p 7687:7687 -e MIRA_NEO4J_URL=bolt://0.0.0.0:7687 --name mira mira_epi_dkg:latest \ No newline at end of file diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index 595a1db6a..1a07efd0a 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -460,10 +460,10 @@ def extract_ontology_subtree(curie: str, add_subtree: bool = False): resource_prefix = curie.split(":")[0] if resource_prefix == "ncbitaxon": type = "class" - cached_relabeled_obo_graph_path = (Path(__file__).resolve().parents[2] - / "docker" / - "mounted_graph_storage" / - "ncbitaxon_obo_graph.pkl") + version = get_version(resource_prefix) + cached_relabeled_obo_graph_path = prefix_directory_join(resource_prefix, + name="relabeled_obo_graph.pkl", + version=version) with open(cached_relabeled_obo_graph_path,'rb') as relabeled_graph_file: relabeled_graph = pickle.load(relabeled_graph_file) diff --git a/mira/dkg/generate_obo_graphs.py b/mira/dkg/generate_obo_graphs.py new file mode 100644 index 000000000..0371768be --- /dev/null +++ b/mira/dkg/generate_obo_graphs.py @@ -0,0 +1,34 @@ +from pyobo import get_version +from pyobo.getters import _ensure_ontology_path +from pyobo.utils.path import prefix_directory_join +from obonet import read_obo +import networkx +import pickle + + +def download_convert_ncbitaxon_obo_to_graph(): + resource_prefix = "ncbitaxon" + version = get_version(resource_prefix) + + # Checks to see if the pickled ncbitaxon obo graph exists in the container + cached_relabeled_obo_graph_path = prefix_directory_join(resource_prefix, + name="relabeled_obo_graph.pkl", + version=version) + if not cached_relabeled_obo_graph_path.exists(): + _, obo_path = _ensure_ontology_path( + resource_prefix, force=False, version=version + ) + obo_graph = read_obo(obo_path) + + # Normalize node indices + relabeled_graph = networkx.relabel_nodes( + obo_graph, lambda node_index: node_index.lower() + ) + with open( + cached_relabeled_obo_graph_path, "wb" + ) as relabeled_graph_file: + pickle.dump(relabeled_graph, relabeled_graph_file) + + +if __name__ == "__main__": + download_convert_ncbitaxon_obo_to_graph() From c2acfe88b897915f213e0fd695627132288b45b6 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Thu, 19 Sep 2024 11:40:26 -0400 Subject: [PATCH 5/9] Revert startup.sh script --- docker/startup.sh | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/docker/startup.sh b/docker/startup.sh index 9ee790f36..3f412a327 100755 --- a/docker/startup.sh +++ b/docker/startup.sh @@ -1,14 +1,4 @@ #!/bin/bash - -# Check if the ncbitaxon pickled graph file exists -if [ ! -f /graphs/ncbitaxon_obo_graph.pkl ]; then - echo "Pickled ncbitaxon obo graph file not found. Generating it" - python /sw/generate_obo_graphs.py -else - echo "Pickled ncbitaxon obo graph file already exists in the container in - /graphs/" -fi - neo4j start sleep 100 neo4j status From 1a6d875e3b3b67e821d526e142b70c257939934d Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Thu, 19 Sep 2024 11:42:05 -0400 Subject: [PATCH 6/9] Remove duplicate script --- docker/generate_obo_graphs.py | 32 -------------------------------- 1 file changed, 32 deletions(-) delete mode 100644 docker/generate_obo_graphs.py diff --git a/docker/generate_obo_graphs.py b/docker/generate_obo_graphs.py deleted file mode 100644 index 1d51a2642..000000000 --- a/docker/generate_obo_graphs.py +++ /dev/null @@ -1,32 +0,0 @@ -from pyobo import get_version -from pyobo.getters import _ensure_ontology_path -from pathlib import Path -from obonet import read_obo -import networkx -import pickle - - -def download_convert_ncbitaxon_obo_to_graph(): - resource_prefix = "ncbitaxon" - version = get_version(resource_prefix) - - # Checks to see if the pickled ncbitaxon obo graph exists in the container - cached_relabeled_obo_graph_path = Path("/graphs/ncbitaxon_obo_graph.pkl") - if not cached_relabeled_obo_graph_path.exists(): - _, obo_path = _ensure_ontology_path( - resource_prefix, force=False, version=version - ) - obo_graph = read_obo(obo_path) - - # Normalize node indices - relabeled_graph = networkx.relabel_nodes( - obo_graph, lambda node_index: node_index.lower() - ) - with open( - cached_relabeled_obo_graph_path, "wb" - ) as relabeled_graph_file: - pickle.dump(relabeled_graph, relabeled_graph_file) - - -if __name__ == "__main__": - download_convert_ncbitaxon_obo_to_graph() From 614bf193aa9d19c615cd89895e1f5b35ad4dafe9 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Thu, 19 Sep 2024 13:24:10 -0400 Subject: [PATCH 7/9] Use correct pyobo get_version --- mira/dkg/generate_obo_graphs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mira/dkg/generate_obo_graphs.py b/mira/dkg/generate_obo_graphs.py index 0371768be..92266ec83 100644 --- a/mira/dkg/generate_obo_graphs.py +++ b/mira/dkg/generate_obo_graphs.py @@ -1,4 +1,4 @@ -from pyobo import get_version +from pyobo.api.utils import get_version from pyobo.getters import _ensure_ontology_path from pyobo.utils.path import prefix_directory_join from obonet import read_obo From efe1ff6538dd5d2d1069724abdfa99b7125f4125 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Thu, 19 Sep 2024 13:43:31 -0400 Subject: [PATCH 8/9] Removed unused imports --- mira/dkg/construct.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index 1a07efd0a..4b97ea919 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -41,10 +41,8 @@ from pydantic import BaseModel, Field from pyobo.struct import part_of, is_a from pyobo.sources import ontology_resolver -from pyobo.getters import _ensure_ontology_path from pyobo.api.utils import get_version from pyobo.utils.path import prefix_directory_join -from obonet import read_obo from tabulate import tabulate from tqdm.auto import tqdm from typing_extensions import Literal From 93868946971c9a1e6caac6dfae5b09fbc109be70 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Fri, 20 Sep 2024 09:49:39 -0400 Subject: [PATCH 9/9] Add fallback option to download and convert ncbitaxon obo graph --- mira/dkg/construct.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index 4b97ea919..322f6c534 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -61,6 +61,7 @@ from mira.dkg.resources.geonames import get_geonames_terms from mira.dkg.resources.extract_eiffel_ontology import get_eiffel_ontology_terms from mira.dkg.resources.uat import get_uat +from mira.dkg.generate_obo_graphs import download_convert_ncbitaxon_obo_to_graph MODULE = pystow.module("mira") DEMO_MODULE = MODULE.module("demo", "import") @@ -462,7 +463,8 @@ def extract_ontology_subtree(curie: str, add_subtree: bool = False): cached_relabeled_obo_graph_path = prefix_directory_join(resource_prefix, name="relabeled_obo_graph.pkl", version=version) - + if not cached_relabeled_obo_graph_path.exists(): + download_convert_ncbitaxon_obo_to_graph() with open(cached_relabeled_obo_graph_path,'rb') as relabeled_graph_file: relabeled_graph = pickle.load(relabeled_graph_file) else: