Skip to content

Commit

Permalink
Merge pull request #411 from RTXteam/midjuly24work
Browse files Browse the repository at this point in the history
Merging KG2.10.1 build code
  • Loading branch information
ecwood authored Sep 8, 2024
2 parents 9075635 + b158cc5 commit e6fbe9f
Show file tree
Hide file tree
Showing 41 changed files with 2,423 additions and 1,234 deletions.
39 changes: 21 additions & 18 deletions build/Snakefile-conversion
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,18 @@ rule UMLS_Conversion:

rule Ontologies_Conversion:
input:
code = config['ONT_CONVERSION_SCRIPT'],
code = config['ONTOLOGIES_CONVERSION_SCRIPT'],
real = config['ONTOLOGIES_EXTRACT_FILE'],
curies_to_categories_map = config['CURIES_TO_CATEGORIES_FILE'],
curies_to_urls_map = config['CURIES_TO_URLS_FILE'],
validation = config['VALIDATION_PLACEHOLDER']
output:
nodes = config['ONT_OUTPUT_NODES_FILE'],
edges = config['ONT_OUTPUT_EDGES_FILE']
nodes = config['ONTOLOGIES_OUTPUT_NODES_FILE'],
edges = config['ONTOLOGIES_OUTPUT_EDGES_FILE']
log:
config['ONT_CONVERSION_LOG']
config['ONTOLOGIES_CONVERSION_LOG']
shell:
"bash -x {input.code} {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1"
config['PYTHON_COMMAND'] + " {input.code} {input.real} {input.curies_to_categories_map} {input.curies_to_urls_map} " + config['BIOLINK_MODEL_VERSION'] + " {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1"

rule SemMedDB_Conversion:
input:
Expand Down Expand Up @@ -120,19 +123,6 @@ rule DGIdb_Conversion:
shell:
config['PYTHON_COMMAND'] + " {input.code} {input.real} {output.nodes} {output.edges} " + config['TEST_ARG'] + " > {log} 2>&1"

rule RepoDB_Conversion:
input:
code = config['REPODB_CONVERSION_SCRIPT'],
real = config['REPODB_INPUT_FILE'],
validation = config['VALIDATION_PLACEHOLDER']
output:
nodes = config['REPODB_OUTPUT_NODES_FILE'],
edges = config['REPODB_OUTPUT_EDGES_FILE']
log:
config['REPODB_CONVERSION_LOG']
shell:
config['PYTHON_COMMAND'] + " {input.code} {input.real} {output.nodes} {output.edges} " + config['TEST_ARG'] + " > {log} 2>&1"

rule DrugBank_Conversion:
input:
code = config['DRUGBANK_CONVERSION_SCRIPT'],
Expand Down Expand Up @@ -275,3 +265,16 @@ rule KEGG_Conversion:
config['KEGG_CONVERSION_LOG']
shell:
config['PYTHON_COMMAND'] + " {input.code} {input.real} {output.nodes} {output.edges} " + config['TEST_ARG'] + " > {log} 2>&1"

rule ClinicalTrialsKG_Conversion:
input:
code = config['CLINICALTRIALSKG_CONVERSION_SCRIPT'],
real = config['CLINICALTRIALSKG_INPUT_FILE'],
validation = config['VALIDATION_PLACEHOLDER']
output:
nodes = config['CLINICALTRIALSKG_OUTPUT_NODES_FILE'],
edges = config['CLINICALTRIALSKG_OUTPUT_EDGES_FILE']
log:
config['CLINICALTRIALSKG_CONVERSION_LOG']
shell:
config['PYTHON_COMMAND'] + " {input.code} {input.real} {output.nodes} {output.edges} " + config['TEST_ARG'] + " > {log} 2>&1"
37 changes: 25 additions & 12 deletions build/Snakefile-extraction
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,19 @@ rule UMLS:
shell:
"bash -x {input.code} {output} > {log} 2>&1"

rule Ontologies:
input:
code = config['ONTOLOGIES_EXTRACTION_SCRIPT'],
parser = config['ONTOLOGIES_EXTRACTION_PARSER'],
ontologies_load_inventory = config['ONTOLOGIES_LOAD_INVENTORY_FILE'],
validation = config['VALIDATION_PLACEHOLDER']
output:
config['ONTOLOGIES_EXTRACT_FILE']
log:
config['ONTOLOGIES_EXTRACTION_LOG']
shell:
"bash -x {input.code} {input.parser} {input.ontologies_load_inventory} {output} > {log} 2>&1"

rule SemMedDB:
input:
code = config['SEMMEDDB_EXTRACTION_SCRIPT'],
Expand Down Expand Up @@ -88,17 +101,6 @@ rule DGIdb:
shell:
"bash -x {input.code} " + config['DGIDB_DIR'] + " > {log} 2>&1"

rule RepoDB:
input:
code = config['REPODB_EXTRACTION_SCRIPT'],
validation = config['VALIDATION_PLACEHOLDER']
output:
config['REPODB_INPUT_FILE']
log:
config['REPODB_EXTRACTION_LOG']
shell:
"bash -x {input.code} " + config['REPODB_DIR'] + " > {log} 2>&1"

rule DrugBank:
input:
code = config['DRUGBANK_EXTRACTION_SCRIPT'],
Expand Down Expand Up @@ -218,4 +220,15 @@ rule KEGG:
log:
config['KEGG_EXTRACTION_LOG']
shell:
"bash -x {input.code} {output} > {log} 2>&1"
"bash -x {input.code} {output} > {log} 2>&1"

rule ClinicalTrialsKG:
input:
code = config['CLINICALTRIALSKG_EXTRACTION_SCRIPT'],
validation = config['VALIDATION_PLACEHOLDER']
output:
config['CLINICALTRIALSKG_INPUT_FILE']
log:
config['CLINICALTRIALSKG_EXTRACTION_LOG']
shell:
"bash -x {input.code} {output} > {log} 2>&1"
22 changes: 11 additions & 11 deletions build/Snakefile-post-etl
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ rule Merge:
code = config['MERGE_SCRIPT'],
umls_nodes = config['UMLS_OUTPUT_NODES_FILE'],
umls_edges = config['UMLS_OUTPUT_EDGES_FILE'],
ont_nodes = config['ONT_OUTPUT_NODES_FILE'],
ont_edges = config['ONT_OUTPUT_EDGES_FILE'],
ontologies_nodes = config['ONTOLOGIES_OUTPUT_NODES_FILE'],
ontologies_edges = config['ONTOLOGIES_OUTPUT_EDGES_FILE'],
uniprot_nodes = config['UNIPROTKB_OUTPUT_NODES_FILE'],
uniprot_edges = config['UNIPROTKB_OUTPUT_EDGES_FILE'],
semmeddb_nodes = config['SEMMEDDB_OUTPUT_NODES_FILE'],
Expand All @@ -19,8 +19,6 @@ rule Merge:
ncbigene_edges = config['NCBIGENE_OUTPUT_EDGES_FILE'],
dgidb_nodes = config['DGIDB_OUTPUT_NODES_FILE'],
dgidb_edges = config['DGIDB_OUTPUT_EDGES_FILE'],
repodb_nodes = config['REPODB_OUTPUT_NODES_FILE'],
repodb_edges = config['REPODB_OUTPUT_EDGES_FILE'],
drugbank_nodes = config['DRUGBANK_OUTPUT_NODES_FILE'],
drugbank_edges = config['DRUGBANK_OUTPUT_EDGES_FILE'],
smpdb_nodes = config['SMPDB_OUTPUT_NODES_FILE'],
Expand All @@ -42,7 +40,9 @@ rule Merge:
disgenet_nodes = config['DISGENET_OUTPUT_NODES_FILE'],
disgenet_edges = config['DISGENET_OUTPUT_EDGES_FILE'],
kegg_nodes = config['KEGG_OUTPUT_NODES_FILE'],
kegg_edges = config['KEGG_OUTPUT_EDGES_FILE']
kegg_edges = config['KEGG_OUTPUT_EDGES_FILE'],
clinicaltrialskg_nodes = config['CLINICALTRIALSKG_OUTPUT_NODES_FILE'],
clinicaltrialskg_edges = config['CLINICALTRIALSKG_OUTPUT_EDGES_FILE']
output:
nodes = config['MERGED_OUTPUT_NODES_FILE'],
edges = config['MERGED_OUTPUT_EDGES_FILE'],
Expand All @@ -56,15 +56,14 @@ rule Merge:
" --outputEdgesFile {output.edges} " + \
" --kgNodesFiles " + \
"{input.umls_nodes} " + \
"{input.ont_nodes} " + \
"{input.ontologies_nodes} " + \
"{input.semmeddb_nodes} " + \
"{input.uniprot_nodes} " + \
"{input.ensembl_nodes} " + \
"{input.unichem_nodes} " + \
"{input.chembl_nodes} " + \
"{input.ncbigene_nodes} " + \
"{input.dgidb_nodes} " + \
"{input.repodb_nodes} " + \
"{input.smpdb_nodes} " + \
"{input.drugbank_nodes} " + \
"{input.hmdb_nodes} " + \
Expand All @@ -76,17 +75,17 @@ rule Merge:
"{input.intact_nodes} " + \
"{input.disgenet_nodes} " + \
"{input.kegg_nodes} " + \
"{input.clinicaltrialskg_nodes} " + \
" --kgEdgesFiles " + \
"{input.umls_edges} " + \
"{input.ont_edges} " + \
"{input.ontologies_edges} " + \
"{input.semmeddb_edges} " + \
"{input.uniprot_edges} " + \
"{input.ensembl_edges} " + \
"{input.unichem_edges} " + \
"{input.chembl_edges} " + \
"{input.ncbigene_edges} " + \
"{input.dgidb_edges} " + \
"{input.repodb_edges} " + \
"{input.smpdb_edges} " + \
"{input.drugbank_edges} " + \
"{input.hmdb_edges} " + \
Expand All @@ -97,7 +96,8 @@ rule Merge:
"{input.drugcentral_edges} " + \
"{input.intact_edges} " + \
"{input.disgenet_edges} " + \
"{input.kegg_edges} > {log} 2>&1"
"{input.kegg_edges} " + \
"{input.clinicaltrialskg_edges} > {log} 2>&1"

rule Stats:
input:
Expand All @@ -122,7 +122,7 @@ rule Simplify:
log:
config['SIMPLIFY_LOG']
shell:
"bash -x {input.code} {input.nodes} {input.edges} {output.nodes} {output.edges} " + config['VERSION_FILE'] + " " + config['TEST_FLAG'] + " > {log} 2>&1"
"bash -x {input.code} {input.nodes} {input.edges} {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1"

rule Slim:
input:
Expand Down
57 changes: 54 additions & 3 deletions build/build-kg2-snakemake.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,43 @@ then
run_flag="-F"
fi

build_kg2_log_file=${BUILD_DIR}/build-kg2-snakemake${dryrun}${test_suffix}.log
trigger_file_is_major_release=${BUILD_DIR}/major-release
trigger_file_is_minor_release=${BUILD_DIR}/minor-release

increment_flag=''
if [[ "${test_flag}" == "test" || "${dryrun}" == "-n" ]]
then
increment_flag=''
else
if [ -e ${trigger_file_is_major_release} ]
then
increment_flag='--increment_major'
else
if [ -e ${trigger_file_is_minor_release} ]
then
increment_flag='--increment_minor'
fi
fi
fi

if [[ "${ci_flag}" == "ci" ]]
then
sed -i "\@^kg2_version=@ckg2_version=KG2.CI" ${CODE_DIR}/master-config.shinc
else
${s3_cp_cmd} s3://${s3_bucket_public}/${kg2_version_file} ${kg2_version_file_local}
if [[ "${increment_flag}" != '' ]]
then
${VENV_DIR}/bin/python3 ${PROCESS_CODE_DIR}/update_version.py ${increment_flag} ${kg2_version_file_local}
else
echo "*** TEST MODE -- NO INCREMENT ***"
fi
curr_kg2_version=`cat ${kg2_version_file_local}`
sed -i "\@^kg2_version=@ckg2_version=${curr_kg2_version}" ${CODE_DIR}/master-config.shinc
fi

source ${config_dir}/master-config.shinc

build_kg2_log_file=${BUILD_DIR}/build-kg2-snakemake-${kg2_version}${dryrun}${test_suffix}.log
touch ${build_kg2_log_file}
if [[ "${ci_flag}" == "ci" ]]
then
Expand All @@ -75,6 +111,8 @@ function build_kg2 () {
echo "================= starting build-kg2-snakemake.sh =================="
date

export PATH=$PATH:${BUILD_DIR}

snakemake_config_file=${BUILD_CODE_DIR}/snakemake-config.yaml
snakefile=${BUILD_CODE_DIR}/Snakefile

Expand All @@ -91,8 +129,6 @@ ${python_command} ${BUILD_CODE_DIR}/generate_snakemake_config_file.py ${test_arg
# --dag | dot -Tpng > ~/kg2-build/snakemake_diagram.png: Creates Snakemake workflow diagram (when combined with -F and -j)
# -n: dry run REMOVE THIS BEFORE BUILDING

export PATH=$PATH:${BUILD_DIR}

graphic=""
if [[ "${build_flag}" == "graphic" || "${secondary_build_flag}" == "graphic" || "${tertiary_build_flag}" == "graphic" ]]
then
Expand All @@ -116,6 +152,21 @@ fi

cd ~ && ${VENV_DIR}/bin/snakemake --snakefile ${snakefile} ${run_flag} -R Finish -j 16 ${dryrun} ${graphic}

if [[ "${ci_flag}" != "ci" ]]
then
${s3_cp_cmd} ${kg2_version_file_local} s3://${s3_bucket_public}/${kg2_version_file}
fi

if [[ -f ${trigger_file_is_major_release} ]]
then
rm -f ${trigger_file_is_major_release}
fi

if [[ -f ${trigger_file_is_minor_release} ]]
then
rm -f ${trigger_file_is_minor_release}
fi

date
echo "================ script finished ============================"
}
Expand Down
Loading

0 comments on commit e6fbe9f

Please sign in to comment.