-
Notifications
You must be signed in to change notification settings - Fork 4
/
Makefile
157 lines (127 loc) · 7.09 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
CORES=5
JAVA_ENV=JAVA_OPTS="-Xmx96G -XX:+UseParallelGC"
BLAZEGRAPH-RUNNER=$(JAVA_ENV) blazegraph-runner
ROBOT_ENV=ROBOT_JAVA_ARGS="-Xmx96G -XX:+UseParallelGC"
ROBOT=$(ROBOT_ENV) robot
JVM_ARGS=JVM_ARGS="-Xmx96G -XX:+UseParallelGC"
ARQ=$(JVM_ARGS) arq
SCALA_RUN=$(JAVA_ENV) COURSIER_CACHE=/data/coursier-cache scala-cli run --server=false
BIOLINK=v4.2.1
# Phony targets
.PHONY: all
all: kg.tsv
echo All done.
owlrl-datalog:
git clone https://github.com/balhoff/owlrl-datalog.git
owlrl-datalog/bin/owl_rl_abox_quads: owlrl-datalog owlrl-datalog/src/datalog/swrl.dl
cd owlrl-datalog &&\
mkdir -p bin &&\
souffle -c src/datalog/owl_rl_abox_quads.dl -o bin/owl_rl_abox_quads
owlrl-datalog/src/datalog/swrl.dl: ontologies-merged.ttl owlrl-datalog
$(SCALA_RUN) owlrl-datalog/src/scala/swrl-to-souffle.sc -- ontologies-merged.ttl $@
owlrl-datalog/bin/owl_from_rdf: owlrl-datalog
cd owlrl-datalog &&\
mkdir -p bin &&\
souffle -c src/datalog/owl_from_rdf.dl -o bin/owl_from_rdf
scripts/kg_edges: scripts/kg_edges.dl
souffle -j ${CORES} -c $< -o $@
# Step 3. Convert ontologies-merged.ttl into a format that can be read by Souffle.
# RIOT is a Jena tool that converts Turtle into n-Triples using streaming, and
# turn it into a TSV with three columns (?s ?p ?o).
ontology.facts: ontologies-merged.ttl
riot --nocheck --output=ntriples $< | sed 's/ /\t/' | sed 's/ /\t/' | sed 's/ \.$$//' >$@
# Step 4. owl_from_rdf converts RDF triples (in TSV) into OWL data structures.
ontology.dir: owlrl-datalog/bin/owl_from_rdf ontology.facts
mkdir -p ontology && ./owlrl-datalog/bin/owl_from_rdf -D ontology && touch $@
# Step 1. Import all the ontologies in ontologies.ofn into the `mirror/` directory.
# Also writes out a catalog file which will be used in future Robot.
# TODO: replace with the ontology.dir pattern of creating a directory and then touching mirror.dir
mirror: ontologies.ofn
rm -rf $@ &&\
$(ROBOT) mirror -i $< -d $@ -o $@/catalog-v001.xml
# Step 2. Create ontologies-merged.ttl by merging the ontologies in mirror with the Ubergraph axioms.
# Unsatisfiable classes are dumped into the debug file (debug.ofn), which is not created if there are
# no problems.
#FIXME stop disabling disjoint checks
ontologies-merged.ttl: ontologies.ofn ubergraph-axioms.ofn mirror
$(ROBOT) merge --catalog mirror/catalog-v001.xml --include-annotations true \
-i $< -i ubergraph-axioms.ofn \
remove --axioms 'disjoint' --trim true --preserve-structure false \
remove --term 'owl:Nothing' --trim true --preserve-structure false \
remove --term 'http://purl.obolibrary.org/obo/caro#part_of' --term 'http://purl.obolibrary.org/obo/caro#develops_from' --trim true --preserve-structure false \
reason -r ELK -D debug.ofn -o $@
# Step 5. Download all the Noctua models.
noctua-models.dir:
git clone --depth 1 https://github.com/geneontology/noctua-models && touch $@
aop-models.dir:
git clone --depth 1 https://github.com/ExposuresProvider/noctua-models.git aop-models && touch $@
# Step 6. Merge all the Noctua models. For each model:
# - Loads the model.
# - Identifies the ontology IRI.
# - Filters out non-production/non-Reactome models.
# - Outputs nquads where the graph is the ontology IRI.
noctua-models.nq: noctua-models.dir scripts/merge_noctua_models.sc
$(SCALA_RUN) scripts/merge_noctua_models.sc -- noctua-models/models $@
aop-models.nq: aop-models.dir scripts/merge_noctua_models.sc
$(SCALA_RUN) scripts/merge_noctua_models.sc -- aop-models/models $@
# Step 7. Prepare the Signor models.
# TODO: replace with a Scala script similar to the one for noctua-models.nq.
signor-models.nq: signor-models
rm -f [email protected] &&\
$(BLAZEGRAPH-RUNNER) load [email protected] --properties=blazegraph.properties --informat=turtle --use-ontology-graph=true signor-models &&\
$(BLAZEGRAPH-RUNNER) update [email protected] --properties=blazegraph.properties sparql/set-provenance-to-signor.ru &&\
$(BLAZEGRAPH-RUNNER) dump [email protected] --properties=blazegraph.properties --outformat=n-quads $@ && rm [email protected]
# Step 8. Download CTD file.
CTD_chem_gene_ixns_structured.xml:
curl -L -O 'http://ctdbase.org/reports/CTD_chem_gene_ixns_structured.xml.gz' &&\
gunzip CTD_chem_gene_ixns_structured.xml.gz
# Step 9. Generate an n-quads file from CTD.
# (ctd-to-owl is in the Docker container that we use).
ctd-models.nq: CTD_chem_gene_ixns_structured.xml
$(JAVA_ENV) ctd-to-owl CTD_chem_gene_ixns_structured.xml $@ chebi_mesh.tsv
# Step 10. Concatenate all RDF files using a single RIOT instance (to make sure blank nodes don't collapse)
# to create quad.facts. Each quad has a graph IRI that tells you were the quad came from.
# Must concatenate multiple RDF files using riot before loading into Souffle, so that blank nodes don't collide
quad.facts: noctua-models.nq aop-models.nq ctd-models.nq
riot -q --output=N-Quads $^ | sed 's/ /\t/' | sed 's/ /\t/' | sed -E 's/\t(.+) (.+) \.$$/\t\1\t\2/' >$@
# Step 11. Reason over the quad.facts, which:
# - 1. Load the ontology from ontology.dir
# - 2. Read asserted triples from quad.facts
# - 3. Create inferred graphs for every asserted graph, adding insertions.
#
# This uses a bunch of rules:
# - All the OWL RL rules (but with graphs to prevent inferencing between graphs).
# (Note that ontology-related rules will use triples but data-related rules will use quads.)
#
# Note that the output file -- inferred.csv -- is actually a TSV file.
inferred.csv: quad.facts ontology.dir owlrl-datalog/bin/owl_rl_abox_quads
./owlrl-datalog/bin/owl_rl_abox_quads
# Step 12. Download the Biolink model.
biolink-model.owl.ttl:
curl -L 'https://raw.githubusercontent.com/biolink/biolink-model/$(BIOLINK)/project/owl/biolink_model.owl.ttl' -o $@
# Step 13. Convert Biolink model into an n-triples file.
biolink.facts: biolink-model.owl.ttl
riot -q --syntax=turtle --output=ntriples $< | sed 's/ /\t/' | sed 's/ /\t/' | sed 's/ \.$$//' >$@
# Step 14. Download the Biolink Model prefix map.
biolink-model-prefix-map.json:
curl -L 'https://raw.githubusercontent.com/biolink/biolink-model/$(BIOLINK)/project/prefixmap/biolink_model_prefix_map.json' -o $@
# Step 15. Load all the data and ontologies.
# - ./scripts/kg_edges: compiled from ./scripts/kg_edges.dl with Souffle (see above).
# - inferred.csv: All inferred quads.
# - quad.facts: All asserted quads.
# - biolink.facts: Biolink model.
# - ontology.facts: only used to convert REACTOME identifiers into UniProtKB identifiers.
# - Also uses: ro-to-biolink-local-mappings.tsv to map from RO to Biolink.
# - TODO: add as a prereq
# Creates a TSV file named kg_edge.csv with five columns:
# - subj: direct type of subject
# - pred: Biolink predicate
# - obj: direct type of object
# - ps: primary_source
# - prov: graph that this is coming from (without brackets -- if it had brackets, it would
# be ignored by scripts/compact_iris.sc)
kg_edge.csv: scripts/kg_edges inferred.csv quad.facts biolink.facts ontology.facts
./scripts/kg_edges -j ${CORES}
# Step 16. Compact IRIs in the kg_edge.csv file using the specified prefixes.
kg.tsv: kg_edge.csv scripts/compact_iris.sc biolink-model-prefix-map.json supplemental-namespaces.json
$(SCALA_RUN) scripts/compact_iris.sc -- biolink-model-prefix-map.json supplemental-namespaces.json kg_edge.csv $@