-
Notifications
You must be signed in to change notification settings - Fork 1
/
Snakefile
70 lines (54 loc) · 3.37 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
localrules: final_files
assert os.getenv('MODE') in ['full','test'], "Must set environmental variable MODE to full or test"
if os.getenv('MODE') == 'full':
source_dir = os.getenv('BIOTEXT')
assert source_dir and os.path.isdir(source_dir), "For full run, must set environmental variable BIOTEXT to directory with BIOTEXT BioC XML files"
source_dir = source_dir.rstrip('/')
work_dir = 'working'
elif os.getenv('MODE') == 'test':
source_dir = 'test_data'
work_dir = 'test_working'
kb_files = [ '%s/kb/%s' % (work_dir,f.replace('.bioc.xml','.tsv')) for f in os.listdir(source_dir) ]
final_files = [ f"{work_dir}/{f}" for f in ['civicmine_unfiltered.tsv.gz','civicmine_collated.tsv.gz','civicmine_sentences.tsv.gz'] ]
rule final_files:
input: final_files
rule build_models:
output: "models.flag"
shell: "sh buildModels.sh && touch {output}"
rule get_biowordlists:
output: f"{work_dir}/biowordlists.flag"
shell: f"mkdir -p {work_dir}/biowordlists && zenodo_get -o {work_dir}/biowordlists https://doi.org/10.5281/zenodo.1286661 && touch {{output}}"
rule prep_pediatric_cancerlist:
input: f"{work_dir}/biowordlists.flag"
output: f"{work_dir}/terms_cancers_pediatric.tsv"
shell: f"python pediatric/modifyCancerList.py --old {work_dir}/biowordlists/terms_cancers.tsv --syndromes pediatric/syndromes.tsv --extra pediatric/ph_cancers.tsv --outFile {{output}}"
rule prepare_wordlist:
input: f"{work_dir}/biowordlists.flag", f"{work_dir}/terms_cancers_pediatric.tsv"
output: f"{work_dir}/civicmine_terms.pickle"
shell: f"python wordlistLoader.py --genes {work_dir}/biowordlists/terms_genes.tsv --cancers {work_dir}/terms_cancers_pediatric.tsv --drugs {work_dir}/biowordlists/terms_drugs.tsv --conflicting {work_dir}/biowordlists/terms_conflicting.tsv --variants {work_dir}/biowordlists/terms_variants.tsv --wordlistPickle {{output}}"
rule parse_and_find_entities:
input:
biocxml=f"{source_dir}/{{f}}.bioc.xml",
wordlist=f"{work_dir}/civicmine_terms.pickle"
output: f"{work_dir}/sentenceData/{{f}}.json"
shell: f"python findSentences.py --biocFile {{input.biocxml}} --filterTerms filterTerms.txt --wordlistPickle {{input.wordlist}} --variantStopwords stopwords_variants.txt --outSentencesFilename {{output}}"
rule apply_models_to_sentences:
input:
sentences=f"{work_dir}/sentenceData/{{f}}.json",
wordlist=f"{work_dir}/civicmine_terms.pickle",
models="models.flag"
output:
f"{work_dir}/kb/{{f}}.tsv"
shell: f"python applyModelsToSentences.py --models models/Diagnostic.model,models/Predictive.model,models/Prognostic.model,models/Predisposing.model,models/AssociatedVariant.model --filterTerms filterTerms.txt --wordlistPickle {{input.wordlist}} --genes {work_dir}/biowordlists/terms_genes.tsv --cancerTypes {work_dir}/terms_cancers_pediatric.tsv --drugs {work_dir}/biowordlists/terms_drugs.tsv --variants {work_dir}/biowordlists/terms_variants.tsv --variantStopwords stopwords_variants.txt --sentenceFile {{input.sentences}} --outData {{output}}"
rule filter_and_collated:
input: kb_files
output:
unfiltered=f"{work_dir}/civicmine_unfiltered.tsv",
collated=f"{work_dir}/civicmine_collated.tsv",
sentences=f"{work_dir}/civicmine_sentences.tsv",
shell: f"python filterAndCollate.py --inData {work_dir}/kb/ --outUnfiltered {{output.unfiltered}} --outCollated {{output.collated}} --outSentences {{output.sentences}}"
rule gzip:
input: "{f}"
output: "{f}.gz"
shell: "gzip -c {input} > {output}"