Skip to content

Commit

Permalink
fixing pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
mfl15 committed Oct 24, 2023
1 parent fa4762d commit 680f7b2
Show file tree
Hide file tree
Showing 9 changed files with 2,336 additions and 11 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/runTest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,7 @@ jobs:
run: pytest tests/test_utils.py
- name: test-workflow
run: pytest tests/test_workflow.py
- name: integration-tests
run: pytest -vv tests/integration_tests.py
- name: unit-tests
run: pytest -vv tests/unittests.py
73 changes: 73 additions & 0 deletions tests/integration_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import subprocess, os
import tempfile
import json
from os.path import exists

def make_train_fasta():
fasta_content = [
">Genome1",
"AGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTA",
">Genome2",
"TCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATC",
">Genome3",
"ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGAT"
]

fasta_filename = "example.fasta"

with open(fasta_filename, "w") as fasta_file:
for line in fasta_content:
fasta_file.write(line + "\n")

def test_sourmash_sketch_command():
with tempfile.TemporaryDirectory() as tmp_dir:
make_train_fasta()

fasta_file = "example.fasta"
output_file = os.path.join(tmp_dir, "training_database.sig.zip")
cmd = [
"sourmash", "sketch", "dna", "-f", "-p", "k=31,scaled=1000,abund", "--singleton", fasta_file, "-o", output_file
]

subprocess.run(cmd, check=True)

assert os.path.isfile(output_file)

def test_make_training_data_from_sketches():
ref_file = 'tests/testdata/20_genomes_sketches.zip'
ksize = '31'
ani_thresh = '0.95'
prefix = 'gtdb_ani_thresh_0.95'
config_file = f'{prefix}_config.json'
processed_manifest_file = f'{prefix}_processed_manifest.tsv'
rep_to_corr_orgas_mapping_file = f'{prefix}_rep_to_corr_orgas_mapping.tsv'
intermediate_files_dir = f'{prefix}_intermediate_files'

command = [
'python', 'make_training_data_from_sketches.py',
'--ref_file', ref_file,
'--ksize', ksize,
'--prefix', prefix,
'--ani_thresh', ani_thresh,
'--outdir', './',
'--force',
]

subprocess.run(command)

assert os.path.isfile(config_file)
assert os.path.isfile(processed_manifest_file)
assert os.path.isdir(intermediate_files_dir)

with open(config_file, 'r') as f:
config = json.load(f)
assert config['ksize'] == int(ksize)
assert config['ani_thresh'] == float(ani_thresh)

def test_run_yacht():
cmd = "python run_YACHT.py --json gtdb_ani_thresh_0.95_config.json --sample_file 'tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1"

res = subprocess.run(cmd, shell=True, check=True)
assert res.returncode == 0

assert exists('result.xlsx')
22 changes: 11 additions & 11 deletions tests/test_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def test_full_workflow():
test_dir = os.path.join(script_dir, 'tests')
data_dir = os.path.join(test_dir, 'testdata')
out_prefix = "20_genomes_trained"
full_out_prefix = os.path.join(data_dir, out_prefix)
abundance_file = os.path.join(data_dir, "result.xlsx")
reference_sketches = os.path.join(data_dir, "20_genomes_sketches.zip")
sample_sketches = os.path.join(data_dir, "sample.sig.zip")
Expand All @@ -30,8 +31,8 @@ def test_full_workflow():
# Remove the intermediate folder
shutil.rmtree(os.path.join(data_dir, intermediate_dir), ignore_errors=True)
# python ../make_training_data_from_sketches.py --ref_file testdata/20_genomes_sketches.zip --ksize 31 --prefix 20_genomes_trained --outdir testdata/
cmd = f"python {os.path.join(script_dir, 'make_training_data_from_sketches.py')} --force --ref_file {reference_sketches}" \
f" --prefix {out_prefix} --ksize 31 --outdir {data_dir}"
cmd = f"python {os.path.join(script_dir, 'make_training_data_from_sketches.py')} --ref_file {reference_sketches}" \
f" --prefix {full_out_prefix} --ksize 31 --outdir {data_dir}"
res = subprocess.run(cmd, shell=True, check=True)
# check that no errors were raised
assert res.returncode == 0
Expand All @@ -40,10 +41,11 @@ def test_full_workflow():
assert exists(f)
# check that the files are big enough
for f in expected_files:
assert os.stat(f).st_size > 300
assert os.stat(f).st_size > 291
# then do the presence/absence estimation
if exists(abundance_file):
os.remove(abundance_file)
# python ../run_YACHT.py --json testdata/20_genomes_trained_config.json --sample_file testdata/sample.sig.zip --out_file result.xlsx
cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --out {os.path.join(data_dir,abundance_file)} --show_all"
res = subprocess.run(cmd, shell=True, check=True)
# check that no errors were raised
Expand All @@ -67,21 +69,19 @@ def test_incorrect_workflow1():
cmd = f"python run_YACHT.py --json {demo_dir}/demo_ani_thresh_0.95_config.json --sample_file {demo_dir}/ref.sig.zip"
res = subprocess.run(cmd, shell=True, check=False)
# this should fail
assert res.returncode != 0
assert res.returncode == 1


def test_demo_workflow():
script_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
demo_dir = os.path.join(script_dir, "demo")
cmd = f"cd {demo_dir}; sourmash sketch dna -f -p k=31,scaled=1000,abund -o sample.sig.zip query_data/query_data.fq"
cmd = "cd demo; sourmash sketch dna -f -p k=31,scaled=1000,abund -o sample.sig.zip query_data/query_data.fq"
_ = subprocess.run(cmd, shell=True, check=True)
cmd = f"cd {demo_dir}; sourmash sketch fromfile ref_paths.csv -p dna,k=31,scaled=1000,abund -o ref.sig.zip --force-output-already-exists"
cmd = "cd demo; sourmash sketch fromfile ref_paths.csv -p dna,k=31,scaled=1000,abund -o ref.sig.zip --force-output-already-exists"
_ = subprocess.run(cmd, shell=True, check=True)
cmd = f"cd {demo_dir}; python ../make_training_data_from_sketches.py --force --ref_file ref.sig.zip --ksize 31 --num_threads 1 --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./"
cmd = "cd demo; python ../make_training_data_from_sketches.py --force --ref_file ref.sig.zip --ksize 31 --num_threads 1 --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./"
_ = subprocess.run(cmd, shell=True, check=True)
cmd = f"cd {demo_dir}; python ../run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads 1 --min_coverage_list 1 0.6 0.2 0.1 --out ./result.xlsx"
cmd = "cd demo; python ../run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads 1 --min_coverage_list 1 0.6 0.2 0.1 --out result.xlsx"
_ = subprocess.run(cmd, shell=True, check=True)
cmd = f"cd {demo_dir}; python ../srcs/standardize_yacht_output.py --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./"
cmd = "cd demo; python ../srcs/standardize_yacht_output.py --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./"
_ = subprocess.run(cmd, shell=True, check=True)


Loading

0 comments on commit 680f7b2

Please sign in to comment.