Skip to content

Commit

Permalink
Merge pull request #41 from KoslickiLab/dmk
Browse files Browse the repository at this point in the history
Fixing issues
  • Loading branch information
dkoslicki authored Oct 19, 2023
2 parents d0f82c8 + 4127d95 commit edc6ba5
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 76 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/runTest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ jobs:
activate-environment: yacht_env
environment-file: env/yacht_env.yml
- name: make training data
run: python make_training_data_from_sketches.py --ref_file 'tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ./
run: python make_training_data_from_sketches.py --ref_file 'tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ./ --force
- name: run YACHT
run: python run_YACHT.py --json gtdb_ani_thresh_0.95_config.json --sample_file 'tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1 --outdir ./
run: python run_YACHT.py --json gtdb_ani_thresh_0.95_config.json --sample_file 'tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1
- name: test-unit
run: pytest tests/test_unit.py
- name: test-utils
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ cd demo

# build k-mer sketches for the query sample and ref genomes
sourmash sketch dna -f -p k=31,scaled=1000,abund -o sample.sig.zip query_data/query_data.fq
sourmash sketch fromfile ref_paths.csv -p dna,k=31,scaled=1000,abund -o ref.sig.zip
sourmash sketch fromfile ref_paths.csv -p dna,k=31,scaled=1000,abund -o ref.sig.zip --force-output-already-exists

# preprocess the reference genomes (training step)
python ../make_training_data_from_sketches.py --ref_file ref.sig.zip --ksize 31 --num_threads ${NUM_THREADS} --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./
python ../make_training_data_from_sketches.py --ref_file ref.sig.zip --ksize 31 --num_threads ${NUM_THREADS} --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./ --force

# run YACHT algorithm to check the presence of reference genomes in the query sample (inference step)
python ../run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads ${NUM_THREADS} --min_coverage_list 1 0.6 0.2 0.1 --outdir ./
python ../run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads ${NUM_THREADS} --min_coverage_list 1 0.6 0.2 0.1 --out_filename result.xlsx

# convert result to CAMI profile format (Optional)
python ../srcs/standardize_yacht_output.py --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./
Expand Down
6 changes: 4 additions & 2 deletions make_training_data_from_sketches.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
required=False, default=0.95)
parser.add_argument('--prefix', help='Prefix for this experiment.', required=False, default='yacht')
parser.add_argument('--outdir', type=str, help='path to output directory', required=False, default=os.getcwd())
parser.add_argument('--force', action='store_true', help='Overwrite the output directory if it exists')
args = parser.parse_args()

# get the arguments
Expand All @@ -34,6 +35,7 @@
ani_thresh = args.ani_thresh
prefix = args.prefix
outdir = str(Path(args.outdir).absolute())
force = args.force

# make sure reference database file exist and valid
logger.info("Checking reference database file")
Expand All @@ -44,9 +46,9 @@
# Create a temporary directory with time info as label
logger.info("Creating a temporary directory")
path_to_temp_dir = os.path.join(outdir, prefix+'_intermediate_files')
if os.path.exists(path_to_temp_dir):
if os.path.exists(path_to_temp_dir) and not force:
raise ValueError(f"Temporary directory {path_to_temp_dir} already exists. Please remove it or given a new prefix name using parameter '--prefix'.")
os.makedirs(path_to_temp_dir)
os.makedirs(path_to_temp_dir, exist_ok=True)

# unzip the sourmash signature file to the temporary directory
logger.info("Unzipping the sourmash signature file to the temporary directory")
Expand Down
26 changes: 20 additions & 6 deletions run_YACHT.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import srcs.utils as utils
import json
import warnings
import zipfile
warnings.filterwarnings("ignore")
from tqdm import tqdm
from loguru import logger
Expand All @@ -31,8 +32,7 @@
'Each value should be between 0 and 1, with 0 being the most sensitive (and least '
'precise) and 1 being the most precise (and least sensitive).',
required=False, default=[1, 0.5, 0.1, 0.05, 0.01])
parser.add_argument('--out_filename', help='output filename', required=False, default='result.xlsx')
parser.add_argument('--outdir', help='path to output directory', required=True)
parser.add_argument('--out_filename', help='Full path of output filename', required=False, default='result.xlsx')

# parse the arguments
args = parser.parse_args()
Expand All @@ -44,7 +44,6 @@
show_all = args.show_all # Show all organisms (no matter if present) in output file.
min_coverage_list = args.min_coverage_list # a list of percentages of unique k-mers covered by reads in the sample.
out_filename = args.out_filename # output filename
outdir = args.outdir # csv destination for results

# check if the json file exists
utils.check_file_existence(json_file_path, f'Config file {json_file_path} does not exist. '
Expand All @@ -57,6 +56,10 @@
ksize = config['ksize']
ani_thresh = config['ani_thresh']

# Make sure the output can be written to
if not os.access(os.path.abspath(os.path.dirname(out_filename)), os.W_OK):
raise FileNotFoundError(f"Cannot write to the location: {os.path.abspath(os.path.dirname(out_filename))}.")

# check if min_coverage is between 0 and 1
for x in min_coverage_list:
if not (0 <= x <= 1):
Expand All @@ -70,9 +73,20 @@
logger.info('Loading the manifest file generated from the training data.')
manifest = pd.read_csv(manifest_file_path, sep='\t', header=0)

# check if there is a manifest in the sample sig file
with zipfile.ZipFile(sample_file, 'r') as zip_file:
if 'SOURMASH-MANIFEST.csv' not in zip_file.namelist():
raise FileNotFoundError(f'The input file {sample_file} appears to be missing a manifest associated with it. '
f'Try running: sourmash sig merge {sample_file} -o <new signature with the manifest present>. '
f'And then run YACHT using the output of that command.')

# load sample signature and its signature info
logger.info('Loading sample signature and its signature info.')
sample_sig = utils.load_signature_with_ksize(sample_file, ksize)
try:
sample_sig = utils.load_signature_with_ksize(sample_file, ksize)
except ValueError:
raise ValueError(f'Expected exactly one signature with ksize {ksize} in {sample_file}, found {len(sample_file)}. '
f'Likely you will need to do something like: sourmash sig merge {sample_file} -o <new signature with just one sketch in it>.')
sample_sig_info = utils.get_info_from_single_sig(sample_file, ksize)

# add sample signature info to the manifest
Expand Down Expand Up @@ -112,9 +126,9 @@
manifest_list = temp_manifest_list

# save the results into Excel file
logger.info(f'Saving results to {outdir}.')
logger.info(f'Saving results to {os.path.dirname(out_filename)}.')
# save the results with different min_coverage
with pd.ExcelWriter(os.path.join(outdir, out_filename), engine='openpyxl', mode='w') as writer:
with pd.ExcelWriter(out_filename, engine='openpyxl', mode='w') as writer:
# save the raw results (i.e., min_coverage=1.0)
if keep_raw:
temp_mainifest = manifest_list[0].copy()
Expand Down
89 changes: 26 additions & 63 deletions tests/test_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,36 +24,6 @@ def test_full_workflow():
"training_sig_files.txt"])))
# one of the signature files
expected_files.extend(list(map(lambda x: os.path.join(data_dir, intermediate_dir, "signatures", x), ["04212e93c2172d4df49dc5d8c2973d8b.sig.gz"])))
# In testdata/
# 20_genomes_trained_config.json
# 20_genomes_trained_processed_manifest.tsv
# 20_genomes_trained_rep_to_corr_orgas_mapping.tsv
# /testdata/20_genomes_trained_intermediate_files$ tree .
# .
# ├── signatures
# │   ├── 04212e93c2172d4df49dc5d8c2973d8b.sig.gz
# │   ├── 04f2b0e94f2d1f1f5b8355114b70274e.sig.gz
# │   ├── 0661ecab88c3d65d0f10e599a5ba1654.sig.gz
# │   ├── 06ebe48d527882bfa9505aba8e31ae23.sig.gz
# │   ├── 11fe9a00287c7ad086ebbc463724cf10.sig.gz
# │   ├── 16c6c1d37259d83088ab3a4f5b691631.sig.gz
# │   ├── 188d55801a78d4773cf6c0b46bca96ba.sig.gz
# │   ├── 1a121dca600c6504e88252e81004f0cf.sig.gz
# │   ├── 39ea7fd48ee7003587c9c763946d5d6e.sig.gz
# │   ├── 45f2675c1dca4ef1a24a05f5b268adbb.sig.gz
# │   ├── 7b312fffa3fb35440ba40203ba826c05.sig.gz
# │   ├── 8fb9b1a69838a58cc4f31c1e42a5f189.sig.gz
# │   ├── 92fb1b3e4baa6c474aff3efb84957687.sig.gz
# │   ├── 96cb85214535b0f9723a6abc17097821.sig.gz
# │   ├── a136145bee08846ed94c0406df3da2d4.sig.gz
# │   ├── b691deddf179ead0a006527330d86dde.sig.gz
# │   ├── b7f087146f5cc3121477c29ff003e3d0.sig.gz
# │   ├── c39c52d2d088348c950c2afe503b405b.sig.gz
# │   ├── c9eb6a9d058df8036ad93bc45d5bf260.sig.gz
# │   └── ce54d962851b0fdeefc624300036a133.sig.gz
# ├── SOURMASH-MANIFEST.csv
# ├── training_multisearch_result.csv
# └── training_sig_files.txt
# remove the files if they exist
for f in expected_files:
if exists(f):
Expand All @@ -75,39 +45,8 @@ def test_full_workflow():
# then do the presence/absence estimation
if exists(abundance_file):
os.remove(abundance_file)
# python ../run_YACHT.py --json testdata/20_genomes_trained_config.json --sample_file testdata/sample.sig.zip --out_file result.xlsx --outdir testdata/
cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --outdir {data_dir} --out_file {abundance_file} --show_all"
print(cmd)
# ~/pycharm/YACHT/tests/testdata$ tree 20_genomes_trained_intermediate_files/
# 20_genomes_trained_intermediate_files/
# ├── organism_sig_file.txt # <-- new
# ├── sample_multisearch_result.csv
# ├── sample_sig_file.txt
# ├── signatures
# │   ├── 04212e93c2172d4df49dc5d8c2973d8b.sig.gz
# │   ├── 04f2b0e94f2d1f1f5b8355114b70274e.sig.gz
# │   ├── 0661ecab88c3d65d0f10e599a5ba1654.sig.gz
# │   ├── 06ebe48d527882bfa9505aba8e31ae23.sig.gz
# │   ├── 11fe9a00287c7ad086ebbc463724cf10.sig.gz
# │   ├── 16c6c1d37259d83088ab3a4f5b691631.sig.gz
# │   ├── 188d55801a78d4773cf6c0b46bca96ba.sig.gz
# │   ├── 1a121dca600c6504e88252e81004f0cf.sig.gz
# │   ├── 39ea7fd48ee7003587c9c763946d5d6e.sig.gz
# │   ├── 45f2675c1dca4ef1a24a05f5b268adbb.sig.gz
# │   ├── 7b312fffa3fb35440ba40203ba826c05.sig.gz
# │   ├── 8fb9b1a69838a58cc4f31c1e42a5f189.sig.gz
# │   ├── 92fb1b3e4baa6c474aff3efb84957687.sig.gz
# │   ├── 96cb85214535b0f9723a6abc17097821.sig.gz
# │   ├── a136145bee08846ed94c0406df3da2d4.sig.gz
# │   ├── b691deddf179ead0a006527330d86dde.sig.gz
# │   ├── b7f087146f5cc3121477c29ff003e3d0.sig.gz
# │   ├── c39c52d2d088348c950c2afe503b405b.sig.gz
# │   ├── c9eb6a9d058df8036ad93bc45d5bf260.sig.gz
# │   └── ce54d962851b0fdeefc624300036a133.sig.gz
# ├── SOURMASH-MANIFEST.csv
# ├── training_multisearch_result.csv
# └── training_sig_files.txt
# print(cmd)
# python ../run_YACHT.py --json testdata/20_genomes_trained_config.json --sample_file testdata/sample.sig.zip --out_file result.xlsx
cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --out_file {os.path.join(data_dir,abundance_file)} --show_all"
res = subprocess.run(cmd, shell=True, check=True)
# check that no errors were raised
assert res.returncode == 0
Expand All @@ -122,3 +61,27 @@ def test_full_workflow():
assert df[df['organism_name'] == present_organism]["num_matches"].values[0] == 2
# and the threshold was 706
assert df[df['organism_name'] == present_organism]["acceptance_threshold_with_coverage"].values[0] == 706


def test_incorrect_workflow1():
script_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
demo_dir = os.path.join(script_dir, "demo")
cmd = f"python run_YACHT.py --json {demo_dir}/demo_ani_thresh_0.95_config.json --sample_file {demo_dir}/ref.sig.zip"
res = subprocess.run(cmd, shell=True, check=False)
# this should fail
assert res.returncode == 1


def test_demo_workflow():
cmd = "cd demo; sourmash sketch dna -f -p k=31,scaled=1000,abund -o sample.sig.zip query_data/query_data.fq"
_ = subprocess.run(cmd, shell=True, check=True)
cmd = "cd demo; sourmash sketch fromfile ref_paths.csv -p dna,k=31,scaled=1000,abund -o ref.sig.zip --force-output-already-exists"
_ = subprocess.run(cmd, shell=True, check=True)
cmd = "cd demo; python ../make_training_data_from_sketches.py --force --ref_file ref.sig.zip --ksize 31 --num_threads 1 --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./"
_ = subprocess.run(cmd, shell=True, check=True)
cmd = "cd demo; python ../run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads 1 --min_coverage_list 1 0.6 0.2 0.1 --out_filename result.xlsx"
_ = subprocess.run(cmd, shell=True, check=True)
cmd = "cd demo; python ../srcs/standardize_yacht_output.py --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./"
_ = subprocess.run(cmd, shell=True, check=True)


0 comments on commit edc6ba5

Please sign in to comment.