-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into YAC-46-tests
- Loading branch information
Showing
35 changed files
with
285,633 additions
and
509 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
### A toy sample for YACHT demonstration | ||
The ref genomes contain 15 randomly selected genomes from GTDB representatives r207. | ||
|
||
The query fastq file is simulated by BBMap from 5 genomes with `randomreads.sh` and `coverage=0.5 len=100 metagenome`. | ||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
name,genome_filename,protein_filename | ||
GCF_018918045.1,ref_genomes/GCF_018918045.1_genomic.fna.gz, | ||
GCF_018918095.1,ref_genomes/GCF_018918095.1_genomic.fna.gz, | ||
GCF_018918125.1,ref_genomes/GCF_018918125.1_genomic.fna.gz, | ||
GCF_018918185.1,ref_genomes/GCF_018918185.1_genomic.fna.gz, | ||
GCF_018918235.1,ref_genomes/GCF_018918235.1_genomic.fna.gz, | ||
GCF_018918285.1,ref_genomes/GCF_018918285.1_genomic.fna.gz, | ||
GCF_018918345.1,ref_genomes/GCF_018918345.1_genomic.fna.gz, | ||
GCF_907163105.1,ref_genomes/GCF_907163105.1_genomic.fna.gz, | ||
GCF_907163115.1,ref_genomes/GCF_907163115.1_genomic.fna.gz, | ||
GCF_907163125.1,ref_genomes/GCF_907163125.1_genomic.fna.gz, | ||
GCF_907163135.1,ref_genomes/GCF_907163135.1_genomic.fna.gz, | ||
GCF_907164845.1,ref_genomes/GCF_907164845.1_genomic.fna.gz, | ||
GCF_907164905.1,ref_genomes/GCF_907164905.1_genomic.fna.gz, | ||
GCF_907165045.1,ref_genomes/GCF_907165045.1_genomic.fna.gz, | ||
GCF_907165215.1,ref_genomes/GCF_907165215.1_genomic.fna.gz, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
genome_id taxid | ||
GCF_018918045.1 2841525 | ||
GCF_018918095.1 2841523 | ||
GCF_018918125.1 2841517 | ||
GCF_018918185.1 2841519 | ||
GCF_018918235.1 2841513 | ||
GCF_018918285.1 2841512 | ||
GCF_018918345.1 2841509 | ||
GCF_907163105.1 2004710 | ||
GCF_907163115.1 316 | ||
GCF_907163125.1 2004710 | ||
GCF_907163135.1 28220 | ||
GCF_907164845.1 2822368 | ||
GCF_907164905.1 2822344 | ||
GCF_907165045.1 2823330 | ||
GCF_907165215.1 76633 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,19 @@ | ||
Output Column,Description | ||
organism_name,Name of organism | ||
original_index,Index in original reference | ||
processed_index,Index in reference after preprocessing | ||
num_unique_kmers_in_genome_sketch,Count of distinct k-mers in sketched genome | ||
num_total_kmers_in_genome_sketch,Total number of k-mers in genome sketch | ||
genome_scale_factor,Scale factor for genome sketch | ||
num_total_kmers_in_sample_sketch,Count of distinct k-mers in sketched sample | ||
num_unique_kmers_in_sample_sketch,Total number of k-mers in sample sketch | ||
sample_scale_factor,Scale factor for sample sketch | ||
scale_factor,Scale factor | ||
num_exclusive_kmers_in_sample_sketch,Count of distinct k-mers in sketched sample | ||
num_total_kmers_in_sample_sketch,Total number of k-mers in sample sketch | ||
min_coverage,Minimum coverage parameter | ||
nontrivial_overlap,Boolean indicating whether genome shares at least one k-mer with sample | ||
in_sample_est,Main output: Boolean indicating whether genome is present in sample | ||
num_exclusive_kmers,Number of k-mers that are exclusive to genome | ||
num_exclusive_kmers_with_coverage,"Number of k-mers exclusive to genome, multiplied by min_coverage parameter" | ||
in_sample_est,Boolean indicating whether genome is present in sample | ||
p_vals,Probability of observing this or more extreme result at ANI threshold | ||
num_exclusive_kmers_to_genome,Number of k-mers that are exclusive to genome | ||
num_exclusive_kmers_to_genome_coverage,"Number of k-mers exclusive to genome, multiplied by min_coverage parameter" | ||
num_matches,Size of intersection between exclusive k-mers and sample | ||
acceptance_threshold_wo_coverage,Acceptance threshold without adjusting for coverage | ||
acceptance_threshold_with_coverage,Acceptance threshold adjusted for coverage | ||
p_vals, Probability of observing this or more extreme result at ANI threshold. | ||
alt_confidence_mut_rate,"Mutation rate such that at this mutation rate, false positive rate = p_val. Does not account for min_coverage parameter." | ||
alt_confidence_mut_rate_coverage,"Mutation rate such that at this mutation rate, false positive rate = p_val, accounting for min_coverage parameter." | ||
actual_confidence_wo_coverage,Actual confidence without adjusting for coverage | ||
actual_confidence_with_coverage,Actual confidence adjusted for coverage | ||
alt_confidence_mut_rate_wo_coverage,"Mutation rate such that at this mutation rate, false positive rate = p_val. Does not account for min_coverage parameter." | ||
alt_confidence_mut_rate_with_coverage,"Mutation rate such that at this mutation rate, false positive rate = p_val, accounting for min_coverage parameter." |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
name: yacht_env | ||
channels: | ||
- bioconda | ||
- conda-forge | ||
- defaults | ||
dependencies: | ||
- python>3.6 | ||
- sourmash>=4.8.3,<5 | ||
- rust | ||
- scipy | ||
- numpy | ||
- pandas | ||
- scikit-learn | ||
- pytest | ||
- loguru | ||
- maturin>=1,<2 | ||
- tqdm | ||
- biom-format | ||
- pytaxonkit | ||
- pip: | ||
- openpyxl | ||
- git+https://github.com/sourmash-bio/[email protected] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,66 +1,106 @@ | ||
#!/usr/bin/env python | ||
import sys | ||
import os, sys | ||
import sourmash | ||
import argparse | ||
import zipfile | ||
from pathlib import Path | ||
from scipy.sparse import save_npz | ||
import pandas as pd | ||
import srcs.utils as utils | ||
from loguru import logger | ||
import json | ||
import shutil | ||
logger.remove() | ||
logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO") | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
description="This script converts a collection of signature files into a reference database matrix.", | ||
formatter_class=argparse.ArgumentDefaultsHelpFormatter) | ||
parser.add_argument('--ref_file', help='Location of the Sourmash signature file. ' | ||
'This is expected to be in Zipfile format (eg. *.sig.zip)', required=True) | ||
parser.add_argument('--ksize', type=int, help='Size of kmers in sketch since Zipfiles ' | ||
'can contain multiple k-mer sizes', required=True) | ||
parser.add_argument('--ref_file', help='Location of the Sourmash signature database file. ' | ||
'This is expected to be in Zipfile format (eg. *.zip)' | ||
'that contains a manifest "SOURMASH-MANIFEST.csv" and a folder "signatures"' | ||
'with all Gzip-format signature file (eg. *.sig.gz) ', required=True) | ||
parser.add_argument('--ksize', type=int, help='Size of kmers in sketch since Zipfiles', required=True) | ||
parser.add_argument('--num_threads', type=int, help='Number of threads to use for parallelization.', required=False, default=16) | ||
parser.add_argument('--ani_thresh', type=float, help='mutation cutoff for species equivalence.', | ||
required=False, default=0.95) | ||
parser.add_argument('--out_prefix', help='Location and prefix for output files.', required=True) | ||
parser.add_argument('--prefix', help='Prefix for this experiment.', required=False, default='yacht') | ||
parser.add_argument('--outdir', type=str, help='path to output directory', required=False, default=os.getcwd()) | ||
parser.add_argument('--force', action='store_true', help='Overwrite the output directory if it exists') | ||
args = parser.parse_args() | ||
|
||
# get the arguments | ||
ref_file = args.ref_file | ||
ref_file = str(Path(args.ref_file).absolute()) | ||
ksize = args.ksize | ||
num_threads = args.num_threads | ||
ani_thresh = args.ani_thresh | ||
out_prefix = args.out_prefix | ||
prefix = args.prefix | ||
outdir = str(Path(args.outdir).absolute()) | ||
force = args.force | ||
|
||
# load the signatures | ||
logger.info(f"Loading signatures from {ref_file}") | ||
signatures = sourmash.load_file_as_signatures(ref_file) | ||
signature_count = utils.count_files_in_zip(ref_file) - 1 | ||
# make sure reference database file exist and valid | ||
logger.info("Checking reference database file") | ||
if os.path.splitext(ref_file)[1] != '.zip': | ||
raise ValueError(f"Reference database file {ref_file} is not a zip file. Please a Sourmash signature database file with Zipfile format.") | ||
utils.check_file_existence(str(Path(ref_file).absolute()), f'Reference database zip file {ref_file} does not exist.') | ||
|
||
# DONE: do signature size checking, coverting to sourmash list and generate reference matrix at the same time | ||
# check that all signatures have the same ksize as the one provided | ||
# signatures_mismatch_ksize return False (if all signatures have the same kmer size) | ||
# or True (the first signature with a different kmer size) | ||
# convert signatures to reference matrix (rows are hashes/kmers, columns are organisms) | ||
logger.info("Converting signatures to reference matrix") | ||
signatures, ref_matrix, hashes, is_mismatch = utils.signatures_to_ref_matrix(signatures, ksize, signature_count) | ||
if is_mismatch: | ||
raise ValueError(f"Not all signatures from sourmash signature file {ref_file} have the given ksize {ksize}") | ||
# Create a temporary directory with time info as label | ||
logger.info("Creating a temporary directory") | ||
path_to_temp_dir = os.path.join(outdir, prefix+'_intermediate_files') | ||
if os.path.exists(path_to_temp_dir) and not force: | ||
raise ValueError(f"Temporary directory {path_to_temp_dir} already exists. Please remove it or given a new prefix name using parameter '--prefix'.") | ||
else: | ||
# remove the temporary directory if it exists | ||
if os.path.exists(path_to_temp_dir): | ||
logger.warning(f"Temporary directory {path_to_temp_dir} already exists. Removing it.") | ||
shutil.rmtree(path_to_temp_dir) | ||
os.makedirs(path_to_temp_dir, exist_ok=True) | ||
|
||
# unzip the sourmash signature file to the temporary directory | ||
logger.info("Unzipping the sourmash signature file to the temporary directory") | ||
with zipfile.ZipFile(ref_file, 'r') as sourmash_db: | ||
sourmash_db.extractall(path_to_temp_dir) | ||
|
||
# remove 'same' organisms: any organisms with ANI > ani_thresh are considered the same organism | ||
logger.info("Removing 'same' organisms with ANI > ani_thresh") | ||
processed_ref_matrix, uncorr_org_idx = utils.get_uncorr_ref(ref_matrix, ksize, ani_thresh) | ||
save_npz(f'{out_prefix}_ref_matrix_processed.npz', processed_ref_matrix) | ||
# Extract signature information | ||
logger.info("Extracting signature information") | ||
sig_info_dict = utils.collect_signature_info(num_threads, ksize, path_to_temp_dir) | ||
# check if all signatures have the same ksize and scaled | ||
logger.info("Checking if all signatures have the same scaled") | ||
scale_set = set([value[-1] for value in sig_info_dict.values()]) | ||
if len(scale_set) != 1: | ||
raise ValueError(f"Not all signatures have the same scaled. Please check your input.") | ||
scale = scale_set.pop() | ||
|
||
# write out hash-to-row-indices file | ||
logger.info("Writing out hash-to-row-indices file") | ||
utils.write_hashes(f'{out_prefix}_hash_to_col_idx.pkl', hashes) | ||
# Find the close related genomes with ANI > ani_thresh from the reference database | ||
logger.info("Find the close related genomes with ANI > ani_thresh from the reference database") | ||
multisearch_result = utils.run_multisearch(num_threads, ani_thresh, ksize, scale, path_to_temp_dir) | ||
|
||
# write out organism manifest (original index, processed index, num unique kmers, num total kmers, scale factor) | ||
logger.info("Writing out organism manifest") | ||
utils.write_processed_indices(f'{out_prefix}_processed_org_idx.csv', signatures, uncorr_org_idx) | ||
# remove the close related organisms: any organisms with ANI > ani_thresh | ||
# pick only the one with largest number of unique kmers from all the close related organisms | ||
logger.info("Removing the close related organisms with ANI > ani_thresh") | ||
remove_corr_df, manifest_df = utils.remove_corr_organisms_from_ref(sig_info_dict, multisearch_result) | ||
|
||
# save the k-mer size and ani threshold to a json file | ||
logger.info("Saving k-mer size and ani threshold to json file") | ||
json.dump({'reference_matrix_path': str(Path(f'{out_prefix}_ref_matrix_processed.npz').resolve()), | ||
'hash_to_idx_path': str(Path(f'{out_prefix}_hash_to_col_idx.pkl').resolve()), | ||
'processed_org_file_path': str(Path(f'{out_prefix}_processed_org_idx.csv').resolve()), | ||
# write out the manifest file | ||
logger.info("Writing out the manifest file") | ||
manifest_file_path = os.path.join(outdir, f'{prefix}_processed_manifest.tsv') | ||
manifest_df.to_csv(manifest_file_path, sep='\t', index=None) | ||
|
||
# write out a mapping dataframe from representative organism to the close related organisms | ||
logger.info("Writing out a mapping dataframe from representative organism to the close related organisms") | ||
if len(remove_corr_df) == 0: | ||
logger.warning("No close related organisms found.") | ||
remove_corr_df_indicator = "" | ||
else: | ||
remove_corr_df_path = os.path.join(outdir, f'{prefix}_removed_orgs_to_corr_orgas_mapping.tsv') | ||
remove_corr_df.to_csv(remove_corr_df_path, sep='\t', index=None) | ||
remove_corr_df_indicator = remove_corr_df_path | ||
|
||
# save the config file | ||
logger.info("Saving the config file") | ||
json_file_path = os.path.join(outdir, f'{prefix}_config.json') | ||
json.dump({'manifest_file_path': manifest_file_path, | ||
'remove_cor_df_path': remove_corr_df_indicator, | ||
'intermediate_files_dir': path_to_temp_dir, | ||
'scale': scale, | ||
'ksize': ksize, | ||
'ani_thresh': ani_thresh}, open(f'{out_prefix}_config.json', 'w'), indent=4) | ||
'ani_thresh': ani_thresh}, open(json_file_path, 'w'), indent=4) |
Oops, something went wrong.