Skip to content

Commit

Permalink
make genome and sample separately have their intermediate folders
Browse files Browse the repository at this point in the history
  • Loading branch information
chunyuma committed Oct 19, 2023
1 parent 773107f commit eb1c27a
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 21 deletions.
2 changes: 1 addition & 1 deletion make_training_data_from_sketches.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@
json_file_path = os.path.join(outdir, f'{prefix}_config.json')
json.dump({'manifest_file_path': manifest_file_path,
'rep_remove_df_path': rep_remove_df_path,
'pathogen_detection_intermediate_files_dir': path_to_temp_dir,
'intermediate_files_dir': path_to_temp_dir,
'scale': scale,
'ksize': ksize,
'ani_thresh': ani_thresh}, open(json_file_path, 'w'), indent=4)
4 changes: 2 additions & 2 deletions run_YACHT.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
# load the config file, ksize, and ani_thresh
config = json.load(open(json_file_path, 'r'))
manifest_file_path = config['manifest_file_path']
path_to_temp_dir = config['pathogen_detection_intermediate_files_dir']
path_to_genome_temp_dir = config['intermediate_files_dir']
scale = config['scale']
ksize = config['ksize']
ani_thresh = config['ani_thresh']
Expand Down Expand Up @@ -100,7 +100,7 @@
else:
has_raw = True

manifest_list = hr.hypothesis_recovery(manifest, sample_info_set, path_to_temp_dir, min_coverage_list, scale, ksize, significance, ani_thresh, num_threads)
manifest_list = hr.hypothesis_recovery(manifest, sample_info_set, path_to_genome_temp_dir, min_coverage_list, scale, ksize, significance, ani_thresh, num_threads)

# remove unnecessary columns
remove_cols = ['md5sum', 'sample_scale_factor']
Expand Down
43 changes: 25 additions & 18 deletions srcs/hypothesis_recovery_src.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
logger.remove()
logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")

def get_organisms_with_nonzero_overlap(manifest: pd.DataFrame, sample_file: str, scale: int, ksize: int, num_threads: int, path_to_temp_dir: str) -> List[str]:
def get_organisms_with_nonzero_overlap(manifest: pd.DataFrame, sample_file: str, scale: int, ksize: int, num_threads: int, path_to_genome_temp_dir: str, path_to_sample_temp_dir: str) -> List[str]:
"""
This function runs the sourmash multisearch to find the organisms that have non-zero overlap with the sample.
:param manifest: a dataframe with the following columns:
Expand All @@ -33,39 +33,39 @@ def get_organisms_with_nonzero_overlap(manifest: pd.DataFrame, sample_file: str,
:param scale: int (scale factor)
:param ksize: string (size of kmer)
:param num_threads: int (number of threads to use for parallelization)
:param path_to_temp_dir: string (path to the temporary directory generated by the training step )
:param path_to_genome_temp_dir: string (path to the genome temporary directory generated by the training step)
:param path_to_sample_temp_dir: string (path to the sample temporary directory)
:return: a list of organism names that have non-zero overlap with the sample
"""
# run the sourmash multisearch
# prepare the input files for the sourmash multisearch
# unzip the sourmash signature file to the temporary directory
logger.info("Unzipping the sample signature zip file")
sample_dir = os.path.dirname(sample_file)
with zipfile.ZipFile(sample_file, 'r') as sample_zip_file:
sample_zip_file.extractall(sample_dir)
sample_zip_file.extractall(path_to_sample_temp_dir)

sample_sig_file = pd.DataFrame([os.path.join(sample_dir, 'signatures', sig_file) for sig_file in os.listdir(os.path.join(sample_dir, 'signatures'))])
sample_sig_file_path = os.path.join(path_to_temp_dir, 'sample_sig_file.txt')
sample_sig_file = pd.DataFrame([os.path.join(path_to_sample_temp_dir, 'signatures', sig_file) for sig_file in os.listdir(os.path.join(path_to_sample_temp_dir, 'signatures'))])
sample_sig_file_path = os.path.join(path_to_sample_temp_dir, 'sample_sig_file.txt')
sample_sig_file.to_csv(sample_sig_file_path, header=False, index=False)

organism_sig_file = pd.DataFrame([os.path.join(path_to_temp_dir, 'signatures', md5sum+'.sig.gz') for md5sum in manifest['md5sum']])
organism_sig_file_path = os.path.join(path_to_temp_dir, 'organism_sig_file.txt')
organism_sig_file = pd.DataFrame([os.path.join(path_to_genome_temp_dir, 'signatures', md5sum+'.sig.gz') for md5sum in manifest['md5sum']])
organism_sig_file_path = os.path.join(path_to_sample_temp_dir, 'organism_sig_file.txt')
organism_sig_file.to_csv(organism_sig_file_path, header=False, index=False)

# run the sourmash multisearch
cmd = f"sourmash scripts multisearch {sample_sig_file_path} {organism_sig_file_path} -s {scale} -k {ksize} -c {num_threads} -t 0 -o {os.path.join(path_to_temp_dir, 'sample_multisearch_result.csv')}"
cmd = f"sourmash scripts multisearch {sample_sig_file_path} {organism_sig_file_path} -s {scale} -k {ksize} -c {num_threads} -t 0 -o {os.path.join(path_to_sample_temp_dir, 'sample_multisearch_result.csv')}"
logger.info(f"Running sourmash multisearch with command: {cmd}")
exit_code = os.system(cmd)
if exit_code != 0:
raise ValueError(f"Error running sourmash multisearch with command: {cmd}")

# read the multisearch result
multisearch_result = pd.read_csv(os.path.join(path_to_temp_dir, 'sample_multisearch_result.csv'), sep=',', header=0)
multisearch_result = pd.read_csv(os.path.join(path_to_sample_temp_dir, 'sample_multisearch_result.csv'), sep=',', header=0)
multisearch_result = multisearch_result.drop_duplicates().reset_index(drop=True)

return multisearch_result['match_name'].to_list()

def get_exclusive_hashes(manifest: pd.DataFrame, nontrivial_organism_names: List[str], sample_sig: sourmash.SourmashSignature, ksize: int, path_to_temp_dir: str) -> Tuple[List[Tuple[int, int]], pd.DataFrame]:
def get_exclusive_hashes(manifest: pd.DataFrame, nontrivial_organism_names: List[str], sample_sig: sourmash.SourmashSignature, ksize: int, path_to_genome_temp_dir: str) -> Tuple[List[Tuple[int, int]], pd.DataFrame]:
"""
This function gets the unique hashes exclusive to each of the organisms that have non-zero overlap with the sample, and
then find how many are in the sampe.
Expand All @@ -83,7 +83,7 @@ def get_exclusive_hashes(manifest: pd.DataFrame, nontrivial_organism_names: List
:param sample_sig: the sample signature
:param ksize: int (size of kmer)
:param num_threads: int (number of threads to use for parallelization)
:param path_to_temp_dir: string (path to the temporary directory generated by the training step)
:param path_to_genome_temp_dir: string (path to the genome temporary directory generated by the training step)
:return:
a list of tuples, each tuple contains the following information:
1. the number of unique hashes exclusive to the organism under consideration
Expand All @@ -103,7 +103,7 @@ def __find_exclusive_hashes(md5sum, path_to_temp_dir, ksize, single_occurrence_h
single_occurrence_hashes = set()
multiple_occurrence_hashes = set()
for md5sum in tqdm(organism_md5sum_list):
sig = load_signature_with_ksize(os.path.join(path_to_temp_dir, 'signatures', md5sum+'.sig.gz'), ksize)
sig = load_signature_with_ksize(os.path.join(path_to_genome_temp_dir, 'signatures', md5sum+'.sig.gz'), ksize)
for hash in sig.minhash.hashes:
if hash in multiple_occurrence_hashes:
continue
Expand All @@ -118,7 +118,7 @@ def __find_exclusive_hashes(md5sum, path_to_temp_dir, ksize, single_occurrence_h
logger.info("Finding hashes that are unique to each organism")
exclusive_hashes_org = []
for md5sum in tqdm(organism_md5sum_list, desc='Finding exclusive hashes'):
exclusive_hashes_org.append(__find_exclusive_hashes(md5sum, path_to_temp_dir, ksize, single_occurrence_hashes))
exclusive_hashes_org.append(__find_exclusive_hashes(md5sum, path_to_genome_temp_dir, ksize, single_occurrence_hashes))
del single_occurrence_hashes # free up memory

# Get sample hashes
Expand Down Expand Up @@ -214,7 +214,7 @@ def single_hyp_test(
def hypothesis_recovery(
manifest: pd.DataFrame,
sample_info_set: Tuple[str, sourmash.SourmashSignature],
path_to_temp_dir: str,
path_to_genome_temp_dir: str,
min_coverage_list: List[float],
scale: int,
ksize: int,
Expand All @@ -237,7 +237,7 @@ def hypothesis_recovery(
'sample_scale_factor',
'min_coverage'
:param sample_info_set: a set of information about the sample, including the sample signature location and the sample signature object
:param path_to_temp_dir: path to the temporary directory generated by the training step
:param path_to_genome_temp_dir: path to the genome temporary directory generated by the training step
:param min_coverage_list: a list of minimum coverage values
:param scale: scale factor
:param ksize: k-mer size
Expand All @@ -250,11 +250,18 @@ def hypothesis_recovery(
# unpack the sample info set
sample_file, sample_sig = sample_info_set

# create a temporary directory for the sample
sample_dir = os.path.dirname(sample_file)
sample_name = os.path.basename(sample_file).replace('.sig.zip', '')
path_to_sample_temp_dir = os.path.join(sample_dir, f'sample_{sample_name}_intermediate_files')
if not os.path.exists(path_to_sample_temp_dir):
os.makedirs(path_to_sample_temp_dir)

# Find the organisms that have non-zero overlap with the sample
nontrivial_organism_names = get_organisms_with_nonzero_overlap(manifest, sample_file, scale, ksize, num_threads, path_to_temp_dir)
nontrivial_organism_names = get_organisms_with_nonzero_overlap(manifest, sample_file, scale, ksize, num_threads, path_to_genome_temp_dir, path_to_sample_temp_dir)

# Get the unique hashes exclusive to each of the organisms that have non-zero overlap with the sample
exclusive_hashes_info, manifest = get_exclusive_hashes(manifest, nontrivial_organism_names, sample_sig, ksize, path_to_temp_dir)
exclusive_hashes_info, manifest = get_exclusive_hashes(manifest, nontrivial_organism_names, sample_sig, ksize, path_to_genome_temp_dir)

# Set up the results dataframe columns
given_columns = [
Expand Down

0 comments on commit eb1c27a

Please sign in to comment.