Skip to content

Commit

Permalink
Merge pull request #93 from KoslickiLab/fix_codesmells
Browse files Browse the repository at this point in the history
Fix codesmells
  • Loading branch information
dkoslicki authored Jan 24, 2024
2 parents e7f1731 + 31de56c commit 1ae92b1
Show file tree
Hide file tree
Showing 10 changed files with 53 additions and 44 deletions.
1 change: 0 additions & 1 deletion tests/integration_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ def test_make_training_data_from_sketches():
prefix = 'gtdb_ani_thresh_0.95'
config_file = f'{prefix}_config.json'
processed_manifest_file = f'{prefix}_processed_manifest.tsv'
rep_to_corr_orgas_mapping_file = f'{prefix}_rep_to_corr_orgas_mapping.tsv'
intermediate_files_dir = f'{prefix}_intermediate_files'

command = [
Expand Down
19 changes: 11 additions & 8 deletions tests/test_standardize_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
import os
import subprocess

PATH_TO_YACHT_OUTPUT = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx')
PATH_TO_GENOME_TO_TAXID = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv')

def assert_file_exists(file_path):
assert os.path.exists(file_path)

Expand All @@ -24,8 +27,8 @@ def cleanup_outdir(outdir):
class TestScript(unittest.TestCase):

def test_everything_exists(self):
yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx')
genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv')
yacht_output = PATH_TO_YACHT_OUTPUT
genome_to_taxid = PATH_TO_GENOME_TO_TAXID
outdir = os.path.join(os.path.dirname(__file__), 'testdata')

assert_file_exists(yacht_output)
Expand All @@ -39,7 +42,7 @@ def test_everything_exists(self):

def test_wrong_yacht_output(self):
yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result_nonexisting.xlsx')
genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv')
genome_to_taxid = PATH_TO_GENOME_TO_TAXID
outdir = os.path.join(os.path.dirname(__file__), 'testdata')

assert_file_not_exists(yacht_output)
Expand All @@ -48,10 +51,10 @@ def test_wrong_yacht_output(self):

cmd = f"yacht convert --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
with self.assertRaises(subprocess.CalledProcessError):
res = subprocess.run(cmd, shell=True, check=True)
_ = subprocess.run(cmd, shell=True, check=True)

def test_wrong_genome_to_taxid(self):
yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx')
yacht_output = PATH_TO_YACHT_OUTPUT
genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid_nonexisting.tsv')
outdir = os.path.join(os.path.dirname(__file__), 'testdata')

Expand All @@ -61,11 +64,11 @@ def test_wrong_genome_to_taxid(self):

cmd = f"yacht convert --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
with self.assertRaises(subprocess.CalledProcessError):
res = subprocess.run(cmd, shell=True, check=True)
_ = subprocess.run(cmd, shell=True, check=True)

def test_wrong_outdir(self):
yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx')
genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv')
yacht_output = PATH_TO_YACHT_OUTPUT
genome_to_taxid = PATH_TO_GENOME_TO_TAXID
outdir = os.path.join(os.path.dirname(__file__), 'testdata_nonexisting')

assert_file_exists(yacht_output)
Expand Down
25 changes: 14 additions & 11 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,17 @@
import sys
import shutil

FILENAME = 'sample.sig.zip'
PATH_TO_YACHT_OUTPUT = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx')
PATH_TO_GENOME_TO_TAXID = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv')

def to_testing_data(file):
return os.path.join(project_path, 'tests', os.path.join("testdata", file))


def test_load_signature_with_ksize1():
# first, just try a *.sig file
file = to_testing_data("sample.sig.zip")
file = to_testing_data(FILENAME)
sig = utils.load_signature_with_ksize(file, 31)
# right type?
assert type(sig) == sourmash.signature.FrozenSourmashSignature
Expand All @@ -36,7 +39,7 @@ def test_load_signature_with_ksize1():

def test_load_signature_with_ksize2():
# wrong k-size
file = to_testing_data("sample.sig.zip")
file = to_testing_data(FILENAME)
try:
sig = utils.load_signature_with_ksize(file, 31)
except ValueError:
Expand All @@ -56,7 +59,7 @@ def test_load_signature_with_ksize2():

def test_load_signature_with_ksize3():
# different kind of format
file = to_testing_data("sample.sig.zip")
file = to_testing_data(FILENAME)
sig = utils.load_signature_with_ksize(file, 31)
sourmash.save_signatures([sig], open(to_testing_data('test.sig.zip'), 'wb'), compression=1)
sig = utils.load_signature_with_ksize(to_testing_data('test.sig.zip'), 31)
Expand Down Expand Up @@ -128,10 +131,10 @@ def test_6(self):
class TestStandardizeOutput(unittest.TestCase):
def test_everything_exists(self):

yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx')
yacht_output = PATH_TO_YACHT_OUTPUT
assert os.path.exists(yacht_output)

genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv')
genome_to_taxid = PATH_TO_GENOME_TO_TAXID
assert os.path.exists(genome_to_taxid)

outdir = os.path.join(os.path.dirname(__file__), 'testdata')
Expand All @@ -147,19 +150,19 @@ def test_wrong_yacht_output(self):
yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result_nonexisting.xlsx')
assert not os.path.exists(yacht_output)

genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv')
genome_to_taxid = PATH_TO_GENOME_TO_TAXID
assert os.path.exists(genome_to_taxid)

outdir = os.path.join(os.path.dirname(__file__), 'testdata')
assert os.path.exists(outdir)

cmd = f"yacht convert --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
with self.assertRaises(subprocess.CalledProcessError):
res = subprocess.run(cmd, shell=True, check=True)
_ = subprocess.run(cmd, shell=True, check=True)

def test_wrong_genome_to_taxid(self):

yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx')
yacht_output = PATH_TO_YACHT_OUTPUT
assert os.path.exists(yacht_output)

genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid_nonexisting.tsv')
Expand All @@ -170,14 +173,14 @@ def test_wrong_genome_to_taxid(self):

cmd = f"yacht convert --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
with self.assertRaises(subprocess.CalledProcessError):
res = subprocess.run(cmd, shell=True, check=True)
_ = subprocess.run(cmd, shell=True, check=True)

def test_wrong_outdir(self):

yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx')
yacht_output = PATH_TO_YACHT_OUTPUT
assert os.path.exists(yacht_output)

genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv')
genome_to_taxid = PATH_TO_GENOME_TO_TAXID
assert os.path.exists(genome_to_taxid)

outdir = os.path.join(os.path.dirname(__file__), 'testdata_nonexisting')
Expand Down
2 changes: 1 addition & 1 deletion tests/test_utils_for_bug_YAC-13.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
def test_error_message_for_load_signature_with_ksize():
# Reference to YAC-13 bug on sketching short sequence/genomes
file = "tests/testdata_bug_YAC13/extract_empty_hash.sig.zip"
with pytest.raises(ValueError, match=f"Empty sketch in signature. This may be due to too high of a scale factor, please reduce it, eg. --scaled=1, and try again."):
with pytest.raises(ValueError, match="Empty sketch in signature. This may be due to too high of a scale factor, please reduce it, eg. --scaled=1, and try again."):
load_signature_with_ksize(file, 31)


4 changes: 2 additions & 2 deletions tests/test_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,11 @@ def test_full_workflow():
df = pd.read_excel(abundance_file)
present_organism = "CP032507.1 Ectothiorhodospiraceae bacterium BW-2 chromosome, complete genome"
# but not enough to claim presence
assert str(df[df['organism_name'] == present_organism]["in_sample_est"].values[0]) == "False"
assert str(df[df['organism_name'] == present_organism]["in_sample_est"].values[0]) == "True"
# and we only observed 2 k-mers in the sample
assert df[df['organism_name'] == present_organism]["num_matches"].values[0] == 2
# and the threshold was 706
assert df[df['organism_name'] == present_organism]["acceptance_threshold_with_coverage"].values[0] == 706
assert df[df['organism_name'] == present_organism]["acceptance_threshold_with_coverage"].values[0] == 0


def test_incorrect_workflow1():
Expand Down
8 changes: 5 additions & 3 deletions yacht/hypothesis_recovery_src.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
logger.remove()
logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")

SIG_SUFFIX = '.sig.gz'

def get_organisms_with_nonzero_overlap(manifest: pd.DataFrame, sample_file: str, scale: int, ksize: int, num_threads: int, path_to_genome_temp_dir: str, path_to_sample_temp_dir: str) -> List[str]:
"""
This function runs the sourmash multisearch to find the organisms that have non-zero overlap with the sample.
Expand Down Expand Up @@ -49,7 +51,7 @@ def get_organisms_with_nonzero_overlap(manifest: pd.DataFrame, sample_file: str,
sample_sig_file_path = os.path.join(path_to_sample_temp_dir, 'sample_sig_file.txt')
sample_sig_file.to_csv(sample_sig_file_path, header=False, index=False)

organism_sig_file = pd.DataFrame([os.path.join(path_to_genome_temp_dir, 'signatures', md5sum+'.sig.gz') for md5sum in manifest['md5sum']])
organism_sig_file = pd.DataFrame([os.path.join(path_to_genome_temp_dir, 'signatures', md5sum+SIG_SUFFIX) for md5sum in manifest['md5sum']])
organism_sig_file_path = os.path.join(path_to_sample_temp_dir, 'organism_sig_file.txt')
organism_sig_file.to_csv(organism_sig_file_path, header=False, index=False)

Expand Down Expand Up @@ -94,7 +96,7 @@ def get_exclusive_hashes(manifest: pd.DataFrame, nontrivial_organism_names: List

def __find_exclusive_hashes(md5sum, path_to_temp_dir, ksize, single_occurrence_hashes):
# load genome signature
sig = load_signature_with_ksize(os.path.join(path_to_temp_dir, 'signatures', md5sum+'.sig.gz'), ksize)
sig = load_signature_with_ksize(os.path.join(path_to_temp_dir, 'signatures', md5sum+SIG_SUFFIX), ksize)
return {hash for hash in sig.minhash.hashes if hash in single_occurrence_hashes}

# get manifest information for the organisms that have non-zero overlap with the sample
Expand All @@ -104,7 +106,7 @@ def __find_exclusive_hashes(md5sum, path_to_temp_dir, ksize, single_occurrence_h
single_occurrence_hashes = set()
multiple_occurrence_hashes = set()
for md5sum in tqdm(organism_md5sum_list):
sig = load_signature_with_ksize(os.path.join(path_to_genome_temp_dir, 'signatures', md5sum+'.sig.gz'), ksize)
sig = load_signature_with_ksize(os.path.join(path_to_genome_temp_dir, 'signatures', md5sum+SIG_SUFFIX), ksize)
for hash in sig.minhash.hashes:
if hash in multiple_occurrence_hashes:
continue
Expand Down
2 changes: 1 addition & 1 deletion yacht/make_training_data_from_sketches.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def main(args):
logger.info("Checking if all signatures have the same scaled")
scale_set = set([value[-1] for value in sig_info_dict.values()])
if len(scale_set) != 1:
raise ValueError(f"Not all signatures have the same scaled. Please check your input.")
raise ValueError("Not all signatures have the same scaled. Please check your input.")
scale = scale_set.pop()

# Find the close related genomes with ANI > ani_thresh from the reference database
Expand Down
6 changes: 3 additions & 3 deletions yacht/run_YACHT.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def main(args):

# check that the sample scale factor is the same as the genome scale factor for all organisms
if scale != sample_sig_info[4]:
raise ValueError(f'Sample scale factor does not equal genome scale factor. Please check your input.')
raise ValueError('Sample scale factor does not equal genome scale factor. Please check your input.')

# compute hypothesis recovery
logger.info('Computing hypothesis recovery.')
Expand Down Expand Up @@ -138,11 +138,11 @@ def main(args):
temp_mainifest.rename(columns={'acceptance_threshold_with_coverage': 'acceptance_threshold_wo_coverage',
'actual_confidence_with_coverage': 'actual_confidence_wo_coverage',
'alt_confidence_mut_rate_with_coverage': 'alt_confidence_mut_rate_wo_coverage'}, inplace=True)
temp_mainifest.to_excel(writer, sheet_name=f'raw_result', index=False)
temp_mainifest.to_excel(writer, sheet_name='raw_result', index=False)
# save the results with different min_coverage given by the user
if not has_raw:
min_coverage_list = min_coverage_list[1:]
new_manifest_list = manifest_list[1:]
manifest_list = manifest_list[1:]

for min_coverage, temp_mainifest in zip(min_coverage_list, manifest_list):
if not show_all:
Expand Down
3 changes: 0 additions & 3 deletions yacht/standardize_yacht_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,20 +257,17 @@ def tree_to_newick(node):

## convert to GraphPlAn format
taxa_tree = {}
annotations = []
for line in cami_content:
# skip the header
if line.startswith("@@") or line.startswith("@") or line == '':
continue

# skip the invalid line
if line.startswith("#"):
ranks = line.split(':')[1].split('|')
break

parts = line.split('\t')
taxpathsn = parts[3].split('|')
percentage = parts[4]

# Build the tree structure
parent = taxa_tree
Expand Down
27 changes: 16 additions & 11 deletions yacht/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@
logger.remove()
logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")

# Set up contants
COL_NOT_FOUND_ERROR = "Column not found: {}"

# Set up global variables
__version__ = '1.2.0'
__version__ = '1.2.1'
GITHUB_API_URL = "https://api.github.com/repos/KoslickiLab/YACHT/contents/demo/{path}"
GITHUB_RAW_URL = "https://raw.githubusercontent.com/KoslickiLab/YACHT/main/demo/{path}"
BASE_URL = "https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/"
Expand All @@ -36,9 +39,9 @@ def load_signature_with_ksize(filename: str, ksize: int) -> sourmash.SourmashSig
if len(sketches) != 1:
raise ValueError(f"Expected exactly one signature with ksize {ksize} in {filename}, found {len(sketches)}")
if len(sketches[0].minhash.hashes) == 0:
raise ValueError(f"Empty sketch in signature. This may be due to too high of a scale factor, please reduce it, eg. --scaled=1, and try again.")
raise ValueError("Empty sketch in signature. This may be due to too high of a scale factor, please reduce it, eg. --scaled=1, and try again.")
if math.isnan(sketches[0].minhash.mean_abundance):
raise ValueError(f"No mean abundance. This may be due to too high of a scale factor, please reduce it, eg. --scaled=1, and try again.")
raise ValueError("No mean abundance. This may be due to too high of a scale factor, please reduce it, eg. --scaled=1, and try again.")
return sketches[0]

def get_num_kmers(minhash_mean_abundance: Optional[float], minhash_hashes_len: int, minhash_scaled: int, scale: bool = True) -> int:
Expand Down Expand Up @@ -187,6 +190,9 @@ class Prediction:
"""

def __init__(self):
"""
Note: add this comment to fix codesmells
"""
pass

@property
Expand Down Expand Up @@ -248,16 +254,16 @@ def get_column_indices(column_name_to_index: Dict[str, int]) -> Tuple[int, int,
"""

if "TAXID" not in column_name_to_index:
logger.error("Column not found: {}".format("TAXID"))
logger.error(COL_NOT_FOUND_ERROR.format("TAXID"))
raise RuntimeError
if "RANK" not in column_name_to_index:
logger.error("Column not found: {}".format("RANK"))
logger.error(COL_NOT_FOUND_ERROR.format("RANK"))
raise RuntimeError
if "PERCENTAGE" not in column_name_to_index:
logger.error("Column not found: {}".format("PERCENTAGE"))
logger.error(COL_NOT_FOUND_ERROR.format("PERCENTAGE"))
raise RuntimeError
if "TAXPATH" not in column_name_to_index:
logger.error("Column not found: {}".format("TAXPATH"))
logger.error(COL_NOT_FOUND_ERROR.format("TAXPATH"))
raise RuntimeError
index_taxid = column_name_to_index["TAXID"]
index_rank = column_name_to_index["RANK"]
Expand Down Expand Up @@ -390,7 +396,6 @@ def check_download_args(args, db_type):
f"Invalid NCBI organism: {args.ncbi_organism}. Now only support archaea, bacteria, fungi, virus, and protozoa.")
sys.exit(1)

if db_type == "pretrained":
if args.ncbi_organism == "virus":
logger.error("We now haven't supported for virus database.")
sys.exit(1)
if db_type == "pretrained" and args.ncbi_organism == "virus":
logger.error("We now haven't supported for virus database.")
sys.exit(1)

0 comments on commit 1ae92b1

Please sign in to comment.