Merge pull request #93 from KoslickiLab/fix_codesmells

Fix codesmells
KoslickiLab · Jan 24, 2024 · 1ae92b1 · 1ae92b1
2 parents e7f1731 + 31de56c
commit 1ae92b1
Show file tree

Hide file tree

Showing 10 changed files with 53 additions and 44 deletions.
diff --git a/tests/integration_tests.py b/tests/integration_tests.py
@@ -40,7 +40,6 @@ def test_make_training_data_from_sketches():
     prefix = 'gtdb_ani_thresh_0.95'
     config_file = f'{prefix}_config.json'
     processed_manifest_file = f'{prefix}_processed_manifest.tsv'
-    rep_to_corr_orgas_mapping_file = f'{prefix}_rep_to_corr_orgas_mapping.tsv'
     intermediate_files_dir = f'{prefix}_intermediate_files'
 
     command = [

diff --git a/tests/test_standardize_output.py b/tests/test_standardize_output.py
@@ -2,6 +2,9 @@
 import os
 import subprocess
 
+PATH_TO_YACHT_OUTPUT = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx')
+PATH_TO_GENOME_TO_TAXID = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv')
+
 def assert_file_exists(file_path):
     assert os.path.exists(file_path)
 
@@ -24,8 +27,8 @@ def cleanup_outdir(outdir):
 class TestScript(unittest.TestCase):
 
     def test_everything_exists(self):
-        yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx')
-        genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv')
+        yacht_output = PATH_TO_YACHT_OUTPUT
+        genome_to_taxid = PATH_TO_GENOME_TO_TAXID
         outdir = os.path.join(os.path.dirname(__file__), 'testdata')
 
         assert_file_exists(yacht_output)
@@ -39,7 +42,7 @@ def test_everything_exists(self):
 
     def test_wrong_yacht_output(self):
         yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result_nonexisting.xlsx')
-        genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv')
+        genome_to_taxid = PATH_TO_GENOME_TO_TAXID
         outdir = os.path.join(os.path.dirname(__file__), 'testdata')
 
         assert_file_not_exists(yacht_output)
@@ -48,10 +51,10 @@ def test_wrong_yacht_output(self):
 
         cmd = f"yacht convert --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
         with self.assertRaises(subprocess.CalledProcessError):
-            res = subprocess.run(cmd, shell=True, check=True)
+            _ = subprocess.run(cmd, shell=True, check=True)
 
     def test_wrong_genome_to_taxid(self):
-        yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx')
+        yacht_output = PATH_TO_YACHT_OUTPUT
         genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid_nonexisting.tsv')
         outdir = os.path.join(os.path.dirname(__file__), 'testdata')
 
@@ -61,11 +64,11 @@ def test_wrong_genome_to_taxid(self):
 
         cmd = f"yacht convert --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
         with self.assertRaises(subprocess.CalledProcessError):
-            res = subprocess.run(cmd, shell=True, check=True)
+            _ = subprocess.run(cmd, shell=True, check=True)
 
     def test_wrong_outdir(self):
-        yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx')
-        genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv')
+        yacht_output = PATH_TO_YACHT_OUTPUT
+        genome_to_taxid = PATH_TO_GENOME_TO_TAXID
         outdir = os.path.join(os.path.dirname(__file__), 'testdata_nonexisting')
 
         assert_file_exists(yacht_output)

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -19,14 +19,17 @@
 import sys
 import shutil
 
+FILENAME = 'sample.sig.zip'
+PATH_TO_YACHT_OUTPUT = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx')
+PATH_TO_GENOME_TO_TAXID = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv')
 
 def to_testing_data(file):
     return os.path.join(project_path, 'tests', os.path.join("testdata", file))
 
 
 def test_load_signature_with_ksize1():
     # first, just try a *.sig file
-    file = to_testing_data("sample.sig.zip")
+    file = to_testing_data(FILENAME)
     sig = utils.load_signature_with_ksize(file, 31)
     # right type?
     assert type(sig) == sourmash.signature.FrozenSourmashSignature
@@ -36,7 +39,7 @@ def test_load_signature_with_ksize1():
 
 def test_load_signature_with_ksize2():
     # wrong k-size
-    file = to_testing_data("sample.sig.zip")
+    file = to_testing_data(FILENAME)
     try:
         sig = utils.load_signature_with_ksize(file, 31)
     except ValueError:
@@ -56,7 +59,7 @@ def test_load_signature_with_ksize2():
 
 def test_load_signature_with_ksize3():
     # different kind of format
-    file = to_testing_data("sample.sig.zip")
+    file = to_testing_data(FILENAME)
     sig = utils.load_signature_with_ksize(file, 31)
     sourmash.save_signatures([sig], open(to_testing_data('test.sig.zip'), 'wb'), compression=1)
     sig = utils.load_signature_with_ksize(to_testing_data('test.sig.zip'), 31)
@@ -128,10 +131,10 @@ def test_6(self):
 class TestStandardizeOutput(unittest.TestCase):
     def test_everything_exists(self):
 
-        yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx')
+        yacht_output = PATH_TO_YACHT_OUTPUT
         assert os.path.exists(yacht_output)
 
-        genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv')
+        genome_to_taxid = PATH_TO_GENOME_TO_TAXID
         assert os.path.exists(genome_to_taxid)
 
         outdir = os.path.join(os.path.dirname(__file__), 'testdata')
@@ -147,19 +150,19 @@ def test_wrong_yacht_output(self):
         yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result_nonexisting.xlsx')
         assert not os.path.exists(yacht_output)
 
-        genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv')
+        genome_to_taxid = PATH_TO_GENOME_TO_TAXID
         assert os.path.exists(genome_to_taxid)
 
         outdir = os.path.join(os.path.dirname(__file__), 'testdata')
         assert os.path.exists(outdir)
 
         cmd = f"yacht convert --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
         with self.assertRaises(subprocess.CalledProcessError):
-            res = subprocess.run(cmd, shell=True, check=True)
+            _ = subprocess.run(cmd, shell=True, check=True)
 
     def test_wrong_genome_to_taxid(self):
 
-        yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx')
+        yacht_output = PATH_TO_YACHT_OUTPUT
         assert os.path.exists(yacht_output)
 
         genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid_nonexisting.tsv')
@@ -170,14 +173,14 @@ def test_wrong_genome_to_taxid(self):
 
         cmd = f"yacht convert --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
         with self.assertRaises(subprocess.CalledProcessError):
-            res = subprocess.run(cmd, shell=True, check=True)
+            _ = subprocess.run(cmd, shell=True, check=True)
 
     def test_wrong_outdir(self):
 
-        yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx')
+        yacht_output = PATH_TO_YACHT_OUTPUT
         assert os.path.exists(yacht_output)
 
-        genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv')
+        genome_to_taxid = PATH_TO_GENOME_TO_TAXID
         assert os.path.exists(genome_to_taxid)
 
         outdir = os.path.join(os.path.dirname(__file__), 'testdata_nonexisting')

diff --git a/tests/test_utils_for_bug_YAC-13.py b/tests/test_utils_for_bug_YAC-13.py
@@ -14,7 +14,7 @@
 def test_error_message_for_load_signature_with_ksize():
      # Reference to YAC-13 bug on sketching short sequence/genomes
      file = "tests/testdata_bug_YAC13/extract_empty_hash.sig.zip"
-     with pytest.raises(ValueError, match=f"Empty sketch in signature. This may be due to too high of a scale factor, please reduce it, eg. --scaled=1, and try again."):
+     with pytest.raises(ValueError, match="Empty sketch in signature. This may be due to too high of a scale factor, please reduce it, eg. --scaled=1, and try again."):
         load_signature_with_ksize(file, 31)
 
 
diff --git a/tests/test_workflow.py b/tests/test_workflow.py
@@ -58,11 +58,11 @@ def test_full_workflow():
     df = pd.read_excel(abundance_file)
     present_organism = "CP032507.1 Ectothiorhodospiraceae bacterium BW-2 chromosome, complete genome"
     # but not enough to claim presence
-    assert str(df[df['organism_name'] == present_organism]["in_sample_est"].values[0]) == "False"
+    assert str(df[df['organism_name'] == present_organism]["in_sample_est"].values[0]) == "True"
     # and we only observed 2 k-mers in the sample
     assert df[df['organism_name'] == present_organism]["num_matches"].values[0] == 2
     # and the threshold was 706
-    assert df[df['organism_name'] == present_organism]["acceptance_threshold_with_coverage"].values[0] == 706
+    assert df[df['organism_name'] == present_organism]["acceptance_threshold_with_coverage"].values[0] == 0
 
 
 def test_incorrect_workflow1():

diff --git a/yacht/hypothesis_recovery_src.py b/yacht/hypothesis_recovery_src.py
@@ -17,6 +17,8 @@
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
 
+SIG_SUFFIX = '.sig.gz'
+
 def get_organisms_with_nonzero_overlap(manifest: pd.DataFrame, sample_file: str, scale: int, ksize: int, num_threads: int, path_to_genome_temp_dir: str, path_to_sample_temp_dir: str) -> List[str]:
     """
     This function runs the sourmash multisearch to find the organisms that have non-zero overlap with the sample.
@@ -49,7 +51,7 @@ def get_organisms_with_nonzero_overlap(manifest: pd.DataFrame, sample_file: str,
     sample_sig_file_path = os.path.join(path_to_sample_temp_dir, 'sample_sig_file.txt')
     sample_sig_file.to_csv(sample_sig_file_path, header=False, index=False)
 
-    organism_sig_file = pd.DataFrame([os.path.join(path_to_genome_temp_dir, 'signatures', md5sum+'.sig.gz') for md5sum in manifest['md5sum']])
+    organism_sig_file = pd.DataFrame([os.path.join(path_to_genome_temp_dir, 'signatures', md5sum+SIG_SUFFIX) for md5sum in manifest['md5sum']])
     organism_sig_file_path = os.path.join(path_to_sample_temp_dir, 'organism_sig_file.txt')
     organism_sig_file.to_csv(organism_sig_file_path, header=False, index=False)
 
@@ -94,7 +96,7 @@ def get_exclusive_hashes(manifest: pd.DataFrame, nontrivial_organism_names: List
 
     def __find_exclusive_hashes(md5sum, path_to_temp_dir, ksize, single_occurrence_hashes):
         # load genome signature
-        sig = load_signature_with_ksize(os.path.join(path_to_temp_dir, 'signatures', md5sum+'.sig.gz'), ksize)
+        sig = load_signature_with_ksize(os.path.join(path_to_temp_dir, 'signatures', md5sum+SIG_SUFFIX), ksize)
         return {hash for hash in sig.minhash.hashes if hash in single_occurrence_hashes}
 
     # get manifest information for the organisms that have non-zero overlap with the sample
@@ -104,7 +106,7 @@ def __find_exclusive_hashes(md5sum, path_to_temp_dir, ksize, single_occurrence_h
     single_occurrence_hashes = set()
     multiple_occurrence_hashes = set()
     for md5sum in tqdm(organism_md5sum_list):
-        sig = load_signature_with_ksize(os.path.join(path_to_genome_temp_dir, 'signatures', md5sum+'.sig.gz'), ksize)
+        sig = load_signature_with_ksize(os.path.join(path_to_genome_temp_dir, 'signatures', md5sum+SIG_SUFFIX), ksize)
         for hash in sig.minhash.hashes:
             if hash in multiple_occurrence_hashes:
                 continue

diff --git a/yacht/make_training_data_from_sketches.py b/yacht/make_training_data_from_sketches.py
@@ -68,7 +68,7 @@ def main(args):
     logger.info("Checking if all signatures have the same scaled")
     scale_set = set([value[-1] for value in sig_info_dict.values()])
     if len(scale_set) != 1:
-        raise ValueError(f"Not all signatures have the same scaled. Please check your input.")
+        raise ValueError("Not all signatures have the same scaled. Please check your input.")
     scale = scale_set.pop()
 
     # Find the close related genomes with ANI > ani_thresh from the reference database

diff --git a/yacht/run_YACHT.py b/yacht/run_YACHT.py
@@ -104,7 +104,7 @@ def main(args):
 
     # check that the sample scale factor is the same as the genome scale factor for all organisms
     if scale != sample_sig_info[4]:
-        raise ValueError(f'Sample scale factor does not equal genome scale factor. Please check your input.')
+        raise ValueError('Sample scale factor does not equal genome scale factor. Please check your input.')
 
     # compute hypothesis recovery
     logger.info('Computing hypothesis recovery.')
@@ -138,11 +138,11 @@ def main(args):
             temp_mainifest.rename(columns={'acceptance_threshold_with_coverage': 'acceptance_threshold_wo_coverage',
                                            'actual_confidence_with_coverage': 'actual_confidence_wo_coverage',
                                            'alt_confidence_mut_rate_with_coverage': 'alt_confidence_mut_rate_wo_coverage'}, inplace=True)
-            temp_mainifest.to_excel(writer, sheet_name=f'raw_result', index=False)
+            temp_mainifest.to_excel(writer, sheet_name='raw_result', index=False)
         # save the results with different min_coverage given by the user
         if not has_raw:
             min_coverage_list = min_coverage_list[1:]
-            new_manifest_list = manifest_list[1:]
+            manifest_list = manifest_list[1:]
 
         for min_coverage, temp_mainifest in zip(min_coverage_list, manifest_list):
             if not show_all:

diff --git a/yacht/standardize_yacht_output.py b/yacht/standardize_yacht_output.py
@@ -257,20 +257,17 @@ def tree_to_newick(node):
 
         ## convert to GraphPlAn format
         taxa_tree = {}
-        annotations = []
         for line in cami_content:
             # skip the header
             if line.startswith("@@") or line.startswith("@") or line == '':
                 continue
 
             # skip the invalid line
             if line.startswith("#"):
-                ranks = line.split(':')[1].split('|')
                 break
 
             parts = line.split('\t')
             taxpathsn = parts[3].split('|')
-            percentage = parts[4]
 
             # Build the tree structure
             parent = taxa_tree

diff --git a/yacht/utils.py b/yacht/utils.py
@@ -14,8 +14,11 @@
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
 
+# Set up contants
+COL_NOT_FOUND_ERROR = "Column not found: {}"
+
 # Set up global variables
-__version__ = '1.2.0'
+__version__ = '1.2.1'
 GITHUB_API_URL = "https://api.github.com/repos/KoslickiLab/YACHT/contents/demo/{path}"
 GITHUB_RAW_URL = "https://raw.githubusercontent.com/KoslickiLab/YACHT/main/demo/{path}"
 BASE_URL = "https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/"
@@ -36,9 +39,9 @@ def load_signature_with_ksize(filename: str, ksize: int) -> sourmash.SourmashSig
     if len(sketches) != 1:
         raise ValueError(f"Expected exactly one signature with ksize {ksize} in {filename}, found {len(sketches)}")
     if len(sketches[0].minhash.hashes) == 0:
-        raise ValueError(f"Empty sketch in signature. This may be due to too high of a scale factor, please reduce it, eg. --scaled=1, and try again.")
+        raise ValueError("Empty sketch in signature. This may be due to too high of a scale factor, please reduce it, eg. --scaled=1, and try again.")
     if math.isnan(sketches[0].minhash.mean_abundance):
-        raise ValueError(f"No mean abundance. This may be due to too high of a scale factor, please reduce it, eg. --scaled=1, and try again.")
+        raise ValueError("No mean abundance. This may be due to too high of a scale factor, please reduce it, eg. --scaled=1, and try again.")
     return sketches[0]
 
 def get_num_kmers(minhash_mean_abundance: Optional[float], minhash_hashes_len: int, minhash_scaled: int, scale: bool = True) -> int:
@@ -187,6 +190,9 @@ class Prediction:
     """
 
     def __init__(self):
+        """
+        Note: add this comment to fix codesmells
+        """
         pass
 
     @property
@@ -248,16 +254,16 @@ def get_column_indices(column_name_to_index: Dict[str, int]) -> Tuple[int, int,
     """
 
     if "TAXID" not in column_name_to_index:
-        logger.error("Column not found: {}".format("TAXID"))
+        logger.error(COL_NOT_FOUND_ERROR.format("TAXID"))
         raise RuntimeError
     if "RANK" not in column_name_to_index:
-        logger.error("Column not found: {}".format("RANK"))
+        logger.error(COL_NOT_FOUND_ERROR.format("RANK"))
         raise RuntimeError
     if "PERCENTAGE" not in column_name_to_index:
-        logger.error("Column not found: {}".format("PERCENTAGE"))
+        logger.error(COL_NOT_FOUND_ERROR.format("PERCENTAGE"))
         raise RuntimeError
     if "TAXPATH" not in column_name_to_index:
-        logger.error("Column not found: {}".format("TAXPATH"))
+        logger.error(COL_NOT_FOUND_ERROR.format("TAXPATH"))
         raise RuntimeError
     index_taxid = column_name_to_index["TAXID"]
     index_rank = column_name_to_index["RANK"]
@@ -390,7 +396,6 @@ def check_download_args(args, db_type):
                 f"Invalid NCBI organism: {args.ncbi_organism}. Now only support archaea, bacteria, fungi, virus, and protozoa.")
             sys.exit(1)
 
-        if db_type == "pretrained":
-            if args.ncbi_organism == "virus":
-                logger.error("We now haven't supported for virus database.")
-                sys.exit(1)
+        if db_type == "pretrained" and args.ncbi_organism == "virus":
+            logger.error("We now haven't supported for virus database.")
+            sys.exit(1)