Updating unit tests, adding to pipeline (#24)

* Updating unit tests, adding to pipeline * fixing tests, deleting unused files --------- Co-authored-by: David Koslicki <[email protected]>
KoslickiLab · Oct 3, 2023 · 47bf00a · 47bf00a
1 parent 271bbf8
commit 47bf00a
Show file tree

Hide file tree

Showing 8 changed files with 95 additions and 20 deletions.
diff --git a/.github/workflows/runTest.yml b/.github/workflows/runTest.yml
@@ -15,4 +15,10 @@ jobs:
     - name: make training data
       run: python make_training_data_from_sketches.py --ref_file 'tests/testdata/20_genomes_sketches.zip' --ksize 31 --out_prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95
     - name: run YACHT
-      run: python run_YACHT.py --json 'gtdb_ani_thresh_0.95_config.json' --sample_file 'tests/testdata/sample.sig' --significance 0.99 --min_coverage 1 --outdir './'
+      run: python run_YACHT.py --json 'gtdb_ani_thresh_0.95_config.json' --sample_file 'tests/testdata/sample.sig' --significance 0.99 --min_coverage 1 --outdir './'
+    - name: test-unit
+      run: pytest tests/test_unit.py
+    - name: test-utils
+      run: pytest tests/test_utils.py
+    - name: test-workflow
+      run: pytest tests/test_workflow.py
diff --git a/depreciated/old_code.py b/depreciated/old_code.py
@@ -70,7 +70,7 @@ def sample_vector_from_files(sig_filename, hash_filename, ksize):
     :return: numpy vector (sample vector y)
     """
     sample_sig = utils.load_signature_with_ksize(sig_filename, ksize)
-    hash_to_idx = utils.load_hashes(hash_filename)
+    hash_to_idx = utils.load_hashes_to_index(hash_filename)
     sample_vector, num_hash_diff_unique, num_hash_diff_total = sample_vector_from_signature(sample_sig, hash_to_idx)
     return sample_vector, sample_sig, num_hash_diff_unique, num_hash_diff_total
 

diff --git a/run_YACHT.py b/run_YACHT.py
@@ -71,7 +71,7 @@
     # load the training data
     logger.info('Loading reference matrix, hash to index dictionary, and organism data.')
     reference_matrix = load_npz(reference_matrix_path)
-    hash_to_idx = utils.load_hashes(hash_to_idx_path)
+    hash_to_idx = utils.load_hashes_to_index(hash_to_idx_path)
     organism_data = pd.read_csv(processed_org_file_path)
 
     logger.info('Loading sample signature.')

diff --git a/srcs/utils.py b/srcs/utils.py
@@ -10,7 +10,7 @@
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
 
-def load_hashes(filename):
+def load_hashes_to_index(filename):
     """
     Helper function that loads the hash_to_col_idx.pkl file and returns a dictionary mapping hashes to indices in the
     training dictionary. filename should point to a CSV file with two columns: hash, col_idx.
@@ -31,7 +31,10 @@ def load_signature_with_ksize(filename, ksize):
     :return: sourmash signature
     """
     # Take the first sample signature with the given kmer size
-    return list(sourmash.load_file_as_signatures(filename, ksize=ksize))[0]
+    sketches = list(sourmash.load_file_as_signatures(filename, ksize=ksize))
+    if len(sketches) != 1:
+        raise ValueError(f"Expected exactly one signature with ksize {ksize} in {filename}, found {len(sketches)}")
+    return sketches[0] if sketches else 0
 
 
 def get_num_kmers(signature, scale=True):

diff --git a/tests/profiling/for_profiling.py b/tests/profiling/for_profiling.py
@@ -32,7 +32,7 @@
 import sourmash
 
 
-def load_hashes(filename):
+def load_hashes_to_index(filename):
     """
     Helper function that loads the hash_to_col_idx.csv file and returns a dictionary mapping hashes to indices in the
     training dictionary. filename should point to a CSV file with two columns: hash, col_idx.
@@ -124,7 +124,7 @@ def run_YACHT():
 
     # load the training data
     reference_matrix = load_npz(ref_matrix)
-    hash_to_idx = load_hashes(hash_to_idx_file)
+    hash_to_idx = load_hashes_to_index(hash_to_idx_file)
     organism_data = pd.read_csv(processed_org_file)
 
     # get the sample y vector (indexed by hash/k-mer, with entry = number of times k-mer appears in sample)

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,71 @@
+import subprocess
+from os.path import exists
+import os
+import numpy as np
+import pandas as pd
+# add the parent directory to the path
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from srcs import utils
+import sourmash
+
+
+def to_testing_data(file):
+    return os.path.join('tests', os.path.join("testdata", file))
+
+
+def test_load_hashes_to_index():
+     # the *hash_to_col_idx.pkl files contain a pickle list of key value pairs with keys the hash values and values
+     # the index of the row that they appear in the npz matrix
+     file = to_testing_data("integration_test_hash_to_col_idx.pkl")
+     hashes = utils.load_hashes_to_index(file)
+     assert type(hashes) == dict
+     assert len(hashes) == 63888
+     assert np.allclose(np.sort(list(hashes.values())), range(0, len(hashes)))
+
+
+def test_load_signature_with_ksize1():
+    # first, just try a *.sig file
+    file = to_testing_data("sample.sig")
+    sig = utils.load_signature_with_ksize(file, 31)
+    # right type?
+    assert type(sig) == sourmash.signature.FrozenSourmashSignature
+    # can we do a simple operation on it?
+    assert sig.jaccard(sig) == 1.0
+
+
+def test_load_signature_with_ksize2():
+    # wrong k-size
+    file = to_testing_data("sample.sig")
+    try:
+        sig = utils.load_signature_with_ksize(file, 21)
+    except ValueError:
+        pass
+    # wrong file type
+    file = to_testing_data("foobar")
+    try:
+        sig = utils.load_signature_with_ksize(file, 31)
+    except ValueError:
+        pass
+    # too many files
+    try:
+        sig = utils.load_signature_with_ksize(to_testing_data("20_genomes_sketches.zip"), 31)
+    except ValueError:
+        pass
+
+
+def test_load_signature_with_ksize3():
+    # different kind of format
+    file = to_testing_data("sample.sig")
+    sig = utils.load_signature_with_ksize(file, 31)
+    sourmash.save_signatures([sig], open(to_testing_data('test.sig.zip'), 'wb'), compression=1)
+    sig = utils.load_signature_with_ksize(to_testing_data('test.sig.zip'), 31)
+    assert type(sig) == sourmash.signature.FrozenSourmashSignature
+    assert sig.jaccard(sig) == 1.0
+
+
+
+
+
+
+
diff --git a/tests/test_workflow.py b/tests/test_workflow.py
@@ -10,10 +10,11 @@ def test_full_workflow():
     :return: None
     """
     script_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))  # currently one level above ./tests
-    data_dir = "testdata"
+    test_dir = os.path.join(script_dir, 'tests')
+    data_dir = os.path.join(test_dir, 'testdata')
     out_prefix = "integration_test"
     full_out_prefix = os.path.join(data_dir, out_prefix)
-    abundance_file = full_out_prefix + "recovered_abundance.csv"
+    abundance_file = full_out_prefix + "recovered_abundance.xlsx"
     reference_sketches = os.path.join(data_dir, "20_genomes_sketches.zip")
     sample_sketches = os.path.join(data_dir, "sample.sig")
     expected_files = map(lambda x: full_out_prefix + x, ["_hash_to_col_idx.csv", "_processed_org_idx.csv",
@@ -38,23 +39,17 @@ def test_full_workflow():
     # then do the abundance estimation
     if exists(abundance_file):
         os.remove(abundance_file)
-    cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --database_prefix " \
-          f"{full_out_prefix} --sample_file " \
-          f"{sample_sketches} --outfile {abundance_file}"
-    #print(cmd)
+    cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(script_dir, 'gtdb_ani_thresh_0.95_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 1 --outdir {data_dir} --out_filename {abundance_file}"
+    # print(cmd)
     res = subprocess.run(cmd, shell=True, check=True)
     # check that no errors were raised
     assert res.returncode == 0
     # check that the output file exists
     assert exists(abundance_file)
     # check if CP032507.1 has correct abundance of 6
-    df = pd.read_csv(abundance_file, sep=",", header=0)
+    df = pd.read_excel(abundance_file)
     present_organism = "CP032507.1 Ectothiorhodospiraceae bacterium BW-2 chromosome, complete genome"
-    # test if there are k-mers in common
-    assert df[df['organism_name'] == present_organism]["nontrivial_overlap"].values[0] == 1
     # but not enough to claim presence
-    assert df[df['organism_name'] == present_organism]["in_sample_est"].values[0] == 0
-    # since the threshold was 706 k-mers
-    assert df[df['organism_name'] == present_organism]["acceptance_threshold_wo_coverage"].values[0] == 706
+    assert len(df[df['organism_name'] == present_organism]["in_sample_est"].values) == 0
     # and we only observed 2 k-mers in the sample
-    assert df[df['organism_name'] == present_organism]["num_matches"].values[0] == 2
+    assert len(df[df['organism_name'] == present_organism]["num_matches"].values) == 0
diff --git a/tests/testdata/integration_test_hash_to_col_idx.pkl b/tests/testdata/integration_test_hash_to_col_idx.pkl