Skip to content

Commit

Permalink
Updating unit tests, adding to pipeline (#24)
Browse files Browse the repository at this point in the history
* Updating unit tests, adding to pipeline
* fixing tests, deleting unused files

---------

Co-authored-by: David Koslicki <[email protected]>
  • Loading branch information
mfl15 and dkoslicki authored Oct 3, 2023
1 parent 271bbf8 commit 47bf00a
Show file tree
Hide file tree
Showing 8 changed files with 95 additions and 20 deletions.
8 changes: 7 additions & 1 deletion .github/workflows/runTest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,10 @@ jobs:
- name: make training data
run: python make_training_data_from_sketches.py --ref_file 'tests/testdata/20_genomes_sketches.zip' --ksize 31 --out_prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95
- name: run YACHT
run: python run_YACHT.py --json 'gtdb_ani_thresh_0.95_config.json' --sample_file 'tests/testdata/sample.sig' --significance 0.99 --min_coverage 1 --outdir './'
run: python run_YACHT.py --json 'gtdb_ani_thresh_0.95_config.json' --sample_file 'tests/testdata/sample.sig' --significance 0.99 --min_coverage 1 --outdir './'
- name: test-unit
run: pytest tests/test_unit.py
- name: test-utils
run: pytest tests/test_utils.py
- name: test-workflow
run: pytest tests/test_workflow.py
2 changes: 1 addition & 1 deletion depreciated/old_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def sample_vector_from_files(sig_filename, hash_filename, ksize):
:return: numpy vector (sample vector y)
"""
sample_sig = utils.load_signature_with_ksize(sig_filename, ksize)
hash_to_idx = utils.load_hashes(hash_filename)
hash_to_idx = utils.load_hashes_to_index(hash_filename)
sample_vector, num_hash_diff_unique, num_hash_diff_total = sample_vector_from_signature(sample_sig, hash_to_idx)
return sample_vector, sample_sig, num_hash_diff_unique, num_hash_diff_total

Expand Down
2 changes: 1 addition & 1 deletion run_YACHT.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@
# load the training data
logger.info('Loading reference matrix, hash to index dictionary, and organism data.')
reference_matrix = load_npz(reference_matrix_path)
hash_to_idx = utils.load_hashes(hash_to_idx_path)
hash_to_idx = utils.load_hashes_to_index(hash_to_idx_path)
organism_data = pd.read_csv(processed_org_file_path)

logger.info('Loading sample signature.')
Expand Down
7 changes: 5 additions & 2 deletions srcs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
logger.remove()
logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")

def load_hashes(filename):
def load_hashes_to_index(filename):
"""
Helper function that loads the hash_to_col_idx.pkl file and returns a dictionary mapping hashes to indices in the
training dictionary. filename should point to a CSV file with two columns: hash, col_idx.
Expand All @@ -31,7 +31,10 @@ def load_signature_with_ksize(filename, ksize):
:return: sourmash signature
"""
# Take the first sample signature with the given kmer size
return list(sourmash.load_file_as_signatures(filename, ksize=ksize))[0]
sketches = list(sourmash.load_file_as_signatures(filename, ksize=ksize))
if len(sketches) != 1:
raise ValueError(f"Expected exactly one signature with ksize {ksize} in {filename}, found {len(sketches)}")
return sketches[0] if sketches else 0


def get_num_kmers(signature, scale=True):
Expand Down
4 changes: 2 additions & 2 deletions tests/profiling/for_profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
import sourmash


def load_hashes(filename):
def load_hashes_to_index(filename):
"""
Helper function that loads the hash_to_col_idx.csv file and returns a dictionary mapping hashes to indices in the
training dictionary. filename should point to a CSV file with two columns: hash, col_idx.
Expand Down Expand Up @@ -124,7 +124,7 @@ def run_YACHT():

# load the training data
reference_matrix = load_npz(ref_matrix)
hash_to_idx = load_hashes(hash_to_idx_file)
hash_to_idx = load_hashes_to_index(hash_to_idx_file)
organism_data = pd.read_csv(processed_org_file)

# get the sample y vector (indexed by hash/k-mer, with entry = number of times k-mer appears in sample)
Expand Down
71 changes: 71 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import subprocess
from os.path import exists
import os
import numpy as np
import pandas as pd
# add the parent directory to the path
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from srcs import utils
import sourmash


def to_testing_data(file):
return os.path.join('tests', os.path.join("testdata", file))


def test_load_hashes_to_index():
# the *hash_to_col_idx.pkl files contain a pickle list of key value pairs with keys the hash values and values
# the index of the row that they appear in the npz matrix
file = to_testing_data("integration_test_hash_to_col_idx.pkl")
hashes = utils.load_hashes_to_index(file)
assert type(hashes) == dict
assert len(hashes) == 63888
assert np.allclose(np.sort(list(hashes.values())), range(0, len(hashes)))


def test_load_signature_with_ksize1():
# first, just try a *.sig file
file = to_testing_data("sample.sig")
sig = utils.load_signature_with_ksize(file, 31)
# right type?
assert type(sig) == sourmash.signature.FrozenSourmashSignature
# can we do a simple operation on it?
assert sig.jaccard(sig) == 1.0


def test_load_signature_with_ksize2():
# wrong k-size
file = to_testing_data("sample.sig")
try:
sig = utils.load_signature_with_ksize(file, 21)
except ValueError:
pass
# wrong file type
file = to_testing_data("foobar")
try:
sig = utils.load_signature_with_ksize(file, 31)
except ValueError:
pass
# too many files
try:
sig = utils.load_signature_with_ksize(to_testing_data("20_genomes_sketches.zip"), 31)
except ValueError:
pass


def test_load_signature_with_ksize3():
# different kind of format
file = to_testing_data("sample.sig")
sig = utils.load_signature_with_ksize(file, 31)
sourmash.save_signatures([sig], open(to_testing_data('test.sig.zip'), 'wb'), compression=1)
sig = utils.load_signature_with_ksize(to_testing_data('test.sig.zip'), 31)
assert type(sig) == sourmash.signature.FrozenSourmashSignature
assert sig.jaccard(sig) == 1.0







21 changes: 8 additions & 13 deletions tests/test_workflow.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@ def test_full_workflow():
:return: None
"""
script_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) # currently one level above ./tests
data_dir = "testdata"
test_dir = os.path.join(script_dir, 'tests')
data_dir = os.path.join(test_dir, 'testdata')
out_prefix = "integration_test"
full_out_prefix = os.path.join(data_dir, out_prefix)
abundance_file = full_out_prefix + "recovered_abundance.csv"
abundance_file = full_out_prefix + "recovered_abundance.xlsx"
reference_sketches = os.path.join(data_dir, "20_genomes_sketches.zip")
sample_sketches = os.path.join(data_dir, "sample.sig")
expected_files = map(lambda x: full_out_prefix + x, ["_hash_to_col_idx.csv", "_processed_org_idx.csv",
Expand All @@ -38,23 +39,17 @@ def test_full_workflow():
# then do the abundance estimation
if exists(abundance_file):
os.remove(abundance_file)
cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --database_prefix " \
f"{full_out_prefix} --sample_file " \
f"{sample_sketches} --outfile {abundance_file}"
#print(cmd)
cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(script_dir, 'gtdb_ani_thresh_0.95_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 1 --outdir {data_dir} --out_filename {abundance_file}"
# print(cmd)
res = subprocess.run(cmd, shell=True, check=True)
# check that no errors were raised
assert res.returncode == 0
# check that the output file exists
assert exists(abundance_file)
# check if CP032507.1 has correct abundance of 6
df = pd.read_csv(abundance_file, sep=",", header=0)
df = pd.read_excel(abundance_file)
present_organism = "CP032507.1 Ectothiorhodospiraceae bacterium BW-2 chromosome, complete genome"
# test if there are k-mers in common
assert df[df['organism_name'] == present_organism]["nontrivial_overlap"].values[0] == 1
# but not enough to claim presence
assert df[df['organism_name'] == present_organism]["in_sample_est"].values[0] == 0
# since the threshold was 706 k-mers
assert df[df['organism_name'] == present_organism]["acceptance_threshold_wo_coverage"].values[0] == 706
assert len(df[df['organism_name'] == present_organism]["in_sample_est"].values) == 0
# and we only observed 2 k-mers in the sample
assert df[df['organism_name'] == present_organism]["num_matches"].values[0] == 2
assert len(df[df['organism_name'] == present_organism]["num_matches"].values) == 0
Binary file not shown.

0 comments on commit 47bf00a

Please sign in to comment.