Merge branch 'main' of https://github.com/KoslickiLab/YACHT

KoslickiLab · Oct 4, 2023 · b21ff7e · b21ff7e
2 parents e8af352 + 47bf00a
commit b21ff7e
Show file tree

Hide file tree

Showing 12 changed files with 642 additions and 81 deletions.
diff --git a/.github/workflows/runTest.yml b/.github/workflows/runTest.yml
@@ -0,0 +1,24 @@
+name: runTest
+on: [push]
+jobs:
+  test-python-app:
+    runs-on: ubuntu-20.04
+    defaults:
+      run:
+        shell: bash -el {0}
+    steps:
+    - uses: actions/checkout@v4
+    - uses: conda-incubator/setup-miniconda@v2
+      with:
+        activate-environment: yacht
+        environment-file: env/yacht_env.yaml
+    - name: make training data
+      run: python make_training_data_from_sketches.py --ref_file 'tests/testdata/20_genomes_sketches.zip' --ksize 31 --out_prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95
+    - name: run YACHT
+      run: python run_YACHT.py --json 'gtdb_ani_thresh_0.95_config.json' --sample_file 'tests/testdata/sample.sig' --significance 0.99 --min_coverage 1 --outdir './'
+    - name: test-unit
+      run: pytest tests/test_unit.py
+    - name: test-utils
+      run: pytest tests/test_utils.py
+    - name: test-workflow
+      run: pytest tests/test_workflow.py
diff --git a/README.md b/README.md
@@ -52,7 +52,7 @@ sourmash sketch dna -f -p k=31,scaled=1000,abund -o sample.sig.zip <input FASTA/
 ```
 
 ### Creating a reference dictionary matrix
-The script `make_training_data_from_sketches.py` collects and transforms the sketched microbial genomes, getting them into a form usable by YACHT. In particular, it removes one of any two organisms that are withing the ANI threshold the user specifies as making two organisms "indistinguishable".
+The script `make_training_data_from_sketches.py` collects and transforms the sketched microbial genomes, getting them into a form usable by YACHT. In particular, it removes one of any two organisms that are within the ANI threshold the user specifies as making two organisms "indistinguishable".
 ```bash 
 python make_training_data_from_sketches.py --ref_file 'gtdb-rs214-reps.k31.zip' --ksize 31 --out_prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95
 ```
@@ -77,3 +77,9 @@ Other interesting columns include:
 * `num_matches`: How many k-mers were found in this organism and the sample
 * `acceptance_threshold_*`: How many k-mers must be found in this organism to be considered "present" at the given ANI threshold. Hence, `in_sample_est` is 1 if `num_matches` >= `acceptance_threshold_*` (adjusting by coverage if desired).
 * `alt_confidence_mut_rate`: What the mutation rate (1-ANI) would need to be to get your false positive to match the false negative rate of 1-`significance`.
+
+### Convert YACHT result to other popular output formats (e.g., CAMI profiling format, BIOM format, GraphPlAn)
+When we get the EXCEL result file from run_YACHT.py, you run `standardize_yacht_output.py` to covert the YACHT result to other popular output formats. Currently, only `cami`, `biom`, `graphplan` are supported.
+```bash
+python srcs/standardize_yacht_output.py --yacht_output 'result.xlsx' --sheet_name 'min_coverage0.01' --genome_to_taxid 'genome_to_taxid.tsv' --mode 'cami' --sample_name 'MySample' --outfile_prefix 'cami_result' --outdir './'
+```
diff --git a/depreciated/old_code.py b/depreciated/old_code.py
@@ -70,7 +70,7 @@ def sample_vector_from_files(sig_filename, hash_filename, ksize):
     :return: numpy vector (sample vector y)
     """
     sample_sig = utils.load_signature_with_ksize(sig_filename, ksize)
-    hash_to_idx = utils.load_hashes(hash_filename)
+    hash_to_idx = utils.load_hashes_to_index(hash_filename)
     sample_vector, num_hash_diff_unique, num_hash_diff_total = sample_vector_from_signature(sample_sig, hash_to_idx)
     return sample_vector, sample_sig, num_hash_diff_unique, num_hash_diff_total
 

diff --git a/env/yacht_env.yaml b/env/yacht_env.yaml
@@ -13,5 +13,6 @@ dependencies:
   - pytest==7.4.0
   - loguru=0.7.1
   - tqdm=4.65.0
+  - biom-format
   - pip:
     - openpyxl
diff --git a/make_training_data_from_sketches.py b/make_training_data_from_sketches.py
@@ -32,18 +32,17 @@
     # load the signatures
     logger.info(f"Loading signatures from {ref_file}")
     signatures = sourmash.load_file_as_signatures(ref_file)
+    signature_count = utils.count_files_in_zip(ref_file) - 1
 
+    # DONE: do signature size checking, coverting to sourmash list and generate reference matrix at the same time
     # check that all signatures have the same ksize as the one provided
     # signatures_mismatch_ksize return False (if all signatures have the same kmer size)
     # or True (the first signature with a different kmer size)
-    # covert to sourmash list and check the ksize at the same time
-    signatures, is_mismatch = utils.signatures_mismatch_ksize(signatures, ksize)
-    if is_mismatch:
-        raise ValueError(f"Not all signatures from sourmash signature file {ref_file} have the given ksize {ksize}")
-
     # convert signatures to reference matrix (rows are hashes/kmers, columns are organisms)
     logger.info("Converting signatures to reference matrix")
-    ref_matrix, hashes = utils.signatures_to_ref_matrix(signatures)
+    signatures, ref_matrix, hashes, is_mismatch = utils.signatures_to_ref_matrix(signatures, ksize, signature_count)
+    if is_mismatch:
+        raise ValueError(f"Not all signatures from sourmash signature file {ref_file} have the given ksize {ksize}")
 
     # remove 'same' organisms: any organisms with ANI > ani_thresh are considered the same organism
     logger.info("Removing 'same' organisms with ANI > ani_thresh")

diff --git a/run_YACHT.py b/run_YACHT.py
@@ -71,7 +71,7 @@
     # load the training data
     logger.info('Loading reference matrix, hash to index dictionary, and organism data.')
     reference_matrix = load_npz(reference_matrix_path)
-    hash_to_idx = utils.load_hashes(hash_to_idx_path)
+    hash_to_idx = utils.load_hashes_to_index(hash_to_idx_path)
     organism_data = pd.read_csv(processed_org_file_path)
 
     logger.info('Loading sample signature.')
@@ -146,4 +146,3 @@
             if not show_all:
                 temp_output_result = temp_output_result[temp_output_result['in_sample_est'] == True]
             temp_output_result.to_excel(writer, sheet_name=f'min_coverage{min_coverage}', index=False)
-