Merge pull request #41 from KoslickiLab/dmk

Fixing issues
KoslickiLab · Oct 19, 2023 · edc6ba5 · edc6ba5
2 parents d0f82c8 + 4127d95
commit edc6ba5
Show file tree

Hide file tree

Showing 5 changed files with 55 additions and 76 deletions.
diff --git a/.github/workflows/runTest.yml b/.github/workflows/runTest.yml
@@ -13,9 +13,9 @@ jobs:
         activate-environment: yacht_env
         environment-file: env/yacht_env.yml
     - name: make training data
-      run: python make_training_data_from_sketches.py --ref_file 'tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ./
+      run: python make_training_data_from_sketches.py --ref_file 'tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ./ --force
     - name: run YACHT
-      run: python run_YACHT.py --json gtdb_ani_thresh_0.95_config.json --sample_file 'tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1 --outdir ./
+      run: python run_YACHT.py --json gtdb_ani_thresh_0.95_config.json --sample_file 'tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1
     - name: test-unit
       run: pytest tests/test_unit.py
     - name: test-utils

diff --git a/README.md b/README.md
@@ -18,13 +18,13 @@ cd demo
 
 # build k-mer sketches for the query sample and ref genomes
 sourmash sketch dna -f -p k=31,scaled=1000,abund -o sample.sig.zip query_data/query_data.fq
-sourmash sketch fromfile ref_paths.csv -p dna,k=31,scaled=1000,abund -o ref.sig.zip 
+sourmash sketch fromfile ref_paths.csv -p dna,k=31,scaled=1000,abund -o ref.sig.zip --force-output-already-exists
 
 # preprocess the reference genomes (training step)
-python ../make_training_data_from_sketches.py --ref_file ref.sig.zip --ksize 31 --num_threads ${NUM_THREADS} --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./
+python ../make_training_data_from_sketches.py --ref_file ref.sig.zip --ksize 31 --num_threads ${NUM_THREADS} --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./ --force
 
 # run YACHT algorithm to check the presence of reference genomes in the query sample (inference step)
-python ../run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads ${NUM_THREADS} --min_coverage_list 1 0.6 0.2 0.1 --outdir ./
+python ../run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads ${NUM_THREADS} --min_coverage_list 1 0.6 0.2 0.1 --out_filename result.xlsx
 
 # convert result to CAMI profile format (Optional)
 python ../srcs/standardize_yacht_output.py --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./

diff --git a/make_training_data_from_sketches.py b/make_training_data_from_sketches.py
@@ -25,6 +25,7 @@
                         required=False, default=0.95)
     parser.add_argument('--prefix', help='Prefix for this experiment.', required=False, default='yacht')
     parser.add_argument('--outdir', type=str, help='path to output directory', required=False, default=os.getcwd())
+    parser.add_argument('--force', action='store_true', help='Overwrite the output directory if it exists')
     args = parser.parse_args()
 
     # get the arguments
@@ -34,6 +35,7 @@
     ani_thresh = args.ani_thresh
     prefix = args.prefix
     outdir = str(Path(args.outdir).absolute())
+    force = args.force
 
     # make sure reference database file exist and valid
     logger.info("Checking reference database file")
@@ -44,9 +46,9 @@
     # Create a temporary directory with time info as label
     logger.info("Creating a temporary directory")
     path_to_temp_dir = os.path.join(outdir, prefix+'_intermediate_files')
-    if os.path.exists(path_to_temp_dir):
+    if os.path.exists(path_to_temp_dir) and not force:
         raise ValueError(f"Temporary directory {path_to_temp_dir} already exists. Please remove it or given a new prefix name using parameter '--prefix'.")
-    os.makedirs(path_to_temp_dir)
+    os.makedirs(path_to_temp_dir, exist_ok=True)
 
     # unzip the sourmash signature file to the temporary directory
     logger.info("Unzipping the sourmash signature file to the temporary directory")

diff --git a/run_YACHT.py b/run_YACHT.py
@@ -9,6 +9,7 @@
 import srcs.utils as utils
 import json
 import warnings
+import zipfile
 warnings.filterwarnings("ignore")
 from tqdm import tqdm
 from loguru import logger
@@ -31,8 +32,7 @@
                                                            'Each value should be between 0 and 1, with 0 being the most sensitive (and least '
                                                            'precise) and 1 being the most precise (and least sensitive).', 
                                                            required=False, default=[1, 0.5, 0.1, 0.05, 0.01])
-    parser.add_argument('--out_filename', help='output filename', required=False, default='result.xlsx')
-    parser.add_argument('--outdir', help='path to output directory', required=True)
+    parser.add_argument('--out_filename', help='Full path of output filename', required=False, default='result.xlsx')
 
     # parse the arguments
     args = parser.parse_args()
@@ -44,7 +44,6 @@
     show_all = args.show_all  # Show all organisms (no matter if present) in output file.
     min_coverage_list = args.min_coverage_list  # a list of percentages of unique k-mers covered by reads in the sample.
     out_filename = args.out_filename  # output filename
-    outdir = args.outdir  # csv destination for results
 
     # check if the json file exists
     utils.check_file_existence(json_file_path, f'Config file {json_file_path} does not exist. '
@@ -57,6 +56,10 @@
     ksize = config['ksize']
     ani_thresh = config['ani_thresh']
 
+    # Make sure the output can be written to
+    if not os.access(os.path.abspath(os.path.dirname(out_filename)), os.W_OK):
+        raise FileNotFoundError(f"Cannot write to the location: {os.path.abspath(os.path.dirname(out_filename))}.")
+
     # check if min_coverage is between 0 and 1
     for x in min_coverage_list:
         if not (0 <= x <= 1):
@@ -70,9 +73,20 @@
     logger.info('Loading the manifest file generated from the training data.')
     manifest = pd.read_csv(manifest_file_path, sep='\t', header=0)
 
+    # check if there is a manifest in the sample sig file
+    with zipfile.ZipFile(sample_file, 'r') as zip_file:
+        if 'SOURMASH-MANIFEST.csv' not in zip_file.namelist():
+            raise FileNotFoundError(f'The input file {sample_file} appears to be missing a manifest associated with it. '
+                                    f'Try running: sourmash sig merge {sample_file} -o <new signature with the manifest present>. '
+                                    f'And then run YACHT using the output of that command.')
+
     # load sample signature and its signature info
     logger.info('Loading sample signature and its signature info.')
-    sample_sig = utils.load_signature_with_ksize(sample_file, ksize)
+    try:
+        sample_sig = utils.load_signature_with_ksize(sample_file, ksize)
+    except ValueError:
+        raise ValueError(f'Expected exactly one signature with ksize {ksize} in {sample_file}, found {len(sample_file)}. '
+                         f'Likely you will need to do something like: sourmash sig merge {sample_file} -o <new signature with just one sketch in it>.')
     sample_sig_info = utils.get_info_from_single_sig(sample_file, ksize)
 
     # add sample signature info to the manifest
@@ -112,9 +126,9 @@
     manifest_list = temp_manifest_list
 
     # save the results into Excel file
-    logger.info(f'Saving results to {outdir}.')
+    logger.info(f'Saving results to {os.path.dirname(out_filename)}.')
     # save the results with different min_coverage
-    with pd.ExcelWriter(os.path.join(outdir, out_filename), engine='openpyxl', mode='w') as writer:
+    with pd.ExcelWriter(out_filename, engine='openpyxl', mode='w') as writer:
         # save the raw results (i.e., min_coverage=1.0)
         if keep_raw:
             temp_mainifest = manifest_list[0].copy()

diff --git a/tests/test_workflow.py b/tests/test_workflow.py
@@ -24,36 +24,6 @@ def test_full_workflow():
                                               "training_sig_files.txt"])))
     # one of the signature files
     expected_files.extend(list(map(lambda x: os.path.join(data_dir, intermediate_dir, "signatures", x), ["04212e93c2172d4df49dc5d8c2973d8b.sig.gz"])))
-    # In testdata/
-    # 20_genomes_trained_config.json
-    # 20_genomes_trained_processed_manifest.tsv
-    # 20_genomes_trained_rep_to_corr_orgas_mapping.tsv
-    # /testdata/20_genomes_trained_intermediate_files$ tree .
-    # .
-    # ├── signatures
-    # │   ├── 04212e93c2172d4df49dc5d8c2973d8b.sig.gz
-    # │   ├── 04f2b0e94f2d1f1f5b8355114b70274e.sig.gz
-    # │   ├── 0661ecab88c3d65d0f10e599a5ba1654.sig.gz
-    # │   ├── 06ebe48d527882bfa9505aba8e31ae23.sig.gz
-    # │   ├── 11fe9a00287c7ad086ebbc463724cf10.sig.gz
-    # │   ├── 16c6c1d37259d83088ab3a4f5b691631.sig.gz
-    # │   ├── 188d55801a78d4773cf6c0b46bca96ba.sig.gz
-    # │   ├── 1a121dca600c6504e88252e81004f0cf.sig.gz
-    # │   ├── 39ea7fd48ee7003587c9c763946d5d6e.sig.gz
-    # │   ├── 45f2675c1dca4ef1a24a05f5b268adbb.sig.gz
-    # │   ├── 7b312fffa3fb35440ba40203ba826c05.sig.gz
-    # │   ├── 8fb9b1a69838a58cc4f31c1e42a5f189.sig.gz
-    # │   ├── 92fb1b3e4baa6c474aff3efb84957687.sig.gz
-    # │   ├── 96cb85214535b0f9723a6abc17097821.sig.gz
-    # │   ├── a136145bee08846ed94c0406df3da2d4.sig.gz
-    # │   ├── b691deddf179ead0a006527330d86dde.sig.gz
-    # │   ├── b7f087146f5cc3121477c29ff003e3d0.sig.gz
-    # │   ├── c39c52d2d088348c950c2afe503b405b.sig.gz
-    # │   ├── c9eb6a9d058df8036ad93bc45d5bf260.sig.gz
-    # │   └── ce54d962851b0fdeefc624300036a133.sig.gz
-    # ├── SOURMASH-MANIFEST.csv
-    # ├── training_multisearch_result.csv
-    # └── training_sig_files.txt
     # remove the files if they exist
     for f in expected_files:
         if exists(f):
@@ -75,39 +45,8 @@ def test_full_workflow():
     # then do the presence/absence estimation
     if exists(abundance_file):
         os.remove(abundance_file)
-    # python ../run_YACHT.py --json testdata/20_genomes_trained_config.json --sample_file testdata/sample.sig.zip --out_file result.xlsx --outdir testdata/
-    cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --outdir {data_dir} --out_file {abundance_file} --show_all"
-    print(cmd)
-    # ~/pycharm/YACHT/tests/testdata$ tree 20_genomes_trained_intermediate_files/
-    # 20_genomes_trained_intermediate_files/
-    # ├── organism_sig_file.txt  # <-- new
-    # ├── sample_multisearch_result.csv
-    # ├── sample_sig_file.txt
-    # ├── signatures
-    # │   ├── 04212e93c2172d4df49dc5d8c2973d8b.sig.gz
-    # │   ├── 04f2b0e94f2d1f1f5b8355114b70274e.sig.gz
-    # │   ├── 0661ecab88c3d65d0f10e599a5ba1654.sig.gz
-    # │   ├── 06ebe48d527882bfa9505aba8e31ae23.sig.gz
-    # │   ├── 11fe9a00287c7ad086ebbc463724cf10.sig.gz
-    # │   ├── 16c6c1d37259d83088ab3a4f5b691631.sig.gz
-    # │   ├── 188d55801a78d4773cf6c0b46bca96ba.sig.gz
-    # │   ├── 1a121dca600c6504e88252e81004f0cf.sig.gz
-    # │   ├── 39ea7fd48ee7003587c9c763946d5d6e.sig.gz
-    # │   ├── 45f2675c1dca4ef1a24a05f5b268adbb.sig.gz
-    # │   ├── 7b312fffa3fb35440ba40203ba826c05.sig.gz
-    # │   ├── 8fb9b1a69838a58cc4f31c1e42a5f189.sig.gz
-    # │   ├── 92fb1b3e4baa6c474aff3efb84957687.sig.gz
-    # │   ├── 96cb85214535b0f9723a6abc17097821.sig.gz
-    # │   ├── a136145bee08846ed94c0406df3da2d4.sig.gz
-    # │   ├── b691deddf179ead0a006527330d86dde.sig.gz
-    # │   ├── b7f087146f5cc3121477c29ff003e3d0.sig.gz
-    # │   ├── c39c52d2d088348c950c2afe503b405b.sig.gz
-    # │   ├── c9eb6a9d058df8036ad93bc45d5bf260.sig.gz
-    # │   └── ce54d962851b0fdeefc624300036a133.sig.gz
-    # ├── SOURMASH-MANIFEST.csv
-    # ├── training_multisearch_result.csv
-    # └── training_sig_files.txt
-    # print(cmd)
+    # python ../run_YACHT.py --json testdata/20_genomes_trained_config.json --sample_file testdata/sample.sig.zip --out_file result.xlsx
+    cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --out_file {os.path.join(data_dir,abundance_file)} --show_all"
     res = subprocess.run(cmd, shell=True, check=True)
     # check that no errors were raised
     assert res.returncode == 0
@@ -122,3 +61,27 @@ def test_full_workflow():
     assert df[df['organism_name'] == present_organism]["num_matches"].values[0] == 2
     # and the threshold was 706
     assert df[df['organism_name'] == present_organism]["acceptance_threshold_with_coverage"].values[0] == 706
+
+
+def test_incorrect_workflow1():
+    script_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+    demo_dir = os.path.join(script_dir, "demo")
+    cmd = f"python run_YACHT.py --json {demo_dir}/demo_ani_thresh_0.95_config.json --sample_file {demo_dir}/ref.sig.zip"
+    res = subprocess.run(cmd, shell=True, check=False)
+    # this should fail
+    assert res.returncode == 1
+
+
+def test_demo_workflow():
+    cmd = "cd demo; sourmash sketch dna -f -p k=31,scaled=1000,abund -o sample.sig.zip query_data/query_data.fq"
+    _ = subprocess.run(cmd, shell=True, check=True)
+    cmd = "cd demo; sourmash sketch fromfile ref_paths.csv -p dna,k=31,scaled=1000,abund -o ref.sig.zip --force-output-already-exists"
+    _ = subprocess.run(cmd, shell=True, check=True)
+    cmd = "cd demo; python ../make_training_data_from_sketches.py --force --ref_file ref.sig.zip --ksize 31 --num_threads 1 --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./"
+    _ = subprocess.run(cmd, shell=True, check=True)
+    cmd = "cd demo; python ../run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads 1 --min_coverage_list 1 0.6 0.2 0.1 --out_filename result.xlsx"
+    _ = subprocess.run(cmd, shell=True, check=True)
+    cmd = "cd demo; python ../srcs/standardize_yacht_output.py --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./"
+    _ = subprocess.run(cmd, shell=True, check=True)
+
+