update sphae

linsalrob · May 2, 2024 · 598f0d7 · 598f0d7
1 parent 2dc1828
commit 598f0d7
Show file tree

Hide file tree

Showing 6 changed files with 56 additions and 11 deletions.
diff --git a/misc/README.md b/misc/README.md
@@ -0,0 +1,16 @@
+## Miscalleneous scripts
+
+This has a list of misc scripts that can be run on the sphae output 
+
+- num_cds.py : Take a directory of genbank files to count the number of genes that have been assigned to as a gene
+
+    `python temp.py -d sphae.output/RESULTS/gbk -o test.csv`
+
+    Output test.csv 
+    | Gene  | File1 | File2 | File3 |
+    | ------------- | ------------- |
+    | DNA primase  | 1 | 1 | 2 |
+    | tail protein | 2 | 1 | 1 |
+    | holin | 0 | 0 | 0 |
+
+
diff --git a/misc/num_cds.py b/misc/num_cds.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+import argparse
+import os
+from collections import defaultdict
+from Bio import SeqIO
+
+def extract_genes(genbank_directory):
+    genes_dict = defaultdict(lambda: defaultdict(int))
+    file_names = []
+    for file_name in os.listdir(genbank_directory):
+        if file_name.endswith(".gbk") or file_name.endswith(".gb"):
+            file_names.append(file_name)
+            file_path = os.path.join(genbank_directory, file_name)
+            for record in SeqIO.parse(file_path, "genbank"):
+                for feature in record.features:
+                    if feature.type == "CDS":
+                        gene_name = feature.qualifiers.get("product", ["Unknown_gene"])[0]
+                        genes_dict[gene_name][file_name] += 1
+    return genes_dict, file_names
+
+def write_to_csv(genes_dict, file_names, output_file):
+    with open(output_file, "w") as f:
+        # Write header
+        f.write("Gene," + ",".join(file_names) + "\n")
+        # Write gene names and the number of CDS in each file
+        for gene, counts_per_file in genes_dict.items():
+            counts = [str(counts_per_file.get(file_name, 0)) for file_name in file_names]
+            f.write(f"{gene},{','.join(counts)}\n")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="A script that takes a directory containing genbank files and writes gene presence absence table")
+    parser.add_argument('-d', '--directory', dest='directory', help='Enter the directory containing the genbank files')
+    parser.add_argument('-o', dest='output', help='Enter the output tabular format')
+    results = parser.parse_args()
+    genes_dict, file_names = extract_genes(results.directory)
+    write_to_csv(genes_dict, file_names, results.output)
diff --git a/sphae/config/config.yaml b/sphae/config/config.yaml
@@ -11,7 +11,7 @@ resources:
 trimnami:
   qc:
     subsample:
-      --bases 100M
+      --bases 1000M
   host: 
 
 params:
@@ -28,4 +28,4 @@ args:
 db:
   pfam: "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/"
   pfam_file: 'Pfam35.0/Pfam-A.hmm.gz'
-  models: "https://zenodo.org/record/8198288/files/phynteny_models_v0.1.11.tar.gz"
+  models: "https://zenodo.org/record/8198288/files/phynteny_models_v0.1.11.tar.gz"
diff --git a/sphae/workflow/envs/checkv.yaml b/sphae/workflow/envs/checkv.yaml
@@ -5,4 +5,4 @@ channels:
     - defaults
 dependencies:
     - checkv >= 1.0.1
-    - diamond >=2.1.8
+    - diamond==2.1.8
diff --git a/sphae/workflow/envs/filtlong.yaml b/sphae/workflow/envs/filtlong.yaml
diff --git a/sphae/workflow/envs/trimnami.yaml b/sphae/workflow/envs/trimnami.yaml
@@ -6,4 +6,4 @@ dependencies:
     - mamba
     - pip
     - pip:
-        - trimnami==0.1.1
+        - trimnami>=0.1.1
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,4 +6,4 @@ dependencies: @@
         - mamba
         - pip
         - pip:
-            - trimnami==0.1.1
+            - trimnami>=0.1.1