From 598f0d75c1f8464c6e06ec51095cc35e2e25c1db Mon Sep 17 00:00:00 2001
From: npbhavya <npbhavya@gmail.com>
Date: Thu, 2 May 2024 09:56:54 +0930
Subject: [PATCH] update sphae

---
 misc/README.md                    | 16 ++++++++++++++
 misc/num_cds.py                   | 36 +++++++++++++++++++++++++++++++
 sphae/config/config.yaml          |  4 ++--
 sphae/workflow/envs/checkv.yaml   |  2 +-
 sphae/workflow/envs/filtlong.yaml |  7 ------
 sphae/workflow/envs/trimnami.yaml |  2 +-
 6 files changed, 56 insertions(+), 11 deletions(-)
 create mode 100644 misc/README.md
 create mode 100644 misc/num_cds.py
 delete mode 100644 sphae/workflow/envs/filtlong.yaml

diff --git a/misc/README.md b/misc/README.md
new file mode 100644
index 0000000..9b754ff
--- /dev/null
+++ b/misc/README.md
@@ -0,0 +1,16 @@
+## Miscalleneous scripts
+
+This has a list of misc scripts that can be run on the sphae output 
+
+- num_cds.py : Take a directory of genbank files to count the number of genes that have been assigned to as a gene
+  
+    `python temp.py -d sphae.output/RESULTS/gbk -o test.csv`
+
+    Output test.csv 
+    | Gene  | File1 | File2 | File3 |
+    | ------------- | ------------- |
+    | DNA primase  | 1 | 1 | 2 |
+    | tail protein | 2 | 1 | 1 |
+    | holin | 0 | 0 | 0 |
+
+    
\ No newline at end of file
diff --git a/misc/num_cds.py b/misc/num_cds.py
new file mode 100644
index 0000000..c17497d
--- /dev/null
+++ b/misc/num_cds.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+import argparse
+import os
+from collections import defaultdict
+from Bio import SeqIO
+
+def extract_genes(genbank_directory):
+    genes_dict = defaultdict(lambda: defaultdict(int))
+    file_names = []
+    for file_name in os.listdir(genbank_directory):
+        if file_name.endswith(".gbk") or file_name.endswith(".gb"):
+            file_names.append(file_name)
+            file_path = os.path.join(genbank_directory, file_name)
+            for record in SeqIO.parse(file_path, "genbank"):
+                for feature in record.features:
+                    if feature.type == "CDS":
+                        gene_name = feature.qualifiers.get("product", ["Unknown_gene"])[0]
+                        genes_dict[gene_name][file_name] += 1
+    return genes_dict, file_names
+
+def write_to_csv(genes_dict, file_names, output_file):
+    with open(output_file, "w") as f:
+        # Write header
+        f.write("Gene," + ",".join(file_names) + "\n")
+        # Write gene names and the number of CDS in each file
+        for gene, counts_per_file in genes_dict.items():
+            counts = [str(counts_per_file.get(file_name, 0)) for file_name in file_names]
+            f.write(f"{gene},{','.join(counts)}\n")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="A script that takes a directory containing genbank files and writes gene presence absence table")
+    parser.add_argument('-d', '--directory', dest='directory', help='Enter the directory containing the genbank files')
+    parser.add_argument('-o', dest='output', help='Enter the output tabular format')
+    results = parser.parse_args()
+    genes_dict, file_names = extract_genes(results.directory)
+    write_to_csv(genes_dict, file_names, results.output)
diff --git a/sphae/config/config.yaml b/sphae/config/config.yaml
index c425328..27ebca6 100644
--- a/sphae/config/config.yaml
+++ b/sphae/config/config.yaml
@@ -11,7 +11,7 @@ resources:
 trimnami:
   qc:
     subsample:
-      --bases 100M
+      --bases 1000M
   host: 
 
 params:
@@ -28,4 +28,4 @@ args:
 db:
   pfam: "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/"
   pfam_file: 'Pfam35.0/Pfam-A.hmm.gz'
-  models: "https://zenodo.org/record/8198288/files/phynteny_models_v0.1.11.tar.gz"
\ No newline at end of file
+  models: "https://zenodo.org/record/8198288/files/phynteny_models_v0.1.11.tar.gz"
diff --git a/sphae/workflow/envs/checkv.yaml b/sphae/workflow/envs/checkv.yaml
index 0475c23..f27f467 100644
--- a/sphae/workflow/envs/checkv.yaml
+++ b/sphae/workflow/envs/checkv.yaml
@@ -5,4 +5,4 @@ channels:
     - defaults
 dependencies:
     - checkv >= 1.0.1
-    - diamond >=2.1.8
\ No newline at end of file
+    - diamond==2.1.8
\ No newline at end of file
diff --git a/sphae/workflow/envs/filtlong.yaml b/sphae/workflow/envs/filtlong.yaml
deleted file mode 100644
index 1b42f3b..0000000
--- a/sphae/workflow/envs/filtlong.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-name: filtlong
-channels:
-    - conda-forge
-    - bioconda
-    - defaults
-dependencies:
-    - filtlong>=0.2.0
diff --git a/sphae/workflow/envs/trimnami.yaml b/sphae/workflow/envs/trimnami.yaml
index 68d2fcb..822df8d 100644
--- a/sphae/workflow/envs/trimnami.yaml
+++ b/sphae/workflow/envs/trimnami.yaml
@@ -6,4 +6,4 @@ dependencies:
     - mamba
     - pip
     - pip:
-        - trimnami==0.1.1
\ No newline at end of file
+        - trimnami>=0.1.1
\ No newline at end of file