From 598f0d75c1f8464c6e06ec51095cc35e2e25c1db Mon Sep 17 00:00:00 2001 From: npbhavya Date: Thu, 2 May 2024 09:56:54 +0930 Subject: [PATCH] update sphae --- misc/README.md | 16 ++++++++++++++ misc/num_cds.py | 36 +++++++++++++++++++++++++++++++ sphae/config/config.yaml | 4 ++-- sphae/workflow/envs/checkv.yaml | 2 +- sphae/workflow/envs/filtlong.yaml | 7 ------ sphae/workflow/envs/trimnami.yaml | 2 +- 6 files changed, 56 insertions(+), 11 deletions(-) create mode 100644 misc/README.md create mode 100644 misc/num_cds.py delete mode 100644 sphae/workflow/envs/filtlong.yaml diff --git a/misc/README.md b/misc/README.md new file mode 100644 index 0000000..9b754ff --- /dev/null +++ b/misc/README.md @@ -0,0 +1,16 @@ +## Miscalleneous scripts + +This has a list of misc scripts that can be run on the sphae output + +- num_cds.py : Take a directory of genbank files to count the number of genes that have been assigned to as a gene + + `python temp.py -d sphae.output/RESULTS/gbk -o test.csv` + + Output test.csv + | Gene | File1 | File2 | File3 | + | ------------- | ------------- | + | DNA primase | 1 | 1 | 2 | + | tail protein | 2 | 1 | 1 | + | holin | 0 | 0 | 0 | + + \ No newline at end of file diff --git a/misc/num_cds.py b/misc/num_cds.py new file mode 100644 index 0000000..c17497d --- /dev/null +++ b/misc/num_cds.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python +import argparse +import os +from collections import defaultdict +from Bio import SeqIO + +def extract_genes(genbank_directory): + genes_dict = defaultdict(lambda: defaultdict(int)) + file_names = [] + for file_name in os.listdir(genbank_directory): + if file_name.endswith(".gbk") or file_name.endswith(".gb"): + file_names.append(file_name) + file_path = os.path.join(genbank_directory, file_name) + for record in SeqIO.parse(file_path, "genbank"): + for feature in record.features: + if feature.type == "CDS": + gene_name = feature.qualifiers.get("product", ["Unknown_gene"])[0] + genes_dict[gene_name][file_name] += 1 + return genes_dict, file_names + +def write_to_csv(genes_dict, file_names, output_file): + with open(output_file, "w") as f: + # Write header + f.write("Gene," + ",".join(file_names) + "\n") + # Write gene names and the number of CDS in each file + for gene, counts_per_file in genes_dict.items(): + counts = [str(counts_per_file.get(file_name, 0)) for file_name in file_names] + f.write(f"{gene},{','.join(counts)}\n") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="A script that takes a directory containing genbank files and writes gene presence absence table") + parser.add_argument('-d', '--directory', dest='directory', help='Enter the directory containing the genbank files') + parser.add_argument('-o', dest='output', help='Enter the output tabular format') + results = parser.parse_args() + genes_dict, file_names = extract_genes(results.directory) + write_to_csv(genes_dict, file_names, results.output) diff --git a/sphae/config/config.yaml b/sphae/config/config.yaml index c425328..27ebca6 100644 --- a/sphae/config/config.yaml +++ b/sphae/config/config.yaml @@ -11,7 +11,7 @@ resources: trimnami: qc: subsample: - --bases 100M + --bases 1000M host: params: @@ -28,4 +28,4 @@ args: db: pfam: "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/" pfam_file: 'Pfam35.0/Pfam-A.hmm.gz' - models: "https://zenodo.org/record/8198288/files/phynteny_models_v0.1.11.tar.gz" \ No newline at end of file + models: "https://zenodo.org/record/8198288/files/phynteny_models_v0.1.11.tar.gz" diff --git a/sphae/workflow/envs/checkv.yaml b/sphae/workflow/envs/checkv.yaml index 0475c23..f27f467 100644 --- a/sphae/workflow/envs/checkv.yaml +++ b/sphae/workflow/envs/checkv.yaml @@ -5,4 +5,4 @@ channels: - defaults dependencies: - checkv >= 1.0.1 - - diamond >=2.1.8 \ No newline at end of file + - diamond==2.1.8 \ No newline at end of file diff --git a/sphae/workflow/envs/filtlong.yaml b/sphae/workflow/envs/filtlong.yaml deleted file mode 100644 index 1b42f3b..0000000 --- a/sphae/workflow/envs/filtlong.yaml +++ /dev/null @@ -1,7 +0,0 @@ -name: filtlong -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - filtlong>=0.2.0 diff --git a/sphae/workflow/envs/trimnami.yaml b/sphae/workflow/envs/trimnami.yaml index 68d2fcb..822df8d 100644 --- a/sphae/workflow/envs/trimnami.yaml +++ b/sphae/workflow/envs/trimnami.yaml @@ -6,4 +6,4 @@ dependencies: - mamba - pip - pip: - - trimnami==0.1.1 \ No newline at end of file + - trimnami>=0.1.1 \ No newline at end of file