From 448ac928ede5d6e0ffe22abde603b2c62fe75bfb Mon Sep 17 00:00:00 2001 From: gbouras13 Date: Wed, 20 Mar 2024 10:19:51 +1030 Subject: [PATCH 01/11] fix issue with pdb input and header has colon --- src/phold/features/create_foldseek_db.py | 9 +++++++-- tests/test_integration.py | 1 - 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/phold/features/create_foldseek_db.py b/src/phold/features/create_foldseek_db.py index f93246f..ae84edd 100644 --- a/src/phold/features/create_foldseek_db.py +++ b/src/phold/features/create_foldseek_db.py @@ -176,9 +176,14 @@ def generate_foldseek_db_from_pdbs( no_pdb_cds_ids = [] for id in sequences_aa.keys(): - cds_id = id.split(":")[1] - # record_id = id.split(":")[0] + # cds_id = id.split(":")[1] + + # in case the header has a colon in it - this will cause a bug if so + cds_id = id.split(":")[1:] + cds_id = ":".join(cds_id).strip() + + # record_id = id.split(":")[0] # this is potentially an issue if a contig has > 9999 AAs # need to fix with Pharokka possibly. Unlikely to occur but might! # enforce names as '{cds_id}.pdb' diff --git a/tests/test_integration.py b/tests/test_integration.py index 56a81c6..c00ac29 100755 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -11,7 +11,6 @@ pytest --run_remote --gpu_available . # to run with 8 threads - pytest --run_remote --gpu_available --threads 8 . """ From 18d0e428021cc68f46d812d0c9f332e0e6ac64fe Mon Sep 17 00:00:00 2001 From: gbouras13 Date: Wed, 20 Mar 2024 11:21:03 +1030 Subject: [PATCH 02/11] filter pdb --- src/phold/__init__.py | 9 ++++++++- src/phold/features/create_foldseek_db.py | 2 -- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/phold/__init__.py b/src/phold/__init__.py index 6dbc57b..5633744 100755 --- a/src/phold/__init__.py +++ b/src/phold/__init__.py @@ -696,6 +696,11 @@ def proteins_predict( help="Path to directory with pdbs. The FASTA headers need to match names of the pdb files", type=click.Path(), ) +@click.option( + "--filter_pdbs", + is_flag=True, + help="Flag that creates a copy of the PDBs with matching record IDs found in the GenBank. Helpful if you have a directory with lots of PDBs and want to annotate only e.g. 1 phage.", +) @common_options @compare_options def proteins_compare( @@ -711,6 +716,7 @@ def proteins_compare( predictions_dir, pdb, pdb_dir, + filter_pdbs, keep_tmp_files, split, split_threshold, @@ -740,6 +746,7 @@ def proteins_compare( "--predictions_dir": predictions_dir, "--pdb": pdb, "--pdb_dir": pdb_dir, + "--filter_pdbs": filter_pdbs, "--keep_tmp_files": keep_tmp_files, "--split": split, "--split_threshold": split_threshold, @@ -795,7 +802,7 @@ def proteins_compare( pdb, pdb_dir, logdir, - filter_pdbs=False, + filter_pdbs, split=split, split_threshold=split_threshold, remote_flag=False, diff --git a/src/phold/features/create_foldseek_db.py b/src/phold/features/create_foldseek_db.py index ae84edd..8b14c5b 100644 --- a/src/phold/features/create_foldseek_db.py +++ b/src/phold/features/create_foldseek_db.py @@ -177,8 +177,6 @@ def generate_foldseek_db_from_pdbs( for id in sequences_aa.keys(): - # cds_id = id.split(":")[1] - # in case the header has a colon in it - this will cause a bug if so cds_id = id.split(":")[1:] cds_id = ":".join(cds_id).strip() From 445e980bdefeb67f409c813f86a04b82ec01a7f5 Mon Sep 17 00:00:00 2001 From: gbouras13 Date: Wed, 20 Mar 2024 11:26:58 +1030 Subject: [PATCH 03/11] tidy up --filter_pdbs wording --- src/phold/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/phold/__init__.py b/src/phold/__init__.py index 5633744..ca9bb97 100755 --- a/src/phold/__init__.py +++ b/src/phold/__init__.py @@ -454,7 +454,7 @@ def predict( @click.option( "--filter_pdbs", is_flag=True, - help="Flag that creates a copy of the PDBs with matching record IDs found in the GenBank. Helpful if you have a directory with lots of PDBs and want to annotate only e.g. 1 phage.", + help="Flag that creates a copy of the .pdb files with matching record IDs found in the input GenBank file. Helpful if you have a directory with lots of .pdb files and want to annotate only e.g. 1 phage.", ) @common_options @compare_options @@ -693,13 +693,13 @@ def proteins_predict( ) @click.option( "--pdb_dir", - help="Path to directory with pdbs. The FASTA headers need to match names of the pdb files", + help="Path to directory with .pdb files. The FASTA headers need to match names of the .pdb files", type=click.Path(), ) @click.option( "--filter_pdbs", is_flag=True, - help="Flag that creates a copy of the PDBs with matching record IDs found in the GenBank. Helpful if you have a directory with lots of PDBs and want to annotate only e.g. 1 phage.", + help="Flag that creates a copy of the .pdb files with matching record IDs found in the input. Helpful if you have a directory with lots of .pdb files and want to annotate only some.", ) @common_options @compare_options From bad29fbcf04c51b0c8384114151b400a2ec4dce8 Mon Sep 17 00:00:00 2001 From: gbouras13 Date: Wed, 20 Mar 2024 20:19:20 +1030 Subject: [PATCH 04/11] Fix #31 issue with older Pharokka output not having transl_table --- src/phold/io/handle_genbank.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/phold/io/handle_genbank.py b/src/phold/io/handle_genbank.py index 71ce178..579cf9d 100644 --- a/src/phold/io/handle_genbank.py +++ b/src/phold/io/handle_genbank.py @@ -239,9 +239,17 @@ def write_genbank( else: # because for some reason when parsing the pharokka genbank, it is a list, fasta it is not if fasta_flag is True: - transl_table = cds_feature.qualifiers["transl_table"] + try: + transl_table = cds_feature.qualifiers["transl_table"] + except: + # for older pharokka input before v1.5.0 + transl_table = "11" else: - transl_table = cds_feature.qualifiers["transl_table"][0] + try: + transl_table = cds_feature.qualifiers["transl_table"][0] + except: + # for older pharokka input before v1.5.0 + transl_table = "11" # to reverse the start and end coordinates for output tsv + fix genbank 0 index start relative to pharokka if cds_feature.location.strand == -1: # neg strand From cf931b81dc7d314c4bd4e5af82cd05d9b49fe9f4 Mon Sep 17 00:00:00 2001 From: gbouras13 Date: Fri, 22 Mar 2024 12:40:37 +1030 Subject: [PATCH 05/11] add efam support --- src/phold/results/topfunction.py | 9 +- tests/test_data/KF623293.1_subset_efam.fasta | 119 ++++++ tests/test_data/NC_043029_pharokka1.4.1.gbk | 365 +++++++++++++++++++ tests/test_integration.py | 18 +- 4 files changed, 509 insertions(+), 2 deletions(-) create mode 100644 tests/test_data/KF623293.1_subset_efam.fasta create mode 100644 tests/test_data/NC_043029_pharokka1.4.1.gbk diff --git a/src/phold/results/topfunction.py b/src/phold/results/topfunction.py index 7c07975..58210f1 100644 --- a/src/phold/results/topfunction.py +++ b/src/phold/results/topfunction.py @@ -83,8 +83,15 @@ def get_topfunctions( "envhog_" + foldseek_df.loc[mask, "tophit_protein"] ) - foldseek_df["phrog"] = foldseek_df["phrog"].astype("str") + # strip off efam + mask = foldseek_df["phrog"].str.startswith("efam_") + foldseek_df.loc[mask, "phrog"] = foldseek_df.loc[mask, "phrog"].str.replace( + "efam_", "" + ) + # no need to add it on to protein - already done + + foldseek_df["phrog"] = foldseek_df["phrog"].astype("str") # read in the mapping tsv phrog_annot_mapping_tsv: Path = Path(database) / "phold_annots.tsv" phrog_mapping_df = pd.read_csv(phrog_annot_mapping_tsv, sep="\t") diff --git a/tests/test_data/KF623293.1_subset_efam.fasta b/tests/test_data/KF623293.1_subset_efam.fasta new file mode 100644 index 0000000..c316db4 --- /dev/null +++ b/tests/test_data/KF623293.1_subset_efam.fasta @@ -0,0 +1,119 @@ +>KF623293.1 Vibrio phage SHOU24, complete genome +GACGGAAGAGTATTTCGTCAACTACGCTGGCGGCCTACTGACACTAGACCTTCACAAACTTAAAGAAGCT +GGCGTTAATACGAACTTCTTCTACACAGACGAAGAGTTTGAAGATAAAGCTAGCGGCTGTTAGTTTGGTC +TGAGACCAAAAGTATTTAGCCCCGTATTAGGGGCTTTAACCTGTAAGGATTAAAAATGATAAAGCTAATG +AATGAAGACTGTTTGTCTGCTATGAGAAAGTTAGAAGACTCTAGCGTTGACTTAGTATTGACTTCACCTC +CATACAACATGAACCTTAGAATACGTAACGGGAAATATTGTTCTAGACAGATAGTCAAAGAAATCAGCAC +CAAGTACAAAGCGTTCGATGACAACCTTCCTATGGATAAATACTTTGAATTTAACAAGTCAGTCATCAAT +GAGTGCCTGAGAGTTTCTGACTTAGTGTTCTACAACGTTCAAATACTTACAGGGAATAAGCCTGCTTTGT +TCCGTCTTATGGGAGAGTTTCACGATAAGCTGAAAGAAGTGATTGTTTGGGATAAGATTAACGCTCAGCC +AGCCATAGGCGCTAACGTTATGAATTCTCAGTTCGAGCTTATTCTTGTGTTGCAGGACTCTTATCCTGAA +AGCCGAGCGTTTAAAACAGCACAATTCGACAGAGGCACTTTGTCTAATCACTGGCAGATAAAACGTGGCA +AAAAAGTGCATAAAGAACACGGAGCTGTGTTCCCAGAAGAATTGGCTGACAGAGTGATATCTAACTTCTC +AAAAGTCGGTGATGTGATCTTAGACCCATTCATGGGAACAGGAACTACTGGCGCTTCTGCGGTAAAACTT +GACAGAAGTTTTATAGGCATAGAGCTAGACAAAGACTATTACGAGTTTTCTGAGCAAAGGATTCTTGGTC +TGAGACCAAAGGAGTAAACTTATGTACACATTGGCTCACTACTTCCGTGACGCCCAGTTGTTTGAGTTTC +CTGATTGGTTCTACGGCATTCAAATGAAACGATATGACGAGGCTTCGGCCTCGTTAGTTCCGTTGGAGCC +GTGGCCTCACCAGAAGAAAGACTTAAACTTCTTATTGAGAAAAGAACGTGCAGGCCTGTTCAATGACGCA +GGTGTAGGTAAAACCATTCCTATGCAGGCGTTTGGCATCTACATGTCAGCCATTGGCAATAAGACTGTAT +TCGCTATGCCTCCTAAGCTTCTAGGGCAGTTCGCTGAGTCCATGGTGGACACTTACCGTGGCGTCGACCA +ACACCTAGACATCTTCTTGCTTAACGAGAAGCAGAAAGAAGCTGAAGCTATTGTTCAGGGCTGGATAGAC +AATCCTAAGTCTCGTCCTGACATCGTGATCATGTCTTACGAGATGTTCGCGTTTCTTCAGCCGATCAAAG +CGTCCCCAGCCAAAGTCATCAAAAACTCAAAGACTGGCGCTGAGTACACTCGTCCTGCAGTCAAACCAAA +GCGTCACCACCCACTTAAACAGGCAGGATTCAACTGTCTGGTATTCGATGAAGCGCACAAGCTGAAAGAG +CCTTCTTCGGCTACACACAAGCGAGTGTGGCGTTGGATAGGTAGTTCAGAGGGTGAGACTCAATTGGTAT +TGGCAACGGGTACGCCTGTTTACAACCAGCTCATCGACGCTTACGGCATTATTCGACTTCTTACCCCTTA +CATTTATGGGTCTAAGAAAGCGTTTGAACGTAAGCACGCAATCATCGACCACCAAAGCGATTACCGTCAG +ATTATCGGCTGGCAGAACGAAGAAGAGTTGCACCACAACCTTTACATTCACGGCCGCCGTGTTACTAAAG +AAGAAGTGTTGAAAGATCTTCCTCCTATGATCCCGTATCAGCATGACATCCGTCTTGGTACGGCTCACAA +AGAGTTGTACAAACGATTGATGACAGAGCGAGTACTGGAGCTGGAAGAAGAGTTTATCGACGCTACCAAC +GCTTCGAAGTTGCGTCAGGTAGCGCTTCAGCTTATCTCTAACCCTAACAAGTTCTCTGACACACCTATCA +AGAACGTTATGGATGACTGGTTAGAGAACATGTTCGAAGAGATCGGCATCTATCAGCACAAAGTGATTGT +ATTCGCCTACTTCAAAGACACCGTTGCCTACTTGGTTAACAAGTACAAACACTTGAATCCTGCGGTCATT +AACGGCGCAGGCGGAGACAGTGAAGAAGCGCGTATCAAGTTCTTGCGCGATCCTAGTTGCCGAGTGCTGT +TCTTAAACTGGCGTTCTGGTGGTGCAGGTCTAAACCTGCAAATCAGTCCGTACGAATTGTTCTACGAAGT +GCCTACCGTGCCTGGCGATTTACAGCAAGCTATCGCACGAAGTCACCGAGGCGGTCAGACTCAAGGGGTG +CACGTTCACATCCCTCGCGTCTTCAGTACCGTAGCGAATAAGTCACTTAACCAACTTTTATCTAAGCAGC +TTAAAAACAATGAGGTTGTGAAAGATAAACACAAATTACTTGCAGAATTACTTGGAAAATAACGTCACGG +TTTGACTTCCTTGTGTGCTGAGAGTAATATGTACCCATACCGCGATGATTGAATGATTTGATGATTCGCG +ATATCAACTGAAACTGAAATTGTAACTGTTAATTGTTATCTGATACCTAACCTGAAAAGAGGAAATTATC +ATGGGTCTAGTAAAATCATCTGCTCCTGAAGAAATGGAAACTGTCGAAACTAACGAAGTATCTGAAACTG +AAACGGCTGAAGTTTCTGCCGAGCAAGTTAACGAAGTTATTGCTCAAGCGCAGGAAGAAGAAGTGTCAAC +TGAAACTGAAGTAGCAACTGAGCCTGAAACAGAAGAACCTGTTGAAGAAGTATCAACTGAGCCTGAAACT +GAAACGGCTGAAGAGCCTGTGCAAGAAGAAGTTGAAGAAGTGTCTGAACCTGAAATTGTAGAAGAGAAGA +AAATGGAATCTAAACCTAAACCTAAAGCCGAAGAAAAGAAAGAAGTGAAATCGACTGAAGTTGCTGTAAG +CGAAACTAAGTCTACTGCAGTAGCAAAAGCTGACAACAAGCAGGTTGCGGCTAGCCAAGTTATCCAGCAA +GCTGAAGACGATGGCTTCGGTGGTCTTGAACTAGGCTTCGGCTCATTCGCGATTATCAAACTAGATAACA +GCGGTCAGTTCATGGACTCGGACGATAACGAACTTGGTACGCTAATTCGCGCATCTGTTCAACAGTCGAC +AGCGTCTTACCTTTACGTACAAGAAGGCAACGACGACAGCCCAGCAGTTTATTCATACGACGGTGTGAAC +CTAACAGCAGAAACTGAAGACGGTGACAAAACTGTCGAGCAAGTTAAAATGCGTTGGGTAGACGAAGGCT +ACGATATGGAATGTCGTAAGTACCTAGAAGTTGTTGCGACAATCGCAGAAGATCATGACGACATCATCGA +CGGTGAAGTTCCTGAAGGACTGCTAGACTACGAAGGCGAGACAGTGATGCTTCGTCTACCGCCAGCGTCT +ATTAAGACATTCTCTGGCAAAGTAGCAACGCTTAAGATGCAGAACAAGCCGCTACGTGGTGCGGTAATGG +ACTTCAAAGTAGGCAAACAACGTAAGACAAACAACGGCAAAGCGAAATACTTCCCATGGAAGATCAGCCT +AGTTAAGTAATTTGTTCTACATTTTGGTCTCAGACCAAGAGGCGGCCAATGTGCCGCCTTTTCTTTACAA +ACTCCTGAAAGGAACTAATAAATGTCTAAATTCAAACCAGCAATATCACAACCGCTAGTTCACGTTGACC +AGTGGGCTATCCTAGATTACCAAGCACTGCTAAAACACACGCACAACATGGGGCAAGACCCTGACTGTGT +TTACGGCAGCGTAACAGGTAAAGAAATCAACTCGGCACAACACACAGTACGCGCTTTCCTTGATGCGTAC +ATTCTGCCGATCCTAGACAACGGCTTCACACCTCGCCAAATCCTAGTAGCTCACGACGACGGTCACGCTT +ACCGTTCTAAACTGCTTCCTGCGTACAAAGAGAAGCGTGATCAACGTAAGAACGACGAAACTGTCACTGA +CCCTGAGATTTTCCAAGAGCTGAACAAAGCTAGCGACTCAATCAAGCGCATCCTGGCTTACTTAGGTTGT +ACTCAGGCTCGTGTTAAAGGCGTTGAAGGTGACGACATCATCGCCCACTTCTGCAAGCTGCCAGGCCTTA +AACAGGTATACACAGTGGACGCCGACTTACTTCGTCTAACCGCTGAAGACACGATGGTCATTCTCAAAAA +CAAACCTATCTTCCTACAAGATTTGGACGTTGAGAATCTGCCTAAGAACATGCACAAGTTCATCGAACCG +TTCATGAAAGGCGACAACTGCAAAGTTAAAAACCCATTCAAGTACCTCACTTTGTATAAATCAATCGTTG +GTGACTCATCTGACGAATACAAAGGCGTACCTCAGATGGGCGACAAAGCTTGGGCTGACTTGGTTGAAAC +GTTCGATTTCGACGGTCTAGACGAGCTGATCAGCATTGTCGAAGACAAAGACTGGAAGAAGCTGGACTCT +TACGTGAAACATTACGAAGAGACCGCAAACGGTAGCAAAGCGCACAAGCTTCTTAAGAAATTGCACGACG +AAGCAGGTCCATGGCGAACTTGCTGGACTGTTGCTGAGCTTCACCCTGAGCTGTGTTACAAGCCATTCAA +TAAAAAGCTTACTCAGATCGACTGGTTCAAACGTGTTCCGAATCGAGCGCGCATCGTTAAGTTGCTTGAA +GACAACTACTGCCCTGAGTTCTTAGAAGACCTAGAGCAGTTCTTACCTGTTGAGTGGCTAATCGACGCTT +CTAACTTCGAAGAGTCTGATATTGACGAGTTCCGTGAAATCTGCGAACAGTCGCCACACATCAGCTTCGA +CTACGAAGGTGCAACTGATAACCAAGACTGGATCAACACACTTCTAAACAGCACCAGCATGAAGTCGTAT +GTAGATGTTCTTGGCCAAGGCGTAACTGGCGTGTCGTTCAACTTCGGTAACAACTTGCAATACACTTGTT +ACTTGACGATCGACCACAAGAACAGCCACAACCTGCCAATCAAAGTGGTTCGTAAGTTCTTTGAGGCTAT +CCCTAAAGATAAAATTCGCGTCGCGCACAACTCTCAGTTCGAAGAAGTTCTAACCTTCACCAACTTGGAC +GGATACGTGATGCCTATCGGCACTGTTCACGATACGGCGATCATGTCTACGTATGACGACGAGAACAAAG +AATCTCACGGTCTGAAAGCGTTAAGTAAGACGCTTCTTGGCTACAACCAGGCTTCGTATGAAGACACCTT +GGCACAGGCTGGCGCTACTAACATGCGCGAACTGACTGCATGCCAGGTGCTTAAATACGGCCTTGACGAC +TCGACGGTAACAGGTCACTTATACGATTTGTTCCGTATCAGTCTGTACCTTCAAGATATGATGGACTTCT +ACGTATCTAACGAGCCTTACGTAAACCACCGTCTAGCGATGTCGTTTATCCAAGGTACTGACATCGACTG +GGAACGTCAGAAAGAAATTCAGGAAGAAGACGAACGCGATATCGAAGAGTCTATGTCTGAGCTTCGTGAG +ATTCTCGAAGAGCACTGTAGCAATGTAAACCGAGAAGCAGCAGAAGCGTTTTTCAACGAAGAGAAGAAGT +TCCTTGAAGCGTCTGCACGAGACAAGTACGGCAAGATGGATCGCGCTAAAGTTATCTCTCGCGCCGATGC +CGTAGCGTCAGAAATCGAAAAGATGGACATCGCTGACGACACTTACATGGACATCGCTAACTCGATTCGC +GCCAAAGTCCCTCAAGCCAAGAGCAAGAAAGACAAAGAAGCGCGTGAAGAAGAGATTAAAAACCTCGATC +ACAAAGTCCTTATCGCATGGATTGTTAAGTTTGACCTACATAAAGCGCTGAACAAGAACATTGAAGACTC +TGTCTACGTCAAGTACGTCAAGACAGTGCACTCAGCAGAGTTCGCTCCTTCAGTGAAAAACCTAGACACG +GTGGCTGAGAAACTTGGTCTTCCTCTTCCTGGCACAGCGGCGAAAGGCAAACTTGAAGCGTGGGAAGCGT +CTGTGCGTGAAATCGACTTTGAAGCCGAGATCGACAAGTATGACGAGTTCAATGACGATCAGAAGAAGTT +CATCGACTTGCTGGCTAAAGCGCGTAATCACTTCAAACCTGATCAGCGCGGTCACGAAGACTACCAAGCG +TTCCAGTCTTTCTGCTGCAACTTGTTAGGCATTGAAGGCAAAGTGTCTTACTCCGGCACTGAACTGAACT +TAGGTTCGCCAAACCAGATGAAAGGTTTGCTGTACTGCATGCTAGGCCTTCCAGTTCGACTACGAGGCAA +ACCTAGCCCTGGCCGTACAAGCTTAGGGTTCTGGGAAGGTTCACCTGCTACCGACGCTCTGGCGCAGGAT +ACGGCGCTTGCTGAAGACATCACAGTCAATCCAGAACGTCACTGGCAAGAGCGCGTAATCAACCTTATTA +AGAAGGTGACCGAATGTAATACTCGCATGAGTTTGTACCACAATAGCTACCCTTACTTGGCGCACCCTCG +TGATGGTAAATTGCACCCTAACGTTCGTAACTGTGGCACTGTAACTCGACGCCCAACTGGCTCGCAGCCA +AACATTCTTCAAGTTTCTAAACACCAGAAGAAAGGCGCGATGCGTGGTATCTACATTCCGCCTCGCGGTT +ACTGTGTAGTCCCTATCGACTTTGCAGGCCAAGAGCTTCGTATTCAGGCGTCAGAGACGAAGGATAAAAA +CCTATTGCAGGTTTATCTAGGTGAAGAGCTTGCAGGTCAGTACCTGCGTGGTGAAGTGTTAGACATTACG +TACGACATGGTAAAAGACAAGAAGGACTTGAAAGACCTGCACAGCATGACTGCTTCAGGTATCACAGAGC +ACTTCGGTCTTGATGAAGCGGGTAAACTTGTACCTGGTGGTCAGCACATTAAATGGACTCCGCCTACGTA +CGAAGAATATGTTGAAGCGCATAAGAACGAAGACCACGAGTACCATGATCTAGCGTCTAAGGTACGTAAA +CGACCAGCTAAGCAAACTAACTTCTTGCTTTCTTACGGCGGTACGTCTCAGACTCTATCGCACCGTCTGA +TCATCCCGGAATCAGTGGCGCAAAGTATCATGGACTCAACGTTAACGCTGTATGCTGGCATTCCAGTAGC +TCAGGACGAAACGCTACACTATGCACGTATGAACGGCTTCGTTACTACGGCTTATGGTAACCGCCGTCAT +GCTACAGACGAGATTTTCAGTAAGTCTAAAGGTCCGGTTAACCGTCAGGTTCGTCAGCTATACAACTACC +GAATCCAGGGCTGTGCGGCGGATATCCTTAAAGTGGTGCTGGCTAACTGCGAGAAGCAAGACCTGTGGGA +CAAATACAATGCAATCATGGTTGCTCCTGTATATGATGAAGTTGTTGCTTACGTTCCTTTCGAACACGCT +TGGGACTTTGTTCAGGACATGCGCTCTATCATGAACTTAACACCTCCTGGTCATGCGGTTCCGATGGTAG +CCGACGTGAGTGTAGGCCCTAACTGGCAAGTGCAGTACGAACTCGGCCACCAACCGACTAAAGAAGAGTT +CGACAAGTGCATGAATGAAAATGTAATCCCAGAAGCTGAAGCTATTTGGGACCGTATCGAAGCTGCTTAA +TTGGAGAAATAAAATGGCAGGACATACAGTACACGTAGTCAATAACGAAGACGGTAAAGTGGTAGAGACG +TACCCAAATGCTAAGATTGAAATCACGCAGGGTATGCTTTACGTGAAGAGTGCGGCAGATGATGCTATTC +TGTGGTTCTCTAACACGGCCACACACTCAGCTAAAGTAGACAGCTCTGCAAGCTGATAAAATTCAACCTA +ACTAAGGCGGCCAATTGGCCGCCTTTCTTTTATCTGAGAGAAAATTATGCGAGTTATCCTAGAATCCCCG +TTCAAATCAAACGATCCTAAGCTCTTCTTTGAGAACATGCTCTACGTTAACATGGTCGCCAGACACCTGA +CGATTGAGGAGGGTATGTGCCCTTTGTTCTTCCACACGTTCTACACGCAGTTCCTGGATGACAGTAACGA +CAACGAACGTGCGTTAGGGCTAGACGCCAGCTTCCACCACCATGACCGTATAGACACGCGCATAATCGCC +ATAGACAGAGGCCTAAGCCTAGGTATGAAGTTGGGTACTGAATATGGCTTAGAGATTGGCTGCATGCCTG diff --git a/tests/test_data/NC_043029_pharokka1.4.1.gbk b/tests/test_data/NC_043029_pharokka1.4.1.gbk new file mode 100644 index 0000000..1ab6a34 --- /dev/null +++ b/tests/test_data/NC_043029_pharokka1.4.1.gbk @@ -0,0 +1,365 @@ +LOCUS NC_043029 7648 bp DNA linear VRL 20-MAR-2024 +DEFINITION NC_043029. +ACCESSION NC_043029 +VERSION NC_043029 +KEYWORDS . +SOURCE . + ORGANISM . + . +FEATURES Location/Qualifiers + CDS 1..1056 + /ID="EDNGHAAT_CDS_0001" + /phrog="453" + /top_hit="p438425 VI_01048" + /locus_tag="EDNGHAAT_CDS_0001" + /function="DNA" + /function=" RNA and nucleotide metabolism" + /product="replication initiation protein" + /source="PHANOTATE" + /score="-14247.55" + /phase="0" + /translation="MAVDRARFRMAVEGGAGGFSPLSPGEKGQRAAAEIGPGSNTGQKG + QQDAIIDYLTIVVPLSALEEVNCKKLDLLLFRIFGFRGEVVAGAIREKNWNFYEQSAVL + IDRENEVVGRVGIGGKKSTVCLSLTGMGCKWIRDWARVYKQCSMLDAKITRVDCAHDDY + EGERLDVHALREVAAQGGFTEGGCPPRHRFISDEGHNTGCTLYVGGKGHKELCVYEKGK + AEGLPSSRWVRAEVRLYGKHMEIPLDVLLNPGAYLRGSYSALQDLIKGVCTRLRTIRKH + VEVSAEAMVLWMERQVGPALSVLRGAFGDSWSDFCEARIVRDGHPGRFRGIAKGDALHR + FVREELCPSAA" + CDS 1038..1316 + /ID="EDNGHAAT_CDS_0002" + /phrog="1199" + /top_hit="p382337 VI_06652" + /locus_tag="EDNGHAAT_CDS_0002" + /function="DNA" + /function=" RNA and nucleotide metabolism" + /product="single strand DNA binding protein" + /source="PHANOTATE" + /score="-9.306293" + /phase="0" + /translation="MPICRVKSAAVEERHNSKTNTINRSQTVGLDLGNGFELPFRVGLG + SRPPYTPGEYDIDPQSFALSQYGDLVLKRYVDLVPLQAKAAAAPAKP" + CDS 1340..1432 + /ID="EDNGHAAT_CDS_0003" + /phrog="No_PHROG" + /top_hit="No_PHROG" + /locus_tag="EDNGHAAT_CDS_0003" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE" + /score="-0.06263662" + /phase="0" + /translation="LPRSRPRHGRGDLHGCDLDSSAGTAAGTAD" + CDS 1435..3327 + /ID="EDNGHAAT_CDS_0004" + /phrog="4454" + /top_hit="p436949 VI_06948" + /locus_tag="EDNGHAAT_CDS_0004" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE" + /score="-41003.94" + /phase="0" + /translation="MHRPSAQRSRCCGLSRTCSGSFARKSNSPRRTHAQDVQRPEGQGC + RAGGCRHRRAGFGSGLRVGWWWRGRGPGGHVHQRRARPGRSDRGCCAVGAGRHQGLQVG + APRDVTATGGQGRLPPPGLLTPWIGQGAWDGRVDLAVRMARGLRDHLRGLQLMRWVARV + FASAFIRRVAVLLVAALVGWCFSGRAHAAACASYTDQCTEGAAKQGALAWGGAQSKCVA + VAGPNGRAGGNVSSKKSEGAGRGYFTVKAECLLNGNVVTYVEPAPPAEQGQWFYTQSCD + AQPSYTGTGPWGSGGSAKNGSLGCRNGCDGIWQTNADASKTWTPLGNTCPDDEKKTCET + YGDGYYWNSLLKVCEPPEGKCQGGGRPNSLGQCAPEPCPEGMAQQADGTCKKKDNECPA + GQVRSPDGKCLPGDGQCAKGEVRGQDGTCKKDADNDGNPDPVNEDSFSGGDDCSAPPSC + SGSPIMCGQARIQWRIDCNTRKNRNIAGGTCASMPICTGEKCDAMEYSALLMQWRSACA + LEKMAQGNGNGGGDNGDTKAIRDALTGTGGAVTTAPDRPSSDVWAPRSGTPVKPDTGGY + GWGRTCPQPPSFEVFGNVIQINTAPLCNWLILGGYFVMGLAALASLRIIASRDA" + CDS 3330..3641 + /ID="EDNGHAAT_CDS_0005" + /phrog="1242" + /top_hit="p296981 VI_03025" + /locus_tag="EDNGHAAT_CDS_0005" + /function="head and packaging" + /product="minor head protein" + /source="PHANOTATE" + /score="-21.08314" + /phase="0" + /translation="MPMLISTLLTALAALFRSKWGPWVAEAMVWLGLSWATNEFLVQPW + IDQMEQAMRAGTPGGEFGALVIAYAGLMKFDVACTMIASAVTAKFAVGAAKTFLTKRA" + CDS 3643..4845 + /ID="EDNGHAAT_CDS_0006" + /phrog="197" + /top_hit="p439678 VI_01075" + /locus_tag="EDNGHAAT_CDS_0006" + /function="moron" + /function=" auxiliary metabolic gene and host takeover" + /product="Zot-like toxin" + /source="PHANOTATE" + /score="-9628.932" + /phase="0" + /translation="MPIELFTGQPGNGKTALMMERLVAEAKAASRPIFAAGIDGLDPGL + ATVLDDPRHWNNKDADGNYIVPDGSLIFVDEAWKWFGHLHDATRQQTPRHVLELAEHRH + RGLDFVWTTQQPNQLYPFVRGLIGSHAHVVRRFGTKMLDVYRWGELNEEIKSLAKRDMA + QRTTRLLPSQVFGQYKSAEVHTIKARIPFKVMLLPVLAIAAIVFAYLAYTSLRPSSFAG + GEGKEGTQSASADAAPSPFRPAGAKEDAPRWPTAAAYAKDHLPRISTMPWTAPVFDERQ + ARSDPQLVCMSSLEGLDAQGVRQEASCRCLTEQGTAYELSQPECRTLARNGPVYNPYRE + RSEERSTQRIEDLERSRPGVATTSAGGVAQHVERSMGTFPESPSYRSDSYMTTAPGPNK + L" + CDS 4890..5468 + /ID="EDNGHAAT_CDS_0007" + /phrog="5529" + /top_hit="p20940 VI_08217" + /locus_tag="EDNGHAAT_CDS_0007" + /function="other" + /product="transfer protein" + /source="PHANOTATE" + /score="-45.34559" + /phase="0" + /translation="MTGDHVAKVIYGGYVPGLSEAGRVAFPLFALVMAYNLAQPGADVG + KSVRRLALWGAIAQPVHALAFGYWLPLNILLTFGVCAAAVYAACQRNWIVLAFAAVVLP + AFVDYQWAGVAFVLLAWLGFRTGRLLLTLVAFAPLCAFNGNLWALVAIPAALGLSHTAW + SVPRGRWTFYGYYVAHLACLGLLAPILRP" + CDS 5465..5554 + /ID="EDNGHAAT_CDS_0008" + /phrog="No_PHROG" + /top_hit="No_PHROG" + /locus_tag="EDNGHAAT_CDS_0008" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE" + /score="-2.183448" + /phase="0" + /translation="MSLRRYLDIHYWVARWMDRAFARERGRKA" + CDS complement(5559..5963) + /ID="EDNGHAAT_CDS_0009" + /phrog="No_PHROG" + /top_hit="No_PHROG" + /locus_tag="EDNGHAAT_CDS_0009" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE" + /score="-9.518482" + /phase="0" + /translation="MERERPEYLQPIPRRRWEFPWLGMWAVLLLGGAGAGIWLHLKTGD + AWNTRFMAAAETSDAAAPIEPSQADTDASRQVMIAEIRARRELAEIAAKRARAGRSDTP + AHTDELRCINGIAFRRIPGGWENVPGAPCP" + CDS complement(6031..6222) + /ID="EDNGHAAT_CDS_0010" + /phrog="No_PHROG" + /top_hit="No_PHROG" + /locus_tag="EDNGHAAT_CDS_0010" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE" + /score="-5.020161" + /phase="0" + /translation="MESSRAIDKPRNPLSFNAASVVNLSDALAQRRKKRSPGAMAGPDA + EPPAAVLPVPGPKRRQRV" + CDS complement(6222..6320) + /ID="EDNGHAAT_CDS_0011" + /phrog="No_PHROG" + /top_hit="No_PHROG" + /locus_tag="EDNGHAAT_CDS_0011" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE" + /score="-0.1352431" + /phase="0" + /translation="VADWSHPRAASWNRRIWPGCRSPLRKRRNGVG" + CDS complement(6372..6818) + /ID="EDNGHAAT_CDS_0012" + /phrog="No_PHROG" + /top_hit="No_PHROG" + /locus_tag="EDNGHAAT_CDS_0012" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE" + /score="-40.18632" + /phase="0" + /translation="MRSIDLLLDKAREKCERPSDRALAEKLRVTASAVSKWRKGGVITE + MHATALAAIAGLDGEIVVRVMEEQAETPAQRRVWRSVLDRLSAAAAVLMLVVFAAPGAA + RAKAIDSQGSSGSDQPHSVYYVRIILGWLARLLPLPRHLLWHGA" + CDS complement(6870..6968) + /ID="EDNGHAAT_CDS_0013" + /phrog="No_PHROG" + /top_hit="No_PHROG" + /locus_tag="EDNGHAAT_CDS_0013" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE" + /score="-2.143476" + /phase="0" + /translation="MNGSIIPLPLPQAPKGTRQRPWGAGGGASTTS" + CDS 6952..7212 + /ID="EDNGHAAT_CDS_0014" + /phrog="2181" + /top_hit="p405308 VI_02583" + /locus_tag="EDNGHAAT_CDS_0014" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE" + /score="-7.553578" + /phase="0" + /translation="MIDPFIAFVLLAAIVAVSIGSAKLVSWCLDRRGESARRSAREAAI + VAEACAELAATGWTAEDEASFQAIRGQQLVFLKHLQEVRHG" + CDS 7205..7351 + /ID="EDNGHAAT_CDS_0015" + /phrog="No_PHROG" + /top_hit="No_PHROG" + /locus_tag="EDNGHAAT_CDS_0015" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE" + /score="-17.07403" + /phase="0" + /translation="MVKVLLFSAVLFGAVAILKDELYFAVVSALLGLLAYGFQAAEDRS + NGR" + CDS 7341..7475 + /ID="EDNGHAAT_CDS_0016" + /phrog="No_PHROG" + /top_hit="No_PHROG" + /locus_tag="EDNGHAAT_CDS_0016" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE" + /score="-6.300162" + /phase="0" + /translation="MAVDQFREFLRDPFVVSVLGGVLLTGLYWSLVLALRAKGAGNGR" + CDS 7513..7647 + /ID="EDNGHAAT_CDS_0017" + /phrog="No_PHROG" + /top_hit="No_PHROG" + /locus_tag="EDNGHAAT_CDS_0017" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE" + /score="-2.474578" + /phase="0" + /translation="MQLFAGMFLLLAFCAVFSPFDLHSWEYRVRRYLRRRRIARIRESV + " +ORIGIN + 1 atggccgttg atcgcgctcg cttcaggatg gctgtcgagg gcggggcagg gggcttttcc + 61 ccgctttcgc ccggtgaaaa ggggcagcgg gcggcggcgg agattggccc ggggagtaac + 121 acgggccaaa agggtcagca agacgcaatc atcgactacc tgaccattgt ggtcccgctc + 181 tccgcccttg aggaagtgaa ctgcaagaag ctcgacctct tgctgttccg catcttcggt + 241 ttccgtggcg aagttgttgc cggtgcgatt cgtgagaaga actggaactt ctacgagcag + 301 tcggcggtgc tgatcgaccg ggaaaacgag gtggttggtc gtgtcggcat cggcggcaag + 361 aagagcaccg tatgcctgag cctcaccggt atgggttgca aatggattcg tgactgggcg + 421 cgcgtctaca agcagtgctc catgcttgat gccaagatca cccgcgttga ctgcgcgcac + 481 gatgactacg agggcgaacg cctggacgtg catgcgctcc gcgaggtagc agctcaaggt + 541 ggtttcactg aaggcggatg cccgccgcgc caccgcttca tttccgatga aggccacaac + 601 accggctgca cgctttatgt cggcggaaaa ggccacaagg aactgtgcgt atacgagaag + 661 gggaaggccg agggcctgcc gtcttcgcgc tgggtgcgcg cggaagttcg cctgtacggc + 721 aagcacatgg aaatcccgct ggatgtgctg ttgaacccgg gcgcgtacct gcggggttct + 781 tacagcgcgt tgcaggacct catcaagggc gtgtgcactc gactgcgcac gatccgcaag + 841 catgtcgaag tatctgccga ggcaatggtg ctctggatgg agcgtcaggt cggcccggcc + 901 ctcagtgttc tgcgcggagc gttcggagat tcatggtccg acttctgcga ggcccgcatc + 961 gtccgtgacg gtcaccccgg acgttttcgc ggtattgcca agggtgacgc actccatcgt + 1021 ttcgtgaggg aagaactatg cccatctgcc gcgtgaagtc cgctgccgtc gaagagcggc + 1081 acaacagcaa gaccaacacc atcaatcgtt cgcagaccgt tggcctcgac ctgggcaacg + 1141 gattcgaact gccgttccgt gtcggcctcg gctcgcgccc gccgtatacg ccgggtgagt + 1201 acgacattga cccgcagtcc ttcgcactga gccagtacgg cgatctggtg ttgaagcgtt + 1261 acgtggacct cgttccgctg caggcgaagg ccgcagccgc accggcgaag ccgtaagcca + 1321 tggccgtgct gatccccgct tgccgcgaag ccgacctcga cacggccgcg gggacctgca + 1381 cggctgtgat ctggattcct cagccggcac tgctgccgga actgccgatt gaggatgcac + 1441 aggccatcgg cgcaaagatc gcgctgctgt gggctgtcgc gtacgtgttc cggctcattc + 1501 gcaagaaaat cgaacagtcc taggaggaca catgcacaag atgttcaacg ccctgaaggg + 1561 caagggtgcc gcgctggcgg ctgtcggcac cgccgcgctg gcttcggctc cggccttcgc + 1621 gtcgggtggt ggtggcgtgg acgtgggccc ggtggtcacg tccatcaacg gcgcgctcgg + 1681 cccggtcggt cagatcgggg ctgctgtgct gttggtgctg gtaggcatca aggtctacaa + 1741 gtgggtgcgc cgcgcgatgt aacggcaacc ggggggcagg gccgactccc tccccccggt + 1801 cttctaacgc cctggatagg gcagggggct tgggatggaa gggtggattt ggctgtgcgc + 1861 atggctcgtg gcttgcgcga tcatcttcgt ggacttcaac tgatgcgctg ggtggcacgc + 1921 gtgtttgctt ccgcgttcat tcggcgagta gcggtactcc tcgttgctgc gcttgtcggc + 1981 tggtgcttct cagggcgtgc gcatgccgcc gcgtgtgctt cctacaccga ccaatgcacc + 2041 gaaggtgcag ctaaacaggg cgcgttagca tggggcgggg cgcagtcgaa gtgtgtagcg + 2101 gtggccgggc cgaatggccg cgcgggcgga aacgtgtcga gtaagaagtc ggaaggcgct + 2161 ggcagggggt acttcactgt caaggcggaa tgcctcttga acggtaacgt tgtgacctac + 2221 gttgagcctg cgccgccagc tgagcagggg cagtggtttt acacgcagtc atgtgatgca + 2281 cagccttcat ataccggcac tggtccatgg ggcagcggtg gttcagcgaa gaatggcagc + 2341 ctcggttgcc gcaatggctg tgacggcatc tggcaaacca acgcagatgc ctccaagacg + 2401 tggacgccac ttggtaacac ttgcccagat gatgagaaga agacatgcga aacctacggc + 2461 gacggctact actggaactc gttgctgaag gtttgtgagc cgcctgaggg gaaatgccag + 2521 ggcggtggcc gtccaaattc gttgggtcaa tgcgctcccg agccgtgccc cgagggcatg + 2581 gcgcagcaag ccgatggaac gtgcaagaag aaggataacg agtgccctgc gggtcaggtc + 2641 cgttcacctg atggtaagtg ccttcccggc gacggccagt gtgcaaaggg tgaggtgcgc + 2701 ggccaggacg ggacctgcaa gaaggacgcc gacaacgacg gcaatcccga tccagtgaat + 2761 gaggattcat tcagtggggg cgacgactgt agcgcgccac catcgtgcag tggctcgccg + 2821 atcatgtgcg gtcaggcgcg cattcaatgg cgcatcgatt gcaacacgcg caagaaccgc + 2881 aacatcgccg ggggtacttg tgcgtccatg ccgatctgta cgggcgagaa gtgcgacgcg + 2941 atggagtatt ccgcgctgtt gatgcagtgg cggtcagcct gcgcgttgga aaagatggcg + 3001 cagggtaacg gcaatggtgg tggtgataac ggagacacca aggcgattcg tgacgcactc + 3061 accggcactg gtggggcagt cacaacggcg ccggatcggc ctagctccga cgtatgggcg + 3121 ccgcgcagcg gcacgccggt caagcctgat acgggcggat acggctgggg gcgtacgtgc + 3181 ccgcagccgc ccagcttcga agtgttcgga aatgtcatcc agatcaacac agcaccgctc + 3241 tgcaactggc tgattctcgg gggctacttc gtgatggggc tcgccgcgct ggcctcgctt + 3301 cgcatcatcg catctaggga cgcttgatca tgccaatgct catcagcaca ttgctgaccg + 3361 cgcttgcagc gctgttccgt tccaagtggg gcccatgggt cgctgaagcc atggtgtggc + 3421 tgggcttgtc ctgggcaacc aacgaattct tggtgcagcc atggattgat cagatggaac + 3481 aggcgatgcg cgcaggtacg cccggcggcg agttcggcgc gctggtcatt gcttacgcgg + 3541 gactcatgaa gttcgacgtg gcctgcacca tgattgcctc ggcggtgacc gcgaagttcg + 3601 ccgtgggggc tgcaaaaacg ttcctgacga agcgggcctg acatgcctat cgaactcttc + 3661 accggtcagc cgggcaatgg caaaacggcg ctgatgatgg aacggcttgt cgctgaggcg + 3721 aaggccgcaa gtcgtccaat ttttgcggcg ggcatcgatg gccttgaccc gggccttgcg + 3781 actgttctgg acgatccgcg ccactggaac aacaaggatg cggacgggaa ttacattgtt + 3841 cctgatggct cgctcatctt cgtggatgaa gcgtggaagt ggttcggcca cctgcatgac + 3901 gccactcgcc aacagacgcc gcgtcacgtg ctcgaactgg ctgagcatcg acatcgcggc + 3961 ctggacttcg tatggacgac tcagcagccg aaccagctgt atccattcgt gcgcggtctg + 4021 atcggatcgc atgcgcatgt ggttcgccgc ttcggcacaa agatgctcga tgtctatcgc + 4081 tggggtgagt tgaacgaaga aatcaagtcg ctggcgaagc gcgacatggc gcagcgcacg + 4141 acccggttgc tgccctcgca ggtcttcggt caatacaagt ccgctgaggt acacacgatc + 4201 aaggcccgca ttcccttcaa ggtgatgctg ttgccggtgc tggcgattgc tgccatcgtt + 4261 ttcgcctatc tggcatacac gtcgcttcgt ccctcaagct tcgccggtgg cgaggggaaa + 4321 gaggggacgc aatcggcgtc agccgatgcg gccccttcgc ccttccgacc agcgggagcc + 4381 aaggaagatg cgccgcgttg gccgactgcc gctgcatatg ccaaggatca cctgccgcgc + 4441 atcagcacca tgccctggac agcgccggtc tttgatgagc ggcaggcacg ttcggatccg + 4501 cagttggtgt gtatgtcgtc gctggaaggg ctggatgcgc agggcgttcg acaggaggcc + 4561 agctgcaggt gtctgacgga gcagggcacc gcatatgagt tgagccagcc agaatgccgc + 4621 acgctggctc gaaacgggcc ggtctataac ccgtaccgcg agcgctcaga ggaacgcagt + 4681 acccagcgga ttgaggacct tgaacgatct cggccgggcg tagcaaccac gagtgccgga + 4741 ggtgttgctc agcacgttga acgttcgatg ggcacgtttc cagagtcgcc gtcctatcga + 4801 tctgattcct acatgaccac ggcgccgggg ccgaacaagc tgtgaccagt agcgcgcgcg + 4861 aactgttgaa gtggctggcc gtcatcctta tgactgggga tcacgtcgcc aaggtgatat + 4921 acggcgggta cgtgcctgga ctcagcgaag cggggcgggt ggccttcccg cttttcgcac + 4981 tggtgatggc ctacaacctc gcgcagcccg gtgctgatgt aggcaagtcc gtgcggaggc + 5041 tcgccctgtg gggcgccatc gcacagccgg tccacgcgct ggcgttcggc tactggttgc + 5101 cgctgaacat cctgctcacc ttcggcgtgt gtgctgcggc ggtctacgcg gcctgccagc + 5161 gtaactggat cgtcctggcg ttcgccgcgg tggtgctgcc ggcgttcgtg gactaccagt + 5221 gggccggggt ggctttcgtg ctgctggcat ggctgggatt ccgcacagga cgtttgttac + 5281 tgactttggt cgcgttcgcg cctctgtgtg ccttcaacgg caacctgtgg gcgctggtag + 5341 ccattcccgc ggccttgggg ctatcgcaca cggcgtggtc cgtcccgcgc ggtcggtgga + 5401 ccttctacgg ttattacgtc gcccaccttg cgtgcctagg gctgttggcg cctatactgc + 5461 ggccatgagc ctgcgccgat acctcgacat tcactattgg gtcgctcgat ggatggaccg + 5521 ggcgttcgcg cgagagcgag gccgtaaggc gtagctcatc aagggcaggg tgcgccgggt + 5581 acgttctccc agccgcctgg gatccggcga aatgcgattc cattgatgca tcggagctca + 5641 tctgtgtgcg ctggcgtgtc cgaacggcca gctcgggcgc gtttggcggc aatctctgca + 5701 agctctctac gcgcacggat ttcagcgatc atcacctgac ggctggcatc agtatctgcc + 5761 tgtgatggct cgatcggtgc ggctgcgtcg gatgtttctg cggccgccat gaatcgcgta + 5821 ttccaggcgt ctccggtctt caggtgtagc cagatgccag caccagcccc gcccagtagc + 5881 aggacggccc acattccgag ccatgggaac tcccatcggc ggcgtgggat cggttgaagg + 5941 tactccggtc gttcgcgttc catacggccc ccaatgcgtc ctgcgcgcat tgtagccggg + 6001 gtgtaggggc agcgccccta cggaagcgcc tcacacgcgc tggcggcgtt tcggccccgg + 6061 taccggcagg actgctgcag gcggctcggc gtcgggtcca gccatcgccc cgggtgaccg + 6121 ctttttccgg cgctgtgcca aggcatcgga gaggttcacc acgctagcgg cgttgaagga + 6181 caagggattc cggggcttgt cgatcgcgcg gctgctctcc atcatccgac gccattcctg + 6241 cgcttgcgca gcggtgagcg acagccaggc cagatcctgc ggttccaact cgcggccctc + 6301 gggtgtgacc agtcggccac ccttaaacga aaaaccggcc caagggccgg tcagtttccg + 6361 atcacgcacg atcaggctcc atgccagagt aggtgccggg gcagcggcaa gagacgtgcc + 6421 agccacccca agatgatccg aacataatat acagaatgcg gctgatcgga gccggaagag + 6481 ccttgtgaat caatggcttt agcgcgtgct gcgcctggtg ctgcgaacac taccagcatc + 6541 agcactgccg ccgccgcgct taacctgtcc agcactgagc gccagaccct acgctgagcg + 6601 ggggtctcgg cctgttcctc catcactcgc acgacgatct cgccatccag accagcgatg + 6661 gcggccagcg cagttgcgtg catttccgtg atcactccgc ccttgcgcca cttcgatacg + 6721 gcgcttgcgg tcacgcgcaa tttctccgcc aaagctctgt ccgacgggcg ttcgcacttc + 6781 tcgcgggcct tgtctagcag taggtcgatg cttcgcatgt ggagtaccag ttgacacggg + 6841 gatcaattgt gagtttatat cgccccgcgt cagctggtag ttgatgcccc gccaccggca + 6901 ccccaaggtc gctggcgggt tcccttgggg gcttggggta ggggaagagg gatgatcgat + 6961 ccgttcattg ccttcgtgct gctggcggcc atcgtggccg tatccattgg cagcgccaaa + 7021 ctcgtttcgt ggtgcctcga ccggcgcggg gagtctgccc gtcgcagtgc acgcgaagcg + 7081 gccatcgtcg ccgaggcatg cgccgaactg gccgccaccg gctggactgc tgaagacgaa + 7141 gcctcgttcc aagccattcg gggccagcag ctcgtcttcc tgaagcatct acaggaggtg + 7201 cgtcatggtt aaggtccttc tcttcagcgc agtgctgttc ggtgctgtcg ccatactcaa + 7261 ggatgagctg tatttcgccg tggtttcagc tctcttgggc ttgctcgcgt acggattcca + 7321 ggctgcggag gatcgctcca atggccgttg atcagttccg cgagttcctc cgcgatccgt + 7381 tcgttgtctc cgtgctgggc ggcgtcctgc ttaccggcct ctattggtcg ctggtgctcg + 7441 cgctgcgtgc gaagggggcg ggcaatggcc gctgagtgct tggtcatcac caaggcggat + 7501 tgggatcagc tgatgcagct gttcgccgga atgttcctgc tgctcgcctt ctgcgccgtg + 7561 ttctcgccgt tcgatctgca ttcgtgggag taccgcgtgc gccgctatct gcgtcgccgt + 7621 cgcattgcgc gcattcggga gtccgttc +// diff --git a/tests/test_integration.py b/tests/test_integration.py index c00ac29..5c80425 100755 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -39,7 +39,9 @@ output_dir = Path(f"{test_data}/outputs") output_dir.mkdir(parents=True, exist_ok=True) run_gbk_dir: Path = f"{output_dir}/combined_truncated_phold_run_gbk" +run_gbk_pharokka_1_4_1_dir : Path = f"{output_dir}/NC_043029_pharokka1.4.1_gbk" run_fasta_dir: Path = f"{output_dir}/combined_truncated_phold_run_fasta" +run_fasta_efam_dir: Path = f"{output_dir}/KF_efam_phold_run_fasta" predict_gbk_dir: Path = f"{output_dir}/combined_truncated_phold_predict_gbk" compare_pdb_dir: Path = f"{output_dir}/NC_043029_phold_compare_gbk_pdb" compare_gbk_dir: Path = f"{output_dir}/combined_truncated_phold_compare_gbk" @@ -100,6 +102,14 @@ def test_install(): def test_run_genbank(gpu_available, threads): """test phold run with genbank input""" + input_gbk: Path = f"{test_data}/NC_043029_pharokka1.4.1.gbk" + cmd = f"phold run -i {input_gbk} -o {run_gbk_pharokka_1_4_1_dir} -t {threads} -d {database_dir} -f" + if gpu_available is False: + cmd = f"{cmd} --cpu" + exec_command(cmd) + +def test_run_genbank_old_pharokka(gpu_available, threads): + """test phold run with genbank input from pharokka prior to v1.5.0 no transl_table field (#34)""" input_gbk: Path = f"{test_data}/combined_truncated_acr_defense_vfdb_card.gbk" cmd = f"phold run -i {input_gbk} -o {run_gbk_dir} -t {threads} -d {database_dir} -f" if gpu_available is False: @@ -114,6 +124,13 @@ def test_run_fasta(gpu_available, threads): cmd = f"{cmd} --cpu" exec_command(cmd) +def test_run_efam(gpu_available, threads): + """test phold run with a tophit to efam""" + input_fasta: Path = f"{test_data}/KF623293.1_subset_efam.fasta" + cmd = f"phold run -i {input_fasta} -o {run_fasta_efam_dir} -t {threads} -d {database_dir} -f" + if gpu_available is False: + cmd = f"{cmd} --cpu" + exec_command(cmd) def test_predict_genbank(gpu_available, threads): """test phold predict with genbank input""" @@ -143,7 +160,6 @@ def test_proteins_compare_pdb(threads): cmd = f"phold proteins-compare -i {input_faa} -o {proteins_compare_pdb_dir} -t {threads} -d {database_dir} --pdb --pdb_dir {pdb_dir} -f" exec_command(cmd) - def test_predict_fasta(gpu_available, threads): """test phold predict with fasta input""" input_fasta: Path = f"{test_data}/combined_truncated_acr_defense_vfdb_card.fasta" From 523189b3c8a3e1881dbe5310cc2d19cb81201881 Mon Sep 17 00:00:00 2001 From: gbouras13 Date: Fri, 22 Mar 2024 18:22:44 +1030 Subject: [PATCH 06/11] issue when increase maxseqs of weighted_counts_normalised being empty in get_topfunctions --- src/phold/results/topfunction.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/phold/results/topfunction.py b/src/phold/results/topfunction.py index 58210f1..d65e34f 100644 --- a/src/phold/results/topfunction.py +++ b/src/phold/results/topfunction.py @@ -174,10 +174,15 @@ def weighted_function(group: pd.DataFrame) -> pd.DataFrame: value / total_functional_bitscore, 3 ) - top_bitscore_function = max( - weighted_counts_normalised, key=weighted_counts_normalised.get - ) - top_bitscore_perc = max(weighted_counts_normalised.values()) + # error where weighted_counts_normalised was empty for maxseqs = 10000 + if weighted_counts_normalised: + top_bitscore_function = max( + weighted_counts_normalised, key=weighted_counts_normalised.get + ) + top_bitscore_perc = max(weighted_counts_normalised.values()) + else: + top_bitscore_function = "unknown function" + top_bitscore_perc = 0 d = { "function_with_highest_bitscore_proportion": [top_bitscore_function], From 856cc883120d1089cf40ba268d4f7dfee1b8c0a8 Mon Sep 17 00:00:00 2001 From: gbouras13 Date: Mon, 25 Mar 2024 21:12:04 +1030 Subject: [PATCH 07/11] fix issue with long header, support efam --- src/phold/__init__.py | 28 +++++++++------------- src/phold/features/create_foldseek_db.py | 5 ++-- src/phold/features/predict_3Di.py | 4 +++- src/phold/features/predict_3Di_finetune.py | 9 ++++--- src/phold/features/query_remote_3Di.py | 4 +++- src/phold/io/handle_genbank.py | 4 ++-- src/phold/plot/plot.py | 12 +++++----- src/phold/results/topfunction.py | 1 - src/phold/subcommands/compare.py | 11 ++++----- src/phold/subcommands/predict.py | 13 +++++++++- tests/conftest.py | 2 +- tests/test_integration.py | 19 +++++++++------ 12 files changed, 63 insertions(+), 49 deletions(-) diff --git a/src/phold/__init__.py b/src/phold/__init__.py index ca9bb97..2ee7819 100755 --- a/src/phold/__init__.py +++ b/src/phold/__init__.py @@ -8,24 +8,18 @@ from loguru import logger from pycirclize.parser import Genbank - -from phold.plot.plot import create_circos_plot - from phold.databases.db import install_database, validate_db from phold.features.create_foldseek_db import generate_foldseek_db_from_aa_3di from phold.features.predict_3Di import get_T5_model from phold.features.query_remote_3Di import query_remote_3di +from phold.plot.plot import create_circos_plot from phold.subcommands.compare import subcommand_compare from phold.subcommands.predict import subcommand_predict from phold.utils.constants import DB_DIR -from phold.utils.util import ( - begin_phold, - clean_up_temporary_files, - end_phold, - get_version, - print_citation, -) -from phold.utils.validation import check_dependencies, instantiate_dirs, validate_input +from phold.utils.util import (begin_phold, clean_up_temporary_files, end_phold, + get_version, print_citation) +from phold.utils.validation import (check_dependencies, instantiate_dirs, + validate_input) log_fmt = ( "[{time:YYYY-MM-DD HH:mm:ss}] {level: <8} | " @@ -280,6 +274,8 @@ def run( # validate input fasta_flag, gb_dict = validate_input(input, threads) + print(gb_dict) + # phold predict model_dir = database model_name = "Rostlab/ProstT5_fp16" @@ -898,35 +894,33 @@ def remote( fasta_aa: Path = Path(output) / f"{prefix}_aa.fasta" - # makes the nested dictionary {contig_id:{cds_id: cds_feature}} - + for record_id, record in gb_dict.items(): cds_dict[record_id] = {} for cds_feature in record.features: if cds_feature.type == "CDS": if fasta_flag is False: - cds_feature.qualifiers["translation"] = cds_feature.qualifiers["translation"][0] + cds_feature.qualifiers["translation"] = cds_feature.qualifiers[ + "translation" + ][0] cds_dict[record_id][cds_feature.qualifiers["ID"][0]] = cds_feature else: cds_dict[record_id][cds_feature.qualifiers["ID"]] = cds_feature - ## write the CDS to file # FASTA -> takes the whole thing # Pharokka GBK -> requires just the first entry, the GBK is parsed as a list with open(fasta_aa, "w+") as out_f: for contig_id, rest in cds_dict.items(): - aa_contig_dict = cds_dict[contig_id] # writes the CDS to file for seq_id, cds_feature in aa_contig_dict.items(): out_f.write(f">{contig_id}:{seq_id}\n") out_f.write(f"{cds_feature.qualifiers['translation']}\n") - ############ # prostt5 remote ############ diff --git a/src/phold/features/create_foldseek_db.py b/src/phold/features/create_foldseek_db.py index 8b14c5b..f5c1ba6 100644 --- a/src/phold/features/create_foldseek_db.py +++ b/src/phold/features/create_foldseek_db.py @@ -176,10 +176,9 @@ def generate_foldseek_db_from_pdbs( no_pdb_cds_ids = [] for id in sequences_aa.keys(): - # in case the header has a colon in it - this will cause a bug if so - cds_id = id.split(":")[1:] - cds_id = ":".join(cds_id).strip() + cds_id = id.split(":")[1:] + cds_id = ":".join(cds_id).strip() # record_id = id.split(":")[0] # this is potentially an issue if a contig has > 9999 AAs diff --git a/src/phold/features/predict_3Di.py b/src/phold/features/predict_3Di.py index 487e420..66391be 100644 --- a/src/phold/features/predict_3Di.py +++ b/src/phold/features/predict_3Di.py @@ -112,7 +112,9 @@ def get_T5_model( device = torch.device("cpu") dev_name = "cpu" if cpu is not True: - logger.warning("No available GPU was found, but --cpu was not specified") + logger.warning( + "No available GPU was found, but --cpu was not specified" + ) logger.warning("ProstT5 will be run with CPU only") # logger device only if the function is called diff --git a/src/phold/features/predict_3Di_finetune.py b/src/phold/features/predict_3Di_finetune.py index 5652f07..8b47f0c 100644 --- a/src/phold/features/predict_3Di_finetune.py +++ b/src/phold/features/predict_3Di_finetune.py @@ -24,10 +24,13 @@ from torch.nn import CrossEntropyLoss from torch.utils.data import DataLoader from tqdm import tqdm -from transformers import DataCollatorForTokenClassification, T5EncoderModel, T5Tokenizer +from transformers import (DataCollatorForTokenClassification, T5EncoderModel, + T5Tokenizer) from transformers.modeling_outputs import TokenClassifierOutput -from transformers.models.t5.modeling_t5 import T5Config, T5PreTrainedModel, T5Stack -from transformers.utils.model_parallel_utils import assert_device_map, get_device_map +from transformers.models.t5.modeling_t5 import (T5Config, T5PreTrainedModel, + T5Stack) +from transformers.utils.model_parallel_utils import (assert_device_map, + get_device_map) from phold.features.predict_3Di import write_predictions from phold.utils.constants import FINETUNE_DIR diff --git a/src/phold/features/query_remote_3Di.py b/src/phold/features/query_remote_3Di.py index 9ae6777..87f3391 100644 --- a/src/phold/features/query_remote_3Di.py +++ b/src/phold/features/query_remote_3Di.py @@ -9,7 +9,9 @@ from loguru import logger -def query_remote_3di(cds_dict: Dict[str, dict], fasta_3di: Path, fasta_flag: bool) -> None: +def query_remote_3di( + cds_dict: Dict[str, dict], fasta_3di: Path, fasta_flag: bool +) -> None: """ Query remote Foldseek ProstT5 server for 3Di predictions of amino acid sequences and write to file. diff --git a/src/phold/io/handle_genbank.py b/src/phold/io/handle_genbank.py index 579cf9d..72cc34f 100644 --- a/src/phold/io/handle_genbank.py +++ b/src/phold/io/handle_genbank.py @@ -248,7 +248,7 @@ def write_genbank( try: transl_table = cds_feature.qualifiers["transl_table"][0] except: - # for older pharokka input before v1.5.0 + # for older pharokka input before v1.5.0 transl_table = "11" # to reverse the start and end coordinates for output tsv + fix genbank 0 index start relative to pharokka @@ -267,7 +267,7 @@ def write_genbank( if fasta_flag is True: cds_id = cds_feature.qualifiers["ID"] - else: # because for some reason when parsing the pharokka genbank, it is a list + else: # because for some reason when parsing the pharokka genbank, it is a list cds_id = cds_feature.qualifiers["ID"][0] cds_info = { diff --git a/src/phold/plot/plot.py b/src/phold/plot/plot.py index c6c844e..657901d 100644 --- a/src/phold/plot/plot.py +++ b/src/phold/plot/plot.py @@ -1,16 +1,16 @@ from pathlib import Path -from typing import List, Dict +from typing import Dict, List -from loguru import logger -from pycirclize import Circos -from pycirclize.parser import Genbank -from matplotlib.lines import Line2D import matplotlib.pyplot as plt -from matplotlib.patches import Patch import numpy as np from Bio import SeqUtils from Bio.Seq import Seq from Bio.SeqFeature import SeqFeature +from loguru import logger +from matplotlib.lines import Line2D +from matplotlib.patches import Patch +from pycirclize import Circos +from pycirclize.parser import Genbank def create_circos_plot( diff --git a/src/phold/results/topfunction.py b/src/phold/results/topfunction.py index d65e34f..87b0e8b 100644 --- a/src/phold/results/topfunction.py +++ b/src/phold/results/topfunction.py @@ -90,7 +90,6 @@ def get_topfunctions( ) # no need to add it on to protein - already done - foldseek_df["phrog"] = foldseek_df["phrog"].astype("str") # read in the mapping tsv phrog_annot_mapping_tsv: Path = Path(database) / "phold_annots.tsv" diff --git a/src/phold/subcommands/compare.py b/src/phold/subcommands/compare.py index a4ca1f7..df09921 100644 --- a/src/phold/subcommands/compare.py +++ b/src/phold/subcommands/compare.py @@ -10,14 +10,13 @@ from loguru import logger from phold.features.create_foldseek_db import ( - generate_foldseek_db_from_aa_3di, - generate_foldseek_db_from_pdbs, -) + generate_foldseek_db_from_aa_3di, generate_foldseek_db_from_pdbs) from phold.features.run_foldseek import create_result_tsv, run_foldseek_search from phold.features.split_3Di import split_3di_fasta_by_prob from phold.io.handle_genbank import write_genbank from phold.io.sub_db_outputs import create_sub_db_outputs -from phold.results.topfunction import calculate_topfunctions_results, get_topfunctions +from phold.results.topfunction import (calculate_topfunctions_results, + get_topfunctions) def subcommand_compare( @@ -90,7 +89,7 @@ def subcommand_compare( if fasta_flag is False: if cds_feature.type == "CDS": # update DNA, RNA and nucleotide metabolism from pharokka as it is broken as of 1.6.1 - if "DNA" in cds_feature.qualifiers["function"][0] : + if "DNA" in cds_feature.qualifiers["function"][0]: cds_feature.qualifiers["function"][ 0 ] = "DNA, RNA and nucleotide metabolism" @@ -98,7 +97,7 @@ def subcommand_compare( cds_feature.qualifiers["function"][0] ] # Keep only the first element # moron, auxiliary metabolic gene and host takeover as it is broken as of 1.6.1 - if "moron" in cds_feature.qualifiers["function"][0] : + if "moron" in cds_feature.qualifiers["function"][0]: cds_feature.qualifiers["function"][ 0 ] = "moron, auxiliary metabolic gene and host takeover" diff --git a/src/phold/subcommands/predict.py b/src/phold/subcommands/predict.py index 3e88db2..4e3f8d6 100644 --- a/src/phold/subcommands/predict.py +++ b/src/phold/subcommands/predict.py @@ -65,9 +65,20 @@ def subcommand_predict( cds_feature.qualifiers["translation"] = cds_feature.qualifiers[ "translation" ][0] + + # for really long CDS IDs (over 54 chars), a space will be introduced + # this is because the ID will go over a second line + # weird bug noticed it on the Mgnify contigs annotated with Pharokka + + cds_id = cds_feature.qualifiers["ID"][0] + if len(cds_id) >= 54: + # Remove all spaces from the string + cds_id = cds_id.replace(" ", "") + cds_dict[record_id][ - cds_feature.qualifiers["ID"][0] + cds_id ] = cds_feature + else: cds_dict[record_id][cds_feature.qualifiers["ID"]] = cds_feature diff --git a/tests/conftest.py b/tests/conftest.py index 9897f3f..9418f4b 100755 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,8 +3,8 @@ """ + def pytest_addoption(parser): parser.addoption("--gpu_available", action="store_true") parser.addoption("--run_remote", action="store_true") parser.addoption("--threads", action="store", default=1) - diff --git a/tests/test_integration.py b/tests/test_integration.py index 5c80425..3e508d3 100755 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -18,7 +18,6 @@ # import import os import shutil - # import functions import subprocess import sys @@ -39,7 +38,7 @@ output_dir = Path(f"{test_data}/outputs") output_dir.mkdir(parents=True, exist_ok=True) run_gbk_dir: Path = f"{output_dir}/combined_truncated_phold_run_gbk" -run_gbk_pharokka_1_4_1_dir : Path = f"{output_dir}/NC_043029_pharokka1.4.1_gbk" +run_gbk_pharokka_1_4_1_dir: Path = f"{output_dir}/NC_043029_pharokka1.4.1_gbk" run_fasta_dir: Path = f"{output_dir}/combined_truncated_phold_run_fasta" run_fasta_efam_dir: Path = f"{output_dir}/KF_efam_phold_run_fasta" predict_gbk_dir: Path = f"{output_dir}/combined_truncated_phold_predict_gbk" @@ -63,20 +62,22 @@ def remove_directory(dir_path): if os.path.exists(dir_path): shutil.rmtree(dir_path) + @pytest.fixture(scope="session") def gpu_available(pytestconfig): return pytestconfig.getoption("gpu_available") + @pytest.fixture(scope="session") def run_remote(pytestconfig): return pytestconfig.getoption("run_remote") + @pytest.fixture(scope="session") def threads(pytestconfig): return pytestconfig.getoption("threads") - def exec_command(cmnd, stdout=subprocess.PIPE, stderr=subprocess.PIPE): """executes shell command and returns stdout if completes exit code 0 Parameters @@ -108,6 +109,7 @@ def test_run_genbank(gpu_available, threads): cmd = f"{cmd} --cpu" exec_command(cmd) + def test_run_genbank_old_pharokka(gpu_available, threads): """test phold run with genbank input from pharokka prior to v1.5.0 no transl_table field (#34)""" input_gbk: Path = f"{test_data}/combined_truncated_acr_defense_vfdb_card.gbk" @@ -116,6 +118,7 @@ def test_run_genbank_old_pharokka(gpu_available, threads): cmd = f"{cmd} --cpu" exec_command(cmd) + def test_run_fasta(gpu_available, threads): """test phold run with genbank input""" input_fasta: Path = f"{test_data}/combined_truncated_acr_defense_vfdb_card.fasta" @@ -124,6 +127,7 @@ def test_run_fasta(gpu_available, threads): cmd = f"{cmd} --cpu" exec_command(cmd) + def test_run_efam(gpu_available, threads): """test phold run with a tophit to efam""" input_fasta: Path = f"{test_data}/KF623293.1_subset_efam.fasta" @@ -132,6 +136,7 @@ def test_run_efam(gpu_available, threads): cmd = f"{cmd} --cpu" exec_command(cmd) + def test_predict_genbank(gpu_available, threads): """test phold predict with genbank input""" input_gbk: Path = f"{test_data}/combined_truncated_acr_defense_vfdb_card.gbk" @@ -154,12 +159,14 @@ def test_compare_pdb(threads): cmd = f"phold compare -i {input_gbk} -o {compare_pdb_dir} -t {threads} -d {database_dir} --pdb --pdb_dir {pdb_dir} -f" exec_command(cmd) + def test_proteins_compare_pdb(threads): """test phold proteins-compare with pdbs input""" input_faa: Path = f"{test_data}/NC_043029_aa.fasta" cmd = f"phold proteins-compare -i {input_faa} -o {proteins_compare_pdb_dir} -t {threads} -d {database_dir} --pdb --pdb_dir {pdb_dir} -f" exec_command(cmd) + def test_predict_fasta(gpu_available, threads): """test phold predict with fasta input""" input_fasta: Path = f"{test_data}/combined_truncated_acr_defense_vfdb_card.fasta" @@ -199,7 +206,6 @@ def test_plot(): exec_command(cmd) - def test_remote_genbank(run_remote, threads): """test phold remote with genbank input""" input_gbk: Path = f"{test_data}/combined_truncated_acr_defense_vfdb_card.gbk" @@ -207,11 +213,10 @@ def test_remote_genbank(run_remote, threads): cmd = f"phold remote -i {input_gbk} -o {remote_gbk_dir} -t {threads} -d {database_dir} -f" exec_command(cmd) + def test_remote_fasta(run_remote, threads): """test phold remote with fasta input""" - input_fasta: Path = ( - f"{test_data}/combined_truncated_acr_defense_vfdb_card.fasta" - ) + input_fasta: Path = f"{test_data}/combined_truncated_acr_defense_vfdb_card.fasta" if run_remote is True: cmd = f"phold remote -i {input_fasta} -o {remote_fasta_dir} -t {threads} -d {database_dir} -f" exec_command(cmd) From 5ef3da844843566e053fe1fcbf47b64867581a38 Mon Sep 17 00:00:00 2001 From: gbouras13 Date: Mon, 25 Mar 2024 21:43:02 +1030 Subject: [PATCH 08/11] bump version --- pyproject.toml | 2 +- src/phold/utils/VERSION | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 32739d5..1630f97 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = ["setuptools>=61.0", "wheel>=0.37.1"] [project] # https://packaging.python.org/en/latest/specifications/declaring-project-metadata/ name = "phold" -version = "0.1.3" # change VERSION too +version = "0.1.4" # change VERSION too description = "Phage Annotations using Protein Structures" readme = "README.md" requires-python = ">=3.8, <3.12" diff --git a/src/phold/utils/VERSION b/src/phold/utils/VERSION index 7693c96..446ba66 100644 --- a/src/phold/utils/VERSION +++ b/src/phold/utils/VERSION @@ -1 +1 @@ -0.1.3 \ No newline at end of file +0.1.4 \ No newline at end of file From a0ad2193171b863d84a65f974a53a7cf7919a5d3 Mon Sep 17 00:00:00 2001 From: gbouras13 Date: Mon, 25 Mar 2024 21:44:33 +1030 Subject: [PATCH 09/11] format --- src/phold/__init__.py | 2 -- src/phold/subcommands/predict.py | 6 ++---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/phold/__init__.py b/src/phold/__init__.py index 2ee7819..f1de5ae 100755 --- a/src/phold/__init__.py +++ b/src/phold/__init__.py @@ -274,8 +274,6 @@ def run( # validate input fasta_flag, gb_dict = validate_input(input, threads) - print(gb_dict) - # phold predict model_dir = database model_name = "Rostlab/ProstT5_fp16" diff --git a/src/phold/subcommands/predict.py b/src/phold/subcommands/predict.py index 4e3f8d6..94f0087 100644 --- a/src/phold/subcommands/predict.py +++ b/src/phold/subcommands/predict.py @@ -65,7 +65,7 @@ def subcommand_predict( cds_feature.qualifiers["translation"] = cds_feature.qualifiers[ "translation" ][0] - + # for really long CDS IDs (over 54 chars), a space will be introduced # this is because the ID will go over a second line # weird bug noticed it on the Mgnify contigs annotated with Pharokka @@ -75,9 +75,7 @@ def subcommand_predict( # Remove all spaces from the string cds_id = cds_id.replace(" ", "") - cds_dict[record_id][ - cds_id - ] = cds_feature + cds_dict[record_id][cds_id] = cds_feature else: cds_dict[record_id][cds_feature.qualifiers["ID"]] = cds_feature From 1000ab8ed3b1593ebdb0d58f7f28e85a70637864 Mon Sep 17 00:00:00 2001 From: gbouras13 Date: Tue, 26 Mar 2024 10:43:21 +1030 Subject: [PATCH 10/11] add test for long header --- tests/test_data/long_header.gbk | 345 ++++++++++++++++++++++++++++++++ tests/test_integration.py | 8 + 2 files changed, 353 insertions(+) create mode 100644 tests/test_data/long_header.gbk diff --git a/tests/test_data/long_header.gbk b/tests/test_data/long_header.gbk new file mode 100644 index 0000000..50f3f74 --- /dev/null +++ b/tests/test_data/long_header.gbk @@ -0,0 +1,345 @@ +LOCUS ERZ1022805_ERZ1022805.75-NODE-75-length-3260-cov-3.934165 6930 bp DNA linear PHG 26-MAR-2024 +DEFINITION ERZ1022805_ERZ1022805.75-NODE-75-length-3260-cov-3.934165. +ACCESSION ERZ1022805_ERZ1022805.75-NODE-75-length-3260-cov-3.934165 +VERSION ERZ1022805_ERZ1022805.75-NODE-75-length-3260-cov-3.934165 +KEYWORDS . +SOURCE . + ORGANISM . + . +FEATURES Location/Qualifiers + CDS 155..652 + /ID="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-cov-3.934 + 165_CDS_0001" + /transl_table=11 + /phrog="624" + /top_hit="No_MMseqs_PHROG_hit" + /locus_tag="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-co + v-3.934165_CDS_0001" + /function="DNA" + /function=" RNA and nucleotide metabolism" + /product="DNA binding protein" + /source="PHANOTATE_1.5.1" + /score="-3041.407595930475" + /phase="0" + /translation="MNIFQTQRCPIESAREHCNIHRNSQLKEGVQMLSTAHRVLDGDLA + DDRLYKVCQPGNRFTKWARESDKNYLWLWSYCNELNNMFIEHSGRSHKSGELLDILKTL + PHNISIGELSPVPIAESSDTVQYIYNIDPVLGHREYLCEKFCEWRERGKKFIFVNEPAW + LV" + CDS 676..1137 + /ID="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-cov-3.934 + 165_CDS_0002" + /transl_table=11 + /phrog="No_PHROGs_HMM" + /top_hit="No_MMseqs_PHROG_hit" + /locus_tag="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-co + v-3.934165_CDS_0002" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE_1.5.1" + /score="-266.87205070817294" + /phase="0" + /translation="MNILTLLFDVFTFWVCLGTLVFFILSVTELFTAHSFYSQSHRFSF + IEGLVVIPVTVLLWPVSLACVFDNVHRVRRDARIRHTLYVDQKRQDLHKRACEFMIARA + SNVAAKSISVGTVTADHLFERTTKLNAKSISAGTVTADHLFERTTKLNR" + CDS 1167..2207 + /ID="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-cov-3.934 + 165_CDS_0003" + /transl_table=11 + /phrog="No_PHROGs_HMM" + /top_hit="No_MMseqs_PHROG_hit" + /locus_tag="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-co + v-3.934165_CDS_0003" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE_1.5.1" + /score="-101193536.8871446" + /phase="0" + /translation="MRNVNESQVRAYLLTHGVIEGLKLEGIEIAEIRRYYNEVDNFVSI + LMEYGTMLSNFIDKAQAESGKEVAGVNITLCDSVAPAWLVGRLLESAELNAPSKAQWHY + FSENLANAFYSDKLNAFSNMDVDKQMRPGLEFDCPKVEYSEEAIRYHLDKFGSSLPIGE + VLYYLERLPKSATHINLHDLSFAVYWLKRENGSWALWAKAGKTFHEEFVWIKPDFVDGC + FIDKLVPLRDLQVRKILGSNQEEREMMFLPEEGQSLTFTEQRLDPAIEHETYVAPHLSE + QSVEIKTDRELSKPQITNVHELTTIRIDGAVITVDSLKKTVNIVSEYCDLIVNDQFTVS + GSVSNL" + CDS 2312..2641 + /ID="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-cov-3.934 + 165_CDS_0004" + /transl_table=11 + /phrog="No_PHROGs_HMM" + /top_hit="No_MMseqs_PHROG_hit" + /locus_tag="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-co + v-3.934165_CDS_0004" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE_1.5.1" + /score="-680.994779070939" + /phase="0" + /translation="MQDVNKIVAVLRDPSQAKYYHGDLMEDAADALEDLNEKVEELSVK + PESLSDFADYVTDASSAIKEHASKEESMPKLDLTVSGILKQLRTSVKEITEVIELLERN + RSGYF" + CDS 2641..3042 + /ID="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-cov-3.934 + 165_CDS_0005" + /transl_table=11 + /phrog="No_PHROGs_HMM" + /top_hit="No_MMseqs_PHROG_hit" + /locus_tag="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-co + v-3.934165_CDS_0005" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE_1.5.1" + /score="-138.31224476683337" + /phase="0" + /translation="MAKRIPFDYSTLRERISNAQTLNDLICRVPYASIRDRYCKQDITK + AKTLHDLVRDPKKVQQSVFKLEEKIQFLLCRDWWTLPELASEVGVLKRRIELAVGRINS + SGYIVKRRSRVTQNVAITEYHIISENSHD" + CDS 3035..3496 + /ID="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-cov-3.934 + 165_CDS_0006" + /transl_table=11 + /phrog="No_PHROGs_HMM" + /top_hit="No_MMseqs_PHROG_hit" + /locus_tag="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-co + v-3.934165_CDS_0006" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE_1.5.1" + /score="-277.8357309716578" + /phase="0" + /translation="MIKSLLTFALLAMSFCSVAGPWNVAGNFAKTLDTTGYNSEALLFI + NKKGDVSIGFTFYDPDCDGYEAKLKRLPVHLFNGEPIHFKSQCIAEDKIAYIPTYANEG + WSIMRQFRQGEEMVFSALAKRYVYKYNAVGFGEAYDDIFEKSLKYKEDI" + CDS 3496..3681 + /ID="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-cov-3.934 + 165_CDS_0007" + /transl_table=11 + /phrog="No_PHROGs_HMM" + /top_hit="No_MMseqs_PHROG_hit" + /locus_tag="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-co + v-3.934165_CDS_0007" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE_1.5.1" + /score="-25.58925218685028" + /phase="0" + /translation="MLCYKLKSTGERLKLIRKRKQVSMFECIDRPMELCCSGVMAHPRI + VVSDVRYTDVVEECKQ" + CDS 3678..3845 + /ID="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-cov-3.934 + 165_CDS_0008" + /transl_table=11 + /phrog="No_PHROGs_HMM" + /top_hit="No_MMseqs_PHROG_hit" + /locus_tag="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-co + v-3.934165_CDS_0008" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE_1.5.1" + /score="-104.77059019154738" + /phase="0" + /translation="MNVEARIKSTGEIVRIVSSLYGKVEVRHVNRPKIKRDGKMVFVKS + RMLFRDIEFL" + CDS 3886..4503 + /ID="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-cov-3.934 + 165_CDS_0009" + /transl_table=11 + /phrog="2314" + /top_hit="No_MMseqs_PHROG_hit" + /locus_tag="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-co + v-3.934165_CDS_0009" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE_1.5.1" + /score="-58422.1228385253" + /phase="0" + /translation="MHSPIGQLIGKCISWSETKTTSKDKTMATKYYKTNKPEILAQVQR + INNERDALKEAADKVAEQFDAKAMTRTSMHGVSFGGLVLNNYDNVDLFKGRTSPDREDK + HLWTKPDKNNISWPRSSIAGKENKAALKELTEIYNAAVASIPKVEFEPFFEVLGTDWGN + LAFSGLDWFEQDGSIYFVSGLDFSEVATEILGSEYETAERAE" + CDS 4535..5968 + /ID="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-cov-3.934 + 165_CDS_0010" + /transl_table=11 + /phrog="114" + /top_hit="No_MMseqs_PHROG_hit" + /locus_tag="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-co + v-3.934165_CDS_0010" + /function="DNA" + /function=" RNA and nucleotide metabolism" + /product="ATP-dependent DNA ligase" + /source="PHANOTATE_1.5.1" + /score="-975389947.286932" + /phase="0" + /translation="MNSTHAFELIQKLASTSSKKDKEALLKEFWDQAENTTFRYALLKA + LDPFITYGMSKMPKVSPGTEEFNEDTYAMLERLNQRQLTGNAAKQALEIELGRLNFESG + ELLKGIITGKLGAGVTADTVNKVSPGAIFVFKMCLASKFSDLSDKITEKQWADGVKGEV + KADGVRGLFMQIKQFHPVSRNGLPLNSTSDIREEVAQFLAEFACYVKSEIIISEDYDEC + LSDAMNLDCELVEANDVFNDTVGSVRSKDESKAKTIKVKVIDVISQAELEAGKSFYNYK + IRREIMEKFFAEHGSSFPNISLIPCFTFHSEEETYAKFEELKNAGEEGLIIKLDNGFWE + QKRSKGWLKIKDKNSADLVIKSLEEGDANGKYKGLMGAAVCDYRNSKGETVEVKIGGGW + SLQQRAELWSAFTGNPVTYSTTDNGVTTEHITDPEACENPVGWLIEVSYHQETKDGSLR + HPNFVRRRTDKSPDEGQGV" + CDS 6075..6191 + /ID="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-cov-3.934 + 165_CDS_0011" + /transl_table=11 + /phrog="No_PHROGs_HMM" + /top_hit="No_MMseqs_PHROG_hit" + /locus_tag="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-co + v-3.934165_CDS_0011" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE_1.5.1" + /score="-0.0604111446544092" + /phase="0" + /translation="LTTLRLSLPGGYCRGNKAAGAWSQTKSKFTNKPNKDSL" + CDS 6188..6730 + /ID="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-cov-3.934 + 165_CDS_0012" + /transl_table=11 + /phrog="13512" + /top_hit="No_MMseqs_PHROG_hit" + /locus_tag="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-co + v-3.934165_CDS_0012" + /function="transcription regulation" + /product="transcriptional regulator" + /source="PHANOTATE_1.5.1" + /score="-4181.910404944114" + /phase="0" + /translation="MNTISPTSALSLHIIVAVKSAGAKGITGYDIHKSLSDFWSHQQVY + RDCSRLAKSGLLFPTVKNNDGKPDSKVYLFSENAFSDLNNFFVTVMMPFFKEKAHLITE + DYLLAYIEFRKAEVISKVVQEMSDKALLEALIKEEAKWVKLHSEAKQNEALYAHKSSEA + FAKMSMVHYLTGTENEK" + CDS 6720..6929 + /ID="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-cov-3.934 + 165_CDS_0013" + /transl_table=11 + /phrog="No_PHROGs_HMM" + /top_hit="No_MMseqs_PHROG_hit" + /locus_tag="ERZ1022805_ERZ1022805.75-NODE-75-length-3260-co + v-3.934165_CDS_0013" + /function="unknown function" + /product="hypothetical protein" + /source="PHANOTATE_1.5.1" + /score="-70.43465220740035" + /phase="0" + /translation="MKNKVVHNIVEMRADQETPDLDVRVTTCRKNEVHPFVNLMLTEKD + KYASVDVDTYQDVYDTALDWDTTDI" +ORIGIN + 1 tgagtgcatt gatggtgtca gaaatcgcca cgataaagag tagcggtcat tatcctttat + 61 aaggggtcaa aactaaccaa agttttaaat tatttgcatt cggttgcgta acagtttaaa + 121 atcatctcat cattcaatca tccagtaggg acttatgaat atttttcaga ctcagcgctg + 181 tcctatcgag tcagctcgcg aacactgcaa catccaccgt aacagccagc ttaaagaagg + 241 cgtacagatg ctttctacgg cgcaccgcgt gcttgacggc gacctagccg atgaccgtct + 301 atataaggtt tgtcagcctg gcaaccgctt tacgaaatgg gcgcgtgagt ctgacaagaa + 361 ttatctgtgg ctgtggtcgt actgcaatga gcttaacaac atgttcattg agcacagcgg + 421 acgaagccat aagtcgggcg agcttttaga tattcttaaa acgttacccc ataacatttc + 481 tattggtgag ctttcgcctg tgcctattgc tgaatcgagt gatacggtgc aatacattta + 541 caacatcgac cctgtgcttg gtcatcgcga atacttgtgc gagaagtttt gcgagtggcg + 601 cgagcgcggt aagaaattta tttttgttaa tgagccagct tggctggttt aatattattt + 661 agagaaggaa caacaatgaa catcctaacc ttactattcg acgtattcac tttctgggta + 721 tgcctcggca ccctagtatt tttcatactg agcgttaccg aactgtttac agcgcatagc + 781 ttttattcac agtcgcatcg attcagcttt atcgaagggc tggtggtaat ccctgtcact + 841 gttctgcttt ggccagtgtc gcttgcttgc gtctttgata atgttcaccg tgttcgacgt + 901 gatgcacgta tcagacatac cctttacgtc gatcagaagc gtcaagattt acacaagcga + 961 gcgtgtgagt ttatgatcgc cagagcttca aatgtggcag caaaaagcat ttctgtcgga + 1021 accgtaacag cagatcatct gtttgaacgt acaacaaaac taaacgcaaa aagcatttct + 1081 gccggaaccg taacagcaga tcatctgttt gaacgtacaa caaaactaaa ccgataaatt + 1141 tttggtctca gaccaaggag aaaaccatgc gaaacgttaa tgaatcacaa gtccgcgctt + 1201 accttcttac tcacggcgtc attgaaggcc tgaagctaga aggtatcgaa attgcggaaa + 1261 tccgccgcta ctacaatgaa gtggacaact ttgtatctat cttgatggag tacggcacta + 1321 tgctgtctaa ctttatcgac aaggctcaag ctgaatcggg caaagaggtt gcaggcgtta + 1381 acatcacgct ttgtgatagc gtcgctcctg cttggttggt tggtcgccta ttagagtcgg + 1441 cggagctgaa cgcaccttca aaagcgcagt ggcactattt ttctgagaac ctggctaacg + 1501 ctttctattc tgacaagttg aatgcgttca gtaatatgga cgttgataag cagatgcgtc + 1561 ctggtcttga gtttgattgt cctaaggtcg aatatagtga ggaagctatt cgttaccatt + 1621 tggataagtt tggaagtagt ttaccaattg gcgaagttct atactatctg gaaaggcttc + 1681 caaaatcagc cactcacatc aatcttcacg acctgtcttt tgcagtctat tggctaaaac + 1741 gcgagaacgg atcgtgggcg ctttgggcta aagctgggaa aacttttcac gaagaattcg + 1801 tttggattaa gcctgacttt gttgatggct gttttattga caagctcgta ccgcttcgtg + 1861 acctacaagt ccgtaaaatc ctaggcagca accaggaaga gcgcgagatg atgtttttgc + 1921 cagaagaagg tcaatctcta acttttactg aacagagact agacccagct atcgaacatg + 1981 aaacctacgt tgctccacat ttgtccgagc agtcagtaga aatcaaaaca gatcgagagc + 2041 tcagcaagcc tcaaatcacc aacgttcacg agctgaccac tattcgaatt gatggcgcgg + 2101 tgatcaccgt tgactcgctt aaaaagaccg tcaacatcgt tagcgaatat tgcgacctaa + 2161 tcgttaacga tcagtttact gtgtcaggta gcgtatcaaa cttgtagtag gcataattaa + 2221 ataatataat gattccatca ggtgagagat tgcttgatgg gatttttggt ctgagaccaa + 2281 accacaataa caatccagta aggacttaat catgcaagac gttaataaga ttgtcgccgt + 2341 attgcgcgac cctagccagg ccaagtatta ccacggcgat ttaatggaag acgcggctga + 2401 cgcgttggaa gatttgaacg agaaagtaga agagctttct gtcaaacctg agtcgttgtc + 2461 agatttcgct gactacgtga cagacgcttc ttcagccatt aaagagcatg cttctaaaga + 2521 agagagtatg ccaaagcttg atctaacggt tagcggcatc ctgaagcagc tcagaacttc + 2581 agttaaagag ataactgaag tgatcgagtt attggaaaga aatagatcgg ggtatttcta + 2641 atggctaaac gtatcccatt cgactactcg actctaagag agcgcatcag taacgctcaa + 2701 accctgaacg accttatctg tcgagtaccg tacgcgtcta ttcgtgatcg ttactgcaag + 2761 caggacatca ccaaggctaa gactctacac gacctagtca gagaccctaa gaaagtccaa + 2821 cagtccgtgt ttaagctcga agaaaagatc cagtttctat tgtgccgtga ctggtggacg + 2881 ctccctgaac tggcttcaga agttggcgtg cttaaacgtc gaatcgagtt agcggtagga + 2941 cgaattaact cttcaggtta tatcgtcaaa cgacgcagcc gagtaaccca gaacgttgca + 3001 attaccgaat atcatattat tagcgagaat tcccatgatt aagtcactat taacttttgc + 3061 cttattagcg atgtctttct gttctgtggc tggcccatgg aatgttgctg gaaactttgc + 3121 caaaactcta gacaccaccg gatacaacag cgaagcgctt ttgtttatca acaagaaagg + 3181 cgacgtctct attggcttca ccttctacga ccctgactgt gacggttacg aagcgaaact + 3241 caaacgtctt cctgtgcact tattcaacgg tgagccgatt cacttcaaat ctcagtgcat + 3301 cgcagaagac aagatcgctt acattcctac ttacgctaac gaaggttgga gtattatgag + 3361 acagtttaga caaggcgaag agatggtgtt ctctgccctt gctaaacgct acgtttacaa + 3421 atacaacgct gtaggctttg gtgaagccta tgacgacatc tttgaaaaat ccctcaagta + 3481 taaggaagat atctgatgct ttgctacaaa ttgaagtcaa ccggagaacg tctgaagctt + 3541 attcgcaagc gcaagcaggt cagcatgttc gagtgcattg accgccctat ggaactgtgc + 3601 tgcagcggcg ttatggcaca ccctcgcatc gtggtgagcg atgttcgtta taccgatgtt + 3661 gtggaggaat gtaagcaatg aacgtagaag ccagaatcaa atccacagga gagattgtcc + 3721 gtattgtctc gtctttgtac ggcaaagtgg aagtgcgcca cgtaaatcgc cctaaaatta + 3781 agcgcgacgg caaaatggtt tttgtcaaaa gtcgcatgtt gtttagagat attgagttcc + 3841 tgtaatctcg cgtcacgttt gcgtcttcta tcaaggaggc gcaaaatgca ctcaccgatt + 3901 ggacaactaa tcggtaagtg catttcttgg tctgagacca aaacaaccag taaggacaaa + 3961 acaatggcaa cgaaatacta caaaactaac aaacctgaaa tcctggctca ggtgcagcgc + 4021 atcaacaacg agcgtgacgc gcttaaagag gctgcagata aagtggctga acagttcgac + 4081 gctaaggcaa tgactagaac cagcatgcat ggtgtgagct tcggcggttt ggtccttaat + 4141 aactacgaca acgtagatct ttttaaaggt cgcacttcac ctgatcgaga agataaacat + 4201 ctgtggacta agccagataa aaacaacatt agttggccgc gatcttctat cgctggcaaa + 4261 gagaacaaag cagcacttaa agagctgact gagatctaca atgccgcagt agcttcaatc + 4321 cctaaagtgg agtttgaacc gttctttgaa gtgcttggca ctgactgggg caaccttgct + 4381 ttttcagggc tcgactggtt cgaacaggac ggctccatct acttcgtatc tggcttagat + 4441 ttcagtgaag tagctactga aattttaggc tctgaatacg aaacggcaga acgagctgaa + 4501 taattactta acccaaaaca aggaaaataa aaccatgaat tcaacccacg ctttcgagct + 4561 tatccaaaaa cttgcttcta cttcttctaa aaaagacaaa gaggctctac ttaaagaatt + 4621 ttgggaccaa gctgagaaca ccacgttccg ctacgccctt ctaaaagcgc ttgacccgtt + 4681 catcacctac ggcatgagca agatgcctaa agtgagccca ggcactgaag agttcaacga + 4741 agacacctac gccatgcttg agcgtcttaa ccaacgccag ctcacaggca acgcggctaa + 4801 gcaagcgctt gaaattgaac ttggccgcct taacttcgag tctggagaac tgctgaaagg + 4861 tattatcact ggcaaacttg gtgctggcgt gactgctgac acggttaaca aggtttctcc + 4921 tggagccatc ttcgtgttta agatgtgttt ggcttctaag ttctctgact tgtcagacaa + 4981 gatcaccgag aaacagtggg ctgacggcgt aaaaggcgaa gttaaagctg acggcgttcg + 5041 cggtctgttt atgcagatca agcagttcca cccagtaagc cgtaacggtc tgcctcttaa + 5101 ctctacttca gatattcgcg aagaagttgc acagttcctt gctgagtttg cctgctacgt + 5161 taagtcagaa atcatcatca gtgaagacta tgacgagtgt ctcagcgatg ccatgaacct + 5221 agactgcgag ttggtagaag ctaacgacgt attcaacgac acagtaggca gtgtgcgcag + 5281 caaggacgag tccaaagcca aaactattaa ggttaaggtt atcgacgtaa tcagtcaggc + 5341 tgagctagaa gcaggcaagt ctttctataa ctacaaaatt cgtcgcgaga tcatggaaaa + 5401 gttcttcgct gaacacggta gctcattccc taacatttcg cttatccctt gcttcacttt + 5461 ccacagtgaa gaagagactt acgccaagtt tgaagagttg aagaacgctg gcgaagaagg + 5521 gcttattatt aagctagaca acggcttctg ggaacaaaaa cgaagcaagg gctggttaaa + 5581 gatcaaagac aagaactcag ccgatctagt cattaagtct ctagaagaag gtgacgctaa + 5641 cggcaagtat aaagggctca tgggcgcagc agtctgtgac tacagaaact ctaaaggcga + 5701 gactgtcgaa gttaaaatag gcggtggttg gtctctacaa cagcgcgctg agctgtggtc + 5761 tgcattcact ggaaaccctg ttacttacag cactacagac aacggtgtga ctaccgagca + 5821 cattacagac cctgaagcgt gcgaaaaccc tgtaggttgg ctgatcgaag tctcttacca + 5881 ccaggagact aaggacggtt cgcttcgtca ccctaacttt gttcgccgtc gtactgataa + 5941 aagccctgat gaaggtcaag gcgtgtaaat aaccaactgg gtcagcgtca aaattgcgct + 6001 gaccctacgt tactgtaaac tgacaacact cagagtttgt ttttgtctga ggctaagggg + 6061 gctgagctgt cgcattgact acattacggc tcagtctgcc cggtggctat tgtaggggga + 6121 acaaagcggc cggagcttgg tctcagacca aaagcaaatt taccaacaaa cccaataagg + 6181 actcactatg aatactatca gcccaacttc tgcgctatct cttcacatca tcgttgctgt + 6241 taaatctgct ggcgccaaag gcatcactgg ttacgacatt cacaagtcgc tttctgactt + 6301 ctggtcacac cagcaagttt accgtgactg tagccgactg gctaaatctg gtttgttatt + 6361 cccgaccgtc aagaacaacg acggcaagcc tgactctaag gtctatctat tctcagagaa + 6421 cgccttctct gaccttaaca acttcttcgt taccgttatg atgcctttct tcaaagaaaa + 6481 ggcgcatctg attaccgaag actatctttt ggcgtatatt gagtttcgca aggctgaagt + 6541 tatcagcaaa gtagttcagg aaatgtctga taaagcgctt cttgaagcac taatcaaaga + 6601 agaggctaaa tgggttaaac ttcactctga agccaaacaa aacgaagctt tgtacgcaca + 6661 caagtctagt gaggcgtttg caaagatgtc tatggttcac tacctaactg ggactgaaaa + 6721 tgaaaaataa agtcgttcac aatatcgtag aaatgcgcgc agatcaggaa actccagacc + 6781 tagacgtgcg cgtaaccact tgtcgaaaaa atgaggttca cccatttgtt aaccttatgc + 6841 tgacggagaa agataagtac gcgtcagtcg acgtagacac ctatcaagac gtgtatgata + 6901 cggcgcttga ttgggatact actgacatcg +// diff --git a/tests/test_integration.py b/tests/test_integration.py index 3e508d3..5f93c90 100755 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -39,6 +39,7 @@ output_dir.mkdir(parents=True, exist_ok=True) run_gbk_dir: Path = f"{output_dir}/combined_truncated_phold_run_gbk" run_gbk_pharokka_1_4_1_dir: Path = f"{output_dir}/NC_043029_pharokka1.4.1_gbk" +run_gbk_long_header_dir: Path = f"{output_dir}/long_header_gbk" run_fasta_dir: Path = f"{output_dir}/combined_truncated_phold_run_fasta" run_fasta_efam_dir: Path = f"{output_dir}/KF_efam_phold_run_fasta" predict_gbk_dir: Path = f"{output_dir}/combined_truncated_phold_predict_gbk" @@ -118,6 +119,13 @@ def test_run_genbank_old_pharokka(gpu_available, threads): cmd = f"{cmd} --cpu" exec_command(cmd) +def test_run_genbank_long_header(gpu_available, threads): + """test phold run with pharokka genbank with large header/locus tag (over 54 chars)""" + input_gbk: Path = f"{test_data}/long_header.gbk" + cmd = f"phold run -i {input_gbk} -o {run_gbk_long_header_dir} -t {threads} -d {database_dir} -f" + if gpu_available is False: + cmd = f"{cmd} --cpu" + exec_command(cmd) def test_run_fasta(gpu_available, threads): """test phold run with genbank input""" From 3e78d3362af4c7100a4889c60485e9f2b42257fc Mon Sep 17 00:00:00 2001 From: gbouras13 Date: Tue, 26 Mar 2024 10:49:06 +1030 Subject: [PATCH 11/11] v0.1.4 history --- HISTORY.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index e184333..4e76758 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,12 @@ # History +0.1.4 (2024-03-26) +------------------ + +* Fixes #31 issue with older Pharokka genbank input (prior to v1.5.0) that lacked 'transl_table' field + * All Pharokka genbank input prior to v1.5.0 will be transl_table 11 (it is before pyrodigal-gv was added) +* Fixes genbank parsing bug that would occur if the ID/locus tag of the features in the inout genbank were longer than 54 characters + 0.1.3 (2024-03-19) ------------------