Skip to content

Commit

Permalink
Added support for genetic codes 2-3, 5-6, 9-10, 12-16, 21-25. The Cre…
Browse files Browse the repository at this point in the history
…ateSchema and AlleleCall modules will use the translation table used to create the training file (values passed to --t, --translation-table are ignored if a training file is used).
  • Loading branch information
rfm-targa committed Jun 28, 2024
1 parent 5e2b903 commit f7ca591
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 39 deletions.
2 changes: 1 addition & 1 deletion CHEWBBACA/SchemaEvaluator/evaluate_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ def main(schema_directory, output_directory, genes_list, annotations,
annotations : str
Path to a TSV file created by the UniprotFinder module.
translation_table : int
Genetic code used to translate the alleles.
Genetic code used to translate alleles.
size_threshold : float
Allele size variation threshold. Used to determine if the size
of an allele is within the interval of the locus size mode +/-
Expand Down
63 changes: 47 additions & 16 deletions CHEWBBACA/chewBBACA.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from ExtractCgMLST import determine_cgmlst
from utils import (join_profiles,
remove_genes,
gene_prediction as gp,
# profiles_sqlitedb as ps,
process_datetime as pdt,
constants as ct,
Expand All @@ -48,6 +49,7 @@
from CHEWBBACA.ExtractCgMLST import determine_cgmlst
from CHEWBBACA.utils import (join_profiles,
remove_genes,
gene_prediction as gp,
# profiles_sqlitedb as ps,
process_datetime as pdt,
constants as ct,
Expand Down Expand Up @@ -102,7 +104,10 @@ def msg(name=None):
parser.add_argument('--ptf', '--training-file', type=str,
required=False, dest='ptf_path',
help='Path to the Prodigal training file used by Pyrodigal '
'to predict genes and added to the schema folder.')
'to predict genes. The translation table used to create '
'this file overrides any value passed to `--t`, '
'`--translation-table`. This file is copied '
'to the schema folder to be used for allele calling.')

parser.add_argument('--bsr', '--blast-score-ratio', type=pv.bsr_type,
required=False, default=ct.DEFAULT_BSR,
Expand All @@ -118,9 +123,11 @@ def msg(name=None):
'sequences (CDSs) shorter than this value are excluded.')

parser.add_argument('--t', '--translation-table', type=pv.translation_table_type,
required=False, default=11, dest='translation_table',
required=False, dest='translation_table',
help='Genetic code used to predict genes and'
' to translate coding sequences (CDSs).')
' to translate coding DNA sequences (CDSs). '
'This value is ignored if a valid training file '
'is passed to `--ptf`, `--training-file`.')

parser.add_argument('--st', '--size-threshold', type=pv.size_threshold_type,
required=False, default=0.2, dest='size_threshold',
Expand Down Expand Up @@ -167,10 +174,21 @@ def msg(name=None):
args = parser.parse_args()
del args.CreateSchema

# Check if PTF exists
if args.ptf_path is not None:
if os.path.isfile(args.ptf_path) is False:
# Check if user passed PTF
if args.ptf_path:
# Check if PTF exists
if not os.path.isfile(args.ptf_path):
sys.exit(ct.INVALID_PTF_PATH)
else:
# Get translation table used to create training file
ptf_table = gp.read_training_file(args.ptf_path).translation_table
args.translation_table = ptf_table
else:
if not args.translation_table:
args.translation_table = ct.GENETIC_CODES_DEFAULT

# Check if translation table is supported
pv.translation_table_type([args.translation_table])

# Create output directory
created = fo.create_directory(args.output_directory)
Expand Down Expand Up @@ -261,7 +279,9 @@ def msg(name=None):
required=False, dest='ptf_path',
help='Path to the Prodigal training file used by Pyrodigal '
'to predict genes. Default is to use the training file '
'included in the schema\'s directory.')
'included in the schema\'s directory. The translation '
'table used to create this file overrides any value '
'passed to `--t`, `--translation-table`.')

parser.add_argument('--gl', '--genes-list', type=str,
required=False, default=False, dest='genes_list',
Expand All @@ -287,9 +307,8 @@ def msg(name=None):
parser.add_argument('--t', '--translation-table', type=pv.translation_table_type,
required=False, dest='translation_table',
help='Genetic code used to predict genes and'
' to translate coding sequences. Must match '
'the genetic code used to create the training '
'file.')
' to translate coding DNA sequences (CDSs). '
'This value will be ignored if a training file is used.')

parser.add_argument('--st', '--size-threshold', type=pv.size_threshold_type,
required=False, dest='size_threshold',
Expand Down Expand Up @@ -886,7 +905,9 @@ def msg(name=None):
required=False, dest='ptf_path',
help='Path to the Prodigal training file that '
'will be included in the directory of the '
'adapted schema.')
'adapted schema. The translation table used to create '
'this file overrides any value passed to `--t`, '
'`--translation-table`.')

parser.add_argument('--bsr', '--blast-score-ratio', type=pv.bsr_type,
required=False, default=ct.DEFAULT_BSR,
Expand All @@ -909,8 +930,10 @@ def msg(name=None):

parser.add_argument('--t', '--translation-table',
type=pv.translation_table_type, required=False,
default=11, dest='translation_table',
help='Genetic code used for allele translation.')
dest='translation_table',
help='Genetic code used for allele translation. This '
'value is ignored if a valid training file '
'is passed to `--ptf`, `--training-file`.')

parser.add_argument('--st', '--size-threshold', type=pv.size_threshold_type,
required=False, default=ct.SIZE_THRESHOLD_DEFAULT,
Expand Down Expand Up @@ -944,10 +967,18 @@ def msg(name=None):
args = parser.parse_args()
del args.PrepExternalSchema

# Check if PTF exists
if args.ptf_path is not None:
if os.path.isfile(args.ptf_path) is False:
# Check if user passed PTF
if args.ptf_path:
# Check if PTF exists
if not os.path.isfile(args.ptf_path):
sys.exit(ct.INVALID_PTF_PATH)
else:
# Get translation table used to create training file
ptf_table = gp.read_training_file(args.ptf_path).translation_table
args.translation_table = ptf_table
else:
if not args.translation_table:
args.translation_table = ct.GENETIC_CODES_DEFAULT

# Define output paths
schema_path = os.path.abspath(args.output_directory)
Expand Down
24 changes: 19 additions & 5 deletions CHEWBBACA/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,25 @@
INTRA_CLUSTER_DEFAULT = 0.9

# Genetic codes/translation tables
GENETIC_CODES = {1: 'Standard',
4: 'The mold, protozoan, and coelenterate mitochondrial '
'code and the mycoplasma/spiroplasma code',
11: 'The Bacterial, Archaeal and Plant Plastid code',
25: 'Candidate division SR1 and gracilibacteria code'}
GENETIC_CODES = {1: 'The Standard Code',
2: 'The Vertebrate Mitochondrial Code',
3: 'The Yeast Mitochondrial Code',
4: 'The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code',
5: 'The Invertebrate Mitochondrial Code',
6: 'The Ciliate, Dasycladacean and Hexamita Nuclear Code',
9: 'The Echinoderm and Flatworm Mitochondrial Code',
10: 'The Euplotid Nuclear Code',
11: 'The Bacterial, Archaeal and Plant Plastid Code',
12: 'The Alternative Yeast Nuclear Code',
13: 'The Ascidian Mitochondrial Code',
14: 'The Alternative Flatworm Mitochondrial Code',
15: 'Blepharisma Nuclear Code',
16: 'Chlorophycean Mitochondrial Code',
21: 'Trematode Mitochondrial Code',
22: 'Scenedesmus obliquus Mitochondrial Code',
23: 'Thraustochytrium Mitochondrial Code',
24: 'Rhabdopleuridae Mitochondrial Code',
25: 'Candidate Division SR1 and Gracilibacteria Code'}

GENETIC_CODES_DEFAULT = 11

Expand Down
47 changes: 30 additions & 17 deletions CHEWBBACA/utils/parameters_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,14 @@

try:
from utils import (constants as ct,
gene_prediction as gp,
file_operations as fo,
chewiens_requests as cr,
fasta_operations as fao,
iterables_manipulation as im)
except ModuleNotFoundError:
from CHEWBBACA.utils import (constants as ct,
gene_prediction as gp,
file_operations as fo,
chewiens_requests as cr,
fasta_operations as fao,
Expand Down Expand Up @@ -242,7 +244,7 @@ def translation_table_type(arg, genetic_codes=ct.GENETIC_CODES):
valid = False

if valid is False:
# format available genetic codes into list
# Format available genetic codes into list
lines = ['\t{0}: {1}'.format(k, v) for k, v in genetic_codes.items()]
gc_table = '\n{0}\n'.format('\n'.join(lines))

Expand Down Expand Up @@ -798,7 +800,6 @@ def validate_ptf(ptf_path, schema_directory, schema_ptfs, force_continue):
True if the training file does not match any of
the training files previously used with the schema.
"""

ptf_path = validate_ptf_path(ptf_path, schema_directory)

# Determine PTF checksum
Expand Down Expand Up @@ -855,26 +856,21 @@ def solve_conflicting_arguments(schema_params, ptf_path, blast_score_ratio,
Dictionary with the arguments validated values that
will be used for allele calling.
"""

# run parameters values
# Parameter values for current run
run_params = {'bsr': blast_score_ratio,
'translation_table': translation_table,
#'translation_table': translation_table,
'minimum_locus_length': minimum_length,
'size_threshold': size_threshold}

# determine user provided arguments values that differ from default
# Determine user provided values that differ from default
unmatch_params = {k: v
for k, v in run_params.items()
if v not in schema_params[k] and v is not None}
# determine arguments values not provided by user
default_params = {k: schema_params[k][0]
for k, v in run_params.items()
if v is None}

# update arguments for current run
for k in run_params:
if k in default_params:
run_params[k] = default_params[k]
# Update run values equal to None
for k, v in run_params.items():
if v is None:
run_params[k] = schema_params[k][0]

if len(unmatch_params) > 0:
print(ct.ARGS_DIFFER)
Expand All @@ -892,11 +888,11 @@ def solve_conflicting_arguments(schema_params, ptf_path, blast_score_ratio,
if params_answer.lower() not in ['y', 'yes']:
sys.exit('Exited.')
else:
# Append new argument values to configs values
# Append new argument values to config values
for p in unmatch_params:
schema_params[p].append(unmatch_params[p])

# default is to get the training file in schema directory
# Default is to get the training file in schema directory
schema_ptfs = schema_params['prodigal_training_file']
ptf_path, ptf_hash, unmatch = validate_ptf(ptf_path, schema_directory,
schema_ptfs, force_continue)
Expand All @@ -906,7 +902,24 @@ def solve_conflicting_arguments(schema_params, ptf_path, blast_score_ratio,
schema_params['prodigal_training_file'].append(ptf_hash)
unmatch_params['prodigal_training_file'] = ptf_hash

# save updated schema config file
# Update translation table
if ptf_path:
# Get translation table used to create training file
ptf_table = gp.read_training_file(ptf_path).translation_table
run_params['translation_table'] = ptf_table
if ptf_table not in schema_params['translation_table']:
schema_params['translation_table'].append(ptf_table)
unmatch_params['translation_table'] = ptf_table
else:
if not translation_table:
run_params['translation_table'] = schema_params['translation_table'][0]
else:
run_params['translation_table'] = translation_table
if translation_table not in schema_params['translation_table']:
schema_params['translation_table'].append(ptf_table)
unmatch_params['translation_table'] = ptf_table

# Update schema config file
if len(unmatch_params) > 0:
fo.pickle_dumper(schema_params, config_file)

Expand Down

0 comments on commit f7ca591

Please sign in to comment.