diff --git a/.github/workflows/black_lint.yml b/.github/workflows/black_lint.yml new file mode 100644 index 00000000..b2cd244f --- /dev/null +++ b/.github/workflows/black_lint.yml @@ -0,0 +1,10 @@ +name: Lint + +on: [push, pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: psf/black@stable \ No newline at end of file diff --git a/VERSION b/VERSION index eca07e4c..ccbccc3d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.1.2 +2.2.0 diff --git a/docs/conf.py b/docs/conf.py index 7ec252d2..68a493c5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -14,12 +14,14 @@ # -- Project information ----------------------------------------------------- -project = 'PPanGGOLiN' -copyright = '2023, LABGeM' -author = 'Jérôme Arnoux' +project = "PPanGGOLiN" +copyright = "2023, LABGeM" +author = "Jérôme Arnoux" # The full version, including alpha/beta/rc tags -release = open(Path(__file__).resolve().parents[1]/"VERSION").read().rstrip() # Get release number in the VERSION file +release = ( + open(Path(__file__).resolve().parents[1] / "VERSION").read().rstrip() +) # Get release number in the VERSION file # -- General configuration --------------------------------------------------- @@ -33,21 +35,19 @@ "sphinx.ext.duration", "sphinx.ext.autosectionlabel", "sphinx.ext.autodoc", - 'sphinx_search.extension', - 'sphinxcontrib.mermaid' + "sphinx_search.extension", + "sphinxcontrib.mermaid", ] -source_suffix = { - '.md': 'markdown' -} +source_suffix = {".md": "markdown"} # Prefix document path to section labels, to use: # `path/to/file:heading` instead of just `heading` autosectionlabel_prefix_document = True # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -60,9 +60,9 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] \ No newline at end of file +html_static_path = ["_static"] diff --git a/docs/dev/contribute.md b/docs/dev/contribute.md index a54c6e48..f883c621 100644 --- a/docs/dev/contribute.md +++ b/docs/dev/contribute.md @@ -8,17 +8,24 @@ If you have ideas for new features or improvements, initiating a discussion in a For minor changes like fixing typos or making small edits, feel free to create a new Pull Request (PR) directly with your proposed changes. + ## Setting Up the Development Environment -1. **Fork the Repository:** Start by forking the repository to your GitHub account. 🍴 +1. **Fork the Repository:** + Start by forking the repository to your GitHub account. 🍴 -2. **Clone the Forked Repository:** Clone your forked repository to your local machine. +2. **Clone the Forked Repository:** + Clone your forked repository to your local machine. 3. **Get an Environment:** Create an environment with all PPanGGOLiN prerequisites installed. For that, you can follow installation instructions [here](../user/install.md#installing-from-source-code-github). -4. **Branch from 'dev':** Begin your changes from the 'dev' branch, where we incorporate changes for the upcoming release. +4. **Branch from 'dev':** + Begin your changes from the 'dev' branch, where we incorporate changes for the upcoming release. + + +5. **Install in Editable Mode:** -5. **Install in Editable Mode:** To enable seamless code editing and testing of new functionality, install PPanGGOLiN in editable mode using the following command: + To enable code editing and testing of new functionality, you can install PPanGGOLiN in editable mode using the following command: ```bash pip install -e . @@ -26,17 +33,22 @@ For minor changes like fixing typos or making small edits, feel free to create a This allows you to modify the code and experiment with new features directly. - ```{note} - Note: Currently, we are not utilizing any auto formatters (like autopep8 or black). Kindly refrain from using them, as it could introduce extensive changes across the project, making code review challenging for us. +6. **Apply Code Formatting with Black:** + We have integrated [Black](https://github.com/psf/black) as our code formatter to maintain consistent styling. Code changes are automatically checked via a GitHub Action in our CI, so **ensure your code is formatted with Black before committing**. + + + ```{tip} + Integrate Black with your IDE to automatically format your changes and avoid formatting-related CI failures. ``` + ## Making Your Changes -We encourage consistency in code formatting; when adding new code, try to follow the existing code structure as closely as possible. Functions should include descriptive docstrings explaining their purpose and detailing the parameters. Ensure that argument types are specified in the function definitions. +Keep it consistent! Match the existing code style, add docstrings to describe functions, and specify argument types. ## Update Documentation -It's essential to update the documentation to reflect your changes. Provide clear descriptions and, if necessary, examples of commands and their respective outputs. +Update docs to reflect changes—clear descriptions and examples are always helpful! ## Tests diff --git a/ppanggolin/RGP/__init__.py b/ppanggolin/RGP/__init__.py index 61883771..50f2feb3 100644 --- a/ppanggolin/RGP/__init__.py +++ b/ppanggolin/RGP/__init__.py @@ -1,3 +1,3 @@ from .genomicIsland import subparser, launch from .spot import * -from . import rgp_cluster \ No newline at end of file +from . import rgp_cluster diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index ec268f89..e7d8918d 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -33,13 +33,15 @@ def changes(self, score): def extract_rgp(contig, node, rgp_id, naming) -> Region: """ - Extract the region from the given starting node + Extract the region from the given starting node """ new_region = None if naming == "contig": new_region = Region(contig.name + "_RGP_" + str(rgp_id)) elif naming == "organism": - new_region = Region(node.gene.organism.name + "_" + contig.name + "_RGP_" + str(rgp_id)) + new_region = Region( + node.gene.organism.name + "_" + contig.name + "_RGP_" + str(rgp_id) + ) while node.state: new_region.add(node.gene) node.state = 0 @@ -52,7 +54,7 @@ def extract_rgp(contig, node, rgp_id, naming) -> Region: def rewrite_matrix(contig, matrix, index, persistent, continuity, multi): """ - ReWrite the matrice from the given index of the node that started a region. + ReWrite the matrice from the given index of the node that started a region. """ prev = matrix[index] index += 1 @@ -63,7 +65,10 @@ def rewrite_matrix(contig, matrix, index, persistent, continuity, multi): next_node = matrix[index] nb_perc = 0 while next_node.state: # while the old state is not 0, recompute the scores. - if next_node.gene.family.named_partition == "persistent" and next_node.gene.family not in multi: + if ( + next_node.gene.family.named_partition == "persistent" + and next_node.gene.family not in multi + ): modif = -pow(persistent, nb_perc) nb_perc += 1 else: @@ -85,7 +90,9 @@ def rewrite_matrix(contig, matrix, index, persistent, continuity, multi): next_node = matrix[index] -def init_matrices(contig: Contig, multi: set, persistent_penalty: int = 3, variable_gain: int = 1) -> list: +def init_matrices( + contig: Contig, multi: set, persistent_penalty: int = 3, variable_gain: int = 1 +) -> list: """ Initialize the vector of score/state nodes @@ -119,10 +126,12 @@ def init_matrices(contig: Contig, multi: set, persistent_penalty: int = 3, varia if prev.state == 0: zero_ind = prev mat.append(prev) - logging.getLogger("PPanGGOLiN").debug(f"gene:{gene.ID};zero_ind:{zero_ind};curr_state:{curr_state};curr_score:{curr_score}.") + logging.getLogger("PPanGGOLiN").debug( + f"gene:{gene.ID};zero_ind:{zero_ind};curr_state:{curr_state};curr_score:{curr_score}." + ) if zero_ind is None: - zero_ind = prev#don't go further than the current node, if no node were at 0. + zero_ind = prev # don't go further than the current node, if no node were at 0. # if the contig is circular, and we're in a rgp state, # we need to continue from the "starting" gene until we leave rgp state. @@ -135,11 +144,16 @@ def init_matrices(contig: Contig, multi: set, persistent_penalty: int = 3, varia mat_node = mat[c] if mat_node == zero_ind: # then we've parsed the entire contig twice. - logging.getLogger("PPanGGOLiN").debug(f"{contig.name} was parsed entirely twice.") + logging.getLogger("PPanGGOLiN").debug( + f"{contig.name} was parsed entirely twice." + ) # The whole sequence is a rgp, so we're stopping the iteration now, otherwise we'll loop indefinitely break - if mat_node.gene.family.named_partition == "persistent" and mat_node.gene.family not in multi: + if ( + mat_node.gene.family.named_partition == "persistent" + and mat_node.gene.family not in multi + ): modif = -pow(persistent_penalty, nb_perc) nb_perc += 1 else: @@ -149,13 +163,23 @@ def init_matrices(contig: Contig, multi: set, persistent_penalty: int = 3, varia curr_score = modif + mat_node.prev.score curr_state = 1 if curr_score >= 0 else 0 mat_node.changes(curr_score) - logging.getLogger("PPanGGOLiN").debug(f"gene:{mat_node.gene.ID};curr_state:{curr_state};curr_score:{curr_score}.") + logging.getLogger("PPanGGOLiN").debug( + f"gene:{mat_node.gene.ID};curr_state:{curr_state};curr_score:{curr_score}." + ) c += 1 return mat -def mk_regions(contig: Contig, matrix: list, multi: set, min_length: int = 3000, min_score: int = 4, - persistent: int = 3, continuity: int = 1, naming: str = "contig") -> Set[Region]: +def mk_regions( + contig: Contig, + matrix: list, + multi: set, + min_length: int = 3000, + min_score: int = 4, + persistent: int = 3, + continuity: int = 1, + naming: str = "contig", +) -> Set[Region]: """ Processing matrix and 'emptying' it to get the regions. @@ -183,7 +207,9 @@ def max_index_node(lst): max_index = idx return max_score, max_index else: - raise TypeError(f"List of matriceNode is expected. The detected type was {type(lst)}") + raise TypeError( + f"List of matriceNode is expected. The detected type was {type(lst)}" + ) contig_regions = set() val, index = max_index_node(matrix) @@ -197,9 +223,16 @@ def max_index_node(lst): return contig_regions -def compute_org_rgp( organism: Organism, multigenics: set, - persistent_penalty: int = 3, variable_gain: int = 1, min_length: int = 3000, min_score: int = 4, - naming: str = "contig", disable_bar: bool = True ) -> set: +def compute_org_rgp( + organism: Organism, + multigenics: set, + persistent_penalty: int = 3, + variable_gain: int = 1, + min_length: int = 3000, + min_score: int = 4, + naming: str = "contig", + disable_bar: bool = True, +) -> set: """ Compute regions of genomic plasticity (RGP) on the given organism based on the provided parameters. @@ -211,14 +244,21 @@ def compute_org_rgp( organism: Organism, multigenics: set, :param min_score: Minimum score threshold for considering a region as RGP (default: 4). :param naming: Naming scheme for the regions, either "contig" or "organism" (default: "contig"). :param disable_bar: Whether to disable the progress bar. It is recommended to disable it when calling this function in a loop on multiple organisms (default: True). - + :return: A set of RGPs of the provided organism. """ org_regions = set() - for contig in tqdm(organism.contigs, total=organism.number_of_contigs, unit="contig", disable=disable_bar): + for contig in tqdm( + organism.contigs, + total=organism.number_of_contigs, + unit="contig", + disable=disable_bar, + ): if contig.number_of_genes != 0: # some contigs have no coding genes... # can definitely multiprocess this part, as not THAT much information is needed... - matrix = init_matrices(contig, multigenics, persistent_penalty, variable_gain) + matrix = init_matrices( + contig, multigenics, persistent_penalty, variable_gain + ) org_regions |= mk_regions( contig, matrix, @@ -227,7 +267,7 @@ def compute_org_rgp( organism: Organism, multigenics: set, min_score, persistent_penalty, variable_gain, - naming=naming + naming=naming, ) return org_regions @@ -245,29 +285,41 @@ def naming_scheme(organisms: Iterable[Organism]) -> str: oldlen = len(contigsids) contigsids.add(contig.name) if oldlen == len(contigsids): - logging.getLogger("PPanGGOLiN").warning("You have contigs with identical identifiers in your " - "assemblies. Identifiers will be supplemented with your " - "provided organism names.") + logging.getLogger("PPanGGOLiN").warning( + "You have contigs with identical identifiers in your " + "assemblies. Identifiers will be supplemented with your " + "provided organism names." + ) return "organism" return "contig" def check_pangenome_former_rgp(pangenome: Pangenome, force: bool = False): - """ checks pangenome status and .h5 files for former rgp, delete them if allowed or raise an error + """checks pangenome status and .h5 files for former rgp, delete them if allowed or raise an error :param pangenome: Pangenome object :param force: Allow to force write on Pangenome file """ if pangenome.status["predictedRGP"] == "inFile" and not force: - raise Exception("You are trying to predict RGPs in a pangenome that already have them predicted. " - "If you REALLY want to do that, use --force " - "(it will erase RGPs and every feature computed from them).") + raise Exception( + "You are trying to predict RGPs in a pangenome that already have them predicted. " + "If you REALLY want to do that, use --force " + "(it will erase RGPs and every feature computed from them)." + ) elif pangenome.status["predictedRGP"] == "inFile" and force: erase_pangenome(pangenome, rgp=True) -def predict_rgp(pangenome: Pangenome, persistent_penalty: int = 3, variable_gain: int = 1, min_length: int = 3000, - min_score: int = 4, dup_margin: float = 0.05, force: bool = False, disable_bar: bool = False): +def predict_rgp( + pangenome: Pangenome, + persistent_penalty: int = 3, + variable_gain: int = 1, + min_length: int = 3000, + min_score: int = 4, + dup_margin: float = 0.05, + force: bool = False, + disable_bar: bool = False, +): """ Main function to predict region of genomic plasticity @@ -282,16 +334,34 @@ def predict_rgp(pangenome: Pangenome, persistent_penalty: int = 3, variable_gain """ # check statuses and load info check_pangenome_former_rgp(pangenome, force) - check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=False, need_partitions=True, - disable_bar=disable_bar) + check_pangenome_info( + pangenome, + need_annotations=True, + need_families=True, + need_graph=False, + need_partitions=True, + disable_bar=disable_bar, + ) logging.getLogger("PPanGGOLiN").info("Detecting multigenic families...") multigenics = pangenome.get_multigenics(dup_margin) logging.getLogger("PPanGGOLiN").info("Compute Regions of Genomic Plasticity ...") name_scheme = naming_scheme(pangenome.organisms) - for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genomes", disable=disable_bar): - for region in compute_org_rgp(org, multigenics, persistent_penalty, variable_gain, min_length, - min_score, naming=name_scheme): + for org in tqdm( + pangenome.organisms, + total=pangenome.number_of_organisms, + unit="genomes", + disable=disable_bar, + ): + for region in compute_org_rgp( + org, + multigenics, + persistent_penalty, + variable_gain, + min_length, + min_score, + naming=name_scheme, + ): pangenome.add_region(region) logging.getLogger("PPanGGOLiN").info(f"Predicted {pangenome.number_of_rgp} RGP") @@ -302,7 +372,7 @@ def predict_rgp(pangenome: Pangenome, persistent_penalty: int = 3, variable_gain pangenome.parameters["rgp"]["min_length"] = min_length pangenome.parameters["rgp"]["min_score"] = min_score pangenome.parameters["rgp"]["dup_margin"] = dup_margin - pangenome.status['predictedRGP'] = "Computed" + pangenome.status["predictedRGP"] = "Computed" def launch(args: argparse.Namespace): @@ -313,10 +383,19 @@ def launch(args: argparse.Namespace): """ pangenome = Pangenome() pangenome.add_file(args.pangenome) - predict_rgp(pangenome, persistent_penalty=args.persistent_penalty, variable_gain=args.variable_gain, - min_length=args.min_length, min_score=args.min_score, dup_margin=args.dup_margin, force=args.force, - disable_bar=args.disable_prog_bar) - write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar) + predict_rgp( + pangenome, + persistent_penalty=args.persistent_penalty, + variable_gain=args.variable_gain, + min_length=args.min_length, + min_score=args.min_score, + dup_margin=args.dup_margin, + force=args.force, + disable_bar=args.disable_prog_bar, + ) + write_pangenome( + pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar + ) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -338,31 +417,61 @@ def parser_rgp(parser: argparse.ArgumentParser): :param parser: parser for align argument """ - required = parser.add_argument_group(title="Required arguments", - description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") + required = parser.add_argument_group( + title="Required arguments", + description="One of the following arguments is required :", + ) + required.add_argument( + "-p", "--pangenome", required=False, type=Path, help="The pangenome .h5 file" + ) optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument('--persistent_penalty', required=False, type=int, default=3, - help="Penalty score to apply to persistent genes") - optional.add_argument('--variable_gain', required=False, type=int, default=1, - help="Gain score to apply to variable genes") - optional.add_argument('--min_score', required=False, type=int, default=4, - help="Minimal score wanted for considering a region as being a RGP") - optional.add_argument('--min_length', required=False, type=int, default=3000, - help="Minimum length (bp) of a region to be considered a RGP") - optional.add_argument("--dup_margin", required=False, type=restricted_float, default=0.05, - help="Minimum ratio of genomes where the family is present in which the family must " - "have multiple genes for it to be considered 'duplicated'") - - -if __name__ == '__main__': + optional.add_argument( + "--persistent_penalty", + required=False, + type=int, + default=3, + help="Penalty score to apply to persistent genes", + ) + optional.add_argument( + "--variable_gain", + required=False, + type=int, + default=1, + help="Gain score to apply to variable genes", + ) + optional.add_argument( + "--min_score", + required=False, + type=int, + default=4, + help="Minimal score wanted for considering a region as being a RGP", + ) + optional.add_argument( + "--min_length", + required=False, + type=int, + default=3000, + help="Minimum length (bp) of a region to be considered a RGP", + ) + optional.add_argument( + "--dup_margin", + required=False, + type=restricted_float, + default=0.05, + help="Minimum ratio of genomes where the family is present in which the family must " + "have multiple genes for it to be considered 'duplicated'", + ) + + +if __name__ == "__main__": """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser_rgp(main_parser) add_common_arguments(main_parser) diff --git a/ppanggolin/RGP/rgp_cluster.py b/ppanggolin/RGP/rgp_cluster.py index b40f67da..b3924570 100644 --- a/ppanggolin/RGP/rgp_cluster.py +++ b/ppanggolin/RGP/rgp_cluster.py @@ -33,7 +33,13 @@ class IdenticalRegions: :param is_contig_border: A boolean indicating if the identical regions span across contig borders. """ - def __init__(self, name: str, identical_rgps: Set[Region], families: Set[GeneFamily], is_contig_border: bool): + def __init__( + self, + name: str, + identical_rgps: Set[Region], + families: Set[GeneFamily], + is_contig_border: bool, + ): if not isinstance(identical_rgps, set): raise TypeError("Expected 'identical_rgps' to be a set") else: @@ -56,7 +62,7 @@ def __init__(self, name: str, identical_rgps: Set[Region], families: Set[GeneFam Region.id_counter += 1 - def __eq__(self, other: 'IdenticalRegions') -> bool: + def __eq__(self, other: "IdenticalRegions") -> bool: """ Check if two IdenticalRegions objects are equal based on their families, identical regions, and contig border status. @@ -66,15 +72,22 @@ def __eq__(self, other: 'IdenticalRegions') -> bool: """ if not isinstance(other, IdenticalRegions): # don't attempt to compare against unrelated types - raise TypeError("'IdenticalRegions' type object was expected, " - f"but '{type(other)}' type object was provided.") + raise TypeError( + "'IdenticalRegions' type object was expected, " + f"but '{type(other)}' type object was provided." + ) - return (self.families == other.families and self.rgps == other.rgps and - self.is_contig_border == other.is_contig_border) + return ( + self.families == other.families + and self.rgps == other.rgps + and self.is_contig_border == other.is_contig_border + ) def __repr__(self): - return (f"IdenticalRegions(name='{self.name}', num_rgps={len(self.rgps)}, num_families={len(self.families)}," - f" is_contig_border={self.is_contig_border})") + return ( + f"IdenticalRegions(name='{self.name}', num_rgps={len(self.rgps)}, num_families={len(self.families)}," + f" is_contig_border={self.is_contig_border})" + ) def __str__(self): return self.name @@ -101,6 +114,7 @@ def genes(self): """ for rgp in self.rgps: yield from rgp.genes + @property def spots(self) -> Set[Spot]: """ @@ -114,13 +128,16 @@ def modules(self) -> Set[Module]: """ Return iterable of genes from all RGPs that are identical in families """ - modules = set() + modules = set() for rgp in self.rgps: modules |= rgp.modules return modules -def compute_grr(rgp_a_families: Set[GeneFamily], rgp_b_families: Set[GeneFamily], mode: Callable) -> float: + +def compute_grr( + rgp_a_families: Set[GeneFamily], rgp_b_families: Set[GeneFamily], mode: Callable +) -> float: """ Compute gene repertoire relatedness (GRR) between two rgp. Mode can be the function min to compute min GRR or max to compute max_grr @@ -132,7 +149,9 @@ def compute_grr(rgp_a_families: Set[GeneFamily], rgp_b_families: Set[GeneFamily] :return: GRR value between 0 and 1 """ - grr = len(rgp_a_families & rgp_b_families) / mode(len(rgp_a_families), len(rgp_b_families)) + grr = len(rgp_a_families & rgp_b_families) / mode( + len(rgp_a_families), len(rgp_b_families) + ) return grr @@ -147,7 +166,9 @@ def compute_jaccard_index(rgp_a_families: set, rgp_b_families: set) -> float: :return : Jaccard index """ - jaccard_index = len(rgp_a_families & rgp_b_families) / len(rgp_a_families | rgp_b_families) + jaccard_index = len(rgp_a_families & rgp_b_families) / len( + rgp_a_families | rgp_b_families + ) return jaccard_index @@ -167,15 +188,17 @@ def add_info_to_rgp_nodes(graph, regions: List[Region], region_to_spot: dict): region_attributes = {} for region in regions: - region_info = {"contig": region.contig.name, - 'genome': region.organism.name, - "name": region.name, - "genes_count": len(region), - "is_contig_border": region.is_contig_border, - "is_whole_contig": region.is_whole_contig, - "spot_id": get_spot_id(region, region_to_spot), - "modules": ';'.join({str(module) for module in region.modules}), - 'families_count': region.number_of_families} + region_info = { + "contig": region.contig.name, + "genome": region.organism.name, + "name": region.name, + "genes_count": len(region), + "is_contig_border": region.is_contig_border, + "is_whole_contig": region.is_whole_contig, + "spot_id": get_spot_id(region, region_to_spot), + "modules": ";".join({str(module) for module in region.modules}), + "families_count": region.number_of_families, + } region_attributes[region.ID] = region_info @@ -185,7 +208,7 @@ def add_info_to_rgp_nodes(graph, regions: List[Region], region_to_spot: dict): return region_attributes -def join_dicts(dicts: List[Dict[str, Any]], delimiter: str = ';') -> Dict[str, Any]: +def join_dicts(dicts: List[Dict[str, Any]], delimiter: str = ";") -> Dict[str, Any]: """ Join dictionaries by concatenating the values with a custom delimiter for common keys. @@ -217,12 +240,18 @@ def format_rgp_metadata(rgp: Region) -> Dict[str, str]: for rgp_metadata in rgp.metadata: source = rgp_metadata.source for field in rgp_metadata.fields: - source_field_2_value[f"{source}_{field}"].append(str(rgp_metadata.get(field))) + source_field_2_value[f"{source}_{field}"].append( + str(rgp_metadata.get(field)) + ) - return {col_name: '|'.join(values) for col_name, values in source_field_2_value.items()} + return { + col_name: "|".join(values) for col_name, values in source_field_2_value.items() + } -def add_rgp_metadata_to_graph(graph: nx.Graph, rgps: List[Union[Region, IdenticalRegions]]) -> None: +def add_rgp_metadata_to_graph( + graph: nx.Graph, rgps: List[Union[Region, IdenticalRegions]] +) -> None: """ Add metadata from Region or IdenticalRegions objects to the graph. @@ -231,41 +260,62 @@ def add_rgp_metadata_to_graph(graph: nx.Graph, rgps: List[Union[Region, Identica """ for rgp in rgps: - element_to_metadata_sources = {"family":set(), "gene":set(), "module":set(), "spot":set()} - + element_to_metadata_sources = { + "family": set(), + "gene": set(), + "module": set(), + "spot": set(), + } for family in rgp.families: - element_to_metadata_sources["family"] |= {metadata.source for metadata in family.metadata} + element_to_metadata_sources["family"] |= { + metadata.source for metadata in family.metadata + } if family.module: - element_to_metadata_sources["module"] |= {metadata.source for metadata in family.module.metadata} + element_to_metadata_sources["module"] |= { + metadata.source for metadata in family.module.metadata + } for gene in rgp.genes: - element_to_metadata_sources["gene"] |= {metadata.source for metadata in gene.metadata} + element_to_metadata_sources["gene"] |= { + metadata.source for metadata in gene.metadata + } if isinstance(rgp, Region): rgp_metadata = rgp.formatted_metadata_dict() if rgp.spot is not None: - element_to_metadata_sources["spot"] = {metadata.source for metadata in rgp.spot.metadata} + element_to_metadata_sources["spot"] = { + metadata.source for metadata in rgp.spot.metadata + } elif isinstance(rgp, IdenticalRegions): - rgp_metadata_dicts = [ident_rgp.formatted_metadata_dict() for ident_rgp in rgp.rgps] + rgp_metadata_dicts = [ + ident_rgp.formatted_metadata_dict() for ident_rgp in rgp.rgps + ] rgp_metadata = join_dicts(rgp_metadata_dicts) - element_to_metadata_sources["spot"] |= {metadata.source for spot in rgp.spots for metadata in spot.metadata} + element_to_metadata_sources["spot"] |= { + metadata.source for spot in rgp.spots for metadata in spot.metadata + } else: - raise TypeError(f'Expect Region or IdenticalRegions object, not {type(rgp)}') + raise TypeError( + f"Expect Region or IdenticalRegions object, not {type(rgp)}" + ) for element, metadata_sources in element_to_metadata_sources.items(): for source in metadata_sources: - graph.nodes[rgp.ID][f'has_{element}_with_{source}'] = True + graph.nodes[rgp.ID][f"has_{element}_with_{source}"] = True for metadata_name, value in rgp_metadata.items(): graph.nodes[rgp.ID][metadata_name] = value -def add_info_to_identical_rgps(rgp_graph: nx.Graph, identical_rgps_objects: List[IdenticalRegions], - rgp_to_spot: Dict[Region, int]): +def add_info_to_identical_rgps( + rgp_graph: nx.Graph, + identical_rgps_objects: List[IdenticalRegions], + rgp_to_spot: Dict[Region, int], +): """ Add identical rgps info in the graph as node attributes. @@ -274,27 +324,41 @@ def add_info_to_identical_rgps(rgp_graph: nx.Graph, identical_rgps_objects: List """ for identical_rgp_obj in identical_rgps_objects: - spots_of_identical_rgp_obj = {get_spot_id(i_rgp, rgp_to_spot) for i_rgp in identical_rgp_obj.rgps} - - rgp_graph.add_node(identical_rgp_obj.ID, - identical_rgp_group=True, - name=identical_rgp_obj.name, - families_count=len(identical_rgp_obj.families), - identical_rgp_count=len(identical_rgp_obj.rgps), - identical_rgp_names=';'.join(i_rgp.name for i_rgp in identical_rgp_obj.rgps), - identical_rgp_genomes=';'.join({i_rgp.organism.name for i_rgp in identical_rgp_obj.rgps}), - identical_rgp_contig_border_count=len( - [True for i_rgp in identical_rgp_obj.rgps if i_rgp.is_contig_border]), - identical_rgp_whole_contig_count=len( - [True for i_rgp in identical_rgp_obj.rgps if i_rgp.is_whole_contig]), - identical_rgp_spots=";".join(spots_of_identical_rgp_obj), - spot_id=spots_of_identical_rgp_obj.pop() if len( - spots_of_identical_rgp_obj) == 1 else "Multiple spots", - modules = ';'.join({str(module) for module in identical_rgp_obj.modules}), - ) - - -def add_edges_to_identical_rgps(rgp_graph: nx.Graph, identical_rgps_objects: List[IdenticalRegions]): + spots_of_identical_rgp_obj = { + get_spot_id(i_rgp, rgp_to_spot) for i_rgp in identical_rgp_obj.rgps + } + + rgp_graph.add_node( + identical_rgp_obj.ID, + identical_rgp_group=True, + name=identical_rgp_obj.name, + families_count=len(identical_rgp_obj.families), + identical_rgp_count=len(identical_rgp_obj.rgps), + identical_rgp_names=";".join( + i_rgp.name for i_rgp in identical_rgp_obj.rgps + ), + identical_rgp_genomes=";".join( + {i_rgp.organism.name for i_rgp in identical_rgp_obj.rgps} + ), + identical_rgp_contig_border_count=len( + [True for i_rgp in identical_rgp_obj.rgps if i_rgp.is_contig_border] + ), + identical_rgp_whole_contig_count=len( + [True for i_rgp in identical_rgp_obj.rgps if i_rgp.is_whole_contig] + ), + identical_rgp_spots=";".join(spots_of_identical_rgp_obj), + spot_id=( + spots_of_identical_rgp_obj.pop() + if len(spots_of_identical_rgp_obj) == 1 + else "Multiple spots" + ), + modules=";".join({str(module) for module in identical_rgp_obj.modules}), + ) + + +def add_edges_to_identical_rgps( + rgp_graph: nx.Graph, identical_rgps_objects: List[IdenticalRegions] +): """ Replace identical rgp objects by all identical RGPs it contains. @@ -302,26 +366,35 @@ def add_edges_to_identical_rgps(rgp_graph: nx.Graph, identical_rgps_objects: Lis :param identical_rgps_objects: A dictionary mapping RGPs to sets of identical RGPs. """ - identical_edge_data = {'grr': 1.0, 'max_grr': 1.0, - 'min_grr': 1.0, - "identical_famillies": True} + identical_edge_data = { + "grr": 1.0, + "max_grr": 1.0, + "min_grr": 1.0, + "identical_famillies": True, + } added_identical_rgps = [] for identical_rgp_obj in identical_rgps_objects: - rgp_graph.add_nodes_from([ident_rgp.ID for ident_rgp in identical_rgp_obj.rgps], - identical_rgp_group=identical_rgp_obj.name) + rgp_graph.add_nodes_from( + [ident_rgp.ID for ident_rgp in identical_rgp_obj.rgps], + identical_rgp_group=identical_rgp_obj.name, + ) # add edge between identical rgp with metrics at one (perfect score) - edges_to_add = [(rgp_a.ID, rgp_b.ID, identical_edge_data) - for rgp_a, rgp_b in combinations(identical_rgp_obj.rgps, 2)] + edges_to_add = [ + (rgp_a.ID, rgp_b.ID, identical_edge_data) + for rgp_a, rgp_b in combinations(identical_rgp_obj.rgps, 2) + ] # replicate all edges that connect identical rgp object to other rgps for connected_rgp in rgp_graph.neighbors(identical_rgp_obj.ID): edge_data = rgp_graph[identical_rgp_obj.ID][connected_rgp] - edges_to_add += [(identical_rgp.ID, connected_rgp, edge_data) - for identical_rgp in identical_rgp_obj.rgps] + edges_to_add += [ + (identical_rgp.ID, connected_rgp, edge_data) + for identical_rgp in identical_rgp_obj.rgps + ] rgp_graph.add_edges_from(edges_to_add) @@ -333,8 +406,9 @@ def add_edges_to_identical_rgps(rgp_graph: nx.Graph, identical_rgps_objects: Lis return added_identical_rgps -def dereplicate_rgp(rgps: Set[Union[Region, IdenticalRegions]], - disable_bar: bool = False) -> List[Union[Region, IdenticalRegions]]: +def dereplicate_rgp( + rgps: Set[Union[Region, IdenticalRegions]], disable_bar: bool = False +) -> List[Union[Region, IdenticalRegions]]: """ Dereplicate RGPs that have the same families. @@ -347,7 +421,7 @@ def dereplicate_rgp(rgps: Set[Union[Region, IdenticalRegions]], :return: A list of dereplicated RGPs (Region or IdenticalRegions objects). For RGPs with the same families, they will be grouped together in IdenticalRegions objects. """ - logging.info(f'Dereplicating {len(rgps)} RGPs') + logging.info(f"Dereplicating {len(rgps)} RGPs") families_to_rgps = defaultdict(list) for rgp in tqdm(rgps, total=len(rgps), unit="RGP", disable=disable_bar): @@ -365,21 +439,22 @@ def dereplicate_rgp(rgps: Set[Union[Region, IdenticalRegions]], is_contig_border = all(rgp.is_contig_border for rgp in rgps) # create a new object that will represent the identical rgps - identical_rgp = IdenticalRegions(name=f"identical_rgps_{identical_region_count}", - identical_rgps=set(rgps), - families=families, - is_contig_border=is_contig_border) + identical_rgp = IdenticalRegions( + name=f"identical_rgps_{identical_region_count}", + identical_rgps=set(rgps), + families=families, + is_contig_border=is_contig_border, + ) identical_region_count += 1 dereplicated_rgps.append(identical_rgp) - logging.info(f'{len(dereplicated_rgps)} unique RGPs') + logging.info(f"{len(dereplicated_rgps)} unique RGPs") return dereplicated_rgps -def compute_rgp_metric(rgp_a: Region, - rgp_b: Region, - grr_cutoff: float, - grr_metric: str) -> Union[Tuple[int, int, dict], None]: +def compute_rgp_metric( + rgp_a: Region, rgp_b: Region, grr_cutoff: float, grr_metric: str +) -> Union[Tuple[int, int, dict], None]: """ Compute GRR metric between two RGPs. @@ -395,16 +470,22 @@ def compute_rgp_metric(rgp_a: Region, # RGP at a contig border are seen as incomplete and min GRR is used instead of max GRR if rgp_a.is_contig_border or rgp_b.is_contig_border: - edge_metrics["incomplete_aware_grr"] = compute_grr(set(rgp_a.families), set(rgp_b.families), min) + edge_metrics["incomplete_aware_grr"] = compute_grr( + set(rgp_a.families), set(rgp_b.families), min + ) else: - edge_metrics["incomplete_aware_grr"] = compute_grr(set(rgp_a.families), set(rgp_b.families), max) + edge_metrics["incomplete_aware_grr"] = compute_grr( + set(rgp_a.families), set(rgp_b.families), max + ) # Compute max and min GRR metrics - edge_metrics['max_grr'] = compute_grr(set(rgp_a.families), set(rgp_b.families), max) - edge_metrics['min_grr'] = compute_grr(set(rgp_a.families), set(rgp_b.families), min) + edge_metrics["max_grr"] = compute_grr(set(rgp_a.families), set(rgp_b.families), max) + edge_metrics["min_grr"] = compute_grr(set(rgp_a.families), set(rgp_b.families), min) # The number of shared families can be useful when visualizing the graph - edge_metrics['shared_family'] = len(set(rgp_a.families).intersection(set(rgp_b.families))) + edge_metrics["shared_family"] = len( + set(rgp_a.families).intersection(set(rgp_b.families)) + ) # Only return the metrics if the GRR value is above the cutoff if edge_metrics[grr_metric] >= grr_cutoff: @@ -413,22 +494,25 @@ def compute_rgp_metric(rgp_a: Region, def cluster_rgp_on_grr(graph: nx.Graph, clustering_attribute: str = "grr"): """ - Cluster rgp based on grr using louvain communities clustering. + Cluster rgp based on grr using louvain communities clustering. :param graph: NetworkX graph object representing the RGPs and their relationship :param clustering_attribute: Attribute of the graph to use for clustering (default is "grr") """ partitions = nx.algorithms.community.louvain_communities( - graph, weight=clustering_attribute) + graph, weight=clustering_attribute + ) # Add partition index in node attributes for i, cluster_nodes in enumerate(partitions): nx.set_node_attributes( - graph, {node: f"cluster_{i}" for node in cluster_nodes}, name=f"{clustering_attribute}_cluster") + graph, + {node: f"cluster_{i}" for node in cluster_nodes}, + name=f"{clustering_attribute}_cluster", + ) - logging.info( - f"Graph has {len(partitions)} clusters using {clustering_attribute}") + logging.info(f"Graph has {len(partitions)} clusters using {clustering_attribute}") def get_spot_id(rgp: Region, rgp_to_spot: Dict[Region, int]) -> str: @@ -448,10 +532,13 @@ def get_spot_id(rgp: Region, rgp_to_spot: Dict[Region, int]) -> str: return "No spot" -def write_rgp_cluster_table(outfile: str, grr_graph: nx.Graph, - rgps_in_graph: List[Union[Region, IdenticalRegions]], - grr_metric: str, - rgp_to_spot: Dict[Region, int]) -> None: +def write_rgp_cluster_table( + outfile: str, + grr_graph: nx.Graph, + rgps_in_graph: List[Union[Region, IdenticalRegions]], + grr_metric: str, + rgp_to_spot: Dict[Region, int], +) -> None: """ Writes RGP cluster info to a TSV file using pandas. @@ -465,20 +552,35 @@ def write_rgp_cluster_table(outfile: str, grr_graph: nx.Graph, all_rgps_infos = [] for rgp_in_graph in rgps_in_graph: - cluster = grr_graph.nodes[rgp_in_graph.ID][f'{grr_metric}_cluster'] + cluster = grr_graph.nodes[rgp_in_graph.ID][f"{grr_metric}_cluster"] - identical_rgps = [rgp_in_graph] if isinstance(rgp_in_graph, Region) else rgp_in_graph.rgps + identical_rgps = ( + [rgp_in_graph] if isinstance(rgp_in_graph, Region) else rgp_in_graph.rgps + ) - all_rgps_infos += [{"RGPs": r.name, "cluster": cluster, - "spot_id": get_spot_id(r, rgp_to_spot)} for r in identical_rgps] + all_rgps_infos += [ + {"RGPs": r.name, "cluster": cluster, "spot_id": get_spot_id(r, rgp_to_spot)} + for r in identical_rgps + ] df = pd.DataFrame(all_rgps_infos) - df.to_csv(outfile, sep='\t', index=False) - - -def cluster_rgp(pangenome, grr_cutoff: float, output: str, basename: str, - ignore_incomplete_rgp: bool, unmerge_identical_rgps: bool, grr_metric: str, - disable_bar: bool, graph_formats: Set[str],add_metadata: bool = False, metadata_sep: str = "|", metadata_sources: List[str] = None,): + df.to_csv(outfile, sep="\t", index=False) + + +def cluster_rgp( + pangenome, + grr_cutoff: float, + output: str, + basename: str, + ignore_incomplete_rgp: bool, + unmerge_identical_rgps: bool, + grr_metric: str, + disable_bar: bool, + graph_formats: Set[str], + add_metadata: bool = False, + metadata_sep: str = "|", + metadata_sources: List[str] = None, +): """ Main function to cluster regions of genomic plasticity based on their GRR @@ -505,43 +607,64 @@ def cluster_rgp(pangenome, grr_cutoff: float, output: str, basename: str, sources_to_use = set(pangenome.status["metasources"][element]) if metadata_sources is not None: - if len(set(pangenome.status["metasources"][element]) & set(metadata_sources)) == 0: - logging.info(f'Metadata for {element} found in pangenome, but none match the specified sources {metadata_sources}. ' - f'Current source for {element}: {sources_to_use}.') + if ( + len( + set(pangenome.status["metasources"][element]) + & set(metadata_sources) + ) + == 0 + ): + logging.info( + f"Metadata for {element} found in pangenome, but none match the specified sources {metadata_sources}. " + f"Current source for {element}: {sources_to_use}." + ) continue else: - sources_to_use = set(pangenome.status["metasources"][element]) & set(metadata_sources) + sources_to_use = set( + pangenome.status["metasources"][element] + ) & set(metadata_sources) need_metadata = True metatypes.add(element) - logging.info(f'Metadata for {element} found in pangenome with sources {sources_to_use}. They will be included in the RGP graph.') + logging.info( + f"Metadata for {element} found in pangenome with sources {sources_to_use}. They will be included in the RGP graph." + ) # check statuses and load info - check_pangenome_info(pangenome, need_families=True, need_annotations=True, - disable_bar=disable_bar, need_rgp=True, need_spots=True, need_modules=True, - need_metadata=need_metadata, - sources= metadata_sources, - metatypes=metatypes) + check_pangenome_info( + pangenome, + need_families=True, + need_annotations=True, + disable_bar=disable_bar, + need_rgp=True, + need_spots=True, + need_modules=True, + need_metadata=need_metadata, + sources=metadata_sources, + metatypes=metatypes, + ) if pangenome.regions == 0: raise Exception( - "The pangenome has no RGPs. The clustering of RGP is then not possible.") + "The pangenome has no RGPs. The clustering of RGP is then not possible." + ) # add all rgp as node if ignore_incomplete_rgp: - valid_rgps = [ - rgp for rgp in pangenome.regions if not rgp.is_contig_border] + valid_rgps = [rgp for rgp in pangenome.regions if not rgp.is_contig_border] ignored_rgp_count = pangenome.number_of_rgp - len(valid_rgps) total_rgp_count = pangenome.number_of_rgp logging.info( - f'Ignoring {ignored_rgp_count}/{total_rgp_count} ({100 * ignored_rgp_count / total_rgp_count:.2f}%) ' - 'RGPs that are located at a contig border and are likely incomplete.') + f"Ignoring {ignored_rgp_count}/{total_rgp_count} ({100 * ignored_rgp_count / total_rgp_count:.2f}%) " + "RGPs that are located at a contig border and are likely incomplete." + ) if len(valid_rgps) == 0: raise Exception( - "The pangenome has no complete RGPs. The clustering of RGP is then not possible.") + "The pangenome has no complete RGPs. The clustering of RGP is then not possible." + ) else: valid_rgps = set(pangenome.regions) @@ -563,8 +686,7 @@ def cluster_rgp(pangenome, grr_cutoff: float, output: str, basename: str, pairs_count = len(rgp_pairs) - logging.info( - f'Computing GRR metric for {pairs_count:,} pairs of RGP.') + logging.info(f"Computing GRR metric for {pairs_count:,} pairs of RGP.") pairs_of_rgps_metrics = [] @@ -577,26 +699,34 @@ def cluster_rgp(pangenome, grr_cutoff: float, output: str, basename: str, grr_graph.add_edges_from(pairs_of_rgps_metrics) - identical_rgps_objects = [rgp for rgp in dereplicated_rgps if isinstance(rgp, IdenticalRegions)] + identical_rgps_objects = [ + rgp for rgp in dereplicated_rgps if isinstance(rgp, IdenticalRegions) + ] rgp_objects_in_graph = [rgp for rgp in dereplicated_rgps if isinstance(rgp, Region)] if unmerge_identical_rgps: - rgp_objects_in_graph += add_edges_to_identical_rgps(grr_graph, identical_rgps_objects) + rgp_objects_in_graph += add_edges_to_identical_rgps( + grr_graph, identical_rgps_objects + ) # cluster rgp based on grr value logging.info( - f"Louvain_communities clustering of RGP based on {grr_metric} on {grr_graph}.") + f"Louvain_communities clustering of RGP based on {grr_metric} on {grr_graph}." + ) cluster_rgp_on_grr(grr_graph, grr_metric) - rgp_to_spot = {region: int(spot.ID) - for spot in pangenome.spots for region in spot.regions} + rgp_to_spot = { + region: int(spot.ID) for spot in pangenome.spots for region in spot.regions + } if not unmerge_identical_rgps: logging.info("Add info on identical RGPs merged in the graph") add_info_to_identical_rgps(grr_graph, identical_rgps_objects, rgp_to_spot) - rgps_in_graph = rgp_objects_in_graph if unmerge_identical_rgps else dereplicated_rgps + rgps_in_graph = ( + rgp_objects_in_graph if unmerge_identical_rgps else dereplicated_rgps + ) # add some attribute to the graph nodes. logging.info("Add RGP information to the graph") @@ -619,8 +749,7 @@ def cluster_rgp(pangenome, grr_cutoff: float, output: str, basename: str, outfile = os.path.join(output, f"{basename}.tsv") logging.info(f"Writing rgp clusters in tsv format in {outfile}") - write_rgp_cluster_table( - outfile, grr_graph, rgps_in_graph, grr_metric, rgp_to_spot) + write_rgp_cluster_table(outfile, grr_graph, rgps_in_graph, grr_metric, rgp_to_spot) def launch(args: argparse.Namespace): @@ -635,11 +764,20 @@ def launch(args: argparse.Namespace): pangenome.add_file(args.pangenome) - cluster_rgp(pangenome, grr_cutoff=args.grr_cutoff, output=args.output, - basename=args.basename, ignore_incomplete_rgp=args.ignore_incomplete_rgp, - unmerge_identical_rgps=args.no_identical_rgp_merging, - grr_metric=args.grr_metric, disable_bar=args.disable_prog_bar, graph_formats=args.graph_formats, - add_metadata=args.add_metadata, metadata_sep=args.metadata_sep, metadata_sources=args.metadata_sources) + cluster_rgp( + pangenome, + grr_cutoff=args.grr_cutoff, + output=args.output, + basename=args.basename, + ignore_incomplete_rgp=args.ignore_incomplete_rgp, + unmerge_identical_rgps=args.no_identical_rgp_merging, + grr_metric=args.grr_metric, + disable_bar=args.disable_prog_bar, + graph_formats=args.graph_formats, + add_metadata=args.add_metadata, + metadata_sep=args.metadata_sep, + metadata_sources=args.metadata_sources, + ) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -651,7 +789,8 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : Parser arguments for cluster_rgp command """ parser = sub_parser.add_parser( - "rgp_cluster", formatter_class=argparse.RawTextHelpFormatter) + "rgp_cluster", formatter_class=argparse.RawTextHelpFormatter + ) parser_cluster_rgp(parser) return parser @@ -662,59 +801,103 @@ def parser_cluster_rgp(parser: argparse.ArgumentParser): :param parser: Parser for cluster_rgp argument """ - required = parser.add_argument_group(title="Required arguments", - description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=True, - type=Path, help="The pangenome .h5 file") + required = parser.add_argument_group( + title="Required arguments", + description="One of the following arguments is required :", + ) + required.add_argument( + "-p", "--pangenome", required=True, type=Path, help="The pangenome .h5 file" + ) optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument('--grr_cutoff', required=False, type=restricted_float, default=0.8, - help="Min gene repertoire relatedness metric used in the rgp clustering") - optional.add_argument('--grr_metric', required=False, type=str, default="incomplete_aware_grr", - help="The grr (Gene Repertoire Relatedness) is used to assess the similarity between two " - "RGPs based on their gene families." - "There are three different modes for calculating the grr value: 'min_grr', 'max_grr' " - "or 'incomplete_aware_grr'." - "'min_grr': Computes the number of gene families shared between the two RGPs and " - "divides it by the smaller number of gene families among the two RGPs." - "'max_grr': Calculates the number of gene families shared between the two RGPs and " - "divides it by the larger number of gene families among the two RGPs." - "'incomplete_aware_grr' (default): If at least one RGP is considered incomplete, " - "which occurs when it is located at the border of a contig," - "the 'min_grr' mode is used. Otherwise, the 'max_grr' mode is applied.", - choices=['incomplete_aware_grr', "min_grr", "max_grr"]) - - optional.add_argument('--ignore_incomplete_rgp', required=False, action="store_true", - help="Do not cluster RGPs located on a contig border which are likely incomplete.") - - optional.add_argument('--no_identical_rgp_merging', required=False, action="store_true", - help="Do not merge in one node identical RGP " - "(i.e. having the same family content) before clustering.") - - optional.add_argument("--basename", required=False, - default="rgp_cluster", help="basename for the output file") - - optional.add_argument('-o', '--output', required=False, type=Path, - default="rgp_clustering", help="Output directory") - - optional.add_argument('--graph_formats', required=False, type=str, choices=['gexf', "graphml"], nargs="+", - default=['gexf', 'graphml'], help="Format of the output graph.") - - optional.add_argument("--add_metadata", - required=False, - action="store_true", - help="Include metadata information in the output files " - "if any have been added to pangenome elements (see ppanggolin metadata command).") - - optional.add_argument("--metadata_sources", - default=None, - nargs="+", - help="Which source of metadata should be written. " - "By default all metadata sources are included.") - - optional.add_argument("--metadata_sep", - required=False, - default='|', - help="The separator used to join multiple metadata values for elements with multiple metadata" - " values from the same source. This character should not appear in metadata values.") + optional.add_argument( + "--grr_cutoff", + required=False, + type=restricted_float, + default=0.8, + help="Min gene repertoire relatedness metric used in the rgp clustering", + ) + optional.add_argument( + "--grr_metric", + required=False, + type=str, + default="incomplete_aware_grr", + help="The grr (Gene Repertoire Relatedness) is used to assess the similarity between two " + "RGPs based on their gene families." + "There are three different modes for calculating the grr value: 'min_grr', 'max_grr' " + "or 'incomplete_aware_grr'." + "'min_grr': Computes the number of gene families shared between the two RGPs and " + "divides it by the smaller number of gene families among the two RGPs." + "'max_grr': Calculates the number of gene families shared between the two RGPs and " + "divides it by the larger number of gene families among the two RGPs." + "'incomplete_aware_grr' (default): If at least one RGP is considered incomplete, " + "which occurs when it is located at the border of a contig," + "the 'min_grr' mode is used. Otherwise, the 'max_grr' mode is applied.", + choices=["incomplete_aware_grr", "min_grr", "max_grr"], + ) + + optional.add_argument( + "--ignore_incomplete_rgp", + required=False, + action="store_true", + help="Do not cluster RGPs located on a contig border which are likely incomplete.", + ) + + optional.add_argument( + "--no_identical_rgp_merging", + required=False, + action="store_true", + help="Do not merge in one node identical RGP " + "(i.e. having the same family content) before clustering.", + ) + + optional.add_argument( + "--basename", + required=False, + default="rgp_cluster", + help="basename for the output file", + ) + + optional.add_argument( + "-o", + "--output", + required=False, + type=Path, + default="rgp_clustering", + help="Output directory", + ) + + optional.add_argument( + "--graph_formats", + required=False, + type=str, + choices=["gexf", "graphml"], + nargs="+", + default=["gexf", "graphml"], + help="Format of the output graph.", + ) + + optional.add_argument( + "--add_metadata", + required=False, + action="store_true", + help="Include metadata information in the output files " + "if any have been added to pangenome elements (see ppanggolin metadata command).", + ) + + optional.add_argument( + "--metadata_sources", + default=None, + nargs="+", + help="Which source of metadata should be written. " + "By default all metadata sources are included.", + ) + + optional.add_argument( + "--metadata_sep", + required=False, + default="|", + help="The separator used to join multiple metadata values for elements with multiple metadata" + " values from the same source. This character should not appear in metadata values.", + ) diff --git a/ppanggolin/RGP/spot.py b/ppanggolin/RGP/spot.py index 5de9423f..df76e480 100644 --- a/ppanggolin/RGP/spot.py +++ b/ppanggolin/RGP/spot.py @@ -19,8 +19,13 @@ from ppanggolin.utils import mk_outdir -def comp_border(border1: list, border2: list, overlapping_match: int = 2, - set_size: int = 3, exact_match: int = 1) -> bool: +def comp_border( + border1: list, + border2: list, + overlapping_match: int = 2, + set_size: int = 3, + exact_match: int = 1, +) -> bool: """ Compare two border @@ -36,16 +41,21 @@ def comp_border(border1: list, border2: list, overlapping_match: int = 2, return True elif len(border1) == set_size and len(border2) == set_size: for ikb in range(1, set_size - overlapping_match + 1): - if border1[0:len(border2[ikb:])] == border2[ikb:]: + if border1[0 : len(border2[ikb:])] == border2[ikb:]: return True for ib in range(1, set_size - overlapping_match + 1): - if border1[ib:] == border2[0:len(border1[ib:])]: + if border1[ib:] == border2[0 : len(border1[ib:])]: return True return False -def check_sim(pair_border1: list, pair_border2: list, overlapping_match: int = 2, - set_size: int = 3, exact_match: int = 1) -> bool: +def check_sim( + pair_border1: list, + pair_border2: list, + overlapping_match: int = 2, + set_size: int = 3, + exact_match: int = 1, +) -> bool: """ Checks if the two pairs of exact_match first gene families are identical, or eventually if they overlap in an ordered way at least 'overlapping_match' @@ -80,8 +90,15 @@ def add_new_node_in_spot_graph(g: nx.Graph, region: Region, borders: list) -> st :param borders: bordering families in spot :return blocks: name of the node that has been added """ - blocks = str(sorted([[gene.family.ID for gene in borders[0]], [gene.family.ID for gene in borders[1]]], - key=lambda x: x[0])) + blocks = str( + sorted( + [ + [gene.family.ID for gene in borders[0]], + [gene.family.ID for gene in borders[1]], + ], + key=lambda x: x[0], + ) + ) g.add_node(blocks) try: g.nodes[blocks]["nb_rgp"] += 1 @@ -95,8 +112,13 @@ def add_new_node_in_spot_graph(g: nx.Graph, region: Region, borders: list) -> st return blocks -def make_spot_graph(rgps: list, multigenics: set, overlapping_match: int = 2, - set_size: int = 3, exact_match: int = 1) -> nx.Graph: +def make_spot_graph( + rgps: list, + multigenics: set, + overlapping_match: int = 2, + set_size: int = 3, + exact_match: int = 1, +) -> nx.Graph: """ Create a spot graph from pangenome RGP @@ -119,17 +141,28 @@ def make_spot_graph(rgps: list, multigenics: set, overlapping_match: int = 2, else: used += 1 add_new_node_in_spot_graph(graph_spot, rgp, border) - logging.getLogger("PPanGGOLiN").info(f"{lost} RGPs were not used as they are on a contig border (or have " - f"less than {set_size} persistent gene families until the contig border)") - logging.getLogger("PPanGGOLiN").info(f"{used} RGPs are being used to predict spots of insertion") + logging.getLogger("PPanGGOLiN").info( + f"{lost} RGPs were not used as they are on a contig border (or have " + f"less than {set_size} persistent gene families until the contig border)" + ) + logging.getLogger("PPanGGOLiN").info( + f"{used} RGPs are being used to predict spots of insertion" + ) node_list = list(graph_spot.nodes) - logging.getLogger("PPanGGOLiN").info(f"{len(node_list)} number of different pairs of flanking gene families") + logging.getLogger("PPanGGOLiN").info( + f"{len(node_list)} number of different pairs of flanking gene families" + ) for i, nodei in enumerate(node_list[:-1]): - for nodej in node_list[i + 1:]: + for nodej in node_list[i + 1 :]: node_obj_i = graph_spot.nodes[nodei] node_obj_j = graph_spot.nodes[nodej] - if check_sim([node_obj_i["border0"], node_obj_i["border1"]], [node_obj_j["border0"], node_obj_j["border1"]], - overlapping_match, set_size, exact_match): + if check_sim( + [node_obj_i["border0"], node_obj_i["border1"]], + [node_obj_j["border0"], node_obj_j["border1"]], + overlapping_match, + set_size, + exact_match, + ): graph_spot.add_edge(nodei, nodej) return graph_spot @@ -137,20 +170,28 @@ def make_spot_graph(rgps: list, multigenics: set, overlapping_match: int = 2, def write_spot_graph(graph_spot, outdir, graph_formats, file_basename="spotGraph"): for node in graph_spot.nodes: - graph_spot.nodes[node]["border0"] = ';'.join([fam.name for fam in graph_spot.nodes[node]["border0"]]) - graph_spot.nodes[node]["border1"] = ';'.join([fam.name for fam in graph_spot.nodes[node]["border1"]]) - - graph_spot.nodes[node]["genomes"] = ';'.join({rgp.organism.name for rgp in graph_spot.nodes[node]["rgp"]}) - graph_spot.nodes[node]["rgp"] = ';'.join([rgp.name for rgp in graph_spot.nodes[node]["rgp"]]) + graph_spot.nodes[node]["border0"] = ";".join( + [fam.name for fam in graph_spot.nodes[node]["border0"]] + ) + graph_spot.nodes[node]["border1"] = ";".join( + [fam.name for fam in graph_spot.nodes[node]["border1"]] + ) + + graph_spot.nodes[node]["genomes"] = ";".join( + {rgp.organism.name for rgp in graph_spot.nodes[node]["rgp"]} + ) + graph_spot.nodes[node]["rgp"] = ";".join( + [rgp.name for rgp in graph_spot.nodes[node]["rgp"]] + ) if "gexf" in graph_formats: outfile = outdir / f"{file_basename}.gexf" - logging.getLogger("PPanGGOLiN").info(f'Writing spot graph in {outfile}') + logging.getLogger("PPanGGOLiN").info(f"Writing spot graph in {outfile}") nx.readwrite.gexf.write_gexf(graph_spot, outfile) if "graphml" in graph_formats: outfile = outdir / f"{file_basename}.graphml" - logging.getLogger("PPanGGOLiN").info(f'Writing spot graph in {outfile}') + logging.getLogger("PPanGGOLiN").info(f"Writing spot graph in {outfile}") nx.readwrite.graphml.write_graphml(graph_spot, outfile) @@ -162,15 +203,25 @@ def check_pangenome_former_spots(pangenome: Pangenome, force: bool = False): :param force: Allow to force write on Pangenome file """ if pangenome.status["spots"] == "inFile" and not force: - raise Exception("You are trying to detect spots on a pangenome which already has predicted spots. " - "If you REALLY want to do that, use --force (it will erase spots previously predicted).") + raise Exception( + "You are trying to detect spots on a pangenome which already has predicted spots. " + "If you REALLY want to do that, use --force (it will erase spots previously predicted)." + ) elif pangenome.status["spots"] == "inFile" and force: erase_pangenome(pangenome, spots=True) -def predict_hotspots(pangenome: Pangenome, output: Path, spot_graph: bool = False, graph_formats: List[str] = ['gexf'], - overlapping_match: int = 2, - set_size: int = 3, exact_match: int = 1, force: bool = False, disable_bar: bool = False): +def predict_hotspots( + pangenome: Pangenome, + output: Path, + spot_graph: bool = False, + graph_formats: List[str] = ["gexf"], + overlapping_match: int = 2, + set_size: int = 3, + exact_match: int = 1, + force: bool = False, + disable_bar: bool = False, +): """ Main function to predict hotspot @@ -186,16 +237,26 @@ def predict_hotspots(pangenome: Pangenome, output: Path, spot_graph: bool = Fals """ # check that given parameters for hotspot computation make sense if overlapping_match >= set_size: - raise Exception(f'--overlapping_match_hotspot ({overlapping_match}) cannot be bigger than (or equal to) ' - f'--set_size_hotspot ({set_size})') + raise Exception( + f"--overlapping_match_hotspot ({overlapping_match}) cannot be bigger than (or equal to) " + f"--set_size_hotspot ({set_size})" + ) if exact_match > set_size: - raise Exception(f'--exact_match_size_hotspot ({exact_match}) cannot be bigger than ' - f'--set_size_hotspot ({set_size})') + raise Exception( + f"--exact_match_size_hotspot ({exact_match}) cannot be bigger than " + f"--set_size_hotspot ({set_size})" + ) # check for formerly computed stuff, and erase if allowed check_pangenome_former_spots(pangenome, force) # check statuses and load info - check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_partitions=True, - need_rgp=True, disable_bar=disable_bar) + check_pangenome_info( + pangenome, + need_annotations=True, + need_families=True, + need_partitions=True, + need_rgp=True, + disable_bar=disable_bar, + ) # get multigenic gene families logging.getLogger("PPanGGOLiN").info("Detecting multigenic families...") @@ -204,11 +265,14 @@ def predict_hotspots(pangenome: Pangenome, output: Path, spot_graph: bool = Fals logging.getLogger("PPanGGOLiN").info("Detecting hotspots in the pangenome...") # make spots - graph_spot = make_spot_graph(pangenome.regions, multigenics, overlapping_match, set_size, - exact_match) + graph_spot = make_spot_graph( + pangenome.regions, multigenics, overlapping_match, set_size, exact_match + ) spots = [] - for spot_id, comp in enumerate(nx.algorithms.components.connected_components(graph_spot)): + for spot_id, comp in enumerate( + nx.algorithms.components.connected_components(graph_spot) + ): curr_spot = Spot(spot_id) spots.append(curr_spot) @@ -244,11 +308,20 @@ def launch(args: argparse.Namespace): pangenome.add_file(args.pangenome) if args.spot_graph: mk_outdir(args.output, args.force) - predict_hotspots(pangenome, args.output, force=args.force, - spot_graph=args.spot_graph, graph_formats=args.graph_formats, - overlapping_match=args.overlapping_match, set_size=args.set_size, - exact_match=args.exact_match_size, disable_bar=args.disable_prog_bar, ) - write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar) + predict_hotspots( + pangenome, + args.output, + force=args.force, + spot_graph=args.spot_graph, + graph_formats=args.graph_formats, + overlapping_match=args.overlapping_match, + set_size=args.set_size, + exact_match=args.exact_match_size, + disable_bar=args.disable_prog_bar, + ) + write_pangenome( + pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar + ) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -259,7 +332,9 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : parser arguments for align command """ - parser = sub_parser.add_parser("spot", formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "spot", formatter_class=argparse.RawTextHelpFormatter + ) parser_spot(parser) return parser @@ -270,39 +345,76 @@ def parser_spot(parser: argparse.ArgumentParser): :param parser: parser for align argument """ - required = parser.add_argument_group(title="Required arguments", - description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") + required = parser.add_argument_group( + title="Required arguments", + description="One of the following arguments is required :", + ) + required.add_argument( + "-p", "--pangenome", required=False, type=Path, help="The pangenome .h5 file" + ) optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument('-o', '--output', required=False, type=Path, - default=Path( - f"ppanggolin_output{time.strftime('DATE%Y-%m-%d_HOUR%H.%M.%S', time.localtime())}" - f"_PID{str(os.getpid())}"), - help="Output directory") - optional.add_argument("--spot_graph", required=False, action="store_true", - help="Writes a graph of pairs of blocks of single copy markers flanking RGPs," - " supposedly belonging to the same hotspot") - optional.add_argument("--overlapping_match", required=False, type=int, default=2, - help="The number of 'missing' persistent genes allowed when comparing flanking genes during " - "hotspot computations") - optional.add_argument("--set_size", required=False, type=int, default=3, - help="Number of single copy markers to use as flanking genes for a RGP during " - "hotspot computation") - optional.add_argument("--exact_match_size", required=False, type=int, default=1, - help="Number of perfectly matching flanking single copy markers required to associate RGPs " - "during hotspot computation (Ex: If set to 1, two RGPs are in the same hotspot " - "if both their 1st flanking genes are the same)") - optional.add_argument('--graph_formats', required=False, type=str, choices=['gexf', "graphml"], nargs="+", - default=['gexf'], help="Format of the output graph.") - - -if __name__ == '__main__': + optional.add_argument( + "-o", + "--output", + required=False, + type=Path, + default=Path( + f"ppanggolin_output{time.strftime('DATE%Y-%m-%d_HOUR%H.%M.%S', time.localtime())}" + f"_PID{str(os.getpid())}" + ), + help="Output directory", + ) + optional.add_argument( + "--spot_graph", + required=False, + action="store_true", + help="Writes a graph of pairs of blocks of single copy markers flanking RGPs," + " supposedly belonging to the same hotspot", + ) + optional.add_argument( + "--overlapping_match", + required=False, + type=int, + default=2, + help="The number of 'missing' persistent genes allowed when comparing flanking genes during " + "hotspot computations", + ) + optional.add_argument( + "--set_size", + required=False, + type=int, + default=3, + help="Number of single copy markers to use as flanking genes for a RGP during " + "hotspot computation", + ) + optional.add_argument( + "--exact_match_size", + required=False, + type=int, + default=1, + help="Number of perfectly matching flanking single copy markers required to associate RGPs " + "during hotspot computation (Ex: If set to 1, two RGPs are in the same hotspot " + "if both their 1st flanking genes are the same)", + ) + optional.add_argument( + "--graph_formats", + required=False, + type=str, + choices=["gexf", "graphml"], + nargs="+", + default=["gexf"], + help="Format of the output graph.", + ) + + +if __name__ == "__main__": """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser_spot(main_parser) add_common_arguments(main_parser) diff --git a/ppanggolin/__init__.py b/ppanggolin/__init__.py index badec1d4..9b163bdf 100755 --- a/ppanggolin/__init__.py +++ b/ppanggolin/__init__.py @@ -39,7 +39,7 @@ "context": ppanggolin.context.subparser, "projection": ppanggolin.projection.subparser, "rgp_cluster": ppanggolin.RGP.rgp_cluster.subparser, - "metadata": ppanggolin.meta.subparser + "metadata": ppanggolin.meta.subparser, } diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 3671ab6a..69efe798 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -16,7 +16,12 @@ # local libraries from ppanggolin.formats import check_pangenome_info from ppanggolin.geneFamily import GeneFamily -from ppanggolin.utils import mk_outdir, read_compressed_or_not, create_tmpdir, run_subprocess +from ppanggolin.utils import ( + mk_outdir, + read_compressed_or_not, + create_tmpdir, + run_subprocess, +) from ppanggolin.pangenome import Pangenome from ppanggolin.region import Spot from ppanggolin.figures.draw_spot import draw_selected_spots, subgraph @@ -24,12 +29,22 @@ from ppanggolin.formats.writeSequences import translate_genes, create_mmseqs_db -def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_files: Union[Path, Iterable[Path]], - tmpdir: Path, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, - query_type: str = "unknow", is_query_slf: bool = False, target_type: str = "unknow", - is_target_slf: bool = False, translation_table: int = None) -> Path: +def align_seq_to_pang( + target_seq_file: Union[Path, Iterable[Path]], + query_seq_files: Union[Path, Iterable[Path]], + tmpdir: Path, + cpu: int = 1, + no_defrag: bool = False, + identity: float = 0.8, + coverage: float = 0.8, + query_type: str = "unknow", + is_query_slf: bool = False, + target_type: str = "unknow", + is_target_slf: bool = False, + translation_table: int = None, +) -> Path: """ - Align fasta sequence to pangenome sequences. + Align fasta sequence to pangenome sequences. :param target_seq_file: File with sequences of pangenome (target) :param query_seq_files: Iterable of files with sequences from input file (query) @@ -48,26 +63,46 @@ def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_fi """ if target_type == "nucleotide": - logging.getLogger("PPanGGOLiN").debug("Target sequences will be translated by mmseqs with " - f"translation table {translation_table}") - with create_tmpdir(tmpdir, basename="target_db", keep_tmp=True) as target_db_dir: + logging.getLogger("PPanGGOLiN").debug( + "Target sequences will be translated by mmseqs with " + f"translation table {translation_table}" + ) + with create_tmpdir( + tmpdir, basename="target_db", keep_tmp=True + ) as target_db_dir: # Keep is set as true because whether tmpdir is deleted or not target_db_dir will be the same - target_db = translate_genes(target_seq_file, target_db_dir, cpu, is_target_slf, translation_table) + target_db = translate_genes( + target_seq_file, target_db_dir, cpu, is_target_slf, translation_table + ) else: db_type = 1 if target_type == "protein" else 0 - target_db = create_mmseqs_db([target_seq_file] if isinstance(target_seq_file, Path) else target_seq_file, - 'target_db', tmpdir, db_mode=1 if is_target_slf else 0, db_type=db_type) + target_db = create_mmseqs_db( + [target_seq_file] if isinstance(target_seq_file, Path) else target_seq_file, + "target_db", + tmpdir, + db_mode=1 if is_target_slf else 0, + db_type=db_type, + ) if query_type == "nucleotide": - logging.getLogger("PPanGGOLiN").debug("Query sequences will be translated by mmseqs " - f"with translation table {translation_table}") + logging.getLogger("PPanGGOLiN").debug( + "Query sequences will be translated by mmseqs " + f"with translation table {translation_table}" + ) with create_tmpdir(tmpdir, basename="query_db", keep_tmp=True) as query_db_dir: # Keep is set as true because whether tmpdir is deleted or not target_db_dir will be the same - query_db = translate_genes(query_seq_files, query_db_dir, cpu, is_query_slf, translation_table) + query_db = translate_genes( + query_seq_files, query_db_dir, cpu, is_query_slf, translation_table + ) else: db_type = 1 if query_type == "protein" else 0 - query_db = create_mmseqs_db([query_seq_files] if isinstance(query_seq_files, Path) else query_seq_files, - 'query_db', tmpdir, db_mode=1 if is_query_slf else 0, db_type=db_type) + query_db = create_mmseqs_db( + [query_seq_files] if isinstance(query_seq_files, Path) else query_seq_files, + "query_db", + tmpdir, + db_mode=1 if is_query_slf else 0, + db_type=db_type, + ) cov_mode = "2" # coverage of query if no_defrag: @@ -76,33 +111,80 @@ def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_fi # mmseqs search command # see https://github.com/soedinglab/MMseqs2/issues/373 Using a combination of param to no miss short proteins - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), prefix="aln_result_db_file", suffix=".aln.DB", - delete=False) as aln_db: + with tempfile.NamedTemporaryFile( + mode="w", + dir=tmpdir.as_posix(), + prefix="aln_result_db_file", + suffix=".aln.DB", + delete=False, + ) as aln_db: logging.getLogger("PPanGGOLiN").info("Aligning sequences") - cmd = ["mmseqs", "search", query_db.as_posix(), target_db.as_posix(), aln_db.name, tmpdir.as_posix(), "-a", - "--min-seq-id", str(identity), "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu), - "--seed-sub-mat", "VTML40.out", "-s", "2", '--comp-bias-corr', "0", "--mask", "0", "-e", "1"] + cmd = [ + "mmseqs", + "search", + query_db.as_posix(), + target_db.as_posix(), + aln_db.name, + tmpdir.as_posix(), + "-a", + "--min-seq-id", + str(identity), + "-c", + str(coverage), + "--cov-mode", + cov_mode, + "--threads", + str(cpu), + "--seed-sub-mat", + "VTML40.out", + "-s", + "2", + "--comp-bias-corr", + "0", + "--mask", + "0", + "-e", + "1", + ] start = time.time() run_subprocess(cmd, msg="MMSeqs search failed with the following error:\n") align_time = time.time() - start - logging.getLogger("PPanGGOLiN").info(f"Done aligning sequences in {round(align_time, 2)} seconds") - - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, prefix="aln_result_db_file", suffix=".tsv", - delete=False) as outfile: + logging.getLogger("PPanGGOLiN").info( + f"Done aligning sequences in {round(align_time, 2)} seconds" + ) + + with tempfile.NamedTemporaryFile( + mode="w", + dir=tmpdir, + prefix="aln_result_db_file", + suffix=".tsv", + delete=False, + ) as outfile: logging.getLogger("PPanGGOLiN").info("Extracting alignments...") - cmd = ["mmseqs", "convertalis", query_db.as_posix(), target_db.as_posix(), aln_db.name, outfile.name, - "--format-mode", "2"] - - run_subprocess(cmd, msg="MMSeqs convertalis failed with the following error:\n") + cmd = [ + "mmseqs", + "convertalis", + query_db.as_posix(), + target_db.as_posix(), + aln_db.name, + outfile.name, + "--format-mode", + "2", + ] + + run_subprocess( + cmd, msg="MMSeqs convertalis failed with the following error:\n" + ) return Path(outfile.name) -def map_input_gene_to_family_all_aln(aln_res: Path, outdir: Path, - pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], Path]: +def map_input_gene_to_family_all_aln( + aln_res: Path, outdir: Path, pangenome: Pangenome +) -> Tuple[Dict[str, GeneFamily], Path]: """ - Read alignment result to link input sequences to pangenome gene family. + Read alignment result to link input sequences to pangenome gene family. Alignment have been made against all genes of the pangenome. :param aln_res: Alignment result file @@ -113,14 +195,18 @@ def map_input_gene_to_family_all_aln(aln_res: Path, outdir: Path, """ seq2pang = {} - aln_file_clean = outdir / "alignment_input_seqs_to_all_pangenome_genes.tsv" # write the actual result file - logging.getLogger("PPanGGOLiN").debug(f'Writing alignment file in {aln_file_clean}') + aln_file_clean = ( + outdir / "alignment_input_seqs_to_all_pangenome_genes.tsv" + ) # write the actual result file + logging.getLogger("PPanGGOLiN").debug(f"Writing alignment file in {aln_file_clean}") with open(aln_res) as alnFile, open(aln_file_clean, "w") as aln_outfl: for line in alnFile: line_splitted = line.split() - line_splitted[1] = line_splitted[1].replace("ppanggolin_", "") # remove the 'ppanggolin_' bit of the id + line_splitted[1] = line_splitted[1].replace( + "ppanggolin_", "" + ) # remove the 'ppanggolin_' bit of the id line_splitted[0] = line_splitted[0].replace("ppanggolin_", "") input_seq_id, gene_id = line_splitted[0:2] @@ -129,13 +215,16 @@ def map_input_gene_to_family_all_aln(aln_res: Path, outdir: Path, if seq2pang.get(input_seq_id) is None: # if no results were found yet family = pangenome.get_gene(gene_id).family - seq2pang[input_seq_id] = family # then the best hit is the first one we see. + seq2pang[input_seq_id] = ( + family # then the best hit is the first one we see. + ) return seq2pang, aln_file_clean -def map_input_gene_to_family_rep_aln(aln_res: Path, outdir: Path, - pangenome: Pangenome) -> Tuple[Dict[Any, GeneFamily], Path]: +def map_input_gene_to_family_rep_aln( + aln_res: Path, outdir: Path, pangenome: Pangenome +) -> Tuple[Dict[Any, GeneFamily], Path]: """ Read alignment result to link input sequences to pangenome gene family. Alignment have been made against representative sequence of gene families of the pangenome. @@ -147,15 +236,19 @@ def map_input_gene_to_family_rep_aln(aln_res: Path, outdir: Path, :return: Dictionary with sequence link to pangenome gene families and actual path to the cleaned alignment file """ seq2pang = {} - aln_file_clean = outdir / "alignment_input_seqs_to_pangenome_gene_families.tsv" # write the actual result file + aln_file_clean = ( + outdir / "alignment_input_seqs_to_pangenome_gene_families.tsv" + ) # write the actual result file - logging.getLogger("PPanGGOLiN").debug(f'Writing alignment file in {aln_file_clean}') + logging.getLogger("PPanGGOLiN").debug(f"Writing alignment file in {aln_file_clean}") with open(aln_res) as alnFile, open(aln_file_clean, "w") as aln_outfl: for line in alnFile: line_splitted = line.split() - line_splitted[1] = line_splitted[1].replace("ppanggolin_", "") # remove the 'ppanggolin_' bit of the id + line_splitted[1] = line_splitted[1].replace( + "ppanggolin_", "" + ) # remove the 'ppanggolin_' bit of the id line_splitted[0] = line_splitted[0].replace("ppanggolin_", "") aln_outfl.write("\t".join(line_splitted) + "\n") @@ -163,7 +256,9 @@ def map_input_gene_to_family_rep_aln(aln_res: Path, outdir: Path, input_seq_id, gene_family_id = line_splitted[0:2] if seq2pang.get(input_seq_id) is None: # if no results were found yet - family = pangenome.get_gene_family(gene_family_id) # then the best hit is the first one we see. + family = pangenome.get_gene_family( + gene_family_id + ) # then the best hit is the first one we see. seq2pang[input_seq_id] = family return seq2pang, aln_file_clean @@ -177,7 +272,7 @@ def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool, bool]: :return: A tuple containing a set of sequence IDs and a boolean indicating if the sequences are nucleotide sequences. """ - dna_expected_char = {'A', 'T', 'G', 'C', 'N'} + dna_expected_char = {"A", "T", "G", "C", "N"} seq_set = set() seq_count = 0 first_seq_concat = "" @@ -187,7 +282,9 @@ def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool, bool]: if line.startswith(">"): seq_set.add(line[1:].split()[0].strip()) seq_count += 1 - if count_fasta_line > 1: # Allow to know if we can use soft link with createdb from MMSeqs2 + if ( + count_fasta_line > 1 + ): # Allow to know if we can use soft link with createdb from MMSeqs2 single_line_fasta = False count_fasta_line = 0 else: @@ -201,7 +298,9 @@ def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool, bool]: return seq_set, is_nucleotide, single_line_fasta -def write_gene_fam_sequences(pangenome: Pangenome, output: Path, add: str = "", disable_bar: bool = False): +def write_gene_fam_sequences( + pangenome: Pangenome, output: Path, add: str = "", disable_bar: bool = False +): """ Export the sequence of gene families @@ -211,13 +310,19 @@ def write_gene_fam_sequences(pangenome: Pangenome, output: Path, add: str = "", :param disable_bar: disable progress bar """ with open(output, "w") as file_obj: - for fam in tqdm(pangenome.gene_families, unit="families", disable=disable_bar, - total=pangenome.number_of_gene_families): + for fam in tqdm( + pangenome.gene_families, + unit="families", + disable=disable_bar, + total=pangenome.number_of_gene_families, + ): file_obj.write(">" + add + fam.name + "\n") file_obj.write(fam.sequence + "\n") -def write_all_gene_sequences(pangenome: Pangenome, output: Path, add: str = "", disable_bar: bool = False): +def write_all_gene_sequences( + pangenome: Pangenome, output: Path, add: str = "", disable_bar: bool = False +): """ Export the sequence of pangenome genes @@ -228,13 +333,17 @@ def write_all_gene_sequences(pangenome: Pangenome, output: Path, add: str = "", """ if pangenome.status["geneSequences"] == "inFile": - get_non_redundant_gene_sequences_from_file(pangenome.file, output, add=add, disable_bar=disable_bar) + get_non_redundant_gene_sequences_from_file( + pangenome.file, output, add=add, disable_bar=disable_bar + ) else: # this should never happen if the pangenome has been properly checked before launching this function. raise Exception("The pangenome does not include gene sequences") -def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq_set: Set[str], output: Path) -> Path: +def project_and_write_partition( + seqid_to_gene_family: Dict[str, GeneFamily], seq_set: Set[str], output: Path +) -> Path: """ Project the partition of each sequence from the input file and write them in a file @@ -250,11 +359,15 @@ def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq for input_seq, gene_fam in seqid_to_gene_family.items(): partProjFile.write(input_seq + "\t" + gene_fam.named_partition + "\n") for remaining_seq in seq_set - seqid_to_gene_family.keys(): - partProjFile.write(remaining_seq + "\tcloud\n") # if there is no hit, it's going to be cloud genes. + partProjFile.write( + remaining_seq + "\tcloud\n" + ) # if there is no hit, it's going to be cloud genes. return partition_proj -def write_gene_to_gene_family(seqid_to_gene_family: Dict[str, GeneFamily], seq_set: Set[str], output: Path) -> Path: +def write_gene_to_gene_family( + seqid_to_gene_family: Dict[str, GeneFamily], seq_set: Set[str], output: Path +) -> Path: """ Write input gene to pangenome gene family. @@ -292,14 +405,20 @@ def get_fam_to_rgp(pangenome, multigenics: set) -> dict: for rgp in pangenome.regions: for fam in rgp.families: fam2rgp[fam].append(rgp.name) - for fam in [gene.family for border in rgp.get_bordering_genes(pangenome.parameters["spot"]["set_size"], - multigenics) for gene in border]: + for fam in [ + gene.family + for border in rgp.get_bordering_genes( + pangenome.parameters["spot"]["set_size"], multigenics + ) + for gene in border + ]: fam2rgp[fam].append(rgp.name) return fam2rgp -def get_fam_to_spot(pangenome: Pangenome, multigenics: Set[GeneFamily]) \ - -> Tuple[Dict[str, List[Spot]], Dict[str, List[Spot]]]: +def get_fam_to_spot( + pangenome: Pangenome, multigenics: Set[GeneFamily] +) -> Tuple[Dict[str, List[Spot]], Dict[str, List[Spot]]]: """ Reads a pangenome object to link families and spots and indicate where each family is. @@ -316,9 +435,15 @@ def get_fam_to_spot(pangenome: Pangenome, multigenics: Set[GeneFamily]) \ fams_border = set() for rgp in spot.regions: fams |= set(rgp.families) - fams_border |= set([gene.family for border in # Set of families in border of spot - rgp.get_bordering_genes(pangenome.parameters["spot"]["set_size"], multigenics) - for gene in border]) + fams_border |= set( + [ + gene.family + for border in rgp.get_bordering_genes( # Set of families in border of spot + pangenome.parameters["spot"]["set_size"], multigenics + ) + for gene in border + ] + ) for fam in fams: fam2spot[fam].append(spot) for fam in fams_border: @@ -326,7 +451,9 @@ def get_fam_to_spot(pangenome: Pangenome, multigenics: Set[GeneFamily]) \ return fam2spot, fam2border -def draw_spot_gexf(spots: set, output: Path, multigenics: set, fam_to_mod: dict, set_size: int = 3): +def draw_spot_gexf( + spots: set, output: Path, multigenics: set, fam_to_mod: dict, set_size: int = 3 +): """ Draw a gexf graph of the spot @@ -338,10 +465,22 @@ def draw_spot_gexf(spots: set, output: Path, multigenics: set, fam_to_mod: dict, """ for spot in spots: fname = output / f"spot_{str(spot.ID)}.gexf" - subgraph(spot, fname, set_size=set_size, multigenics=multigenics, fam_to_mod=fam_to_mod) - - -def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_related: bool = False, disable_bar=False): + subgraph( + spot, + fname, + set_size=set_size, + multigenics=multigenics, + fam_to_mod=fam_to_mod, + ) + + +def get_seq_info( + seq_to_pang: dict, + pangenome: Pangenome, + output: Path, + draw_related: bool = False, + disable_bar=False, +): """ Get sequences information after alignment @@ -352,18 +491,33 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel :param disable_bar: disable progress bar :return: """ - logging.getLogger("PPanGGOLiN").info("Writing RGP and spot information related to hits in the pangenome") + logging.getLogger("PPanGGOLiN").info( + "Writing RGP and spot information related to hits in the pangenome" + ) multigenics = pangenome.get_multigenics(pangenome.parameters["rgp"]["dup_margin"]) with open(output / "info_input_seq.tsv", "w") as finfo: - finfo.write("input\tfamily\tpartition\tspot_list_as_member\tspot_list_as_border\trgp_list\n") + finfo.write( + "input\tfamily\tpartition\tspot_list_as_member\tspot_list_as_border\trgp_list\n" + ) fam2rgp = get_fam_to_rgp(pangenome, multigenics) fam2spot, fam2border = get_fam_to_spot(pangenome, multigenics) spot_list = set() for seq, panfam in seq_to_pang.items(): - finfo.write(seq + '\t' + panfam.name + "\t" + panfam.named_partition + "\t" + ",".join( - map(str, fam2spot[panfam])) + "\t" + ",".join( - map(str, fam2border[panfam])) + "\t" + ','.join(fam2rgp[panfam]) + "\n") + finfo.write( + seq + + "\t" + + panfam.name + + "\t" + + panfam.named_partition + + "\t" + + ",".join(map(str, fam2spot[panfam])) + + "\t" + + ",".join(map(str, fam2border[panfam])) + + "\t" + + ",".join(fam2rgp[panfam]) + + "\n" + ) spot_list |= set(fam2spot[panfam]) spot_list |= set(fam2border[panfam]) @@ -372,11 +526,19 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel for spot in spot_list: if len(spot.get_uniq_ordered_set()) > 1: drawn_spots.add(spot) - logging.getLogger("PPanGGOLiN").info(f"Drawing the {len(drawn_spots)} spots with more than 1 organization " - f"related to hits of the input sequences...") - draw_selected_spots(drawn_spots, pangenome, output, pangenome.parameters["spot"]["overlapping_match"], - pangenome.parameters["spot"]["exact_match_size"], pangenome.parameters["spot"]["set_size"], - disable_bar=disable_bar) + logging.getLogger("PPanGGOLiN").info( + f"Drawing the {len(drawn_spots)} spots with more than 1 organization " + f"related to hits of the input sequences..." + ) + draw_selected_spots( + drawn_spots, + pangenome, + output, + pangenome.parameters["spot"]["overlapping_match"], + pangenome.parameters["spot"]["exact_match_size"], + pangenome.parameters["spot"]["set_size"], + disable_bar=disable_bar, + ) fam2mod = {} # fam2module if pangenome.status["modules"] != "No": @@ -386,15 +548,26 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel draw_spot_gexf(drawn_spots, output, multigenics=multigenics, fam_to_mod=fam2mod) - logging.getLogger("PPanGGOLiN").info(f"File listing RGP and spots where sequences of interest are located : " - f"{output / 'info_input_seq.tsv'}") - - -def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union[Path, Iterable[Path]], output: Path, - tmpdir: Path, input_type: str = "unknow", is_input_slf: bool = False, cpu: int = 1, - no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, - translation_table: int = 11, disable_bar: bool = False - ) -> Tuple[Path, Dict[str, GeneFamily]]: + logging.getLogger("PPanGGOLiN").info( + f"File listing RGP and spots where sequences of interest are located : " + f"{output / 'info_input_seq.tsv'}" + ) + + +def get_input_seq_to_family_with_rep( + pangenome: Pangenome, + sequence_files: Union[Path, Iterable[Path]], + output: Path, + tmpdir: Path, + input_type: str = "unknow", + is_input_slf: bool = False, + cpu: int = 1, + no_defrag: bool = False, + identity: float = 0.8, + coverage: float = 0.8, + translation_table: int = 11, + disable_bar: bool = False, +) -> Tuple[Path, Dict[str, GeneFamily]]: """ Assign gene families from a pangenome to input sequences. @@ -413,30 +586,55 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union :param coverage: Minimum coverage threshold for the alignment (default: 0.8). :param translation_table: Translation table to use if sequences need to be translated (default: 11). :param disable_bar: If True, disable the progress bar. - - :return: A tuple containing the path to the alignment result file, + + :return: A tuple containing the path to the alignment result file, and a dictionary mapping input sequences to gene families. """ pangenome_sequences = tmpdir / "proteins_families.faa" - logging.getLogger("PPanGGOLiN").debug(f'Write gene family sequences in {pangenome_sequences.absolute()}') - write_gene_fam_sequences(pangenome, pangenome_sequences, add="ppanggolin_", disable_bar=disable_bar) - - align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir, - cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - query_type=input_type, is_query_slf=is_input_slf, is_target_slf=True, - target_type="protein", translation_table=translation_table) - - seq2pang, align_file = map_input_gene_to_family_rep_aln(align_file, output, pangenome) + logging.getLogger("PPanGGOLiN").debug( + f"Write gene family sequences in {pangenome_sequences.absolute()}" + ) + write_gene_fam_sequences( + pangenome, pangenome_sequences, add="ppanggolin_", disable_bar=disable_bar + ) + + align_file = align_seq_to_pang( + target_seq_file=pangenome_sequences, + query_seq_files=sequence_files, + tmpdir=tmpdir, + cpu=cpu, + no_defrag=no_defrag, + identity=identity, + coverage=coverage, + query_type=input_type, + is_query_slf=is_input_slf, + is_target_slf=True, + target_type="protein", + translation_table=translation_table, + ) + + seq2pang, align_file = map_input_gene_to_family_rep_aln( + align_file, output, pangenome + ) return align_file, seq2pang -def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Union[Path, Iterable[Path]], output: Path, - tmpdir: Path, input_type: str = "unknow", is_input_slf: bool = False, cpu: int = 1, - no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, - translation_table: int = 11, disable_bar: bool = False - ) -> Tuple[Path, Dict[str, GeneFamily]]: +def get_input_seq_to_family_with_all( + pangenome: Pangenome, + sequence_files: Union[Path, Iterable[Path]], + output: Path, + tmpdir: Path, + input_type: str = "unknow", + is_input_slf: bool = False, + cpu: int = 1, + no_defrag: bool = False, + identity: float = 0.8, + coverage: float = 0.8, + translation_table: int = 11, + disable_bar: bool = False, +) -> Tuple[Path, Dict[str, GeneFamily]]: """ Assign gene families from a pangenome to input sequences. @@ -456,27 +654,55 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Unio :param translation_table: Translation table to use if sequences need to be translated (default: 11). :param disable_bar: If True, disable the progress bar. - :return: A tuple containing the path to the alignment result file, + :return: A tuple containing the path to the alignment result file, and a dictionary mapping input sequences to gene families. """ pangenome_sequences = tmpdir / "nucleotide_genes.fna" - logging.getLogger("PPanGGOLiN").debug(f'Write all pangenome gene sequences in {pangenome_sequences.absolute()}') - write_all_gene_sequences(pangenome, pangenome_sequences, add="ppanggolin_", disable_bar=disable_bar) - - align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir, - cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - query_type=input_type, is_query_slf=is_input_slf, is_target_slf=True, - target_type="nucleotide", translation_table=translation_table) - - seq2pang, align_file = map_input_gene_to_family_all_aln(align_file, output, pangenome) + logging.getLogger("PPanGGOLiN").debug( + f"Write all pangenome gene sequences in {pangenome_sequences.absolute()}" + ) + write_all_gene_sequences( + pangenome, pangenome_sequences, add="ppanggolin_", disable_bar=disable_bar + ) + + align_file = align_seq_to_pang( + target_seq_file=pangenome_sequences, + query_seq_files=sequence_files, + tmpdir=tmpdir, + cpu=cpu, + no_defrag=no_defrag, + identity=identity, + coverage=coverage, + query_type=input_type, + is_query_slf=is_input_slf, + is_target_slf=True, + target_type="nucleotide", + translation_table=translation_table, + ) + + seq2pang, align_file = map_input_gene_to_family_all_aln( + align_file, output, pangenome + ) return align_file, seq2pang -def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: float = 0.8, - coverage: float = 0.8, no_defrag: bool = False, cpu: int = 1, getinfo: bool = False, - use_representatives: bool = False, draw_related: bool = False, translation_table: int = 11, - tmpdir: Path = None, disable_bar: bool = False, keep_tmp=False): +def align( + pangenome: Pangenome, + sequence_file: Path, + output: Path, + identity: float = 0.8, + coverage: float = 0.8, + no_defrag: bool = False, + cpu: int = 1, + getinfo: bool = False, + use_representatives: bool = False, + draw_related: bool = False, + translation_table: int = 11, + tmpdir: Path = None, + disable_bar: bool = False, + keep_tmp=False, +): """ Aligns pangenome sequences with sequences in a FASTA file using MMSeqs2. @@ -498,8 +724,10 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo tmpdir = Path(tempfile.gettempdir()) if tmpdir is None else tmpdir if pangenome.status["geneFamilySequences"] not in ["inFile", "Loaded", "Computed"]: - raise Exception("Cannot use this function as your pangenome does not have gene families representatives " - "associated to it. For now this works only if the clustering is realised by PPanGGOLiN.") + raise Exception( + "Cannot use this function as your pangenome does not have gene families representatives " + "associated to it. For now this works only if the clustering is realised by PPanGGOLiN." + ) # could be possible either by picking a representative somehow, or by aligning on genes rather than on # families, if they are in the pangenome. @@ -508,42 +736,72 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo if pangenome.status["modules"] != "No": # modules are not required to be loaded, but if they have been computed we load them. need_mod = True - check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_partitions=True, need_rgp=True, - need_spots=True, need_modules=need_mod, disable_bar=disable_bar) + check_pangenome_info( + pangenome, + need_annotations=True, + need_families=True, + need_partitions=True, + need_rgp=True, + need_spots=True, + need_modules=need_mod, + disable_bar=disable_bar, + ) else: check_pangenome_info(pangenome, need_families=True, disable_bar=disable_bar) with read_compressed_or_not(sequence_file) as seqFileObj: seq_set, is_nucleotide, single_line_fasta = get_seq_ids(seqFileObj) - with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: + with create_tmpdir( + main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp + ) as new_tmpdir: input_type = "nucleotide" if is_nucleotide else "unknow" if use_representatives: - align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir, - input_type=input_type, - is_input_slf=single_line_fasta, cpu=cpu, - no_defrag=no_defrag, identity=identity, - coverage=coverage, - translation_table=translation_table, - disable_bar=disable_bar) + align_file, seq2pang = get_input_seq_to_family_with_rep( + pangenome, + sequence_file, + output, + new_tmpdir, + input_type=input_type, + is_input_slf=single_line_fasta, + cpu=cpu, + no_defrag=no_defrag, + identity=identity, + coverage=coverage, + translation_table=translation_table, + disable_bar=disable_bar, + ) else: - align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=sequence_file, - output=output, tmpdir=new_tmpdir, - input_type=input_type, - is_input_slf=single_line_fasta, - cpu=cpu, no_defrag=no_defrag, identity=identity, - coverage=coverage, - translation_table=translation_table, - disable_bar=disable_bar) + align_file, seq2pang = get_input_seq_to_family_with_all( + pangenome=pangenome, + sequence_files=sequence_file, + output=output, + tmpdir=new_tmpdir, + input_type=input_type, + is_input_slf=single_line_fasta, + cpu=cpu, + no_defrag=no_defrag, + identity=identity, + coverage=coverage, + translation_table=translation_table, + disable_bar=disable_bar, + ) if getinfo or draw_related: # TODO Add getinfo to function and remove if get_seq_info(seq2pang, pangenome, output, draw_related, disable_bar=disable_bar) - part_proj = project_and_write_partition(seq2pang, seq_set, output) # write the partition assignation only - logging.getLogger("PPanGGOLiN").info(f"sequences partition projection : '{part_proj}'") + part_proj = project_and_write_partition( + seq2pang, seq_set, output + ) # write the partition assignation only + logging.getLogger("PPanGGOLiN").info( + f"sequences partition projection : '{part_proj}'" + ) logging.getLogger("PPanGGOLiN").info( - f"{len(seq2pang)} sequences over {len(seq_set)} have at least one hit in the pangenome.") - logging.getLogger("PPanGGOLiN").info(f"Blast-tab file of the alignment : '{align_file}'") + f"{len(seq2pang)} sequences over {len(seq_set)} have at least one hit in the pangenome." + ) + logging.getLogger("PPanGGOLiN").info( + f"Blast-tab file of the alignment : '{align_file}'" + ) def launch(args: argparse.Namespace): @@ -555,12 +813,22 @@ def launch(args: argparse.Namespace): mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) - align(pangenome=pangenome, sequence_file=args.sequences, output=args.output, - tmpdir=args.tmpdir, identity=args.identity, coverage=args.coverage, - no_defrag=args.no_defrag, cpu=args.cpu, getinfo=args.getinfo, - use_representatives=args.fast, draw_related=args.draw_related, - translation_table=args.translation_table, - disable_bar=args.disable_prog_bar, keep_tmp=args.keep_tmp) + align( + pangenome=pangenome, + sequence_file=args.sequences, + output=args.output, + tmpdir=args.tmpdir, + identity=args.identity, + coverage=args.coverage, + no_defrag=args.no_defrag, + cpu=args.cpu, + getinfo=args.getinfo, + use_representatives=args.fast, + draw_related=args.draw_related, + translation_table=args.translation_table, + disable_bar=args.disable_prog_bar, + keep_tmp=args.keep_tmp, + ) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -571,7 +839,9 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : parser arguments for align command """ - parser = sub_parser.add_parser("align", formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "align", formatter_class=argparse.RawTextHelpFormatter + ) parser_align(parser) return parser @@ -582,52 +852,118 @@ def parser_align(parser: argparse.ArgumentParser): :param parser: parser for align argument """ - required = parser.add_argument_group(title="Required arguments", - description="All of the following arguments are required :") - required.add_argument('-S', '--sequences', required=False, type=Path, - help="sequences (nucleotides or amino acids) to align on the pangenome gene families") - - required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") - required.add_argument('-o', '--output', required=True, type=Path, - help="Output directory where the file(s) will be written") + required = parser.add_argument_group( + title="Required arguments", + description="All of the following arguments are required :", + ) + required.add_argument( + "-S", + "--sequences", + required=False, + type=Path, + help="sequences (nucleotides or amino acids) to align on the pangenome gene families", + ) + + required.add_argument( + "-p", "--pangenome", required=False, type=Path, help="The pangenome .h5 file" + ) + required.add_argument( + "-o", + "--output", + required=True, + type=Path, + help="Output directory where the file(s) will be written", + ) optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument('--no_defrag', required=False, action="store_true", - help="DO NOT Realign gene families to link fragments with" - "their non-fragmented gene family. (default: False)") - optional.add_argument('--identity', required=False, type=float, default=0.5, - help="min identity percentage threshold") - optional.add_argument('--coverage', required=False, type=float, default=0.8, - help="min coverage percentage threshold") - optional.add_argument("--fast", required=False, action="store_true", - help="Use representative sequences of gene families for input gene alignment. " - "This option is faster but may be less sensitive. By default, all pangenome genes are used.") - optional.add_argument("--translation_table", required=False, default="11", - help="Translation table (genetic code) to use.") - optional.add_argument("--getinfo", required=False, action="store_true", - help="Use this option to extract info related to the best hit of each query, " - "such as the RGP it is in, or the spots.") - optional.add_argument("--draw_related", required=False, action="store_true", - help="Draw figures and provide graphs in a gexf format of the eventual spots" - " associated to the input sequences") + optional.add_argument( + "--no_defrag", + required=False, + action="store_true", + help="DO NOT Realign gene families to link fragments with" + "their non-fragmented gene family. (default: False)", + ) + optional.add_argument( + "--identity", + required=False, + type=float, + default=0.5, + help="min identity percentage threshold", + ) + optional.add_argument( + "--coverage", + required=False, + type=float, + default=0.8, + help="min coverage percentage threshold", + ) + optional.add_argument( + "--fast", + required=False, + action="store_true", + help="Use representative sequences of gene families for input gene alignment. " + "This option is faster but may be less sensitive. By default, all pangenome genes are used.", + ) + optional.add_argument( + "--translation_table", + required=False, + default="11", + help="Translation table (genetic code) to use.", + ) + optional.add_argument( + "--getinfo", + required=False, + action="store_true", + help="Use this option to extract info related to the best hit of each query, " + "such as the RGP it is in, or the spots.", + ) + optional.add_argument( + "--draw_related", + required=False, + action="store_true", + help="Draw figures and provide graphs in a gexf format of the eventual spots" + " associated to the input sequences", + ) # but does not use the option - optional.add_argument("--use_pseudo", required=False, action="store_true", - help="In the context of provided annotation, use this option to read pseudogenes. " - "(Default behavior is to ignore them)") - optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") - optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), - help="directory for storing temporary files") - optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", - help="Keeping temporary files (useful for debugging).") - - -if __name__ == '__main__': + optional.add_argument( + "--use_pseudo", + required=False, + action="store_true", + help="In the context of provided annotation, use this option to read pseudogenes. " + "(Default behavior is to ignore them)", + ) + optional.add_argument( + "-c", + "--cpu", + required=False, + default=1, + type=int, + help="Number of available cpus", + ) + optional.add_argument( + "--tmpdir", + required=False, + type=Path, + default=Path(tempfile.gettempdir()), + help="directory for storing temporary files", + ) + optional.add_argument( + "--keep_tmp", + required=False, + default=False, + action="store_true", + help="Keeping temporary files (useful for debugging).", + ) + + +if __name__ == "__main__": """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser_align(main_parser) add_common_arguments(main_parser) set_verbosity_level(main_parser.parse_args()) diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 05607d11..c4359e34 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -20,16 +20,28 @@ from tables.path import check_name_validity, NaturalNameWarning # local libraries -from ppanggolin.annotate.synta import (annotate_organism, read_fasta, get_dna_sequence, - init_contig_counter, contig_counter) +from ppanggolin.annotate.synta import ( + annotate_organism, + get_contigs_from_fasta_file, + get_dna_sequence, + init_contig_counter, + contig_counter, +) from ppanggolin.pangenome import Pangenome from ppanggolin.genome import Organism, Gene, RNA, Contig -from ppanggolin.utils import read_compressed_or_not, mk_file_name, detect_filetype, check_input_files, has_non_ascii, replace_non_ascii +from ppanggolin.utils import ( + read_compressed_or_not, + mk_file_name, + detect_filetype, + check_input_files, + has_non_ascii, + replace_non_ascii, +) from ppanggolin.formats import write_pangenome from ppanggolin.metadata import Metadata -# ignore NaturalNameWarning -warnings.filterwarnings('ignore', category=NaturalNameWarning) +# ignore NaturalNameWarning +warnings.filterwarnings("ignore", category=NaturalNameWarning) ctg_counter = contig_counter @@ -42,9 +54,11 @@ def check_annotate_args(args: argparse.Namespace): :raise Exception: """ if args.fasta is None and args.anno is None: - raise argparse.ArgumentError(argument=None, - message="You must provide at least a file with the --fasta option to annotate " - "from sequences, or a file with the --anno option to load annotations from.") + raise argparse.ArgumentError( + argument=None, + message="You must provide at least a file with the --fasta option to annotate " + "from sequences, or a file with the --anno option to load annotations from.", + ) if hasattr(args, "fasta") and args.fasta is not None: check_input_files(args.fasta, True) @@ -53,11 +67,22 @@ def check_annotate_args(args: argparse.Namespace): check_input_files(args.anno, True) - - -def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxrefs: Set[str], - coordinates: List[Tuple[int, int]], strand: str, gene_type: str, position: int = None, - gene_name: str = "", product: str = "", genetic_code: int = 11, protein_id: str = "") -> Gene: +def create_gene( + org: Organism, + contig: Contig, + gene_counter: int, + rna_counter: int, + gene_id: str, + dbxrefs: Set[str], + coordinates: List[Tuple[int, int]], + strand: str, + gene_type: str, + position: int = None, + gene_name: str = "", + product: str = "", + genetic_code: int = 11, + protein_id: str = "", +) -> Gene: """ Create a Gene object and associate to contig and Organism @@ -78,26 +103,27 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i """ # check for non ascii character in product field if has_non_ascii(product): - + logging.getLogger("PPanGGOLiN").warning( - f"In genome '{org.name}', the 'product' field of gene '{gene_id}' contains non-ASCII characters: '{product}'. " - "These characters cannot be stored in the HDF5 file and will be replaced by underscores." - ) + f"In genome '{org.name}', the 'product' field of gene '{gene_id}' contains non-ASCII characters: '{product}'. " + "These characters cannot be stored in the HDF5 file and will be replaced by underscores." + ) product = replace_non_ascii(product) - start, stop = coordinates[0][0], coordinates[-1][1] - if any(dbxref.startswith('MaGe:') or dbxref.startswith('SEED:') for dbxref in dbxrefs): + if any( + dbxref.startswith("MaGe:") or dbxref.startswith("SEED:") for dbxref in dbxrefs + ): if gene_name == "": gene_name = gene_id for dbxref in dbxrefs: - if dbxref.startswith('MaGe:'): - gene_id = dbxref.split(':')[1] + if dbxref.startswith("MaGe:"): + gene_id = dbxref.split(":")[1] break - if dbxref.startswith('SEED:'): - gene_id = dbxref.split(':')[1] + if dbxref.startswith("SEED:"): + gene_id = dbxref.split(":")[1] break if gene_type == "CDS": @@ -108,16 +134,30 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i # but was when cases like this were encountered) new_gene = Gene(org.name + "_CDS_" + str(gene_counter).zfill(4)) - new_gene.fill_annotations(start=start, stop=stop, strand=strand, coordinates=coordinates, - gene_type=gene_type, name=gene_name, - position=position, product=product, local_identifier=gene_id, - genetic_code=genetic_code) + new_gene.fill_annotations( + start=start, + stop=stop, + strand=strand, + coordinates=coordinates, + gene_type=gene_type, + name=gene_name, + position=position, + product=product, + local_identifier=gene_id, + genetic_code=genetic_code, + ) contig.add(new_gene) else: # if not CDS, it is RNA new_gene = RNA(org.name + f"_{gene_type}_" + str(rna_counter).zfill(4)) - new_gene.fill_annotations(start=start, stop=stop, strand=strand, coordinates=coordinates, gene_type=gene_type, - name=gene_name, - product=product) + new_gene.fill_annotations( + start=start, + stop=stop, + strand=strand, + coordinates=coordinates, + gene_type=gene_type, + name=gene_name, + product=product, + ) contig.add_rna(new_gene) new_gene.fill_parents(org, contig) return new_gene @@ -126,8 +166,8 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i def extract_positions(string: str) -> Tuple[List[Tuple[int, int]], bool, bool, bool]: """ Extracts start and stop positions from a string and determines whether it is complement and pseudogene. - - Example of strings that the function is able to process: + + Example of strings that the function is able to process: "join(190..7695,7695..12071)", "complement(join(4359800..4360707,4360707..4360962))", @@ -137,7 +177,7 @@ def extract_positions(string: str) -> Tuple[List[Tuple[int, int]], bool, bool, b "6811501..6812109", "complement(6792573..>6795461)", "join(1038313,1..1016)" - + :param string: The input string containing position information. @@ -154,40 +194,46 @@ def extract_positions(string: str) -> Tuple[List[Tuple[int, int]], bool, bool, b has_partial_end = False # Check if 'complement' exists in the string - if 'complement' in string: + if "complement" in string: complement = True if "(" in string: # Extract positions found inside the parenthesis - inner_parentheses_regex = r'\(([^()]+)\)' + inner_parentheses_regex = r"\(([^()]+)\)" inner_matches = re.findall(inner_parentheses_regex, string) try: positions = inner_matches[-1] except IndexError: - raise ValueError(f'Gene position {string} is not formatted as expected.') + raise ValueError(f"Gene position {string} is not formatted as expected.") else: positions = string.rstrip() # Check if '>' or '<' exists in the positions to identify partial genes - if '>' in positions or '<' in positions: - if '<' in positions.split(',')[0]: + if ">" in positions or "<" in positions: + if "<" in positions.split(",")[0]: has_partial_start = True - if ">" in positions.split(',')[-1]: + if ">" in positions.split(",")[-1]: has_partial_end = True - inner_positions = ','.join(positions.split(',')[1:-1]) + inner_positions = ",".join(positions.split(",")[1:-1]) - if '>' in inner_positions or '<' in inner_positions or (not has_partial_end and not has_partial_start): - raise ValueError(f"Error parsing positions '{positions}' extracted from GBFF string '{string}'. " - f"Chevrons are found in the inner position. This case is unexpected and not handle.") + if ( + ">" in inner_positions + or "<" in inner_positions + or (not has_partial_end and not has_partial_start) + ): + raise ValueError( + f"Error parsing positions '{positions}' extracted from GBFF string '{string}'. " + f"Chevrons are found in the inner position. This case is unexpected and not handle." + ) - for position in positions.split(','): + for position in positions.split(","): try: - start, stop = position.replace(">", "").replace("<", "").split('..') + start, stop = position.replace(">", "").replace("<", "").split("..") except ValueError: # in some case there is only one position meaning that the gene is long of only one nt in this piece. # for instance : join(1038313,1..1016) @@ -196,17 +242,23 @@ def extract_positions(string: str) -> Tuple[List[Tuple[int, int]], bool, bool, b try: start, stop = int(start), int(stop) except ValueError: - raise ValueError(f"Error parsing position '{position}' extracted from GBFF string '{string}'. " - f"Start position ({start}) and/or stop position ({stop}) are not valid integers.") + raise ValueError( + f"Error parsing position '{position}' extracted from GBFF string '{string}'. " + f"Start position ({start}) and/or stop position ({stop}) are not valid integers." + ) coordinates.append((start, stop)) return coordinates, complement, has_partial_start, has_partial_end -def parse_gbff_by_contig(gbff_file_path: Path - ) -> Generator[Tuple[Dict[str, str], Generator[Dict[str, Union[str, Set[str]]], None, None], str], - None, None]: +def parse_gbff_by_contig( + gbff_file_path: Path, +) -> Generator[ + Tuple[Dict[str, str], Generator[Dict[str, Union[str, Set[str]]], None, None], str], + None, + None, +]: """ Parse a GBFF file by contig and yield tuples containing header, feature, and sequence info for each contig. @@ -225,19 +277,19 @@ def parse_gbff_by_contig(gbff_file_path: Path if not line.strip(): continue - if line.startswith('LOCUS') or line.startswith('CONTIG'): + if line.startswith("LOCUS") or line.startswith("CONTIG"): # CONTIG line are found between FEATURES and ORIGIN and are put in header section here for simplicity current_section = "header" - elif line.startswith('FEATURES'): + elif line.startswith("FEATURES"): current_section = "feature" continue - elif line.startswith('ORIGIN'): + elif line.startswith("ORIGIN"): current_section = "sequence" continue - if line.strip() == '//': + if line.strip() == "//": # Check that each section has some lines assert header_lines and feature_lines and sequence_lines, ( "Missing section in GBFF file. " @@ -246,8 +298,11 @@ def parse_gbff_by_contig(gbff_file_path: Path f"{len(header_lines)} feature lines, " f"and {len(sequence_lines)} sequence lines." ) - yield (parse_contig_header_lines(header_lines), parse_feature_lines(feature_lines), - parse_dna_seq_lines(sequence_lines)) + yield ( + parse_contig_header_lines(header_lines), + parse_feature_lines(feature_lines), + parse_dna_seq_lines(sequence_lines), + ) header_lines = [] feature_lines = [] @@ -265,12 +320,17 @@ def parse_gbff_by_contig(gbff_file_path: Path sequence_lines.append(line) else: - raise ValueError(f'Unexpected structure in GBFF file: {gbff_file_path}. {line}') + raise ValueError( + f"Unexpected structure in GBFF file: {gbff_file_path}. {line}" + ) # In case the last // is missing, return the last contig if header_lines or feature_lines or sequence_lines: - yield (parse_contig_header_lines(header_lines), parse_feature_lines(feature_lines), - parse_dna_seq_lines(sequence_lines)) + yield ( + parse_contig_header_lines(header_lines), + parse_feature_lines(feature_lines), + parse_dna_seq_lines(sequence_lines), + ) def parse_contig_header_lines(header_lines: List[str]) -> Dict[str, str]: @@ -281,7 +341,9 @@ def parse_contig_header_lines(header_lines: List[str]) -> Dict[str, str]: :return: A dict with keys representing different fields and values representing their corresponding values joined by new line. """ field = "" # Initialize field - field_to_value = defaultdict(list) # Initialize defaultdict to store field-value pairs + field_to_value = defaultdict( + list + ) # Initialize defaultdict to store field-value pairs for line in header_lines: field_of_line = line[:12].strip() # Extract field from the first 12 characters @@ -292,10 +354,12 @@ def parse_contig_header_lines(header_lines: List[str]) -> Dict[str, str]: # Append value to the current field in the defaultdict field_to_value[field].append(line[12:].strip()) - return {field: '\n'.join(value) for field, value in field_to_value.items()} + return {field: "\n".join(value) for field, value in field_to_value.items()} -def parse_feature_lines(feature_lines: List[str]) -> Generator[Dict[str, Union[str, Set[str]]], None, None]: +def parse_feature_lines( + feature_lines: List[str], +) -> Generator[Dict[str, Union[str, Set[str]]], None, None]: """ Parse feature lines from a GBFF file and yield dictionaries representing each feature. @@ -303,10 +367,12 @@ def parse_feature_lines(feature_lines: List[str]) -> Generator[Dict[str, Union[s :return: A generator that yields dictionaries, each representing a feature with its type, location, and qualifiers. """ - def stringify_feature_values(feature: Dict[str, List[str]]) -> Dict[str, Union[str, Set[str]]]: + def stringify_feature_values( + feature: Dict[str, List[str]] + ) -> Dict[str, Union[str, Set[str]]]: """ - All value of the returned dict are str except for db_xref that is a list. - When multiple values exist for the same tag only the first one is kept. + All value of the returned dict are str except for db_xref that is a list. + When multiple values exist for the same tag only the first one is kept. """ stringify_feature = {} for tag, val in feature.items(): @@ -334,11 +400,11 @@ def stringify_feature_values(feature: Dict[str, List[str]]) -> Dict[str, Union[s } current_qualifier = "location" - elif line.strip().startswith('/'): + elif line.strip().startswith("/"): qualifier_line = line.strip()[1:] # [1:] used to remove / if "=" in qualifier_line: - current_qualifier, value = qualifier_line.split('=', 1) + current_qualifier, value = qualifier_line.split("=", 1) else: current_qualifier, value = qualifier_line, qualifier_line # clean value from quote @@ -369,14 +435,15 @@ def parse_dna_seq_lines(sequence_lines: List[str]) -> str: :param sequence_lines: List of strings representing sequence lines from a GBFF file. :return: a string in upper case of the DNA sequences that have been cleaned """ - sequence = '' + sequence = "" for line in sequence_lines: sequence += line[10:].replace(" ", "").strip().upper() return sequence -def combine_contigs_metadata(contig_to_metadata: Dict[Contig, Dict[str, str]] - ) -> Tuple[Dict[str, str], Dict[Contig, Dict[str, str]]]: +def combine_contigs_metadata( + contig_to_metadata: Dict[Contig, Dict[str, str]] +) -> Tuple[Dict[str, str], Dict[Contig, Dict[str, str]]]: """ Combine contig metadata to identify shared and unique metadata tags and values. @@ -386,8 +453,12 @@ def combine_contigs_metadata(contig_to_metadata: Dict[Contig, Dict[str, str]] - A dictionary mapping each contig to its unique metadata tags and values. """ # Flatten all metadata items and count their occurrences - all_tag_to_value = [(tag, value) for source_info in contig_to_metadata.values() for (tag, value) in - source_info.items() if isinstance(value, str)] + all_tag_to_value = [ + (tag, value) + for source_info in contig_to_metadata.values() + for (tag, value) in source_info.items() + if isinstance(value, str) + ] # Filter tags that would have a / as it is forbidden when writing the table in HDF5. Such tag can appear with db_xref formatting invalid_tag_names = [] @@ -397,20 +468,29 @@ def combine_contigs_metadata(contig_to_metadata: Dict[Contig, Dict[str, str]] warnings.simplefilter("ignore") check_name_validity(tag) except ValueError as err: - logging.getLogger("PPanGGOLiN").debug(f"{err}. The tag {tag} is ignored for metadata.") + logging.getLogger("PPanGGOLiN").debug( + f"{err}. The tag {tag} is ignored for metadata." + ) invalid_tag_names.append(tag) if value == "": - logging.getLogger("PPanGGOLiN").debug(f"Ignoring tag '{tag}' for metadata due to an empty value.") + logging.getLogger("PPanGGOLiN").debug( + f"Ignoring tag '{tag}' for metadata due to an empty value." + ) invalid_tag_names.append(tag) - all_tag_to_value = [(tag, value) for tag, value in all_tag_to_value if tag not in invalid_tag_names] + all_tag_to_value = [ + (tag, value) for tag, value in all_tag_to_value if tag not in invalid_tag_names + ] contig_count = len(contig_to_metadata) # Identify tags and values shared by all contigs - shared_tag_and_values = {tag_and_value for tag_and_value in all_tag_to_value if - all_tag_to_value.count(tag_and_value) == contig_count} + shared_tag_and_values = { + tag_and_value + for tag_and_value in all_tag_to_value + if all_tag_to_value.count(tag_and_value) == contig_count + } # Create a dictionary for shared metadata genome_metadata = dict(shared_tag_and_values) @@ -418,15 +498,21 @@ def combine_contigs_metadata(contig_to_metadata: Dict[Contig, Dict[str, str]] contig_to_uniq_metadata = {} for contig, tag_to_value in contig_to_metadata.items(): # Identify unique metadata for each contig - uniq_tag_to_value = {tag: value for tag, value in tag_to_value.items() if - tag not in list(genome_metadata) + invalid_tag_names and isinstance(value, str)} + uniq_tag_to_value = { + tag: value + for tag, value in tag_to_value.items() + if tag not in list(genome_metadata) + invalid_tag_names + and isinstance(value, str) + } if uniq_tag_to_value: contig_to_uniq_metadata[contig] = uniq_tag_to_value return genome_metadata, contig_to_uniq_metadata -def reverse_complement_coordinates(coordinates: List[Tuple[int, int]]) -> List[Tuple[int, int]]: +def reverse_complement_coordinates( + coordinates: List[Tuple[int, int]] +) -> List[Tuple[int, int]]: """ Reverses and inverts the given list of coordinates. Each coordinate pair (start, end) is transformed into (-end, -start) and the order of the coordinates is reversed. @@ -438,7 +524,9 @@ def reverse_complement_coordinates(coordinates: List[Tuple[int, int]]) -> List[T return [(-end, -start) for start, end in coordinates[::-1]] -def shift_end_coordinates(coordinates: List[Tuple[int, int]], shift: int) -> List[Tuple[int, int]]: +def shift_end_coordinates( + coordinates: List[Tuple[int, int]], shift: int +) -> List[Tuple[int, int]]: """ Shifts the end of a set of coordinates by a specified amount and then returns the final shifted coordinates. This involves reversing the coordinates twice, shifting the start, and then returning the original orientation. @@ -455,7 +543,9 @@ def shift_end_coordinates(coordinates: List[Tuple[int, int]], shift: int) -> Lis return final_coordinates -def shift_start_coordinates(coordinates: List[Tuple[int, int]], shift: int) -> List[Tuple[int, int]]: +def shift_start_coordinates( + coordinates: List[Tuple[int, int]], shift: int +) -> List[Tuple[int, int]]: """ Shifts the start of the first coordinate in the list by the specified amount. If the shift results in a negative or zero-length interval for the first coordinate, this interval is removed, and the shift is @@ -479,11 +569,14 @@ def shift_start_coordinates(coordinates: List[Tuple[int, int]], shift: int) -> L if adjusted_part_length <= 0: # If the shift results in a zero or negative length, handle accordingly if len(new_coordinates) <= 1: - raise ValueError(f'Shifting the start resulted in a gene with null or negative size. ' - f'Coordinates: {coordinates}, Shift: {shift}') + raise ValueError( + f"Shifting the start resulted in a gene with null or negative size. " + f"Coordinates: {coordinates}, Shift: {shift}" + ) else: logging.getLogger("PPanGGOLiN").warning( - f'Coordinate part {new_coordinates[0]} resulted in a non-positive length after shift. Removing it.') + f"Coordinate part {new_coordinates[0]} resulted in a non-positive length after shift. Removing it." + ) new_coordinates = new_coordinates[1:] # If length is negative, propagate the shift to the next coordinate @@ -495,14 +588,14 @@ def shift_start_coordinates(coordinates: List[Tuple[int, int]], shift: int) -> L def fix_partial_gene_coordinates( - coordinates: List[Tuple[int, int]], - is_complement: bool, - start_shift: int, - ensure_codon_multiple: bool = True + coordinates: List[Tuple[int, int]], + is_complement: bool, + start_shift: int, + ensure_codon_multiple: bool = True, ) -> List[Tuple[int, int]]: """ Adjusts gene coordinates if they have partial starts or ends, ensuring the gene length is a multiple of 3. - + If the gene is on the complement strand, the adjustments will be reversed (i.e., applied to the opposite ends). :param coordinates: List of coordinate tuples (start, stop) for the gene. @@ -514,7 +607,9 @@ def fix_partial_gene_coordinates( """ if not coordinates: - raise ValueError('No coordinates provided. Cannot fix partial gene coordinates.') + raise ValueError( + "No coordinates provided. Cannot fix partial gene coordinates." + ) # Non-complement strand adjustments if not is_complement: @@ -524,7 +619,7 @@ def fix_partial_gene_coordinates( if ensure_codon_multiple: # Ensure the gene length is a multiple of 3 by adjusting the last end gene_length = sum([(stop - start + 1) for start, stop in coordinates]) - end_shift = (gene_length % 3) + end_shift = gene_length % 3 if end_shift != 0: coordinates = shift_end_coordinates(coordinates, end_shift) @@ -536,7 +631,7 @@ def fix_partial_gene_coordinates( if ensure_codon_multiple: # Adjust first start for complement strand gene_length = sum([(stop - start + 1) for start, stop in coordinates]) - start_shift = (gene_length % 3) + start_shift = gene_length % 3 if start_shift != 0: coordinates = shift_start_coordinates(coordinates, start_shift) @@ -545,13 +640,19 @@ def fix_partial_gene_coordinates( if gene_length % 3 != 0: logging.getLogger("PPanGGOLiN").warning( - f'Gene with coordinates: {coordinates} has a length that is not a multiple of 3 after adjusting for partiality with new cordinates ({coordinates}): {gene_length}') + f"Gene with coordinates: {coordinates} has a length that is not a multiple of 3 after adjusting for partiality with new cordinates ({coordinates}): {gene_length}" + ) return coordinates -def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: List[str], - use_pseudogenes: bool = False, translation_table: int = 11) -> Tuple[Organism, bool]: +def read_org_gbff( + organism_name: str, + gbff_file_path: Path, + circular_contigs: List[str], + use_pseudogenes: bool = False, + translation_table: int = 11, +) -> Tuple[Organism, bool]: """ Read a GBFF file and fills Organism, Contig and Genes objects based on information contained in this file @@ -569,73 +670,89 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li organism = Organism(organism_name) - logging.getLogger("PPanGGOLiN").debug(f"Extracting genes information from the given gbff {gbff_file_path.name}") + logging.getLogger("PPanGGOLiN").debug( + f"Extracting genes information from the given gbff {gbff_file_path.name}" + ) gene_counter = 0 rna_counter = 0 contig_to_metadata = {} for header, features, sequence in parse_gbff_by_contig(gbff_file_path): if "LOCUS" not in header: - raise ValueError('Missing LOCUS line in GBFF header.') + raise ValueError("Missing LOCUS line in GBFF header.") - if "VERSION" in header and header['VERSION'] != "": - contig_id = header['VERSION'] + if "VERSION" in header and header["VERSION"] != "": + contig_id = header["VERSION"] else: # If contig_id is not specified in VERSION field like with Prokka, in that case we use the one in LOCUS - contig_id = header['LOCUS'].split()[0] + contig_id = header["LOCUS"].split()[0] - contig_len = int(header['LOCUS'].split()[1]) + contig_len = int(header["LOCUS"].split()[1]) if contig_len != len(sequence): - logging.getLogger("PPanGGOLiN").warning("Unable to determine if the contig is circular or linear in file " - f"'{gbff_file_path}' from the LOCUS header information: {header['LOCUS']}. " - "By default, the contig will be considered linear.") + logging.getLogger("PPanGGOLiN").warning( + "Unable to determine if the contig is circular or linear in file " + f"'{gbff_file_path}' from the LOCUS header information: {header['LOCUS']}. " + "By default, the contig will be considered linear." + ) - if "CIRCULAR" in header['LOCUS'].upper(): + if "CIRCULAR" in header["LOCUS"].upper(): # this line contains linear/circular word telling if the dna sequence is circularized or not is_circ = True - elif "LINEAR" in header['LOCUS'].upper(): + elif "LINEAR" in header["LOCUS"].upper(): is_circ = False else: is_circ = False logging.getLogger("PPanGGOLiN").warning( f"It's impossible to identify if contig {contig_id} is circular or linear." - f"in file {gbff_file_path}.") + f"in file {gbff_file_path}." + ) try: contig = organism.get(contig_id) except KeyError: with contig_counter.get_lock(): - contig = Contig(contig_counter.value, contig_id, - True if contig_id in circular_contigs or is_circ else False) + contig = Contig( + contig_counter.value, + contig_id, + True if contig_id in circular_contigs or is_circ else False, + ) contig_counter.value += 1 organism.add(contig) contig.length = contig_len for feature in features: - if feature['feature_type'] == "source": - contig_to_metadata[contig] = {tag: value for tag, value in feature.items() if - tag not in ['feature_type', "location"] and isinstance(value, str)} + if feature["feature_type"] == "source": + contig_to_metadata[contig] = { + tag: value + for tag, value in feature.items() + if tag not in ["feature_type", "location"] + and isinstance(value, str) + } if "db_xref" in feature: try: - db_xref_for_metadata = {f"db_xref_{database}": identifier for database_identifier in - feature["db_xref"] for database, identifier in - [database_identifier.split(':')]} + db_xref_for_metadata = { + f"db_xref_{database}": identifier + for database_identifier in feature["db_xref"] + for database, identifier in [database_identifier.split(":")] + } contig_to_metadata[contig].update(db_xref_for_metadata) except ValueError: logging.getLogger("PPanGGOLiN").warning( f"db_xref values does not have the expected format. Expect 'db_xref=:' " f"but got {feature['db_xref']} in file {gbff_file_path}. " - "db_xref tags is therefore not retrieved in contig/genomes metadata.") + "db_xref tags is therefore not retrieved in contig/genomes metadata." + ) else: contig_to_metadata[contig].update(db_xref_for_metadata) - genetic_code = '' - if feature['feature_type'] not in ['CDS', 'rRNA', 'tRNA']: + genetic_code = "" + if feature["feature_type"] not in ["CDS", "rRNA", "tRNA"]: continue - coordinates, is_complement, has_partial_start, has_partial_end = extract_positions( - ''.join(feature['location'])) + coordinates, is_complement, has_partial_start, has_partial_end = ( + extract_positions("".join(feature["location"])) + ) if "pseudo" in feature and not use_pseudogenes: continue @@ -649,19 +766,25 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li "will likely contain an internal stop codon when translated with PPanGGOLiN." ) - if feature['feature_type'] == 'CDS': - if feature['transl_table'] == "": + if feature["feature_type"] == "CDS": + if feature["transl_table"] == "": used_transl_table_arg += 1 genetic_code = translation_table else: genetic_code = int(feature["transl_table"]) if has_partial_start or has_partial_end: - start_shift = 0 if 'codon_start' not in feature else int( - feature['codon_start']) - 1 # -1 is to be in zero based index. - - coordinates = fix_partial_gene_coordinates(coordinates, is_complement=is_complement, - start_shift=start_shift) + start_shift = ( + 0 + if "codon_start" not in feature + else int(feature["codon_start"]) - 1 + ) # -1 is to be in zero based index. + + coordinates = fix_partial_gene_coordinates( + coordinates, + is_complement=is_complement, + start_shift=start_shift, + ) strand = "-" if is_complement else "+" @@ -670,16 +793,16 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li contig=contig, gene_counter=gene_counter, rna_counter=rna_counter, - gene_id=feature['locus_tag'], + gene_id=feature["locus_tag"], dbxrefs=feature["db_xref"], coordinates=coordinates, strand=strand, gene_type=feature["feature_type"], position=contig.number_of_genes, gene_name=feature["gene"], - product=feature['product'], + product=feature["product"], genetic_code=genetic_code, - protein_id=feature["protein_id"] + protein_id=feature["protein_id"], ) gene.add_sequence(get_dna_sequence(sequence, gene)) @@ -689,14 +812,18 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li else: rna_counter += 1 - genome_metadata, contig_to_uniq_metadata = combine_contigs_metadata(contig_to_metadata) - organism.add_metadata(metadata=Metadata(source='annotation_file', **genome_metadata)) + genome_metadata, contig_to_uniq_metadata = combine_contigs_metadata( + contig_to_metadata + ) + organism.add_metadata( + metadata=Metadata(source="annotation_file", **genome_metadata) + ) for contig, metadata_dict in contig_to_uniq_metadata.items(): - contig.add_metadata(Metadata(source='annotation_file', **metadata_dict)) + contig.add_metadata(Metadata(source="annotation_file", **metadata_dict)) if used_transl_table_arg: - logging.getLogger("PPanGGOLiN").info( + logging.getLogger("PPanGGOLiN").debug( f"transl_table tag was not found for {used_transl_table_arg} CDS " f"in {gbff_file_path}. Provided translation_table argument value was used instead: {translation_table}." ) @@ -704,7 +831,9 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li return organism, True -def parse_db_xref_metadata(db_xref_values: List[str], annot_file_path: Path = "") -> Dict[str, str]: +def parse_db_xref_metadata( + db_xref_values: List[str], annot_file_path: Path = "" +) -> Dict[str, str]: """ Parses a list of db_xref values and returns a dictionary with formatted keys and identifiers. @@ -718,7 +847,7 @@ def parse_db_xref_metadata(db_xref_values: List[str], annot_file_path: Path = "" db_xref_for_metadata = { f"db_xref_{database}": identifier for database_identifier in db_xref_values - for database, identifier in [database_identifier.split(':')] + for database, identifier in [database_identifier.split(":")] } except ValueError: logging.getLogger("PPanGGOLiN").warning( @@ -729,8 +858,13 @@ def parse_db_xref_metadata(db_xref_values: List[str], annot_file_path: Path = "" return db_xref_for_metadata -def read_org_gff(organism: str, gff_file_path: Path, circular_contigs: List[str], - pseudo: bool = False, translation_table: int = 11) -> Tuple[Organism, bool]: +def read_org_gff( + organism: str, + gff_file_path: Path, + circular_contigs: List[str], + pseudo: bool = False, + translation_table: int = 11, +) -> Tuple[Organism, bool]: """ Read annotation from GFF file @@ -747,7 +881,17 @@ def read_org_gff(organism: str, gff_file_path: Path, circular_contigs: List[str] used_transl_table_arg = 0 global ctg_counter - (gff_seqname, _, gff_type, gff_start, gff_end, _, gff_strand, frame, gff_attribute) = range(0, 9) + ( + gff_seqname, + _, + gff_type, + gff_start, + gff_end, + _, + gff_strand, + frame, + gff_attribute, + ) = range(0, 9) # Missing values: source, score. They are unused. def get_gff_attributes(gff_fields: list) -> dict: @@ -757,11 +901,13 @@ def get_gff_attributes(gff_fields: list) -> dict: :return: Attributes get """ - attributes_field = [f for f in gff_fields[gff_attribute].strip().split(';') if len(f) > 0] + attributes_field = [ + f for f in gff_fields[gff_attribute].strip().split(";") if len(f) > 0 + ] attributes_get = {} for att in attributes_field: try: - (key, value) = att.strip().split('=') + (key, value) = att.strip().split("=") attributes_get[key.upper()] = value except ValueError: pass # we assume that it is a strange, but useless field for our analysis @@ -778,11 +924,15 @@ def get_id_attribute(attributes_dict: dict) -> str: """ element_id = attributes_dict.get("ID") if not element_id: - raise Exception(f"Each CDS type of the gff files must own a unique ID attribute. " - f"Not the case for file: {gff_file_path}") + raise Exception( + f"Each CDS type of the gff files must own a unique ID attribute. " + f"Not the case for file: {gff_file_path}" + ) return element_id - def check_chevrons_in_start_and_stop(start: str, stop: str) -> Tuple[int, int, bool]: + def check_chevrons_in_start_and_stop( + start: str, stop: str + ) -> Tuple[int, int, bool]: """ Checks for the presence of chevrons ('<' or '>') in the start and stop strings, removes them if present, and converts the remaining parts to integers. @@ -792,11 +942,11 @@ def check_chevrons_in_start_and_stop(start: str, stop: str) -> Tuple[int, int, b :return: A tuple containing the integer values of start and stop, and a boolean indicating if chevrons were present in either string. """ - chevrons_present = '>' in start or '<' in start or '>' in stop or '<' in stop + chevrons_present = ">" in start or "<" in start or ">" in stop or "<" in stop if chevrons_present: - start = int(start.replace('<', '').replace('>', '')) - stop = int(stop.replace('<', '').replace('>', '')) + start = int(start.replace("<", "").replace(">", "")) + stop = int(stop.replace("<", "").replace(">", "")) else: start = int(start) stop = int(stop) @@ -819,62 +969,89 @@ def check_chevrons_in_start_and_stop(start: str, stop: str) -> Tuple[int, int, b if has_fasta: fasta_string += line - elif line.startswith('##', 0, 2): - if line.startswith('FASTA', 2, 7): + elif line.startswith("##", 0, 2): + if line.startswith("FASTA", 2, 7): has_fasta = True - elif line.startswith('sequence-region', 2, 17): + elif line.startswith("sequence-region", 2, 17): fields = [el.strip() for el in line.split()] with contig_counter.get_lock(): - contig = Contig(contig_counter.value, fields[1], - True if fields[1] in circular_contigs else False) + contig = Contig( + contig_counter.value, + fields[1], + True if fields[1] in circular_contigs else False, + ) contig_counter.value += 1 org.add(contig) contig.length = int(fields[-1]) - int(fields[2]) + 1 else: continue - elif line.startswith('#'): - if line.startswith('Sequence Data', 2, 15): # GFF from prodigal - fields_prodigal = [el.strip() for el in line.split(': ')[1].split(";")] - attr_prodigal = {field.split("=")[0]: field.split("=")[1] for field in fields_prodigal} + elif line.startswith("#"): + if line.startswith("Sequence Data", 2, 15): # GFF from prodigal + fields_prodigal = [ + el.strip() for el in line.split(": ")[1].split(";") + ] + attr_prodigal = { + field.split("=")[0]: field.split("=")[1] + for field in fields_prodigal + } else: # comment lines to be ignores by parsers continue - elif line.rstrip() == "": # empty lines are not expected, but they do not carry information, so we'll ignore them + elif ( + line.rstrip() == "" + ): # empty lines are not expected, but they do not carry information, so we'll ignore them continue else: - fields_gff = [el.strip() for el in line.split('\t')] + fields_gff = [el.strip() for el in line.split("\t")] attributes = get_gff_attributes(fields_gff) pseudogene = False - gene_start, gene_stop, has_chevron = check_chevrons_in_start_and_stop(start=fields_gff[gff_start], - stop=fields_gff[gff_end]) + gene_start, gene_stop, has_chevron = check_chevrons_in_start_and_stop( + start=fields_gff[gff_start], stop=fields_gff[gff_end] + ) - if fields_gff[gff_type] == 'region': + if fields_gff[gff_type] == "region": # keep region attributes to add them as metadata of genome and contigs # excluding some info as they are already contained in contig object. - contig_name_to_region_info[fields_gff[gff_seqname]] = {tag.lower(): value for tag, value in - attributes.items() if - tag not in ['ID', "NAME", "IS_CIRCULAR", - "DB_XREF", "DBXREF"]} + contig_name_to_region_info[fields_gff[gff_seqname]] = { + tag.lower(): value + for tag, value in attributes.items() + if tag not in ["ID", "NAME", "IS_CIRCULAR", "DB_XREF", "DBXREF"] + } - if "DB_XREF" in attributes or "DBXREF" in attributes: # db_xref can be written Dbxref and db_ref + if ( + "DB_XREF" in attributes or "DBXREF" in attributes + ): # db_xref can be written Dbxref and db_ref dbxref_tag = "DB_XREF" if "DB_XREF" in attributes else "DBXREF" - dbxref_metadata = parse_db_xref_metadata(attributes[dbxref_tag].split(','), gff_file_path) - contig_name_to_region_info[fields_gff[gff_seqname]].update(dbxref_metadata) - - if fields_gff[gff_seqname] in circular_contigs or ('IS_CIRCULAR' in attributes and - attributes['IS_CIRCULAR'] == "true"): - # WARNING: In case we have prodigal gff with is_circular attributes. - # This would fail as contig is not defined. However, is_circular should not be found in prodigal gff - logging.getLogger("PPanGGOLiN").debug(f"Contig {contig.name} is circular.") - contig.is_circular = True - assert contig.name == fields_gff[gff_seqname] + dbxref_metadata = parse_db_xref_metadata( + attributes[dbxref_tag].split(","), gff_file_path + ) + contig_name_to_region_info[fields_gff[gff_seqname]].update( + dbxref_metadata + ) + + if ( + "IS_CIRCULAR" in attributes + and attributes["IS_CIRCULAR"] == "true" + ): + contig_name = fields_gff[gff_seqname] + + if contig is not None: + logging.getLogger("PPanGGOLiN").debug( + f"Contig {contig.name} is circular." + ) + contig.is_circular = True + assert contig.name == contig_name + else: + # contig object has not been initialized yet. + # let's keep the circularity info in the circular_contigs list + circular_contigs.append(contig_name) - elif fields_gff[gff_type] == 'CDS' or "RNA" in fields_gff[gff_type]: + elif fields_gff[gff_type] == "CDS" or "RNA" in fields_gff[gff_type]: id_attribute = get_id_attribute(attributes) locus_tag = attributes.get("LOCUS_TAG") @@ -889,41 +1066,52 @@ def check_chevrons_in_start_and_stop(start: str, stop: str) -> Tuple[int, int, b else: gene_id = id_attribute - name = attributes.pop('NAME', attributes.pop('GENE', "")) + name = attributes.pop("NAME", attributes.pop("GENE", "")) if "PSEUDO" in attributes or "PSEUDOGENE" in attributes: pseudogene = True - if ("PARTIAL" in attributes and attributes["PARTIAL"].upper() == "TRUE") or has_chevron: + if ( + "PARTIAL" in attributes + and attributes["PARTIAL"].upper() == "TRUE" + ) or has_chevron: is_partial = True else: is_partial = False - product = attributes.pop('PRODUCT', "") - + product = attributes.pop("PRODUCT", "") + if has_non_ascii(product): - + logging.getLogger("PPanGGOLiN").warning( - f"In genome '{organism}', the 'product' field of gene '{gene_id}' contains non-ASCII characters: '{product}'. " - "These characters cannot be stored in the HDF5 file and will be replaced by underscores." - ) + f"In genome '{organism}', the 'product' field of gene '{gene_id}' contains non-ASCII characters: '{product}'. " + "These characters cannot be stored in the HDF5 file and will be replaced by underscores." + ) product = replace_non_ascii(product) - if contig is None or contig.name != fields_gff[gff_seqname]: # get the current contig try: contig = org.get(fields_gff[gff_seqname]) except KeyError: with contig_counter.get_lock(): - contig = Contig(contig_counter.value, fields_gff[gff_seqname], - True if fields_gff[gff_seqname] in circular_contigs else False) + contig = Contig( + contig_counter.value, + fields_gff[gff_seqname], + ( + True + if fields_gff[gff_seqname] in circular_contigs + else False + ), + ) contig_counter.value += 1 org.add(contig) if attr_prodigal is not None: contig.length = int(attr_prodigal["seqlen"]) - if fields_gff[gff_type] == "CDS" and (not pseudogene or (pseudogene and pseudo)): + if fields_gff[gff_type] == "CDS" and ( + not pseudogene or (pseudogene and pseudo) + ): if "TRANSL_TABLE" in attributes: genetic_code = int(attributes["TRANSL_TABLE"]) @@ -936,17 +1124,21 @@ def check_chevrons_in_start_and_stop(start: str, stop: str) -> Tuple[int, int, b if fields_gff[frame] in ["1", "2", "0"]: gene_frame = int(fields_gff[frame]) - if gene_id in id_attr_to_gene_id: # the ID has already been seen at least once in this genome - existing_gene = id_attr_to_gene_id[gene_id] - new_gene_info = {"strand": fields_gff[gff_strand], - "type": fields_gff[gff_type], - "name": name, - "position": contig.number_of_genes, - "product": product, - "local_identifier": gene_id, - "start": gene_start, - "stop": gene_stop, - "frame": gene_frame} + if ( + id_attribute in id_attr_to_gene_id + ): # the ID has already been seen at least once in this genome + existing_gene = id_attr_to_gene_id[id_attribute] + new_gene_info = { + "strand": fields_gff[gff_strand], + "type": fields_gff[gff_type], + "name": name, + "position": contig.number_of_genes, + "product": product, + "local_identifier": gene_id, + "start": gene_start, + "stop": gene_stop, + "frame": gene_frame, + } check_and_add_extra_gene_part(existing_gene, new_gene_info) @@ -954,13 +1146,22 @@ def check_chevrons_in_start_and_stop(start: str, stop: str) -> Tuple[int, int, b gene = Gene(org.name + "_CDS_" + str(gene_counter).zfill(4)) - id_attr_to_gene_id[gene_id] = gene + id_attr_to_gene_id[id_attribute] = gene # here contig is filled in order, so position is the number of genes already stored in the contig. - gene.fill_annotations(start=gene_start, stop=gene_stop, strand=fields_gff[gff_strand], - gene_type=fields_gff[gff_type], name=name, product=product, - position=contig.number_of_genes, local_identifier=gene_id, - genetic_code=genetic_code, is_partial=is_partial, frame=gene_frame) + gene.fill_annotations( + start=gene_start, + stop=gene_stop, + strand=fields_gff[gff_strand], + gene_type=fields_gff[gff_type], + name=name, + product=product, + position=contig.number_of_genes, + local_identifier=gene_id, + genetic_code=genetic_code, + is_partial=is_partial, + frame=gene_frame, + ) gene.fill_parents(org, contig) gene_counter += 1 @@ -969,35 +1170,44 @@ def check_chevrons_in_start_and_stop(start: str, stop: str) -> Tuple[int, int, b elif "RNA" in fields_gff[gff_type]: rna_type = fields_gff[gff_type] - rna = RNA(org.name + f"_{rna_type}_" + str(rna_counter).zfill(4)) - - rna.fill_annotations(start=gene_start, stop=gene_stop, strand=fields_gff[gff_strand], - gene_type=fields_gff[gff_type], name=name, product=product, - local_identifier=gene_id) + rna = RNA( + org.name + f"_{rna_type}_" + str(rna_counter).zfill(4) + ) + + rna.fill_annotations( + start=gene_start, + stop=gene_stop, + strand=fields_gff[gff_strand], + gene_type=fields_gff[gff_type], + name=name, + product=product, + local_identifier=gene_id, + ) rna.fill_parents(org, contig) rna_counter += 1 contig.add_rna(rna) - # Correct coordinates of genes that overlap the edge of circulars contig - correct_putative_overlaps(org.contigs) - # Fix partial genes coordinates for contig in org.contigs: for gene in contig.genes: if gene.is_partial: - is_complement = gene.strand == '-' - gene.coordinates = fix_partial_gene_coordinates(gene.coordinates, is_complement=is_complement, - start_shift=gene.frame) + is_complement = gene.strand == "-" + gene.coordinates = fix_partial_gene_coordinates( + gene.coordinates, + is_complement=is_complement, + start_shift=gene.frame, + ) # GET THE FASTA SEQUENCES OF THE GENES if fasta_string == "": has_fasta = False if has_fasta: - contig_sequences = read_fasta(org, fasta_string.split('\n')) # _ is total contig length + contig_sequences = get_contigs_from_fasta_file(org, fasta_string.split("\n")) + + correct_putative_overlaps(org.contigs) + for contig in org.contigs: - if contig.length != len(contig_sequences[contig.name]): - raise ValueError("The contig length defined is different than the sequence length") for gene in contig.genes: gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene)) @@ -1009,15 +1219,18 @@ def check_chevrons_in_start_and_stop(start: str, stop: str) -> Tuple[int, int, b add_metadata_from_gff_file(contig_name_to_region_info, org, gff_file_path) if used_transl_table_arg: - logging.getLogger("PPanGGOLiN").info( + logging.getLogger("PPanGGOLiN").debug( f"transl_table tag was not found for {used_transl_table_arg} CDS " f"in {gff_file_path}. Provided translation_table argument value was used instead: {translation_table}." ) return org, has_fasta -def add_metadata_from_gff_file(contig_name_to_region_info: Dict[str, Dict[str, str]], - org: Organism, gff_file_path: Path): +def add_metadata_from_gff_file( + contig_name_to_region_info: Dict[str, Dict[str, str]], + org: Organism, + gff_file_path: Path, +): """ Add metadata to the organism object from a GFF file. @@ -1028,10 +1241,15 @@ def add_metadata_from_gff_file(contig_name_to_region_info: Dict[str, Dict[str, s # Check if the number of contigs matches the expected number in the organism if len(contig_name_to_region_info) == org.number_of_contigs: - contig_to_region_info = {org.get(name): region_info for name, region_info in contig_name_to_region_info.items()} - genome_metadata, contig_to_uniq_metadata = combine_contigs_metadata(contig_to_region_info) + contig_to_region_info = { + org.get(name): region_info + for name, region_info in contig_name_to_region_info.items() + } + genome_metadata, contig_to_uniq_metadata = combine_contigs_metadata( + contig_to_region_info + ) if genome_metadata: - org.add_metadata(Metadata(source='annotation_file', **genome_metadata)) + org.add_metadata(Metadata(source="annotation_file", **genome_metadata)) else: logging.getLogger("PPanGGOLiN").warning( f"Inconsistent data in GFF file {gff_file_path}: " @@ -1040,10 +1258,12 @@ def add_metadata_from_gff_file(contig_name_to_region_info: Dict[str, Dict[str, s contig_to_uniq_metadata = {} for contig, metadata_dict in contig_to_uniq_metadata.items(): - contig.add_metadata(Metadata(source='annotation_file', **metadata_dict)) + contig.add_metadata(Metadata(source="annotation_file", **metadata_dict)) -def check_and_add_extra_gene_part(gene: Gene, new_gene_info: Dict, max_separation: int = 10): +def check_and_add_extra_gene_part( + gene: Gene, new_gene_info: Dict, max_separation: int = 10 +): """ Checks and potentially adds extra gene parts based on new gene information. This is done before checking for potential overlapping edge genes. Gene coordinates are expected to be in ascending order, and no circularity is taken into account here. @@ -1059,26 +1279,31 @@ def check_and_add_extra_gene_part(gene: Gene, new_gene_info: Dict, max_separatio # Compare attributes of the existing gene with new_gene_info comparison = [ - gene.strand == new_gene_info['strand'], + gene.strand == new_gene_info["strand"], gene.type == new_gene_info["type"], - gene.product == new_gene_info['product'], - gene.name == new_gene_info['name'], - gene.local_identifier == new_gene_info['local_identifier'] + gene.product == new_gene_info["product"], + gene.name == new_gene_info["name"], + gene.local_identifier == new_gene_info["local_identifier"], ] if all(comparison): # The new gene info seems concordant with the gene object. We can try to merge them - assert new_gene_info['start'] <= new_gene_info['stop'], "Start is greater than stop. Incorrect coordinates." + assert ( + new_gene_info["start"] <= new_gene_info["stop"] + ), "Start is greater than stop. Incorrect coordinates." - new_gene_is_before = (gene.strand == "+" and new_gene_info['start'] < gene.start) or ( - gene.strand == "-" and new_gene_info['start'] > gene.start) + new_gene_is_before = ( + gene.strand == "+" and new_gene_info["start"] < gene.start + ) or (gene.strand == "-" and new_gene_info["start"] > gene.start) if new_gene_is_before: - # new gene start if before the current gene - # so its frame if used - gene.frame = new_gene_info['frame'] + # new gene start if before the current gene + # so its frame if used + gene.frame = new_gene_info["frame"] # Add new coordinates to gene's coordinates - gene.coordinates = sorted(gene.coordinates + [(new_gene_info['start'], new_gene_info['stop'])]) + gene.coordinates = sorted( + gene.coordinates + [(new_gene_info["start"], new_gene_info["stop"])] + ) # Check if the coordinates are within the allowed maximum separation first_stop = gene.coordinates[0][1] @@ -1087,7 +1312,8 @@ def check_and_add_extra_gene_part(gene: Gene, new_gene_info: Dict, max_separatio # This is maybe to restrictive but let's go with that first. raise ValueError( f"The coordinates of genes are too far apart ({abs(start - first_stop)}nt). This is unexpected. " - f"Gene coordinates : {gene.coordinates}") + f"Gene coordinates : {gene.coordinates}" + ) # Update start and stop positions based on new coordinates gene.start, gene.stop = gene.coordinates[0][0], gene.coordinates[-1][1] @@ -1095,7 +1321,8 @@ def check_and_add_extra_gene_part(gene: Gene, new_gene_info: Dict, max_separatio logging.getLogger("PPanGGOLiN").debug( f"Gene {new_gene_info['local_identifier']} is found in multiple parts. " "These parts are merged into one gene. " - f"New gene coordinates: {gene.coordinates}") + f"New gene coordinates: {gene.coordinates}" + ) else: detailed_comparison = { @@ -1107,7 +1334,8 @@ def check_and_add_extra_gene_part(gene: Gene, new_gene_info: Dict, max_separatio } raise ValueError( - f"Two genes have the same id attributes but different info in some key attribute. {detailed_comparison}") + f"Two genes have the same id attributes but different info in some key attribute. {detailed_comparison}" + ) def correct_putative_overlaps(contigs: Iterable[Contig]): @@ -1130,16 +1358,20 @@ def correct_putative_overlaps(contigs: Iterable[Contig]): if start > len(contig): if len(new_coordinates) == 0: - raise ValueError(f"First gene start position ({start}) is higher than contig " - f"length ({len(contig)}). This case is not handled.") + raise ValueError( + f"First gene start position ({start}) is higher than contig " + f"length ({len(contig)}). This case is not handled." + ) new_start = start - len(contig) new_stop = stop - len(contig) new_coordinates.append((new_start, new_stop)) - warn_msg = (f"Start position ({start}) for gene {gene.name} is higher than contig {contig.name}" - f" length ({len(contig)}). New coordinate are {new_coordinates}") + warn_msg = ( + f"Start position ({start}) for gene {gene.name} is higher than contig {contig.name}" + f" length ({len(contig)}). New coordinate are {new_coordinates}" + ) logging.getLogger("PPanGGOLiN").warning(warn_msg) elif stop > len(contig): # Handle overlapping gene @@ -1162,8 +1394,13 @@ def correct_putative_overlaps(contigs: Iterable[Contig]): gene.coordinates = new_coordinates -def read_anno_file(organism_name: str, filename: Path, circular_contigs: list, - pseudo: bool = False, translation_table: int = 11) -> Tuple[Organism, bool]: +def read_anno_file( + organism_name: str, + filename: Path, + circular_contigs: list, + pseudo: bool = False, + translation_table: int = 11, +) -> Tuple[Organism, bool]: """ Read a GBFF file for one organism @@ -1179,26 +1416,38 @@ def read_anno_file(organism_name: str, filename: Path, circular_contigs: list, filetype = detect_filetype(filename) if filetype == "gff": try: - org, has_fasta = read_org_gff(organism_name, filename, circular_contigs, pseudo, translation_table) + org, has_fasta = read_org_gff( + organism_name, filename, circular_contigs, pseudo, translation_table + ) except Exception as err: - raise Exception(f"Reading the gff3 file '{filename}' raised an error. {err}") + raise Exception( + f"Reading the gff3 file '{filename}' raised an error. {err}" + ) else: return org, has_fasta elif filetype == "gbff": try: - org, has_fasta = read_org_gbff(organism_name, filename, circular_contigs, pseudo, translation_table) + org, has_fasta = read_org_gbff( + organism_name, filename, circular_contigs, pseudo, translation_table + ) except Exception as err: - raise Exception(f"Reading the gbff file '{filename}' raised an error. {err}") + raise Exception( + f"Reading the gbff file '{filename}' raised an error. {err}" + ) else: return org, has_fasta elif filetype == "fasta": - raise ValueError(f"Invalid file type provided for parameter '--anno'. " - f"The file '{filename}' looks like a fasta file. " - "Please use a .gff or .gbff file. You may be able to use --fasta instead of --anno.") + raise ValueError( + f"Invalid file type provided for parameter '--anno'. " + f"The file '{filename}' looks like a fasta file. " + "Please use a .gff or .gbff file. You may be able to use --fasta instead of --anno." + ) else: - raise ValueError(f"Invalid file type provided for parameter '--anno'. " - f"The file '{filename}' appears to be of type '{filetype}'. Please use .gff or .gbff files.") + raise ValueError( + f"Invalid file type provided for parameter '--anno'. " + f"The file '{filename}' appears to be of type '{filetype}'. Please use .gff or .gbff files." + ) def chose_gene_identifiers(pangenome: Pangenome) -> bool: @@ -1214,8 +1463,12 @@ def chose_gene_identifiers(pangenome: Pangenome) -> bool: if local_identifiers_are_unique(pangenome.genes): for gene in pangenome.genes: - gene.ID = gene.local_identifier # Erase ppanggolin generated gene ids and replace with local identifiers - gene.local_identifier = "" # this is now useless, setting it to default value + gene.ID = ( + gene.local_identifier + ) # Erase ppanggolin generated gene ids and replace with local identifiers + gene.local_identifier = ( + "" # this is now useless, setting it to default value + ) pangenome._mk_gene_getter() # re-build the gene getter return True @@ -1243,9 +1496,14 @@ def local_identifiers_are_unique(genes: Iterable[Gene]) -> bool: return True -def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, pseudo: bool = False, - translation_table: int = 11, - disable_bar: bool = False): +def read_annotations( + pangenome: Pangenome, + organisms_file: Path, + cpu: int = 1, + pseudo: bool = False, + translation_table: int = 11, + disable_bar: bool = False, +): """ Read the annotation from GBFF file @@ -1257,26 +1515,34 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p :param disable_bar: Disable the progress bar """ - logging.getLogger("PPanGGOLiN").info(f"Reading {organisms_file.name} the list of genome files ...") + logging.getLogger("PPanGGOLiN").info( + f"Reading {organisms_file.name} the list of genome files ..." + ) pangenome.status["geneSequences"] = "Computed" # we assume there are gene sequences in the annotation files, # unless a gff file without fasta is met (which is the only case where sequences can be absent) args = [] for line in read_compressed_or_not(organisms_file): - if not line.strip() or line.strip().startswith('#'): + if not line.strip() or line.strip().startswith("#"): continue elements = [el.strip() for el in line.split("\t")] org_path = Path(elements[1]) name = elements[0] circular_contigs = elements[2:] - if not org_path.exists(): # Check tsv sanity test if it's not one it's the other + if ( + not org_path.exists() + ): # Check tsv sanity test if it's not one it's the other org_path = organisms_file.parent.joinpath(org_path) args.append((name, org_path, circular_contigs, pseudo, translation_table)) - with ProcessPoolExecutor(mp_context=get_context('fork'), max_workers=cpu, - initializer=init_contig_counter, initargs=(contig_counter,)) as executor: + with ProcessPoolExecutor( + mp_context=get_context("fork"), + max_workers=cpu, + initializer=init_contig_counter, + initargs=(contig_counter,), + ) as executor: with tqdm(total=len(args), unit="file", disable=disable_bar) as progress: futures = [] @@ -1295,15 +1561,21 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p # decide whether we use local ids or ppanggolin ids. used_local_identifiers = chose_gene_identifiers(pangenome) if used_local_identifiers: - logging.getLogger("PPanGGOLiN").info("gene identifiers used in the provided annotation files were unique, " - "PPanGGOLiN will use them.") + logging.getLogger("PPanGGOLiN").info( + "gene identifiers used in the provided annotation files were unique, " + "PPanGGOLiN will use them." + ) else: - logging.getLogger("PPanGGOLiN").info("gene identifiers used in the provided annotation files were not unique, " - "PPanGGOLiN will use self-generated identifiers.") + logging.getLogger("PPanGGOLiN").info( + "gene identifiers used in the provided annotation files were not unique, " + "PPanGGOLiN will use self-generated identifiers." + ) pangenome.status["genomesAnnotated"] = "Computed" pangenome.parameters["annotate"] = {} - pangenome.parameters["annotate"]["# used_local_identifiers"] = used_local_identifiers + pangenome.parameters["annotate"][ + "# used_local_identifiers" + ] = used_local_identifiers pangenome.parameters["annotate"]["use_pseudo"] = pseudo pangenome.parameters["annotate"]["# read_annotations_from_file"] = True @@ -1316,7 +1588,9 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p pangenome.status["metasources"]["contigs"].append("annotation_file") -def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: Path, disable_bar: bool = False): +def get_gene_sequences_from_fastas( + pangenome: Pangenome, fasta_files: Path, disable_bar: bool = False +): """ Get gene sequences from fastas @@ -1328,51 +1602,88 @@ def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: Path, disa for line in read_compressed_or_not(fasta_files): elements = [el.strip() for el in line.split("\t")] if len(elements) <= 1: - logging.getLogger("PPanGGOLiN").error("No tabulation separator found in genome file") + logging.getLogger("PPanGGOLiN").error( + "No tabulation separator found in genome file" + ) exit(1) try: org = pangenome.get_organism(elements[0]) except KeyError: - raise KeyError(f"One of the genome in your '{fasta_files}' was not found in the pan." - f" This might mean that the genome names between your annotation file and " - f"your fasta file are different.") + raise KeyError( + f"One of the genome in your '{fasta_files}' was not found in the pan." + f" This might mean that the genome names between your annotation file and " + f"your fasta file are different." + ) with read_compressed_or_not(Path(elements[1])) as currFastaFile: - fasta_dict[org] = read_fasta(org, currFastaFile) + fasta_dict[org] = get_contigs_from_fasta_file(org, currFastaFile) + + # When dealing with GFF files, some genes may have coordinates extending beyond the actual + # length of contigs, especially when they overlap the edges. This usually needs to be split + # into two parts to handle the circular genome wrapping. + # If the GFF file lacks associated FASTA sequences and it was not possible to determine the + # contig length from the GFF file, we must apply this correction while parsing the external FASTA file. + + correct_putative_overlaps(org.contigs) if set(pangenome.organisms) > set(fasta_dict.keys()): - missing = pangenome.number_of_organisms - len(set(pangenome.organisms) & set(fasta_dict.keys())) - raise KeyError(f"Not all of your pangenome genomes are present within the provided fasta file. " - f"{missing} are missing (out of {pangenome.number_of_organisms}).") + missing = pangenome.number_of_organisms - len( + set(pangenome.organisms) & set(fasta_dict.keys()) + ) + raise KeyError( + f"Not all of your pangenome genomes are present within the provided fasta file. " + f"{missing} are missing (out of {pangenome.number_of_organisms})." + ) elif pangenome.number_of_organisms < len(fasta_dict): # Indicates that all organisms in the pangenome are present in the provided FASTA file, # but additional genomes are also detected in the file. diff_genomes = len(fasta_dict) - pangenome.number_of_organisms - logging.getLogger("PPanGGOLiN").warning(f"The provided fasta file contains {diff_genomes} " - "additional genomes compared to the pangenome.") + logging.getLogger("PPanGGOLiN").warning( + f"The provided fasta file contains {diff_genomes} " + "additional genomes compared to the pangenome." + ) - with tqdm(total=pangenome.number_of_genes, unit="gene", disable=disable_bar, - desc="Add sequences to genes") as bar: + with tqdm( + total=pangenome.number_of_genes, + unit="gene", + disable=disable_bar, + desc="Add sequences to genes", + ) as bar: for org in pangenome.organisms: for contig in org.contigs: try: for gene in contig.genes: - gene.add_sequence(get_dna_sequence(fasta_dict[org][contig.name], gene)) + gene.add_sequence( + get_dna_sequence(fasta_dict[org][contig.name], gene) + ) bar.update() # for rna in contig.RNAs: # rna.add_sequence(get_dna_sequence(fasta_dict[org][contig.name], rna)) except KeyError: - msg = f"Fasta file for genome {org.name} did not have the contig {contig.name} " \ - f"that was read from the annotation file. " - msg += f"The provided contigs in the fasta were : " \ - f"{', '.join(fasta_dict[org].keys())}." + msg = ( + f"Fasta file for genome {org.name} did not have the contig {contig.name} " + f"that was read from the annotation file. " + ) + msg += ( + f"The provided contigs in the fasta were : " + f"{', '.join(fasta_dict[org].keys())}." + ) raise KeyError(msg) pangenome.status["geneSequences"] = "Computed" -def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: int = 1, translation_table: int = 11, - kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False, - procedure: str = None, disable_bar: bool = False): +def annotate_pangenome( + pangenome: Pangenome, + fasta_list: Path, + tmpdir: str, + cpu: int = 1, + translation_table: int = 11, + kingdom: str = "bacteria", + norna: bool = False, + allow_overlap: bool = False, + procedure: str = None, + disable_bar: bool = False, +): """ Main function to annotate a pangenome @@ -1388,7 +1699,9 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: :param disable_bar: Disable the progress bar """ - logging.getLogger("PPanGGOLiN").info(f"Reading {fasta_list} the list of genome files") + logging.getLogger("PPanGGOLiN").info( + f"Reading {fasta_list} the list of genome files" + ) arguments = [] # Argument given to annotate organism in same order than prototype for line in read_compressed_or_not(fasta_list): @@ -1396,18 +1709,37 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: elements = [el.strip() for el in line.split("\t")] org_path = Path(elements[1]) - if not org_path.exists(): # Check tsv sanity test if it's not one it's the other + if ( + not org_path.exists() + ): # Check tsv sanity test if it's not one it's the other org_path = fasta_list.parent.joinpath(org_path) - arguments.append((elements[0], org_path, elements[2:], tmpdir, translation_table, - norna, kingdom, allow_overlap, procedure)) + arguments.append( + ( + elements[0], + org_path, + elements[2:], + tmpdir, + translation_table, + norna, + kingdom, + allow_overlap, + procedure, + ) + ) if len(arguments) == 0: raise Exception("There are no genomes in the provided file") - logging.getLogger("PPanGGOLiN").info(f"Annotating {len(arguments)} genomes using {cpu} cpus...") - with ProcessPoolExecutor(mp_context=get_context('fork'), max_workers=cpu, - initializer=init_contig_counter, initargs=(contig_counter,)) as executor: + logging.getLogger("PPanGGOLiN").info( + f"Annotating {len(arguments)} genomes using {cpu} cpus..." + ) + with ProcessPoolExecutor( + mp_context=get_context("fork"), + max_workers=cpu, + initializer=init_contig_counter, + initargs=(contig_counter,), + ) as executor: with tqdm(total=len(arguments), unit="file", disable=disable_bar) as progress: futures = [] @@ -1421,12 +1753,16 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: logging.getLogger("PPanGGOLiN").info("Done annotating genomes") pangenome.status["genomesAnnotated"] = "Computed" # the pangenome is now annotated. - pangenome.status["geneSequences"] = "Computed" # the gene objects have their respective gene sequences. + pangenome.status["geneSequences"] = ( + "Computed" # the gene objects have their respective gene sequences. + ) pangenome.parameters["annotate"] = {} pangenome.parameters["annotate"]["norna"] = norna pangenome.parameters["annotate"]["kingdom"] = kingdom pangenome.parameters["annotate"]["translation_table"] = translation_table - pangenome.parameters["annotate"]["prodigal_procedure"] = None if procedure is None else procedure + pangenome.parameters["annotate"]["prodigal_procedure"] = ( + None if procedure is None else procedure + ) pangenome.parameters["annotate"]["allow_overlap"] = allow_overlap pangenome.parameters["annotate"]["# read_annotations_from_file"] = False @@ -1441,28 +1777,53 @@ def launch(args: argparse.Namespace): filename = mk_file_name(args.basename, args.output, args.force) pangenome = Pangenome() if args.fasta is not None and args.anno is None: - annotate_pangenome(pangenome, args.fasta, tmpdir=args.tmpdir, cpu=args.cpu, procedure=args.prodigal_procedure, - translation_table=args.translation_table, kingdom=args.kingdom, norna=args.norna, - allow_overlap=args.allow_overlap, disable_bar=args.disable_prog_bar) + annotate_pangenome( + pangenome, + args.fasta, + tmpdir=args.tmpdir, + cpu=args.cpu, + procedure=args.prodigal_procedure, + translation_table=args.translation_table, + kingdom=args.kingdom, + norna=args.norna, + allow_overlap=args.allow_overlap, + disable_bar=args.disable_prog_bar, + ) elif args.anno is not None: # TODO add warning for option not compatible with read_annotations - read_annotations(pangenome, args.anno, cpu=args.cpu, pseudo=args.use_pseudo, - translation_table=args.translation_table, disable_bar=args.disable_prog_bar) + read_annotations( + pangenome, + args.anno, + cpu=args.cpu, + pseudo=args.use_pseudo, + translation_table=args.translation_table, + disable_bar=args.disable_prog_bar, + ) if pangenome.status["geneSequences"] == "No": if args.fasta: - logging.getLogger("PPanGGOLiN").info(f"Get sequences from FASTA file: {args.fasta}") - get_gene_sequences_from_fastas(pangenome, args.fasta, disable_bar=args.disable_prog_bar) + logging.getLogger("PPanGGOLiN").info( + f"Get sequences from FASTA file: {args.fasta}" + ) + get_gene_sequences_from_fastas( + pangenome, args.fasta, disable_bar=args.disable_prog_bar + ) else: - logging.getLogger("PPanGGOLiN").warning("You provided gff files without sequences, " - "and you did not provide fasta sequences. " - "Thus it was not possible to get the gene sequences.") - logging.getLogger("PPanGGOLiN").warning("You will be able to proceed with your analysis " - "ONLY if you provide the clustering results in the next step.") + logging.getLogger("PPanGGOLiN").warning( + "You provided gff files without sequences, " + "and you did not provide fasta sequences. " + "Thus it was not possible to get the gene sequences." + ) + logging.getLogger("PPanGGOLiN").warning( + "You will be able to proceed with your analysis " + "ONLY if you provide the clustering results in the next step." + ) else: if args.fasta: - logging.getLogger("PPanGGOLiN").warning("You provided fasta sequences " - "but your gff files were already with sequences." - "PPanGGOLiN will use sequences in GFF and not from your fasta.") + logging.getLogger("PPanGGOLiN").warning( + "You provided fasta sequences " + "but your gff files were already with sequences." + "PPanGGOLiN will use sequences in GFF and not from your fasta." + ) write_pangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) @@ -1474,7 +1835,9 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : parser arguments for align command """ - parser = sub_parser.add_parser("annotate", formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "annotate", formatter_class=argparse.RawTextHelpFormatter + ) parser_annot(parser) return parser @@ -1486,49 +1849,113 @@ def parser_annot(parser: argparse.ArgumentParser): :param parser: parser for annotate argument """ date = time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", time.localtime()) - required = parser.add_argument_group(title="Required arguments", - description="One of the following arguments is required :") - required.add_argument('--fasta', required=False, type=Path, - help="A tab-separated file listing the genome names, and the fasta filepath of its genomic " - "sequence(s) (the fastas can be compressed with gzip). One line per genome.") - required.add_argument('--anno', required=False, type=Path, - help="A tab-separated file listing the genome names, and the gff/gbff filepath of its " - "annotations (the files can be compressed with gzip). One line per genome. " - "If this is provided, those annotations will be used.") + required = parser.add_argument_group( + title="Required arguments", + description="One of the following arguments is required :", + ) + required.add_argument( + "--fasta", + required=False, + type=Path, + help="A tab-separated file listing the genome names, and the fasta filepath of its genomic " + "sequence(s) (the fastas can be compressed with gzip). One line per genome.", + ) + required.add_argument( + "--anno", + required=False, + type=Path, + help="A tab-separated file listing the genome names, and the gff/gbff filepath of its " + "annotations (the files can be compressed with gzip). One line per genome. " + "If this is provided, those annotations will be used.", + ) optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument('-o', '--output', required=False, type=Path, - default=Path(f'ppanggolin_output{date}_PID{str(os.getpid())}'), - help="Output directory") - optional.add_argument('--allow_overlap', required=False, action='store_true', default=False, - help="Use to not remove genes overlapping with RNA features.") - optional.add_argument("--norna", required=False, action="store_true", default=False, - help="Use to avoid annotating RNA features.") - optional.add_argument("--kingdom", required=False, type=str.lower, default="bacteria", - choices=["bacteria", "archaea"], - help="Kingdom to which the prokaryota belongs to, " - "to know which models to use for rRNA annotation.") - optional.add_argument("--translation_table", required=False, type=int, default=11, - help="Translation table (genetic code) to use.") - optional.add_argument("--basename", required=False, default="pangenome", help="basename for the output file") - optional.add_argument("--use_pseudo", required=False, action="store_true", - help="In the context of provided annotation, use this option to read pseudogenes. " - "(Default behavior is to ignore them)") - optional.add_argument("-p", "--prodigal_procedure", required=False, type=str.lower, choices=["single", "meta"], - default=None, help="Allow to force the prodigal procedure. " - "If nothing given, PPanGGOLiN will decide in function of contig length") - optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") - optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), - help="directory for storing temporary files") - - -if __name__ == '__main__': + optional.add_argument( + "-o", + "--output", + required=False, + type=Path, + default=Path(f"ppanggolin_output{date}_PID{str(os.getpid())}"), + help="Output directory", + ) + optional.add_argument( + "--allow_overlap", + required=False, + action="store_true", + default=False, + help="Use to not remove genes overlapping with RNA features.", + ) + optional.add_argument( + "--norna", + required=False, + action="store_true", + default=False, + help="Use to avoid annotating RNA features.", + ) + optional.add_argument( + "--kingdom", + required=False, + type=str.lower, + default="bacteria", + choices=["bacteria", "archaea"], + help="Kingdom to which the prokaryota belongs to, " + "to know which models to use for rRNA annotation.", + ) + optional.add_argument( + "--translation_table", + required=False, + type=int, + default=11, + help="Translation table (genetic code) to use.", + ) + optional.add_argument( + "--basename", + required=False, + default="pangenome", + help="basename for the output file", + ) + optional.add_argument( + "--use_pseudo", + required=False, + action="store_true", + help="In the context of provided annotation, use this option to read pseudogenes. " + "(Default behavior is to ignore them)", + ) + optional.add_argument( + "-p", + "--prodigal_procedure", + required=False, + type=str.lower, + choices=["single", "meta"], + default=None, + help="Allow to force the prodigal procedure. " + "If nothing given, PPanGGOLiN will decide in function of contig length", + ) + optional.add_argument( + "-c", + "--cpu", + required=False, + default=1, + type=int, + help="Number of available cpus", + ) + optional.add_argument( + "--tmpdir", + required=False, + type=str, + default=Path(tempfile.gettempdir()), + help="directory for storing temporary files", + ) + + +if __name__ == "__main__": """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser_annot(main_parser) add_common_arguments(main_parser) diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 6ad10365..f3c74a92 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -9,7 +9,7 @@ from subprocess import Popen, PIPE import ast from collections import defaultdict -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Union, Generator, Tuple from pathlib import Path # install libraries @@ -19,7 +19,7 @@ from ppanggolin.genome import Organism, Gene, RNA, Contig from ppanggolin.utils import is_compressed, read_compressed_or_not -contig_counter: Value = Value('i', 0) +contig_counter: Value = Value("i", 0) def init_contig_counter(value: Value): @@ -36,8 +36,23 @@ def reverse_complement(seq: str): :return: reverse sequence """ - complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'R': 'Y', 'Y': 'R', - 'S': 'S', 'W': 'W', 'K': 'M', 'M': 'K', 'B': 'V', 'V': 'B', 'D': 'H', 'H': 'D'} + complement = { + "A": "T", + "C": "G", + "G": "C", + "T": "A", + "N": "N", + "R": "Y", + "Y": "R", + "S": "S", + "W": "W", + "K": "M", + "M": "K", + "B": "V", + "V": "B", + "D": "H", + "H": "D", + } # see https://www.bioinformatics.org/sms/iupac.html for the code. rcseq = "" for i in reversed(seq): @@ -45,7 +60,9 @@ def reverse_complement(seq: str): return rcseq -def launch_aragorn(fna_file: str, org: Organism, contig_to_length: Dict[str, int]) -> defaultdict: +def launch_aragorn( + fna_file: str, org: Organism, contig_to_length: Dict[str, int] +) -> defaultdict: """ Launches Aragorn to annotate tRNAs. @@ -59,7 +76,7 @@ def launch_aragorn(fna_file: str, org: Organism, contig_to_length: Dict[str, int logging.getLogger("PPanGGOLiN").debug(f"aragorn command : {' '.join(cmd)}") p = Popen(cmd, stdout=PIPE) # loading the whole thing, reverting it to 'pop' in order. - file_data = p.communicate()[0].decode().split("\n")[:: -1] + file_data = p.communicate()[0].decode().split("\n")[::-1] gene_objs = defaultdict(set) c = 0 contig_name = "" @@ -73,24 +90,40 @@ def launch_aragorn(fna_file: str, org: Organism, contig_to_length: Dict[str, int start, stop = map(int, ast.literal_eval(line_data[2].replace("c", ""))) if start < 1 or stop < 1: # In some case aragorn gives negative coordinates. This case is just ignored. - logging.warning(f'Aragorn gives non valid coordiates for a RNA gene in contig {contig_name}: {line_data}. This RNA is ignored.') + logging.warning( + f"Aragorn gives non valid coordiates for a RNA gene in contig {contig_name}: {line_data}. This RNA is ignored." + ) continue - if start > contig_to_length[contig_name] or stop > contig_to_length[contig_name]: - logging.warning(f'Aragorn gives non valide coordiates for a RNA gene in contig {contig_name}. ' - f'Gene coordinates exceed contig length ({contig_to_length[contig_name]}): ' - f'{line_data}. This RNA is ignored.') + if ( + start > contig_to_length[contig_name] + or stop > contig_to_length[contig_name] + ): + logging.warning( + f"Aragorn gives non valide coordiates for a RNA gene in contig {contig_name}. " + f"Gene coordinates exceed contig length ({contig_to_length[contig_name]}): " + f"{line_data}. This RNA is ignored." + ) continue c += 1 - gene = RNA(rna_id=locustag + '_tRNA_' + str(c).zfill(4)) - gene.fill_annotations(start=start, stop=stop, strand="-" if line_data[2].startswith("c") else "+", - gene_type="tRNA", product=line_data[1] + line_data[4]) + gene = RNA(rna_id=locustag + "_tRNA_" + str(c).zfill(4)) + gene.fill_annotations( + start=start, + stop=stop, + strand="-" if line_data[2].startswith("c") else "+", + gene_type="tRNA", + product=line_data[1] + line_data[4], + ) gene_objs[contig_name].add(gene) return gene_objs -def launch_prodigal(contig_sequences: Dict[str, str], org: Organism, code: int = 11, - use_meta: bool = False) -> defaultdict: +def launch_prodigal( + contig_sequences: Dict[str, str], + org: Organism, + code: int = 11, + use_meta: bool = False, +) -> defaultdict: """ Launches Prodigal to annotate CDS. Takes a fna file name and a locustag to give an ID to the pred genes. @@ -102,29 +135,40 @@ def launch_prodigal(contig_sequences: Dict[str, str], org: Organism, code: int = :return: Annotated genes in a list of gene objects """ gene_objs = defaultdict(set) - sequences = {contig_name: Sequence(sequence) for contig_name, sequence in contig_sequences.items()} + sequences = { + contig_name: Sequence(sequence) + for contig_name, sequence in contig_sequences.items() + } gene_finder = GeneFinder( meta=use_meta, # '-p meta' if meta is true else '-p single' closed=True, # -c: Closed ends. Do not allow genes to run off edges. mask=True, # -m: Treat runs of N as masked sequence; don't build genes across them. - min_gene=120 # This is to prevent error with mmseqs translatenucs that cut too short sequences + min_gene=120, # This is to prevent error with mmseqs translatenucs that cut too short sequences ) if not use_meta: - gene_finder.train(*contig_sequences.values(), force_nonsd=False, - translation_table=code) # -g: Specify a translation table to use (default 11). + gene_finder.train( + *contig_sequences.values(), force_nonsd=False, translation_table=code + ) # -g: Specify a translation table to use (default 11). gene_counter = 1 for contig_name, sequence in sequences.items(): for pred in gene_finder.find_genes(sequence): gene = Gene(gene_id=f"{org.name}_CDS_{str(gene_counter).zfill(4)}") - gene.fill_annotations(start=pred.begin, stop=pred.end, strand='-' if pred.strand == -1 else '+', - gene_type="CDS", genetic_code=code) + gene.fill_annotations( + start=pred.begin, + stop=pred.end, + strand="-" if pred.strand == -1 else "+", + gene_type="CDS", + genetic_code=code, + ) gene_counter += 1 gene_objs[contig_name].add(gene) return gene_objs -def launch_infernal(fna_file: str, org: Organism, tmpdir: str, kingdom: str = "bacteria") -> defaultdict: +def launch_infernal( + fna_file: str, org: Organism, tmpdir: str, kingdom: str = "bacteria" +) -> defaultdict: """ Launches Infernal in hmmer-only mode to annotate rRNAs. @@ -138,21 +182,39 @@ def launch_infernal(fna_file: str, org: Organism, tmpdir: str, kingdom: str = "b locustag = org.name modelfile = "" if kingdom == "bacteria": - modelfile = os.path.dirname(os.path.realpath(__file__)) + "/rRNA_DB/rRNA_bact.cm" + modelfile = ( + os.path.dirname(os.path.realpath(__file__)) + "/rRNA_DB/rRNA_bact.cm" + ) elif kingdom == "archaea": - modelfile = os.path.dirname(os.path.realpath(__file__)) + "/rRNA_DB/rRNA_arch.cm" + modelfile = ( + os.path.dirname(os.path.realpath(__file__)) + "/rRNA_DB/rRNA_arch.cm" + ) tmp_file = tempfile.NamedTemporaryFile(mode="r", dir=tmpdir) - cmd = ["cmscan", "--tblout", tmp_file.name, "--hmmonly", "--cpu", str(1), "--noali", modelfile, fna_file] + cmd = [ + "cmscan", + "--tblout", + tmp_file.name, + "--hmmonly", + "--cpu", + str(1), + "--noali", + modelfile, + fna_file, + ] logging.getLogger("PPanGGOLiN").debug(f"infernal command : {' '.join(cmd)}") p = Popen(cmd, stdout=open(os.devnull, "w"), stderr=PIPE) err = p.communicate()[1].decode().split() if err: - if err[0] == 'Error: ': - raise Exception(f"Infernal (cmscan) failed with error: '{' '.join(err)}'. If you never used this script," - f" you should press the .cm file using cmpress executable from Infernal. " - f"You should find the file in '{os.path.dirname(os.path.realpath(__file__))}/rRNA_DB/'.") - raise Exception(f"An error occurred with Infernal. Error is: '{' '.join(err)}'.") + if err[0] == "Error: ": + raise Exception( + f"Infernal (cmscan) failed with error: '{' '.join(err)}'. If you never used this script," + f" you should press the .cm file using cmpress executable from Infernal. " + f"You should find the file in '{os.path.dirname(os.path.realpath(__file__))}/rRNA_DB/'." + ) + raise Exception( + f"An error occurred with Infernal. Error is: '{' '.join(err)}'." + ) # never managed to test what happens if the .cm files are compressed with a 'bad' version of infernal, # so if that happens you are on your own. @@ -163,53 +225,123 @@ def launch_infernal(fna_file: str, org: Organism, tmpdir: str, kingdom: str = "b c += 1 line_data = line.split() strand = line_data[9] - start, stop = map(int, (line_data[8], line_data[7]) if strand == "-" else (line_data[7], line_data[8])) + start, stop = map( + int, + ( + (line_data[8], line_data[7]) + if strand == "-" + else (line_data[7], line_data[8]) + ), + ) gene = RNA(rna_id=locustag + "_rRNA_" + str(c).zfill(4)) - gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type="rRNA", - product=" ".join(line_data[17:])) + gene.fill_annotations( + start=start, + stop=stop, + strand=strand, + gene_type="rRNA", + product=" ".join(line_data[17:]), + ) gene_objs[line_data[2]].add(gene) return gene_objs -def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, str]: - """ Reads a fna file (or stream, or string) and stores it in a dictionary with contigs as key and sequence as value. +def check_sequence_tuple(name: str, sequence: str): + """ + Checks and validates a sequence name and its corresponding sequence. + + :param name: The name (header) of the sequence, typically extracted from the FASTA file header. + :param sequence: The sequence string corresponding to the name, containing the nucleotide or protein sequence. + + :return: A tuple containing the validated name and sequence. + + :raises ValueError: + - If the sequence is empty, a ValueError is raised with a message containing the header name. + - If the name is empty, a ValueError is raised with a message containing a preview of the sequence. + """ + if not sequence: + raise ValueError(f"Found an empty sequence with header '{name}'") + + if not name: + raise ValueError( + f"Found a sequence with empty name (sequence starts as '{sequence[:60]}')" + ) + + return name, sequence + - :param org: Organism corresponding to fasta file - :param fna_file: Input fasta file with sequences or list of each line as sequence +def parse_fasta( + fna_file: Union[TextIOWrapper, list] +) -> Generator[Tuple[str, str], None, None]: + """Yields each sequence name and sequence from a FASTA file or stream as a tuple. - :return: Dictionary with contig_name as keys and contig sequence in values + :param fna_file: Input FASTA file or list of lines as sequences. + :yield: Tuple with contig header (without '>') and sequence. + :raises ValueError: If the file does not contain valid FASTA format. """ + name = None + sequence = "" + + for line in fna_file: + line = line.strip() + + if line.startswith(">"): # New header + if name: # Yield previous header and sequence if available + yield check_sequence_tuple(name, sequence) + + name = line[1:].split()[ + 0 + ] # Strip '>' and extract the first word as the name + sequence = "" + + elif line: # Only append non-empty lines + sequence += line + + else: + # You can skip or handle empty lines here if required + pass + + # Yield the final contig if exists + if name: + yield check_sequence_tuple(name, sequence) + + # Check if there was any valid data (at least one header and sequence) + if not name: + raise ValueError("The file does not contain any valid FASTA content.") + + +def get_contigs_from_fasta_file( + org: Organism, fna_file: Union[TextIOWrapper, list] +) -> Dict[str, str]: + """Processes contigs from a parsed FASTA generator and stores in a dictionary. + + :param org: Organism instance to update with contig info. + :param fna_file: Input FASTA file or list of lines as sequences. + :return: Dictionary with contig names as keys and sequences as values. + """ + global contig_counter - try: - contigs = {} - contig_seq = "" - contig = None - for line in fna_file: - if line.startswith('>'): - if len(contig_seq) >= 1: # contig filter = 1 - contigs[contig.name] = contig_seq.upper() - contig.length = len(contig_seq) - contig_seq = "" - try: - contig = org.get(line.split()[0][1:]) - except KeyError: - with contig_counter.get_lock(): - contig = Contig(contig_counter.value, line.split()[0][1:]) - contig_counter.value += 1 - org.add(contig) - else: - contig_seq += line.strip() - if len(contig_seq) >= 1: # processing the last contig - contigs[contig.name] = contig_seq.upper() - contig.length = len(contig_seq) - - except AttributeError as e: - raise AttributeError(f"{e}\nAn error was raised when reading file: '{fna_file.name}'. " - f"One possibility for this error is that the file did not start with a '>' " - f"as it would be expected from a fna file.") - except Exception as err: # To manage other exception which can occur - raise Exception(f"{err}: Please check your input file and if everything looks fine, " - "please post an issue on our github") + contigs = {} + + for contig_name, sequence in parse_fasta(fna_file): + + # Retrieve or create the contig + try: + contig = org.get(contig_name) + except KeyError: + with contig_counter.get_lock(): + contig = Contig(contig_counter.value, contig_name) + contig_counter.value += 1 + org.add(contig) + + # Update contig information + if contig.length is not None and contig.length != len(sequence): + raise ValueError( + f"Length mismatch for contig {contig_name}: expected {contig.length}, found {len(sequence)} from the fasta sequence." + ) + + contig.length = len(sequence) + contigs[contig_name] = sequence.upper() + return contigs @@ -229,15 +361,22 @@ def write_tmp_fasta(contigs: dict, tmpdir: str) -> tempfile._TemporaryFileWrappe tmp_file.write(f">{header}\n") j = 0 while j < len(contigs[header]): - tmp_file.write(contigs[header][j: j + 60] + "\n") + tmp_file.write(contigs[header][j : j + 60] + "\n") j += 60 tmp_file.flush() # force write what remains in the buffer. return tmp_file -def syntaxic_annotation(org: Organism, fasta_file: TextIOWrapper, contig_sequences: Dict[str, str], - tmpdir: str, norna: bool = False, kingdom: str = "bacteria", - code: int = 11, use_meta: bool = False) -> defaultdict: +def syntaxic_annotation( + org: Organism, + fasta_file: TextIOWrapper, + contig_sequences: Dict[str, str], + tmpdir: str, + norna: bool = False, + kingdom: str = "bacteria", + code: int = 11, + use_meta: bool = False, +) -> defaultdict: """ Runs the different software for the syntaxic annotation. @@ -255,14 +394,23 @@ def syntaxic_annotation(org: Organism, fasta_file: TextIOWrapper, contig_sequenc # launching tools for syntaxic annotation genes = defaultdict(list) - for contig_name, genes_from_contig in launch_prodigal(contig_sequences=contig_sequences, org=org, code=code, use_meta=use_meta).items(): + for contig_name, genes_from_contig in launch_prodigal( + contig_sequences=contig_sequences, org=org, code=code, use_meta=use_meta + ).items(): genes[contig_name].extend(genes_from_contig) if not norna: - contig_to_length = {contig_name:len(contig_seq) for contig_name, contig_seq in contig_sequences.items()} - - for contig_name, genes_from_contig in launch_aragorn(fna_file=fasta_file.name, org=org, contig_to_length= contig_to_length).items(): + contig_to_length = { + contig_name: len(contig_seq) + for contig_name, contig_seq in contig_sequences.items() + } + + for contig_name, genes_from_contig in launch_aragorn( + fna_file=fasta_file.name, org=org, contig_to_length=contig_to_length + ).items(): genes[contig_name].extend(genes_from_contig) - for contig_name, genes_from_contig in launch_infernal(fna_file=fasta_file.name, org=org, kingdom=kingdom, tmpdir=tmpdir).items(): + for contig_name, genes_from_contig in launch_infernal( + fna_file=fasta_file.name, org=org, kingdom=kingdom, tmpdir=tmpdir + ).items(): genes[contig_name].extend(genes_from_contig) fasta_file.close() # closing either tmp file or original fasta file. return genes @@ -286,9 +434,17 @@ def overlap_filter(all_genes: defaultdict, allow_overlap: bool = False) -> defau for i, gene_i in enumerate(tmp_genes): if i + 1 < len(tmp_genes): gene_j = tmp_genes[i + 1] - if gene_i.type != "CDS" and gene_j.type == "CDS" and gene_i.stop > gene_j.start: + if ( + gene_i.type != "CDS" + and gene_j.type == "CDS" + and gene_i.stop > gene_j.start + ): rm_genes.add(gene_j) - elif gene_i.type == "CDS" and gene_j.type != "CDS" and gene_i.stop > gene_j.start: + elif ( + gene_i.type == "CDS" + and gene_j.type != "CDS" + and gene_i.stop > gene_j.start + ): rm_genes.add(gene_i) for gene in rm_genes: @@ -314,14 +470,17 @@ def get_dna_sequence(contig_seq: str, gene: Union[Gene, RNA]) -> str: # check contig coordinate is in scope of contig seq length highest_position = max((stop for _, stop in gene.coordinates)) assert highest_position <= len( - contig_seq), f"Coordinates of gene {gene} exceed length of the contig. Gene coordinates {gene.coordinates} vs contig length {len(contig_seq)}" + contig_seq + ), f"Coordinates of gene {gene} exceed length of the contig. Gene coordinates {gene.coordinates} vs contig length {len(contig_seq)}" # Extract gene seq - seq = ''.join([contig_seq[start - 1:stop] for start, stop in gene.coordinates]) + seq = "".join([contig_seq[start - 1 : stop] for start, stop in gene.coordinates]) # check length of extracted seq - assert len(seq) == len(gene), (f"The gene sequence of {gene} extracted from the contig does not have the expected length: " - f"extracted seq length {len(seq)}nt vs expected length based on gene coordinates ({gene.coordinates}) {len(gene)}nt ") + assert len(seq) == len(gene), ( + f"The gene sequence of {gene} extracted from the contig does not have the expected length: " + f"extracted seq length {len(seq)}nt vs expected length based on gene coordinates ({gene.coordinates}) {len(gene)}nt " + ) if gene.strand == "+": return seq @@ -329,9 +488,17 @@ def get_dna_sequence(contig_seq: str, gene: Union[Gene, RNA]) -> str: return reverse_complement(seq) -def annotate_organism(org_name: str, file_name: Path, circular_contigs: List[str], tmpdir: str, - code: int = 11, norna: bool = False, kingdom: str = "bacteria", - allow_overlap: bool = False, procedure: Optional[str] = None) -> Organism: +def annotate_organism( + org_name: str, + file_name: Path, + circular_contigs: List[str], + tmpdir: str, + code: int = 11, + norna: bool = False, + kingdom: str = "bacteria", + allow_overlap: bool = False, + procedure: Optional[str] = None, +) -> Organism: """ Function to annotate a single organism @@ -351,7 +518,7 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs: List[str fasta_file = read_compressed_or_not(file_name) - contig_sequences = read_fasta(org, fasta_file) + contig_sequences = get_contigs_from_fasta_file(org, fasta_file) if is_compressed(file_name): # TODO simply copy file with shutil.copyfileobj fasta_file = write_tmp_fasta(contig_sequences, tmpdir) if procedure is None: # prodigal procedure is not force by user @@ -359,13 +526,16 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs: List[str if max_contig_len < 20000: # case of short sequence use_meta = True logging.getLogger("PPanGGOLiN").info( - f"Using the metagenomic mode to predict genes for {org_name}, as all its contigs are < 20KB in size.") + f"Using the metagenomic mode to predict genes for {org_name}, as all its contigs are < 20KB in size." + ) else: use_meta = False else: use_meta = True if procedure == "meta" else False - genes = syntaxic_annotation(org, fasta_file, contig_sequences, tmpdir, norna, kingdom, code, use_meta) + genes = syntaxic_annotation( + org, fasta_file, contig_sequences, tmpdir, norna, kingdom, code, use_meta + ) genes = overlap_filter(genes, allow_overlap=allow_overlap) for contig_name, genes in genes.items(): diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index a740890d..eec599ad 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -15,14 +15,27 @@ from networkx import Graph from tqdm import tqdm import pandas as pd + # local libraries from ppanggolin.pangenome import Pangenome from ppanggolin.genome import Gene from ppanggolin.geneFamily import GeneFamily -from ppanggolin.utils import is_compressed, restricted_float, run_subprocess, create_tmpdir +from ppanggolin.utils import ( + is_compressed, + restricted_float, + run_subprocess, + create_tmpdir, +) from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome -from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file -from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations, translate_genes, create_mmseqs_db +from ppanggolin.formats.readBinaries import ( + check_pangenome_info, + write_gene_sequences_from_pangenome_file, +) +from ppanggolin.formats.writeSequences import ( + write_gene_sequences_from_annotations, + translate_genes, + create_mmseqs_db, +) # Global functions @@ -34,15 +47,21 @@ def check_pangenome_former_clustering(pangenome: Pangenome, force: bool = False) :param force: Force to write on existing pangenome information """ if pangenome.status["genesClustered"] == "inFile" and not force: - raise Exception("You are trying to cluster genes that are already clustered together. If you REALLY want to " - "do that, use --force (it will erase everything except annotation data in your HDF5 file!)") + raise Exception( + "You are trying to cluster genes that are already clustered together. If you REALLY want to " + "do that, use --force (it will erase everything except annotation data in your HDF5 file!)" + ) elif pangenome.status["genesClustered"] == "inFile" and force: erase_pangenome(pangenome, gene_families=True) # Clustering functions -def check_pangenome_for_clustering(pangenome: Pangenome, sequences: Path, force: bool = False, - disable_bar: bool = False): +def check_pangenome_for_clustering( + pangenome: Pangenome, + sequences: Path, + force: bool = False, + disable_bar: bool = False, +): """ Check the pangenome statuses and write the gene sequences in the provided tmpFile. (whether they are written in the .h5 file or currently in memory) @@ -54,24 +73,44 @@ def check_pangenome_for_clustering(pangenome: Pangenome, sequences: Path, force: """ check_pangenome_former_clustering(pangenome, force) if pangenome.status["geneSequences"] in ["Computed", "Loaded"]: - logging.getLogger("PPanGGOLiN").debug("Write sequences from annotation loaded in pangenome") + logging.getLogger("PPanGGOLiN").debug( + "Write sequences from annotation loaded in pangenome" + ) # we append the gene ids by 'ppanggolin' to avoid crashes from mmseqs when sequence IDs are only numeric. - write_gene_sequences_from_annotations(pangenome.genes, sequences, add="ppanggolin_", - compress=False, disable_bar=disable_bar) + write_gene_sequences_from_annotations( + pangenome.genes, + sequences, + add="ppanggolin_", + compress=False, + disable_bar=disable_bar, + ) elif pangenome.status["geneSequences"] == "inFile": logging.getLogger("PPanGGOLiN").debug("Write sequences from pangenome file") - write_gene_sequences_from_pangenome_file(pangenome.file, sequences, add="ppanggolin_", - compress=False, - disable_bar=disable_bar) # write CDS sequences to the tmpFile + write_gene_sequences_from_pangenome_file( + pangenome.file, + sequences, + add="ppanggolin_", + compress=False, + disable_bar=disable_bar, + ) # write CDS sequences to the tmpFile else: - raise Exception("The pangenome does not include gene sequences, thus it is impossible to cluster " - "the genes in gene families. Either provide clustering results (see --clusters), " - "or provide a way to access the gene sequence during the annotation step " - "(having the fasta in the gff files, or providing the fasta files through the --fasta option)") + raise Exception( + "The pangenome does not include gene sequences, thus it is impossible to cluster " + "the genes in gene families. Either provide clustering results (see --clusters), " + "or provide a way to access the gene sequence during the annotation step " + "(having the fasta in the gff files, or providing the fasta files through the --fasta option)" + ) -def first_clustering(sequences: Path, tmpdir: Path, cpu: int = 1, code: int = 11, coverage: float = 0.8, - identity: float = 0.8, mode: int = 1) -> Tuple[Path, Path]: +def first_clustering( + sequences: Path, + tmpdir: Path, + cpu: int = 1, + code: int = 11, + coverage: float = 0.8, + identity: float = 0.8, + mode: int = 1, +) -> Tuple[Path, Path]: """ Make a first clustering of all sequences in pangenome @@ -86,23 +125,80 @@ def first_clustering(sequences: Path, tmpdir: Path, cpu: int = 1, code: int = 11 :return: path to representative sequence file and path to tsv clustering result """ - seqdb = translate_genes(sequences=sequences, tmpdir=tmpdir, cpu=cpu, - is_single_line_fasta=True, code=code) + seqdb = translate_genes( + sequences=sequences, + tmpdir=tmpdir, + cpu=cpu, + is_single_line_fasta=True, + code=code, + ) logging.getLogger("PPanGGOLiN").info("Clustering sequences...") - cludb = tmpdir / 'cluster_db' - cmd = list(map(str, ["mmseqs", "cluster", seqdb, cludb, tmpdir, "--cluster-mode", mode, "--min-seq-id", - identity, "-c", coverage, "--threads", cpu, "--kmer-per-seq", 80, "--max-seqs", 300])) + cludb = tmpdir / "cluster_db" + cmd = list( + map( + str, + [ + "mmseqs", + "cluster", + seqdb, + cludb, + tmpdir, + "--cluster-mode", + mode, + "--min-seq-id", + identity, + "-c", + coverage, + "--threads", + cpu, + "--kmer-per-seq", + 80, + "--max-seqs", + 300, + ], + ) + ) run_subprocess(cmd, msg="MMSeqs2 cluster failed with the following error:\n") logging.getLogger("PPanGGOLiN").info("Extracting cluster representatives...") - repdb = tmpdir / 'representative_db' - cmd = list(map(str, ["mmseqs", "result2repseq", seqdb, cludb, repdb, "--threads", cpu])) + repdb = tmpdir / "representative_db" + cmd = list( + map(str, ["mmseqs", "result2repseq", seqdb, cludb, repdb, "--threads", cpu]) + ) run_subprocess(cmd, msg="MMSeqs2 result2repseq failed with the following error:\n") - reprfa = tmpdir / 'representative_sequences.fasta' - cmd = list(map(str, ["mmseqs", "result2flat", seqdb, seqdb, repdb, reprfa, "--use-fasta-header"])) + reprfa = tmpdir / "representative_sequences.fasta" + cmd = list( + map( + str, + [ + "mmseqs", + "result2flat", + seqdb, + seqdb, + repdb, + reprfa, + "--use-fasta-header", + ], + ) + ) run_subprocess(cmd, msg="MMSeqs2 result2flat failed with the following error:\n") logging.getLogger("PPanGGOLiN").info("Writing gene to family information") - outtsv = tmpdir / 'families_tsv' - cmd = list(map(str, ["mmseqs", "createtsv", seqdb, seqdb, cludb, outtsv, "--threads", cpu, "--full-header"])) + outtsv = tmpdir / "families_tsv" + cmd = list( + map( + str, + [ + "mmseqs", + "createtsv", + seqdb, + seqdb, + cludb, + outtsv, + "--threads", + cpu, + "--full-header", + ], + ) + ) run_subprocess(cmd, msg="MMSeqs2 createtsv failed with the following error:\n") return reprfa, outtsv @@ -119,14 +215,22 @@ def read_faa(faa_file_name: Path) -> Dict[str, str]: head = "" with open(faa_file_name) as faaFile: for line in faaFile: - if line.startswith('>'): - head = line[1:].strip().replace("ppanggolin_", "") # remove the eventual addition + if line.startswith(">"): + head = ( + line[1:].strip().replace("ppanggolin_", "") + ) # remove the eventual addition else: fam2seq[head] = line.strip() return fam2seq -def align_rep(faa_file: Path, tmpdir: Path, cpu: int = 1, coverage: float = 0.8, identity: float = 0.8) -> Path: +def align_rep( + faa_file: Path, + tmpdir: Path, + cpu: int = 1, + coverage: float = 0.8, + identity: float = 0.8, +) -> Path: """ Align representative sequence @@ -138,21 +242,58 @@ def align_rep(faa_file: Path, tmpdir: Path, cpu: int = 1, coverage: float = 0.8, :return: Result of alignment """ - seqdb = create_mmseqs_db([faa_file], 'rep_sequence_db', tmpdir, db_mode=1, db_type=1) + seqdb = create_mmseqs_db( + [faa_file], "rep_sequence_db", tmpdir, db_mode=1, db_type=1 + ) logging.getLogger("PPanGGOLiN").info("Aligning cluster representatives...") - alndb = tmpdir / 'rep_alignment_db' - cmd = list(map(str, ["mmseqs", "search", seqdb, seqdb, alndb, tmpdir, "-a", "--min-seq-id", identity, - "-c", coverage, "--cov-mode", 1, "--threads", cpu])) + alndb = tmpdir / "rep_alignment_db" + cmd = list( + map( + str, + [ + "mmseqs", + "search", + seqdb, + seqdb, + alndb, + tmpdir, + "-a", + "--min-seq-id", + identity, + "-c", + coverage, + "--cov-mode", + 1, + "--threads", + cpu, + ], + ) + ) run_subprocess(cmd, msg="MMSeqs2 search failed with the following error:\n") logging.getLogger("PPanGGOLiN").info("Extracting alignments...") - outfile = tmpdir / 'rep_families.tsv' - cmd = list(map(str, ["mmseqs", "convertalis", seqdb, seqdb, alndb, outfile, - "--format-output", "query,target,qlen,tlen,bits"])) + outfile = tmpdir / "rep_families.tsv" + cmd = list( + map( + str, + [ + "mmseqs", + "convertalis", + seqdb, + seqdb, + alndb, + outfile, + "--format-output", + "query,target,qlen,tlen,bits", + ], + ) + ) run_subprocess(cmd, msg="MMSeqs2 convertalis failed with the following error:\n") return outfile -def read_tsv(tsv_file_name: Path) -> Tuple[Dict[str, Tuple[str, bool]], Dict[str, Set[str]]]: +def read_tsv( + tsv_file_name: Path, +) -> Tuple[Dict[str, Tuple[str, bool]], Dict[str, Set[str]]]: """Reading tsv file :param tsv_file_name: path to the tsv @@ -163,15 +304,19 @@ def read_tsv(tsv_file_name: Path) -> Tuple[Dict[str, Tuple[str, bool]], Dict[str fam2genes = defaultdict(set) with open(tsv_file_name) as tsvfile: for line in tsvfile: - line = line.replace('"', '').replace("ppanggolin_", "").split() + line = line.replace('"', "").replace("ppanggolin_", "").split() # remove the '"' char which protects the fields, and the eventual addition - genes2fam[line[1]] = (line[0], False) # fam id, and it's a gene (and not a fragment) + genes2fam[line[1]] = ( + line[0], + False, + ) # fam id, and it's a gene (and not a fragment) fam2genes[line[0]].add(line[1]) return genes2fam, fam2genes -def refine_clustering(tsv: Path, aln_file: Path, - fam_to_seq: dict) -> Tuple[Dict[str, Tuple[str, bool]], Dict[str, str]]: +def refine_clustering( + tsv: Path, aln_file: Path, fam_to_seq: dict +) -> Tuple[Dict[str, Tuple[str, bool]], Dict[str, str]]: """ Refine clustering by removing fragment @@ -191,7 +336,9 @@ def refine_clustering(tsv: Path, aln_file: Path, # add the edges with open(aln_file) as alnfile: for line in alnfile: - line = line.replace('"', '').replace("ppanggolin_", "").split() # remove the eventual addition + line = ( + line.replace('"', "").replace("ppanggolin_", "").split() + ) # remove the eventual addition if line[0] != line[1]: simgraph.add_edge(line[0], line[1], score=float(line[4])) @@ -203,7 +350,11 @@ def refine_clustering(tsv: Path, aln_file: Path, for neighbor in sorted(simgraph.neighbors(node)): nei = simgraph.nodes[neighbor] score = simgraph[neighbor][node]["score"] - if nei["length"] > nodedata["length"] and nei["nbgenes"] >= nodedata["nbgenes"] and choice[3] < score: + if ( + nei["length"] > nodedata["length"] + and nei["nbgenes"] >= nodedata["nbgenes"] + and choice[3] < score + ): choice = (genes2fam[neighbor][0], nei["length"], nei["nbgenes"], score) # `genes2fam[neighbor]` instead of just neighbor in case that family has been assigned already # (this is for smaller fragments that are closer to other fragments than the actual gene family) @@ -217,7 +368,9 @@ def refine_clustering(tsv: Path, aln_file: Path, new_fam_to_seq = {} for fam in fam2genes: new_fam_to_seq[fam] = fam_to_seq[fam] - logging.getLogger("PPanGGOLiN").info(f"Ending with {len(new_fam_to_seq)} gene families") + logging.getLogger("PPanGGOLiN").info( + f"Ending with {len(new_fam_to_seq)} gene families" + ) return genes2fam, new_fam_to_seq @@ -228,7 +381,9 @@ def read_fam2seq(pangenome: Pangenome, fam_to_seq: Dict[str, str]): :param pangenome: Annotated pangenome :param fam_to_seq: Dictionary which link families and sequences """ - logging.getLogger("PPanGGOLiN").info("Adding protein sequences to the gene families") + logging.getLogger("PPanGGOLiN").info( + "Adding protein sequences to the gene families" + ) for family, protein in fam_to_seq.items(): fam = GeneFamily(pangenome.max_fam_id, family) fam.add_sequence(protein) @@ -243,17 +398,31 @@ def read_gene2fam(pangenome: Pangenome, gene_to_fam: dict, disable_bar: bool = F :param gene_to_fam: Dictionary which link gene to families :param disable_bar: Allow to disable progress bar """ - logging.getLogger("PPanGGOLiN").info(f"Adding {len(gene_to_fam)} genes to the gene families") - - link = True if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] else False - if link and len(gene_to_fam) != pangenome.number_of_genes: # then maybe there are genes with identical IDs - logging.getLogger("PPanGGOLiN").debug(f"gene_to_fam size: {len(gene_to_fam)}, " - f"Pangenome nb genes: {pangenome.number_of_genes}") - raise Exception("Something unexpected happened during clustering (have less genes clustered than genes " - "in the pangenome). A probable reason is that two genes in two different genomes have " - "the same IDs; If you are sure that all of your genes have non identical IDs, please post an " - "issue at https://github.com/labgem/PPanGGOLiN/") - for gene, (family, is_frag) in tqdm(gene_to_fam.items(), unit="gene", total=len(gene_to_fam), disable=disable_bar): + logging.getLogger("PPanGGOLiN").info( + f"Adding {len(gene_to_fam)} genes to the gene families" + ) + + link = ( + True + if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] + else False + ) + if ( + link and len(gene_to_fam) != pangenome.number_of_genes + ): # then maybe there are genes with identical IDs + logging.getLogger("PPanGGOLiN").debug( + f"gene_to_fam size: {len(gene_to_fam)}, " + f"Pangenome nb genes: {pangenome.number_of_genes}" + ) + raise Exception( + "Something unexpected happened during clustering (have less genes clustered than genes " + "in the pangenome). A probable reason is that two genes in two different genomes have " + "the same IDs; If you are sure that all of your genes have non identical IDs, please post an " + "issue at https://github.com/labgem/PPanGGOLiN/" + ) + for gene, (family, is_frag) in tqdm( + gene_to_fam.items(), unit="gene", total=len(gene_to_fam), disable=disable_bar + ): try: fam = pangenome.get_gene_family(family) except KeyError: # Family not found so create and add @@ -267,9 +436,19 @@ def read_gene2fam(pangenome: Pangenome, gene_to_fam: dict, disable_bar: bool = F fam.add(gene_obj) -def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = True, code: int = 11, - coverage: float = 0.8, identity: float = 0.8, mode: int = 1, force: bool = False, - disable_bar: bool = False, keep_tmp_files: bool = True): +def clustering( + pangenome: Pangenome, + tmpdir: Path, + cpu: int = 1, + defrag: bool = True, + code: int = 11, + coverage: float = 0.8, + identity: float = 0.8, + mode: int = 1, + force: bool = False, + disable_bar: bool = False, + keep_tmp_files: bool = True, +): """ Cluster gene sequences from an annotated pangenome into families. @@ -286,19 +465,25 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = :param keep_tmp_files: Keep temporary files (useful for debugging). """ date = time.strftime("_%Y-%m-%d_%H-%M-%S", time.localtime()) - dir_name = f'clustering_tmpdir_{date}_PID{os.getpid()}' + dir_name = f"clustering_tmpdir_{date}_PID{os.getpid()}" with create_tmpdir(tmpdir, basename=dir_name, keep_tmp=keep_tmp_files) as tmp_path: - sequence_path = tmp_path / 'nucleotide_sequences.fna' - check_pangenome_for_clustering(pangenome, sequence_path, force, disable_bar=disable_bar) + sequence_path = tmp_path / "nucleotide_sequences.fna" + check_pangenome_for_clustering( + pangenome, sequence_path, force, disable_bar=disable_bar + ) logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...") - rep, tsv = first_clustering(sequence_path, tmp_path, cpu, code, coverage, identity, mode) + rep, tsv = first_clustering( + sequence_path, tmp_path, cpu, code, coverage, identity, mode + ) fam2seq = read_faa(rep) if not defrag: logging.getLogger("PPanGGOLiN").debug("No defragmentation") genes2fam, _ = read_tsv(tsv) else: - logging.getLogger("PPanGGOLiN").info("Associating fragments to their original gene family...") + logging.getLogger("PPanGGOLiN").info( + "Associating fragments to their original gene family..." + ) aln = align_rep(rep, tmp_path, cpu, coverage, identity) genes2fam, fam2seq = refine_clustering(tsv, aln, fam2seq) pangenome.status["defragmented"] = "Computed" @@ -332,16 +517,21 @@ def mk_local_to_gene(pangenome: Pangenome) -> dict: old_len = len(local_dict) local_dict[gene.local_identifier] = gene if len(local_dict) == old_len: - if pangenome.parameters["annotate"]["# read_annotations_from_file"] and not \ - pangenome.parameters["annotate"]["# used_local_identifiers"]: - raise Exception(f"'{gene.local_identifier}' was found multiple times used as an identifier. " - f"The identifier of the genes (locus_tag, protein_id in gbff, ID in gff) were not " - f"unique throughout all of the files. It is thus impossible to differentiate the genes." - f" To use this function while importing annotate, all identifiers MUST be unique " - f"throughout all of your genomes") + if ( + pangenome.parameters["annotate"]["# read_annotations_from_file"] + and not pangenome.parameters["annotate"]["# used_local_identifiers"] + ): + raise Exception( + f"'{gene.local_identifier}' was found multiple times used as an identifier. " + f"The identifier of the genes (locus_tag, protein_id in gbff, ID in gff) were not " + f"unique throughout all of the files. It is thus impossible to differentiate the genes." + f" To use this function while importing annotate, all identifiers MUST be unique " + f"throughout all of your genomes" + ) return {} # local identifiers are not unique. return local_dict + def infer_singletons(pangenome: Pangenome): """ Creates a new family for each gene with no associated family. @@ -356,8 +546,6 @@ def infer_singletons(pangenome: Pangenome): fam.representative = gene fam.add(gene) - - # Try to add the new family try: pangenome.add_gene_family(fam) @@ -369,13 +557,22 @@ def infer_singletons(pangenome: Pangenome): singleton_counter += 1 - logging.getLogger("PPanGGOLiN").info(f"Inferred {singleton_counter} singleton families") + logging.getLogger("PPanGGOLiN").info( + f"Inferred {singleton_counter} singleton families" + ) -def get_family_representative_sequences(pangenome: Pangenome, code: int = 11, cpu: int = 1, - tmpdir: Path = None, keep_tmp: bool = False): +def get_family_representative_sequences( + pangenome: Pangenome, + code: int = 11, + cpu: int = 1, + tmpdir: Path = None, + keep_tmp: bool = False, +): - logging.getLogger("PPanGGOLiN").info("Retrieving protein sequences of family representatives.") + logging.getLogger("PPanGGOLiN").info( + "Retrieving protein sequences of family representatives." + ) tmpdir = Path(tempfile.gettempdir()) if tmpdir is None else tmpdir with create_tmpdir(tmpdir, "get_proteins_sequences", keep_tmp) as tmp: @@ -387,18 +584,27 @@ def get_family_representative_sequences(pangenome: Pangenome, code: int = 11, cp for family in pangenome.gene_families: if family.representative.dna is None: - raise ValueError(f'DNA sequence of representative gene {family.representative} is None. ' - 'Sequence may not have been loaded correctly from the pangenome file or the pangenome has no gene sequences.') + raise ValueError( + f"DNA sequence of representative gene {family.representative} is None. " + "Sequence may not have been loaded correctly from the pangenome file or the pangenome has no gene sequences." + ) repres_seq.write(f">{family.name}\n") repres_seq.write(f"{family.representative.dna}\n") - translate_db = translate_genes(sequences=repres_path, tmpdir=tmp, cpu=cpu, - is_single_line_fasta=True, code=code) + translate_db = translate_genes( + sequences=repres_path, + tmpdir=tmp, + cpu=cpu, + is_single_line_fasta=True, + code=code, + ) outpath = tmp / "representative_protein_genes.fna" cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) - run_subprocess(cmd, msg="MMSeqs convert2fasta failed with the following error:\n") + run_subprocess( + cmd, msg="MMSeqs convert2fasta failed with the following error:\n" + ) with open(outpath) as repres_prot: lines = repres_prot.readlines() @@ -441,22 +647,28 @@ def read_clustering_file(families_tsv_path: Path) -> Tuple[pd.DataFrame, bool]: families_tsv_path, sep="\t", header=None, - compression=compress_type if compress_type is not None else 'infer', - dtype=str + compression=compress_type if compress_type is not None else "infer", + dtype=str, ) # Process DataFrame based on the number of columns if families_df.shape[1] == 2: families_df.columns = ["family", "gene"] - families_df["representative"] = families_df.groupby('family')['gene'].transform('first') + families_df["representative"] = families_df.groupby("family")["gene"].transform( + "first" + ) families_df["is_frag"] = False elif families_df.shape[1] == 3: # Check if the third column is 'is_frag' - if families_df[2].dropna().eq('F').all(): + if families_df[2].dropna().eq("F").all(): families_df.columns = ["family", "gene", "is_frag"] - families_df["is_frag"] = families_df["is_frag"].replace('F', True).fillna(False) - families_df["representative"] = families_df.groupby('family')['gene'].transform('first') + families_df["is_frag"] = ( + families_df["is_frag"].replace("F", True).fillna(False) + ) + families_df["representative"] = families_df.groupby("family")[ + "gene" + ].transform("first") else: families_df.columns = ["family", "gene", "representative"] families_df["is_frag"] = False @@ -478,15 +690,27 @@ def read_clustering_file(families_tsv_path: Path) -> Tuple[pd.DataFrame, bool]: duplicates = families_df[families_df["gene"].duplicated()]["gene"].unique() if len(duplicates) > 0: - raise ValueError(f"Duplicate gene IDs found in your clustering: {', '.join(duplicates)}") - + raise ValueError( + f"Duplicate gene IDs found in your clustering: {', '.join(duplicates)}" + ) - return families_df[["family", "representative", "gene", "is_frag"]], families_df["is_frag"].any() + return ( + families_df[["family", "representative", "gene", "is_frag"]], + families_df["is_frag"].any(), + ) -def read_clustering(pangenome: Pangenome, families_tsv_path: Path, infer_singleton: bool = False, - code: int = 11, cpu: int = 1, tmpdir: Path = None, keep_tmp: bool = False, - force: bool = False, disable_bar: bool = False): +def read_clustering( + pangenome: Pangenome, + families_tsv_path: Path, + infer_singleton: bool = False, + code: int = 11, + cpu: int = 1, + tmpdir: Path = None, + keep_tmp: bool = False, + force: bool = False, + disable_bar: bool = False, +): """ Get the pangenome information, the gene families and the genes with an associated gene family. Reads a families tsv file from mmseqs2 output and adds the gene families and the genes to the pangenome. @@ -504,11 +728,16 @@ def read_clustering(pangenome: Pangenome, families_tsv_path: Path, infer_singlet check_pangenome_former_clustering(pangenome, force) if pangenome.status["geneSequences"] == "No": - need_gene_sequences=False + need_gene_sequences = False else: need_gene_sequences = True - check_pangenome_info(pangenome, need_annotations=True, need_gene_sequences=need_gene_sequences, disable_bar=disable_bar) + check_pangenome_info( + pangenome, + need_annotations=True, + need_gene_sequences=need_gene_sequences, + disable_bar=disable_bar, + ) families_df, frag = read_clustering_file(families_tsv_path) @@ -522,9 +751,19 @@ def get_gene_obj(identifier): gene_obj = local_dict.get(identifier) return gene_obj - for _, row in tqdm(families_df.iterrows(), total=families_df.shape[0], unit="line", disable=disable_bar): - - fam_id, reprez_id, gene_id, is_frag = str(row['family']), str(row['representative']), str(row['gene']), bool(row['is_frag']) + for _, row in tqdm( + families_df.iterrows(), + total=families_df.shape[0], + unit="line", + disable=disable_bar, + ): + + fam_id, reprez_id, gene_id, is_frag = ( + str(row["family"]), + str(row["representative"]), + str(row["gene"]), + bool(row["is_frag"]), + ) gene = get_gene_obj(gene_id) @@ -538,7 +777,9 @@ def get_gene_obj(identifier): fam = GeneFamily(pangenome.max_fam_id, fam_id) representative_gene = get_gene_obj(reprez_id) if representative_gene is None: - raise KeyError(f"The gene {reprez_id} associated to family {fam_id} from the clustering file is not found in pangenome.") + raise KeyError( + f"The gene {reprez_id} associated to family {fam_id} from the clustering file is not found in pangenome." + ) fam.representative = representative_gene @@ -546,15 +787,20 @@ def get_gene_obj(identifier): gene.is_fragment = is_frag fam.add(gene) else: - raise KeyError(f"The gene {gene_id} associated to family {fam_id} from the clustering file is not found in pangenome.") + raise KeyError( + f"The gene {gene_id} associated to family {fam_id} from the clustering file is not found in pangenome." + ) - - if nb_gene_with_fam < pangenome.number_of_genes: # not all genes have an associated cluster + if ( + nb_gene_with_fam < pangenome.number_of_genes + ): # not all genes have an associated cluster if nb_gene_with_fam == 0: - raise Exception("No gene ID in the cluster file matched any gene ID from the annotation step." - " Please ensure that the annotations that you loaded previously and the clustering results " - "that you have used the same gene IDs. If you use .gff files it is the identifier stored in" - " the field 'ID'. If you use .gbff files it is the identifier stored in 'locus_tag'.") + raise Exception( + "No gene ID in the cluster file matched any gene ID from the annotation step." + " Please ensure that the annotations that you loaded previously and the clustering results " + "that you have used the same gene IDs. If you use .gff files it is the identifier stored in" + " the field 'ID'. If you use .gbff files it is the identifier stored in 'locus_tag'." + ) else: if infer_singleton: infer_singletons(pangenome) @@ -565,7 +811,9 @@ def get_gene_obj(identifier): f"or use the '--infer_singletons' option to automatically infer a cluster for each non-clustered gene." ) if pangenome.status["geneSequences"] == "No": - logging.getLogger("PPanGGOLiN").info("The pangenome has no gene sequences so it is not possible to extract sequence of family representatives.") + logging.getLogger("PPanGGOLiN").info( + "The pangenome has no gene sequences so it is not possible to extract sequence of family representatives." + ) else: get_family_representative_sequences(pangenome, code, cpu, tmpdir, keep_tmp) @@ -588,20 +836,52 @@ def launch(args: argparse.Namespace): pangenome.add_file(args.pangenome) if args.clusters is None: if args.infer_singletons is True: - logging.getLogger("PPanGGOLiN").warning("--infer_singletons option is not compatible with clustering " - "creation. To infer singleton you should give a clustering") - clustering(pangenome, args.tmpdir, args.cpu, defrag=not args.no_defrag, code=args.translation_table, - coverage=args.coverage, identity=args.identity, mode=args.mode, force=args.force, - disable_bar=args.disable_prog_bar, keep_tmp_files=args.keep_tmp) + logging.getLogger("PPanGGOLiN").warning( + "--infer_singletons option is not compatible with clustering " + "creation. To infer singleton you should give a clustering" + ) + clustering( + pangenome, + args.tmpdir, + args.cpu, + defrag=not args.no_defrag, + code=args.translation_table, + coverage=args.coverage, + identity=args.identity, + mode=args.mode, + force=args.force, + disable_bar=args.disable_prog_bar, + keep_tmp_files=args.keep_tmp, + ) logging.getLogger("PPanGGOLiN").info("Done with the clustering") else: - if None in [args.tmpdir, args.cpu, args.no_defrag, args.translation_table, - args.coverage, args.identity, args.mode]: - logging.getLogger("PPanGGOLiN").warning("You are using an option compatible only with clustering creation.") - read_clustering(pangenome, args.clusters, args.infer_singletons, args.translation_table, - args.cpu, args.tmpdir, args.keep_tmp, args.force, disable_bar=args.disable_prog_bar) + if None in [ + args.tmpdir, + args.cpu, + args.no_defrag, + args.translation_table, + args.coverage, + args.identity, + args.mode, + ]: + logging.getLogger("PPanGGOLiN").warning( + "You are using an option compatible only with clustering creation." + ) + read_clustering( + pangenome, + args.clusters, + args.infer_singletons, + args.translation_table, + args.cpu, + args.tmpdir, + args.keep_tmp, + args.force, + disable_bar=args.disable_prog_bar, + ) logging.getLogger("PPanGGOLiN").info("Done reading the cluster file") - write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar) + write_pangenome( + pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar + ) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -612,7 +892,9 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : parser arguments for align command """ - parser = sub_parser.add_parser("cluster", formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "cluster", formatter_class=argparse.RawTextHelpFormatter + ) parser_clust(parser) return parser @@ -623,45 +905,99 @@ def parser_clust(parser: argparse.ArgumentParser): :param parser: parser for align argument """ - required = parser.add_argument_group(title="Required arguments", - description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") + required = parser.add_argument_group( + title="Required arguments", + description="One of the following arguments is required :", + ) + required.add_argument( + "-p", "--pangenome", required=False, type=Path, help="The pangenome .h5 file" + ) clust = parser.add_argument_group(title="Clustering arguments") - clust.add_argument("--identity", required=False, type=restricted_float, default=0.8, - help="Minimal identity percent for two proteins to be in the same cluster") - clust.add_argument("--coverage", required=False, type=restricted_float, default=0.8, - help="Minimal coverage of the alignment for two proteins to be in the same cluster") - clust.add_argument("--mode", required=False, default="1", choices=["0", "1", "2", "3"], - help="the cluster mode of MMseqs2. 0: Setcover, 1: single linkage (or connected component)," - " 2: CD-HIT-like, 3: CD-HIT-like (lowmem)") - clust.add_argument('--no_defrag', required=False, default=False, action="store_true", - help="DO NOT Use the defragmentation strategy to link potential fragments " - "with their original gene family.") + clust.add_argument( + "--identity", + required=False, + type=restricted_float, + default=0.8, + help="Minimal identity percent for two proteins to be in the same cluster", + ) + clust.add_argument( + "--coverage", + required=False, + type=restricted_float, + default=0.8, + help="Minimal coverage of the alignment for two proteins to be in the same cluster", + ) + clust.add_argument( + "--mode", + required=False, + default="1", + choices=["0", "1", "2", "3"], + help="the cluster mode of MMseqs2. 0: Setcover, 1: single linkage (or connected component)," + " 2: CD-HIT-like, 3: CD-HIT-like (lowmem)", + ) + clust.add_argument( + "--no_defrag", + required=False, + default=False, + action="store_true", + help="DO NOT Use the defragmentation strategy to link potential fragments " + "with their original gene family.", + ) read = parser.add_argument_group(title="Read clustering arguments") - read.add_argument('--clusters', required=False, type=Path, - help="A tab-separated list containing the result of a clustering. One line per gene. " - "First column is cluster ID, and second is gene ID") - read.add_argument("--infer_singletons", required=False, action="store_true", - help="When reading a clustering result with --clusters, if a gene is not in the provided file" - " it will be placed in a cluster where the gene is the only member.") + read.add_argument( + "--clusters", + required=False, + type=Path, + help="A tab-separated list containing the result of a clustering. One line per gene. " + "First column is cluster ID, and second is gene ID", + ) + read.add_argument( + "--infer_singletons", + required=False, + action="store_true", + help="When reading a clustering result with --clusters, if a gene is not in the provided file" + " it will be placed in a cluster where the gene is the only member.", + ) optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument("--translation_table", required=False, default="11", - help="Translation table (genetic code) to use.") - optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") - optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), - help="directory for storing temporary files") - optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", - help="Keeping temporary files (useful for debugging).") + optional.add_argument( + "--translation_table", + required=False, + default="11", + help="Translation table (genetic code) to use.", + ) + optional.add_argument( + "-c", + "--cpu", + required=False, + default=1, + type=int, + help="Number of available cpus", + ) + optional.add_argument( + "--tmpdir", + required=False, + type=Path, + default=Path(tempfile.gettempdir()), + help="directory for storing temporary files", + ) + optional.add_argument( + "--keep_tmp", + required=False, + default=False, + action="store_true", + help="Keeping temporary files (useful for debugging).", + ) -if __name__ == '__main__': +if __name__ == "__main__": """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser_clust(main_parser) add_common_arguments(main_parser) set_verbosity_level(main_parser.parse_args()) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 88cd9fc8..fbd989e4 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -19,34 +19,62 @@ # local libraries from ppanggolin.formats import check_pangenome_info from ppanggolin.genome import Gene, Contig, Organism -from ppanggolin.utils import mk_outdir, restricted_float, create_tmpdir, read_compressed_or_not, extract_contig_window +from ppanggolin.utils import ( + mk_outdir, + restricted_float, + create_tmpdir, + read_compressed_or_not, + extract_contig_window, +) from ppanggolin.pangenome import Pangenome -from ppanggolin.align.alignOnPang import project_and_write_partition, get_input_seq_to_family_with_rep, \ - get_input_seq_to_family_with_all, get_seq_ids +from ppanggolin.align.alignOnPang import ( + project_and_write_partition, + get_input_seq_to_family_with_rep, + get_input_seq_to_family_with_all, + get_seq_ids, +) from ppanggolin.region import GeneContext from ppanggolin.geneFamily import GeneFamily from ppanggolin.projection.projection import write_gene_to_gene_family def check_pangenome_for_context_search(pangenome: Pangenome, sequences: bool = False): - """ Check pangenome status and information to search context + """Check pangenome status and information to search context :param pangenome: The pangenome object :param sequences: True if search contexts with sequences """ if pangenome.status["genesClustered"] not in ["inFile", "Loaded", "Computed"]: - raise AttributeError("Cannot use this function as pangenome genes has not been clustered yet. " - "See the 'ppanggolin cluster' if you want to do that.") - if sequences and pangenome.status["geneFamilySequences"] not in ["inFile", "Loaded", "Computed"]: - raise AttributeError("Your pangenome gene families does not have representatives sequences associated. " - "For now this works only if the clustering has been made by PPanGGOLiN.") - - -def align_sequences_to_families(pangenome: Pangenome, output: Path, sequence_file: Path = None, identity: float = 0.5, - coverage: float = 0.8, use_representatives: bool = False, no_defrag: bool = False, - cpu: int = 1, translation_table: int = 11, tmpdir: Path = None, keep_tmp: bool = False, - disable_bar=True) -> Tuple[Set[GeneFamily], Dict[GeneFamily, Set[str]]]: - """ Align sequences to pangenome gene families to get families of interest + raise AttributeError( + "Cannot use this function as pangenome genes has not been clustered yet. " + "See the 'ppanggolin cluster' if you want to do that." + ) + if sequences and pangenome.status["geneFamilySequences"] not in [ + "inFile", + "Loaded", + "Computed", + ]: + raise AttributeError( + "Your pangenome gene families does not have representatives sequences associated. " + "For now this works only if the clustering has been made by PPanGGOLiN." + ) + + +def align_sequences_to_families( + pangenome: Pangenome, + output: Path, + sequence_file: Path = None, + identity: float = 0.5, + coverage: float = 0.8, + use_representatives: bool = False, + no_defrag: bool = False, + cpu: int = 1, + translation_table: int = 11, + tmpdir: Path = None, + keep_tmp: bool = False, + disable_bar=True, +) -> Tuple[Set[GeneFamily], Dict[GeneFamily, Set[str]]]: + """Align sequences to pangenome gene families to get families of interest :param pangenome: Pangenome containing GeneFamilies to align with sequence set :param sequence_file: Path to file containing the sequences @@ -71,24 +99,44 @@ def align_sequences_to_families(pangenome: Pangenome, output: Path, sequence_fil with read_compressed_or_not(sequence_file) as seqFileObj: seq_set, is_nucleotide, is_slf = get_seq_ids(seqFileObj) - logging.debug(f"Input sequences are {'nucleotide' if is_nucleotide else 'protein'} sequences") + logging.debug( + f"Input sequences are {'nucleotide' if is_nucleotide else 'protein'} sequences" + ) - with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: + with create_tmpdir( + main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp + ) as new_tmpdir: input_type = "nucleotide" if is_nucleotide else "unknow" if use_representatives: - _, seqid2fam = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir, - input_type=input_type, is_input_slf=is_slf, cpu=cpu, - no_defrag=no_defrag, identity=identity, - coverage=coverage, translation_table=translation_table, - disable_bar=disable_bar) + _, seqid2fam = get_input_seq_to_family_with_rep( + pangenome, + sequence_file, + output, + new_tmpdir, + input_type=input_type, + is_input_slf=is_slf, + cpu=cpu, + no_defrag=no_defrag, + identity=identity, + coverage=coverage, + translation_table=translation_table, + disable_bar=disable_bar, + ) else: - _, seqid2fam = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=sequence_file, - output=output, tmpdir=new_tmpdir, - input_type=input_type, is_input_slf=is_slf, - cpu=cpu, no_defrag=no_defrag, - identity=identity, coverage=coverage, - translation_table=translation_table, - disable_bar=disable_bar) + _, seqid2fam = get_input_seq_to_family_with_all( + pangenome=pangenome, + sequence_files=sequence_file, + output=output, + tmpdir=new_tmpdir, + input_type=input_type, + is_input_slf=is_slf, + cpu=cpu, + no_defrag=no_defrag, + identity=identity, + coverage=coverage, + translation_table=translation_table, + disable_bar=disable_bar, + ) project_and_write_partition(seqid2fam, seq_set, output) write_gene_to_gene_family(seqid2fam, seq_set, output) @@ -102,9 +150,18 @@ def align_sequences_to_families(pangenome: Pangenome, output: Path, sequence_fil return families_of_interest, family_2_input_seqid -def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, sequence_file: Path = None, - families: Path = None, transitive: int = 4, jaccard_threshold: float = 0.85, - window_size: int = 1, graph_format: str = "graphml", disable_bar=True, **kwargs): +def search_gene_context_in_pangenome( + pangenome: Pangenome, + output: Path, + sequence_file: Path = None, + families: Path = None, + transitive: int = 4, + jaccard_threshold: float = 0.85, + window_size: int = 1, + graph_format: str = "graphml", + disable_bar=True, + **kwargs, +): """ Main function to search common gene contexts between sequence set and pangenome families @@ -120,14 +177,19 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, sequenc """ # check statuses and load info - check_pangenome_for_context_search(pangenome, sequences=True if sequence_file is not None else False) - check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) + check_pangenome_for_context_search( + pangenome, sequences=True if sequence_file is not None else False + ) + check_pangenome_info( + pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar + ) families_of_interest = set() family_2_input_seqid = {} if sequence_file is not None: - fams_of_interest, family_2_input_seqid = align_sequences_to_families(pangenome, output, sequence_file, - disable_bar=disable_bar, **kwargs) + fams_of_interest, family_2_input_seqid = align_sequences_to_families( + pangenome, output, sequence_file, disable_bar=disable_bar, **kwargs + ) families_of_interest |= fams_of_interest if families is not None: with read_compressed_or_not(families) as f: @@ -139,27 +201,38 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, sequenc logging.getLogger().info("Building the graph...") - gene_context_graph, _ = compute_gene_context_graph(families=families_of_interest, transitive=transitive, - window_size=window_size, disable_bar=disable_bar) + gene_context_graph, _ = compute_gene_context_graph( + families=families_of_interest, + transitive=transitive, + window_size=window_size, + disable_bar=disable_bar, + ) - logging.getLogger().info(f"Took {round(time.time() - start_time, 2)} " - f"seconds to build the graph to find common gene contexts") + logging.getLogger().info( + f"Took {round(time.time() - start_time, 2)} " + f"seconds to build the graph to find common gene contexts" + ) - logging.getLogger().debug(f"Context graph made of {nx.number_of_nodes(gene_context_graph)} nodes and " - f"{nx.number_of_edges(gene_context_graph)} edges") + logging.getLogger().debug( + f"Context graph made of {nx.number_of_nodes(gene_context_graph)} nodes and " + f"{nx.number_of_edges(gene_context_graph)} edges" + ) compute_edge_metrics(gene_context_graph, jaccard_threshold) # Filter graph - filter_flag = f'is_jaccard_gene_>_{jaccard_threshold}' + filter_flag = f"is_jaccard_gene_>_{jaccard_threshold}" - edges_to_remove = [(n, v) for n, v, d in gene_context_graph.edges(data=True) if not d[filter_flag]] + edges_to_remove = [ + (n, v) for n, v, d in gene_context_graph.edges(data=True) if not d[filter_flag] + ] gene_context_graph.remove_edges_from(edges_to_remove) logging.getLogger().debug(f"Filtering context graph on {filter_flag}") logging.getLogger().debug( f"Context graph made of {nx.number_of_nodes(gene_context_graph)} nodes and " - f"{nx.number_of_edges(gene_context_graph)} edges") + f"{nx.number_of_edges(gene_context_graph)} edges" + ) gene_contexts = get_gene_contexts(gene_context_graph, families_of_interest) @@ -168,24 +241,31 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, sequenc if len(gene_contexts) != 0: logging.getLogger().info( - f"There are {sum(len(gc) for gc in gene_contexts)} families among {len(gene_contexts)} gene contexts") + f"There are {sum(len(gc) for gc in gene_contexts)} families among {len(gene_contexts)} gene contexts" + ) output_file = output / "gene_contexts.tsv" - export_context_to_dataframe(gene_contexts, family_2_input_seqid, families_of_interest, output_file) + export_context_to_dataframe( + gene_contexts, family_2_input_seqid, families_of_interest, output_file + ) else: logging.getLogger("PPanGGOLiN").info("No gene contexts were found") - logging.getLogger("PPanGGOLiN").info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") + logging.getLogger("PPanGGOLiN").info( + f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds" + ) return gene_context_graph, out_graph_file -def get_gene_contexts(context_graph: nx.Graph, families_of_interest: Set[GeneFamily]) -> Set[GeneContext]: +def get_gene_contexts( + context_graph: nx.Graph, families_of_interest: Set[GeneFamily] +) -> Set[GeneContext]: """ Extract gene contexts from a context graph based on the provided set of gene families of interest. - - Gene contexts are extracted from a context graph by identifying connected components. + + Gene contexts are extracted from a context graph by identifying connected components. The function filters the connected components based on the following criteria: - Remove singleton families (components with only one gene family). - Remove components that do not contain any gene families of interest. @@ -202,10 +282,16 @@ def get_gene_contexts(context_graph: nx.Graph, families_of_interest: Set[GeneFam # Connected component graph Filtering # remove singleton families - connected_components = (component for component in connected_components if len(component) > 1) + connected_components = ( + component for component in connected_components if len(component) > 1 + ) # remove component made only of families not initially requested - connected_components = (component for component in connected_components if component & families_of_interest) + connected_components = ( + component + for component in connected_components + if component & families_of_interest + ) gene_contexts = set() families_in_context = set() @@ -213,11 +299,15 @@ def get_gene_contexts(context_graph: nx.Graph, families_of_interest: Set[GeneFam for i, component in enumerate(connected_components): families_in_context |= component family_of_interest_of_gc = component & families_of_interest - gene_context = GeneContext(gc_id=i, families=component, families_of_interest=family_of_interest_of_gc) + gene_context = GeneContext( + gc_id=i, families=component, families_of_interest=family_of_interest_of_gc + ) # add gc id to node attribute - node_attributes = {n: {"gene_context_id": i, "families_of_interest": n in families_of_interest} for n in - component} + node_attributes = { + n: {"gene_context_id": i, "families_of_interest": n in families_of_interest} + for n in component + } nx.set_node_attributes(context_graph, node_attributes) gene_contexts.add(gene_context) @@ -248,22 +338,35 @@ def filter_attribute(data: dict): writable_graph = nx.Graph() - writable_graph.add_edges_from((f1.name, f2.name, filter_attribute(d)) - for f1, f2, d in context_graph.edges(data=True)) + writable_graph.add_edges_from( + (f1.name, f2.name, filter_attribute(d)) + for f1, f2, d in context_graph.edges(data=True) + ) # convert transitivity dict to str - edges_with_transitivity_str = {(f1.name, f2.name): str(d['transitivity']) for f1, f2, d in - context_graph.edges(data=True)} + edges_with_transitivity_str = { + (f1.name, f2.name): str(d["transitivity"]) + for f1, f2, d in context_graph.edges(data=True) + } - nx.set_edge_attributes(writable_graph, edges_with_transitivity_str, name="transitivity") + nx.set_edge_attributes( + writable_graph, edges_with_transitivity_str, name="transitivity" + ) - nodes_attributes_filtered = {f.name: filter_attribute(d) for f, d in context_graph.nodes(data=True)} + nodes_attributes_filtered = { + f.name: filter_attribute(d) for f, d in context_graph.nodes(data=True) + } # on top of attributes already contained in node of context graph # add organisms and genes count that have the family, the partition and if the family was in initially requested - nodes_family_data = {f.name: {"genomes": f.number_of_organisms, - "partition": f.named_partition, - "genes": f.number_of_genes} for f in context_graph.nodes()} + nodes_family_data = { + f.name: { + "genomes": f.number_of_organisms, + "partition": f.named_partition, + "genes": f.number_of_genes, + } + for f in context_graph.nodes() + } for f, d in writable_graph.nodes(data=True): d.update(nodes_family_data[f]) @@ -274,30 +377,34 @@ def filter_attribute(data: dict): def write_graph(graph: nx.Graph, output_dir: Path, graph_format: str): """ - Write a graph to file in the GraphML format or/and in GEXF format. + Write a graph to file in the GraphML format or/and in GEXF format. :param graph: Graph to write :param output_dir: The output directory where the graph file will be written. - :param graph_format: Formats of the output graph. Can be graphml or gexf + :param graph_format: Formats of the output graph. Can be graphml or gexf """ if "graphml" == graph_format: out_file = output_dir / "graph_context.graphml" - logging.info(f'Writing context graph in {out_file}') + logging.info(f"Writing context graph in {out_file}") nx.write_graphml_lxml(graph, out_file) elif "gexf" == graph_format: out_file = output_dir / "graph_context.gexf" - logging.info(f'Writing context graph in {out_file}') + logging.info(f"Writing context graph in {out_file}") nx.readwrite.gexf.write_gexf(graph, out_file) else: - raise ValueError(f'The given graph format ({graph_format}) is not correct. it should be "graphml" or gexf') + raise ValueError( + f'The given graph format ({graph_format}) is not correct. it should be "graphml" or gexf' + ) return out_file -def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) -> None: +def compute_edge_metrics( + context_graph: nx.Graph, gene_proportion_cutoff: float +) -> None: """ Compute various metrics on the edges of the context graph. @@ -306,26 +413,32 @@ def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) """ # compute jaccard on organism and on genes for f1, f2, data in context_graph.edges(data=True): - data['jaccard_genome'] = len(data['genomes']) / len(set(f1.organisms) | set(f2.organisms)) + data["jaccard_genome"] = len(data["genomes"]) / len( + set(f1.organisms) | set(f2.organisms) + ) - f1_gene_proportion = len(data['genes'][f1]) / f1.number_of_genes - f2_gene_proportion = len(data['genes'][f2]) / f2.number_of_genes + f1_gene_proportion = len(data["genes"][f1]) / f1.number_of_genes + f2_gene_proportion = len(data["genes"][f2]) / f2.number_of_genes - data['f1'] = f1.name - data['f2'] = f2.name - data['f1_jaccard_gene'] = f1_gene_proportion - data['f2_jaccard_gene'] = f2_gene_proportion + data["f1"] = f1.name + data["f2"] = f2.name + data["f1_jaccard_gene"] = f1_gene_proportion + data["f2_jaccard_gene"] = f2_gene_proportion - data[f'is_jaccard_gene_>_{gene_proportion_cutoff}'] = (f1_gene_proportion >= gene_proportion_cutoff) and ( - f2_gene_proportion >= gene_proportion_cutoff) + data[f"is_jaccard_gene_>_{gene_proportion_cutoff}"] = ( + f1_gene_proportion >= gene_proportion_cutoff + ) and (f2_gene_proportion >= gene_proportion_cutoff) - transitivity_counter = data['transitivity'] + transitivity_counter = data["transitivity"] mean_transitivity = sum( - (transitivity * counter for transitivity, counter in transitivity_counter.items())) / sum( - counter for counter in transitivity_counter.values()) + ( + transitivity * counter + for transitivity, counter in transitivity_counter.items() + ) + ) / sum(counter for counter in transitivity_counter.values()) - data['mean_transitivity'] = mean_transitivity + data["mean_transitivity"] = mean_transitivity # the following commented out lines are additional metrics that could be used @@ -337,10 +450,12 @@ def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) # data[f'f2_jaccard_gene_partial'] = f2_gene_proportion_partial -def add_edges_to_context_graph(context_graph: nx.Graph, - contig: Contig, - contig_windows: List[Tuple[int, int]], - transitivity: int) -> nx.Graph: +def add_edges_to_context_graph( + context_graph: nx.Graph, + contig: Contig, + contig_windows: List[Tuple[int, int]], + transitivity: int, +) -> nx.Graph: """ Add edges to the context graph based on contig genes and windows. @@ -356,13 +471,20 @@ def add_edges_to_context_graph(context_graph: nx.Graph, for window_start, window_end in contig_windows: for gene_index in range(window_start, window_end + 1): gene = contig_genes[gene_index] - next_genes = get_n_next_genes_index(gene_index, next_genes_count=transitivity + 1, - contig_size=len(contig_genes), is_circular=contig.is_circular) + next_genes = get_n_next_genes_index( + gene_index, + next_genes_count=transitivity + 1, + contig_size=len(contig_genes), + is_circular=contig.is_circular, + ) next_genes = list(next_genes) for i, next_gene_index in enumerate(next_genes): # Check if the next gene is within the contig windows - if not any(lower <= next_gene_index <= upper for (lower, upper) in contig_windows): + if not any( + lower <= next_gene_index <= upper + for (lower, upper) in contig_windows + ): # next_gene_index is not in any range of genes in the context, # so it is ignored along with all following genes break @@ -376,15 +498,17 @@ def add_edges_to_context_graph(context_graph: nx.Graph, context_graph.add_edge(gene.family, next_gene.family) contig_graph.add_edge(gene.family, next_gene.family) - edge_dict = context_graph.get_edge_data(gene.family, next_gene.family, default={}) + edge_dict = context_graph.get_edge_data( + gene.family, next_gene.family, default={} + ) if i == 0: - edge_dict['adjacent_family'] = True + edge_dict["adjacent_family"] = True # Store information of the transitivity used to link the two genes: if "transitivity" not in edge_dict: - edge_dict['transitivity'] = {i: 0 for i in range(transitivity + 1)} - edge_dict['transitivity'][i] += 1 + edge_dict["transitivity"] = {i: 0 for i in range(transitivity + 1)} + edge_dict["transitivity"][i] += 1 # Add node attributes node_gene_dict = context_graph.nodes[gene.family] @@ -399,10 +523,10 @@ def add_edges_to_context_graph(context_graph: nx.Graph, # Add edge attributes edge_dict = context_graph[gene.family][next_gene.family] try: - genes_edge_dict = edge_dict['genes'] + genes_edge_dict = edge_dict["genes"] except KeyError: genes_edge_dict = {} - edge_dict['genes'] = genes_edge_dict + edge_dict["genes"] = genes_edge_dict add_val_to_dict_attribute(genes_edge_dict, gene.family, gene) add_val_to_dict_attribute(genes_edge_dict, next_gene.family, next_gene) @@ -411,8 +535,10 @@ def add_edges_to_context_graph(context_graph: nx.Graph, increment_attribute_counter(edge_dict, "gene_pairs") - assert gene.organism == next_gene.organism, (f"Gene of the same contig have a different genome. " - f"{gene.organism} and {next_gene.organism}") + assert gene.organism == next_gene.organism, ( + f"Gene of the same contig have a different genome. " + f"{gene.organism} and {next_gene.organism}" + ) return contig_graph @@ -448,8 +574,12 @@ def increment_attribute_counter(edge_dict: dict, key: Hashable): edge_dict[key] = 1 -def get_n_next_genes_index(current_index: int, next_genes_count: int, - contig_size: int, is_circular: bool = False) -> Iterator[int]: +def get_n_next_genes_index( + current_index: int, + next_genes_count: int, + contig_size: int, + is_circular: bool = False, +) -> Iterator[int]: """ Generate the indices of the next genes based on the current index and contig properties. @@ -465,10 +595,14 @@ def get_n_next_genes_index(current_index: int, next_genes_count: int, # Check if the current index is out of range if current_index >= contig_size: - raise IndexError(f'current gene index is out of range. ' - f"Contig has {contig_size} genes while the given gene index is {current_index}") + raise IndexError( + f"current gene index is out of range. " + f"Contig has {contig_size} genes while the given gene index is {current_index}" + ) if is_circular: - next_genes = chain(range(current_index + 1, contig_size), range(0, current_index)) + next_genes = chain( + range(current_index + 1, contig_size), range(0, current_index) + ) else: next_genes = range(current_index + 1, contig_size) @@ -483,7 +617,7 @@ def get_contig_to_genes(gene_families: Iterable[GeneFamily]) -> Dict[Contig, Set Group genes from specified gene families by contig. :param gene_families: An iterable of gene families object. - + :return: A dictionary mapping contigs to sets of genes. """ @@ -495,8 +629,12 @@ def get_contig_to_genes(gene_families: Iterable[GeneFamily]) -> Dict[Contig, Set return contig_to_genes_of_interest -def compute_gene_context_graph(families: Iterable[GeneFamily], transitive: int = 4, window_size: int = 0, - disable_bar: bool = False) -> Tuple[nx.Graph, Dict[FrozenSet[GeneFamily], Set[Organism]]]: +def compute_gene_context_graph( + families: Iterable[GeneFamily], + transitive: int = 4, + window_size: int = 0, + disable_bar: bool = False, +) -> Tuple[nx.Graph, Dict[FrozenSet[GeneFamily], Set[Organism]]]: """ Construct the graph of gene contexts between families of the pangenome. @@ -512,22 +650,34 @@ def compute_gene_context_graph(families: Iterable[GeneFamily], transitive: int = contig_to_genes_of_interest = get_contig_to_genes(families) combs2orgs = defaultdict(set) - for contig, genes_of_interest in tqdm(contig_to_genes_of_interest.items(), unit="contig", - total=len(contig_to_genes_of_interest), disable=disable_bar): + for contig, genes_of_interest in tqdm( + contig_to_genes_of_interest.items(), + unit="contig", + total=len(contig_to_genes_of_interest), + disable=disable_bar, + ): genes_count = contig.number_of_genes genes_of_interest_positions = [g.position for g in genes_of_interest] - contig_windows = extract_contig_window(genes_count, genes_of_interest_positions, - window_size=window_size, is_circular=contig.is_circular) + contig_windows = extract_contig_window( + genes_count, + genes_of_interest_positions, + window_size=window_size, + is_circular=contig.is_circular, + ) # This part is for PANORAMA - contig_graph = add_edges_to_context_graph(context_graph, contig, contig_windows, transitive) + contig_graph = add_edges_to_context_graph( + context_graph, contig, contig_windows, transitive + ) for cc in nx.connected_components(contig_graph): # If gene families are in the same connected component for the contig graph, # they exist in the same context in at least one genome - combination = list(cc.intersection({gene.family for gene in genes_of_interest})) + combination = list( + cc.intersection({gene.family for gene in genes_of_interest}) + ) # Family here are family of interest for the context and in the same connected component combs2orgs[frozenset(combination)].add(contig.organism) @@ -554,8 +704,12 @@ def fam_to_seq(seq_to_pan: dict) -> dict: return fam_2_seq -def export_context_to_dataframe(gene_contexts: set, fam2seq: Dict[GeneFamily, Set[str]], - families_of_interest: Set[GeneFamily], output: Path): +def export_context_to_dataframe( + gene_contexts: set, + fam2seq: Dict[GeneFamily, Set[str]], + families_of_interest: Set[GeneFamily], + output: Path, +): """ Export the results into dataFrame @@ -571,22 +725,24 @@ def export_context_to_dataframe(gene_contexts: set, fam2seq: Dict[GeneFamily, Se if fam2seq.get(family) is None: sequence_id = None else: - sequence_id = ','.join(fam2seq.get(family)) # Should we sort this ? + sequence_id = ",".join(fam2seq.get(family)) # Should we sort this ? - family_info = {"GeneContext_ID": gene_context.ID, - "Gene_family_name": family.name, - "Sequence_ID": sequence_id, - "Nb_Genomes": family.number_of_organisms, - "Partition": family.named_partition, - "Target_family": family in families_of_interest} + family_info = { + "GeneContext_ID": gene_context.ID, + "Gene_family_name": family.name, + "Sequence_ID": sequence_id, + "Nb_Genomes": family.number_of_organisms, + "Partition": family.named_partition, + "Target_family": family in families_of_interest, + } lines.append(family_info) df = pd.DataFrame(lines).set_index("GeneContext_ID") - df = df.sort_values(["GeneContext_ID", "Sequence_ID"], na_position='last') + df = df.sort_values(["GeneContext_ID", "Sequence_ID"], na_position="last") - df.to_csv(output, sep="\t", na_rep='NA') + df.to_csv(output, sep="\t", na_rep="NA") logging.getLogger().debug(f"detected gene context(s) are listed in: '{output}'") @@ -612,10 +768,18 @@ def launch(args: argparse.Namespace): "keep_tmp": args.keep_tmp, "cpu": args.cpu, } - search_gene_context_in_pangenome(pangenome=pangenome, output=args.output, sequence_file=args.sequences, - families=args.family, transitive=args.transitive, jaccard_threshold=args.jaccard, - window_size=args.window_size, graph_format=args.graph_format, - disable_bar=args.disable_prog_bar, **align_args) + search_gene_context_in_pangenome( + pangenome=pangenome, + output=args.output, + sequence_file=args.sequences, + families=args.family, + transitive=args.transitive, + jaccard_threshold=args.jaccard, + window_size=args.window_size, + graph_format=args.graph_format, + disable_bar=args.disable_prog_bar, + **align_args, + ) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -627,7 +791,9 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : parser arguments for align command """ - parser = sub_parser.add_parser("context", formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "context", formatter_class=argparse.RawTextHelpFormatter + ) parser_context(parser) return parser @@ -639,62 +805,148 @@ def parser_context(parser: argparse.ArgumentParser): :param parser: parser for align argument """ - required = parser.add_argument_group(title="Required arguments", - description="All of the following arguments are required :") - required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome.h5 file") - required.add_argument('-o', '--output', required=False, type=Path, - default="ppanggolin_context" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", - time.localtime()) + "_PID" + str(os.getpid()), - help="Output directory where the file(s) will be written") - onereq = parser.add_argument_group(title="Input file", description="One of the following argument is required :") - onereq.add_argument('-S', '--sequences', required=False, type=Path, - help="Fasta file with the sequences of interest") - onereq.add_argument('-F', '--family', required=False, type=Path, - help="List of family IDs of interest from the pangenome") + required = parser.add_argument_group( + title="Required arguments", + description="All of the following arguments are required :", + ) + required.add_argument( + "-p", "--pangenome", required=False, type=Path, help="The pangenome.h5 file" + ) + required.add_argument( + "-o", + "--output", + required=False, + type=Path, + default="ppanggolin_context" + + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", time.localtime()) + + "_PID" + + str(os.getpid()), + help="Output directory where the file(s) will be written", + ) + onereq = parser.add_argument_group( + title="Input file", description="One of the following argument is required :" + ) + onereq.add_argument( + "-S", + "--sequences", + required=False, + type=Path, + help="Fasta file with the sequences of interest", + ) + onereq.add_argument( + "-F", + "--family", + required=False, + type=Path, + help="List of family IDs of interest from the pangenome", + ) optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument("-t", "--transitive", required=False, type=int, default=4, - help="Size of the transitive closure used to build the graph. This indicates the number of " - "non related genes allowed in-between two related genes. Increasing it will improve " - "precision but lower sensitivity a little.") - optional.add_argument("-w", "--window_size", required=False, type=int, default=5, - help="Number of neighboring genes that are considered on each side of " - "a gene of interest when searching for conserved genomic contexts.") - - optional.add_argument("-s", "--jaccard", required=False, type=restricted_float, default=0.85, - help="minimum jaccard similarity used to filter edges between gene families. Increasing it " - "will improve precision but lower sensitivity a lot.") - optional.add_argument('--graph_format', help="Format of the context graph. Can be gexf or graphml.", - default='graphml', choices=['gexf', 'graphml']) - align = parser.add_argument_group(title="Alignment arguments", - description="This argument makes sense only when --sequence is provided.") - align.add_argument('--no_defrag', required=False, action="store_true", - help="DO NOT Realign gene families to link fragments with" - "their non-fragmented gene family.") - align.add_argument("--fast", required=False, action="store_true", - help="Use representative sequences of gene families for input gene alignment. " - "This option is recommended for faster processing but may be less sensitive. " - "By default, all pangenome genes are used for alignment.") - align.add_argument('--identity', required=False, type=float, default=0.8, - help="min identity percentage threshold") - align.add_argument('--coverage', required=False, type=float, default=0.8, - help="min coverage percentage threshold") - align.add_argument("--translation_table", required=False, default="11", - help="The translation table to use when the input sequences are nucleotide sequences. ") - align.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), - help="directory for storing temporary files") - align.add_argument("--keep_tmp", required=False, default=False, action="store_true", - help="Keeping temporary files (useful for debugging).") - align.add_argument("-c", "--cpu", required=False, default=1, type=int, - help="Number of available cpus") - - -if __name__ == '__main__': + optional.add_argument( + "-t", + "--transitive", + required=False, + type=int, + default=4, + help="Size of the transitive closure used to build the graph. This indicates the number of " + "non related genes allowed in-between two related genes. Increasing it will improve " + "precision but lower sensitivity a little.", + ) + optional.add_argument( + "-w", + "--window_size", + required=False, + type=int, + default=5, + help="Number of neighboring genes that are considered on each side of " + "a gene of interest when searching for conserved genomic contexts.", + ) + + optional.add_argument( + "-s", + "--jaccard", + required=False, + type=restricted_float, + default=0.85, + help="minimum jaccard similarity used to filter edges between gene families. Increasing it " + "will improve precision but lower sensitivity a lot.", + ) + optional.add_argument( + "--graph_format", + help="Format of the context graph. Can be gexf or graphml.", + default="graphml", + choices=["gexf", "graphml"], + ) + align = parser.add_argument_group( + title="Alignment arguments", + description="This argument makes sense only when --sequence is provided.", + ) + align.add_argument( + "--no_defrag", + required=False, + action="store_true", + help="DO NOT Realign gene families to link fragments with" + "their non-fragmented gene family.", + ) + align.add_argument( + "--fast", + required=False, + action="store_true", + help="Use representative sequences of gene families for input gene alignment. " + "This option is recommended for faster processing but may be less sensitive. " + "By default, all pangenome genes are used for alignment.", + ) + align.add_argument( + "--identity", + required=False, + type=float, + default=0.8, + help="min identity percentage threshold", + ) + align.add_argument( + "--coverage", + required=False, + type=float, + default=0.8, + help="min coverage percentage threshold", + ) + align.add_argument( + "--translation_table", + required=False, + default="11", + help="The translation table to use when the input sequences are nucleotide sequences. ", + ) + align.add_argument( + "--tmpdir", + required=False, + type=str, + default=Path(tempfile.gettempdir()), + help="directory for storing temporary files", + ) + align.add_argument( + "--keep_tmp", + required=False, + default=False, + action="store_true", + help="Keeping temporary files (useful for debugging).", + ) + align.add_argument( + "-c", + "--cpu", + required=False, + default=1, + type=int, + help="Number of available cpus", + ) + + +if __name__ == "__main__": """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser_context(main_parser) add_common_arguments(main_parser) diff --git a/ppanggolin/edge.py b/ppanggolin/edge.py index 6b323eb1..cfc2887b 100644 --- a/ppanggolin/edge.py +++ b/ppanggolin/edge.py @@ -31,11 +31,15 @@ def __init__(self, source_gene: Gene, target_gene: Gene): """ # TODO try to change for gene family ? if source_gene.family is None: - raise AttributeError(f"You cannot create a graph without gene families. " - f"gene {source_gene.ID} did not have a gene family.") + raise AttributeError( + f"You cannot create a graph without gene families. " + f"gene {source_gene.ID} did not have a gene family." + ) if target_gene.family is None: - raise AttributeError(f"You cannot create a graph without gene families. " - f"gene {target_gene.ID} did not have a gene family.") + raise AttributeError( + f"You cannot create a graph without gene families. " + f"gene {target_gene.ID} did not have a gene family." + ) self.source = source_gene.family self.target = target_gene.family self.source.set_edge(self.target, self) @@ -77,11 +81,15 @@ def get_organisms_dict(self) -> Dict[Organism, List[Tuple[Gene, Gene]]]: @property def gene_pairs(self) -> List[Tuple[Gene, Gene]]: - """ Get the list of all the gene pairs in the Edge + """Get the list of all the gene pairs in the Edge :return: A list of all the gene pairs in the Edge """ - return [gene_pair for gene_list in self.get_organisms_dict().values() for gene_pair in gene_list] + return [ + gene_pair + for gene_list in self.get_organisms_dict().values() + for gene_pair in gene_list + ] def add_genes(self, source_gene: Gene, target_gene: Gene): """ @@ -96,11 +104,17 @@ def add_genes(self, source_gene: Gene, target_gene: Gene): :raises Exception: If the genes are not in the same organism. """ if not isinstance(source_gene, Gene) or not isinstance(target_gene, Gene): - raise TypeError(f"Genes are expected to be added to edge. " - f"Given type for source: {type(source_gene)} and target: {type(target_gene)}") + raise TypeError( + f"Genes are expected to be added to edge. " + f"Given type for source: {type(source_gene)} and target: {type(target_gene)}" + ) if source_gene.organism is None or target_gene.organism is None: - raise ValueError("Genes are not associated to genome. It's needed to create add genes to edge") + raise ValueError( + "Genes are not associated to genome. It's needed to create add genes to edge" + ) if source_gene.organism != target_gene.organism: - raise Exception(f"You tried to create an edge between two genes that are not even in the same genome ! " - f"(genes are '{source_gene.ID}' and '{target_gene.ID}')") + raise Exception( + f"You tried to create an edge between two genes that are not even in the same genome ! " + f"(genes are '{source_gene.ID}' and '{target_gene.ID}')" + ) self._organisms[source_gene.organism].append((source_gene, target_gene)) diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index b179b266..b52278f2 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -19,7 +19,17 @@ from bokeh.plotting import ColumnDataSource, figure, save from bokeh.io import output_file from bokeh.layouts import column, row -from bokeh.models import WheelZoomTool, LabelSet, Slider, CustomJS, HoverTool, Div, Column, GlyphRenderer, RadioButtonGroup +from bokeh.models import ( + WheelZoomTool, + LabelSet, + Slider, + CustomJS, + HoverTool, + Div, + Column, + GlyphRenderer, + RadioButtonGroup, +) # local libraries from ppanggolin.pangenome import Pangenome @@ -31,10 +41,12 @@ def check_predicted_spots(pangenome): - """ checks pangenome status and .h5 files for predicted spots, raises an error if they were not predicted""" + """checks pangenome status and .h5 files for predicted spots, raises an error if they were not predicted""" if pangenome.status["spots"] == "No": - raise Exception("You are trying to draw spots for a pangenome that does not have spots predicted. " - "Please see the 'spot' subcommand.") + raise Exception( + "You are trying to draw spots for a pangenome that does not have spots predicted. " + "Please see the 'spot' subcommand." + ) def make_colors_for_iterable(it: set) -> dict: @@ -52,11 +64,13 @@ def make_colors_for_iterable(it: set) -> dict: if element == "none": famcol[element] = "#D3D3D3" else: - famcol[element] = '#%02x%02x%02x' % (col[0], col[1], col[2]) + famcol[element] = "#%02x%02x%02x" % (col[0], col[1], col[2]) return famcol -def order_gene_lists(gene_lists: list, overlapping_match: int, exact_match: int, set_size: int): +def order_gene_lists( + gene_lists: list, overlapping_match: int, exact_match: int, set_size: int +): """ Order all rgps the same way, and order them by similarity in gene content. @@ -86,7 +100,9 @@ def row_order_gene_lists(gene_lists: list) -> list: return gene_lists if len(gene_lists) > sys.getrecursionlimit(): - sys.setrecursionlimit(len(gene_lists)) # we need the recursion limit to be higher than the number of regions. + sys.setrecursionlimit( + len(gene_lists) + ) # we need the recursion limit to be higher than the number of regions. for index, genelist in enumerate([genelist[0] for genelist in gene_lists]): for gene in genelist: @@ -100,9 +116,13 @@ def row_order_gene_lists(gene_lists: list) -> list: all_columns.extend(rgp_indexes) data.extend([1.0] * len(rgp_indexes)) - mat_p_a = csc_matrix((data, (all_indexes, all_columns)), shape=(len(fam_dict), len(gene_lists)), dtype='float') + mat_p_a = csc_matrix( + (data, (all_indexes, all_columns)), + shape=(len(fam_dict), len(gene_lists)), + dtype="float", + ) dist = pdist(1 - jaccard_similarities(mat_p_a, 0).todense()) - hc = linkage(dist, 'single') + hc = linkage(dist, "single") dendro = dendrogram(hc, no_plot=True) @@ -111,7 +131,9 @@ def row_order_gene_lists(gene_lists: list) -> list: return new_gene_lists -def line_order_gene_lists(gene_lists: list, overlapping_match: int, exact_match: int, set_size: int): +def line_order_gene_lists( + gene_lists: list, overlapping_match: int, exact_match: int, set_size: int +): """ Line ordering of all rgps @@ -132,12 +154,18 @@ def line_order_gene_lists(gene_lists: list, overlapping_match: int, exact_match: for unclass_index in list(to_classify): border1 = [gene.family for gene in gene_lists[unclass_index][1][0]] border2 = [gene.family for gene in gene_lists[unclass_index][1][1]] - if comp_border(base_border1, border1, overlapping_match, set_size, exact_match) and \ - comp_border(base_border2, border2, overlapping_match, set_size, exact_match): + if comp_border( + base_border1, border1, overlapping_match, set_size, exact_match + ) and comp_border( + base_border2, border2, overlapping_match, set_size, exact_match + ): to_classify.discard(unclass_index) new_classify.add(unclass_index) - elif comp_border(base_border2, border1, overlapping_match, set_size, exact_match) and \ - comp_border(base_border1, border2, overlapping_match, set_size, exact_match): + elif comp_border( + base_border2, border1, overlapping_match, set_size, exact_match + ) and comp_border( + base_border1, border2, overlapping_match, set_size, exact_match + ): # reverse the order of the genes to match the 'reference' gene_lists[unclass_index][0] = gene_lists[unclass_index][0][::-1] # inverse the borders @@ -149,13 +177,21 @@ def line_order_gene_lists(gene_lists: list, overlapping_match: int, exact_match: # specify the new 'classified' and remove from unclassified to_classify.discard(unclass_index) new_classify.add(unclass_index) - classified |= new_classify # the newly classified will help to check the unclassified, + classified |= ( + new_classify # the newly classified will help to check the unclassified, + ) # the formerly classified are not useful for what remains (if something remains) new_classify = set() -def subgraph(spot: Spot, outname: Path, with_border: bool = True, set_size: int = 3, - multigenics: set = None, fam_to_mod: dict = None): +def subgraph( + spot: Spot, + outname: Path, + with_border: bool = True, + set_size: int = 3, + multigenics: set = None, + fam_to_mod: dict = None, +): """ Write a pangeome subgraph of the gene families of a spot in gexf format @@ -212,7 +248,8 @@ def subgraph(spot: Spot, outname: Path, with_border: bool = True, set_size: int nx.write_gexf(g, outname.absolute().as_posix()) -def is_gene_list_ordered(genes:List[Feature]): + +def is_gene_list_ordered(genes: List[Feature]): """ Check if a list of genes is ordered. """ @@ -228,7 +265,9 @@ def is_gene_list_ordered(genes:List[Feature]): return False -def mk_source_data(genelists: list, fam_col: dict, fam_to_mod: dict) -> (ColumnDataSource, list): +def mk_source_data( + genelists: list, fam_col: dict, fam_to_mod: dict +) -> (ColumnDataSource, list): """ :param genelists: @@ -238,10 +277,30 @@ def mk_source_data(genelists: list, fam_col: dict, fam_to_mod: dict) -> (ColumnD """ partition_colors = {"shell": "#00D860", "persistent": "#F7A507", "cloud": "#79DEFF"} - df = {'name': [], 'ordered': [], 'strand': [], "start": [], "stop": [], "length": [], 'module': [], - 'module_color': [], 'x': [], 'y': [], 'width': [], 'family_color': [], 'partition_color': [], 'partition': [], - "family": [], "product": [], "x_label": [], "y_label": [], "label": [], "gene_type": [], 'gene_ID': [], - "gene_local_ID": []} + df = { + "name": [], + "ordered": [], + "strand": [], + "start": [], + "stop": [], + "length": [], + "module": [], + "module_color": [], + "x": [], + "y": [], + "width": [], + "family_color": [], + "partition_color": [], + "partition": [], + "family": [], + "product": [], + "x_label": [], + "y_label": [], + "label": [], + "gene_type": [], + "gene_ID": [], + "gene_local_ID": [], + } for index, gene_list in enumerate(genelists): @@ -258,7 +317,9 @@ def mk_source_data(genelists: list, fam_col: dict, fam_to_mod: dict) -> (ColumnD ordered = False start = first_gene.stop for gene in genelist: - relative_start = gene.start_relative_to(first_gene if ordered else last_gene) + relative_start = gene.start_relative_to( + first_gene if ordered else last_gene + ) relative_stop = gene.stop_relative_to(first_gene if ordered else last_gene) df["ordered"].append(str(ordered)) df["strand"].append(gene.strand) @@ -268,7 +329,7 @@ def mk_source_data(genelists: list, fam_col: dict, fam_to_mod: dict) -> (ColumnD df["gene_type"].append(gene.type) df["product"].append(gene.product) df["gene_local_ID"].append(gene.local_identifier) - df['gene_ID'].append(gene.ID) + df["gene_ID"].append(gene.ID) if "RNA" in gene.type: # dedicated values for RNA genes df["name"].append(gene.product) @@ -282,12 +343,16 @@ def mk_source_data(genelists: list, fam_col: dict, fam_to_mod: dict) -> (ColumnD df["family"].append(gene.family.name) df["partition"].append(gene.family.named_partition) df["family_color"].append(fam_col[gene.family]) - df["partition_color"].append(partition_colors[gene.family.named_partition]) + df["partition_color"].append( + partition_colors[gene.family.named_partition] + ) df["module"].append(fam_to_mod.get(gene.family, "none")) # df["x"].append((abs(gene.start - start) + abs(gene.stop - start)) / 2) # df["width"].append(gene.stop - gene.start) - df["x"].append((abs(relative_start - start) + abs(relative_stop - start)) / 2) + df["x"].append( + (abs(relative_start - start) + abs(relative_stop - start)) / 2 + ) df["width"].append(len(gene)) df["x_label"].append(str(int(df["x"][-1]) - int(int(df["width"][-1]) / 2))) @@ -342,7 +407,7 @@ def add_gene_tools(recs: GlyphRenderer, source_data: ColumnDataSource) -> Column """ def color_str(color_element: str) -> str: - """ + """ Javascript code to switch between partition, family and module color for the given 'color_element' :param color_element: @@ -364,31 +429,52 @@ def color_str(color_element: str) -> str: source.change.emit(); """ - radio_line_color = RadioButtonGroup(labels=["partition", "family", "module"], active=0) - radio_fill_color = RadioButtonGroup(labels=["partition", "family", "module"], active=1) - - radio_line_color.js_on_event("button_click", - CustomJS(args=dict(recs=recs, source=source_data, btn=radio_line_color), - code=color_str("line_color"))) - - radio_fill_color.js_on_event("button_click", - CustomJS(args=dict(recs=recs, source=source_data, btn=radio_fill_color), - code=color_str("fill_color"))) + radio_line_color = RadioButtonGroup( + labels=["partition", "family", "module"], active=0 + ) + radio_fill_color = RadioButtonGroup( + labels=["partition", "family", "module"], active=1 + ) + + radio_line_color.js_on_event( + "button_click", + CustomJS( + args=dict(recs=recs, source=source_data, btn=radio_line_color), + code=color_str("line_color"), + ), + ) + + radio_fill_color.js_on_event( + "button_click", + CustomJS( + args=dict(recs=recs, source=source_data, btn=radio_fill_color), + code=color_str("fill_color"), + ), + ) color_header = Div(text="Genes:") line_title = Div(text="""Color to use for gene outlines:""") fill_title = Div(text="""Color to fill genes with:""") - gene_outline_size = Slider(start=0, end=10, value=5, step=0.1, title="Gene outline size:") - gene_outline_size.js_on_change('value', CustomJS(args=dict(other=recs), - code=""" + gene_outline_size = Slider( + start=0, end=10, value=5, step=0.1, title="Gene outline size:" + ) + gene_outline_size.js_on_change( + "value", + CustomJS( + args=dict(other=recs), + code=""" other.glyph.line_width = this.value; console.log('SLIDER: active=' + this.value, this.toString()) - """ - )) + """, + ), + ) - return column(color_header, row(column(line_title, radio_line_color), column(fill_title, radio_fill_color)), - gene_outline_size) + return column( + color_header, + row(column(line_title, radio_line_color), column(fill_title, radio_fill_color)), + gene_outline_size, + ) def add_gene_labels(fig, source_data: ColumnDataSource) -> (Column, LabelSet): @@ -398,24 +484,39 @@ def add_gene_labels(fig, source_data: ColumnDataSource) -> (Column, LabelSet): :param source_data: :return: """ - labels = LabelSet(x='x_label', y='y_label', text='label', source=source_data, text_font_size="18px") - slider_font = Slider(start=0, end=64, value=16, step=1, title="Gene label font size in px") - slider_angle = Slider(start=0, end=pi / 2, value=0, step=0.01, title="Gene label angle in radian") - - radio_label_type = RadioButtonGroup(labels=["name", "product", "family", "local identifier", "gene ID", "none"], - active=1) - - slider_angle.js_link('value', labels, 'angle') - - slider_font.js_on_change('value', - CustomJS(args=dict(other=labels), - code="other.text_font_size = this.value+'px';" - ) - ) - - radio_label_type.js_on_event("button_click", - CustomJS(args=dict(other=labels, source=source_data, btn=radio_label_type), - code=""" + labels = LabelSet( + x="x_label", + y="y_label", + text="label", + source=source_data, + text_font_size="18px", + ) + slider_font = Slider( + start=0, end=64, value=16, step=1, title="Gene label font size in px" + ) + slider_angle = Slider( + start=0, end=pi / 2, value=0, step=0.01, title="Gene label angle in radian" + ) + + radio_label_type = RadioButtonGroup( + labels=["name", "product", "family", "local identifier", "gene ID", "none"], + active=1, + ) + + slider_angle.js_link("value", labels, "angle") + + slider_font.js_on_change( + "value", + CustomJS( + args=dict(other=labels), code="other.text_font_size = this.value+'px';" + ), + ) + + radio_label_type.js_on_event( + "button_click", + CustomJS( + args=dict(other=labels, source=source_data, btn=radio_label_type), + code=""" if(btn.active == 5){ source.data['label'] = []; for(var i=0;i (Column, LabelSet): } other.source = source; source.change.emit(); - """ - )) + """, + ), + ) label_header = Div(text="Gene labels:") - radio_title = Div(text="""Gene labels to use:""", ) - labels_block = column(label_header, row(slider_font, slider_angle), column(radio_title, radio_label_type)) + radio_title = Div( + text="""Gene labels to use:""", + ) + labels_block = column( + label_header, + row(slider_font, slider_angle), + column(radio_title, radio_label_type), + ) fig.add_layout(labels) @@ -450,21 +558,21 @@ def mk_genomes(gene_lists: list, ordered_counts: list) -> (ColumnDataSource, lis :param ordered_counts: :return: """ - df = {"name": [], "width": [], "occurrences": [], 'x': [], 'y': [], "x_label": []} + df = {"name": [], "width": [], "occurrences": [], "x": [], "y": [], "x_label": []} for index, gene_list in enumerate(gene_lists): genelist = gene_list[0] df["occurrences"].append(ordered_counts[index]) df["y"].append(index * 10) first_gene = genelist[0] - last_gene = genelist[-1] + last_gene = genelist[-1] if is_gene_list_ordered(genelist): # if the order has been inverted, positioning elements on the figure is different - width = abs(last_gene.stop_relative_to(first_gene ) - genelist[0].start) + width = abs(last_gene.stop_relative_to(first_gene) - genelist[0].start) df["width"].append(width) else: # order has been inverted - width = abs(last_gene.stop_relative_to(last_gene ) - last_gene.start) + width = abs(last_gene.stop_relative_to(last_gene) - last_gene.start) df["width"].append(width) df["x"].append((df["width"][-1]) / 2) @@ -477,8 +585,15 @@ def mk_genomes(gene_lists: list, ordered_counts: list) -> (ColumnDataSource, lis return ColumnDataSource(data=df), tooltip -def add_genome_tools(fig, gene_recs: GlyphRenderer, genome_recs: GlyphRenderer, gene_source: ColumnDataSource, - genome_source: ColumnDataSource, nb: int, gene_labels: LabelSet): +def add_genome_tools( + fig, + gene_recs: GlyphRenderer, + genome_recs: GlyphRenderer, + gene_source: ColumnDataSource, + genome_source: ColumnDataSource, + nb: int, + gene_labels: LabelSet, +): """ :param fig: @@ -491,25 +606,47 @@ def add_genome_tools(fig, gene_recs: GlyphRenderer, genome_recs: GlyphRenderer, :return: """ # add genome labels - genome_labels = LabelSet(x='x_label', y='y', x_offset=-20, text='name', text_align="right", - source=genome_source, text_font_size="16px") + genome_labels = LabelSet( + x="x_label", + y="y", + x_offset=-20, + text="name", + text_align="right", + source=genome_source, + text_font_size="16px", + ) fig.add_layout(genome_labels) - slider_font = Slider(start=0, end=64, value=16, step=1, title="Genome label font size in px") - slider_font.js_on_change('value', - CustomJS(args=dict(other=genome_labels), - code="other.text_font_size = this.value+'px';" - ) - ) - - slider_offset = Slider(start=-400, end=0, value=-20, step=1, title="Genome label offset") - slider_offset.js_link('value', genome_labels, 'x_offset') + slider_font = Slider( + start=0, end=64, value=16, step=1, title="Genome label font size in px" + ) + slider_font.js_on_change( + "value", + CustomJS( + args=dict(other=genome_labels), + code="other.text_font_size = this.value+'px';", + ), + ) + + slider_offset = Slider( + start=-400, end=0, value=-20, step=1, title="Genome label offset" + ) + slider_offset.js_link("value", genome_labels, "x_offset") slider_spacing = Slider(start=1, end=40, value=10, step=1, title="Genomes spacing") - slider_spacing.js_on_change('value', CustomJS( - args=dict(gene_recs=gene_recs, gene_source=gene_source, genome_recs=genome_recs, genome_source=genome_source, - nb_elements=nb, genome_labels=genome_labels, gene_labels=gene_labels), - code=""" + slider_spacing.js_on_change( + "value", + CustomJS( + args=dict( + gene_recs=gene_recs, + gene_source=gene_source, + genome_recs=genome_recs, + genome_source=genome_source, + nb_elements=nb, + genome_labels=genome_labels, + gene_labels=gene_labels, + ), + code=""" var current_val = genome_source.data['y'][genome_source.data['y'].length - 1] / (nb_elements-1); for (let i=0 ; i < genome_source.data['y'].length ; i++){ genome_source.data['y'][i] = (genome_source.data['y'][i] * this.value) / current_val; @@ -530,13 +667,21 @@ def add_genome_tools(fig, gene_recs: GlyphRenderer, genome_recs: GlyphRenderer, genome_labels.source = genome_source; gene_source.change.emit(); genome_source.change.emit(); - """)) + """, + ), + ) genome_header = Div(text="Genomes:") return column(genome_header, slider_spacing, slider_font, slider_offset) -def draw_curr_spot(gene_lists: list, ordered_counts: list, fam_to_mod: dict, fam_col: dict, output: Path): +def draw_curr_spot( + gene_lists: list, + ordered_counts: list, + fam_to_mod: dict, + fam_col: dict, + output: Path, +): """ :param gene_lists: @@ -552,23 +697,59 @@ def draw_curr_spot(gene_lists: list, ordered_counts: list, fam_to_mod: dict, fam # generate the figure and add some tools to it wheel_zoom = WheelZoomTool() - fig = figure(title="spot graphic", width=1600, height=600, - tools=["pan", "box_zoom", "reset", "save", wheel_zoom, "ywheel_zoom", "xwheel_zoom"]) + fig = figure( + title="spot graphic", + width=1600, + height=600, + tools=[ + "pan", + "box_zoom", + "reset", + "save", + wheel_zoom, + "ywheel_zoom", + "xwheel_zoom", + ], + ) fig.axis.visible = True fig.toolbar.active_scroll = wheel_zoom # genome rectangles genome_source, genome_tooltip = mk_genomes(gene_lists, ordered_counts) - genome_recs = fig.rect(x='x', y='y', fill_color="dimgray", width="width", height=0.5, source=genome_source) - genome_recs_hover = HoverTool(renderers=[genome_recs], tooltips=genome_tooltip, mode="mouse", - point_policy="follow_mouse") + genome_recs = fig.rect( + x="x", + y="y", + fill_color="dimgray", + width="width", + height=0.5, + source=genome_source, + ) + genome_recs_hover = HoverTool( + renderers=[genome_recs], + tooltips=genome_tooltip, + mode="mouse", + point_policy="follow_mouse", + ) fig.add_tools(genome_recs_hover) # gene rectangles gene_source, gene_tooltips = mk_source_data(gene_lists, fam_col, fam_to_mod) - recs = fig.rect(x='x', y='y', line_color='line_color', fill_color='fill_color', width='width', height=2, - line_width=5, source=gene_source) - recs_hover = HoverTool(renderers=[recs], tooltips=gene_tooltips, mode="mouse", point_policy="follow_mouse") + recs = fig.rect( + x="x", + y="y", + line_color="line_color", + fill_color="fill_color", + width="width", + height=2, + line_width=5, + source=gene_source, + ) + recs_hover = HoverTool( + renderers=[recs], + tooltips=gene_tooltips, + mode="mouse", + point_policy="follow_mouse", + ) fig.add_tools(recs_hover) # gene modification tools gene_tools = add_gene_tools(recs, gene_source) @@ -577,13 +758,22 @@ def draw_curr_spot(gene_lists: list, ordered_counts: list, fam_to_mod: dict, fam labels_tools, labels = add_gene_labels(fig, gene_source) # genome tool - genome_tools = add_genome_tools(fig, recs, genome_recs, gene_source, genome_source, len(gene_lists), labels) + genome_tools = add_genome_tools( + fig, recs, genome_recs, gene_source, genome_source, len(gene_lists), labels + ) save(column(fig, row(labels_tools, gene_tools), row(genome_tools))) -def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: Pangenome, output: Path, - overlapping_match: int, exact_match: int, set_size: int, disable_bar: bool = False): +def draw_selected_spots( + selected_spots: Union[List[Spot], Set[Spot]], + pangenome: Pangenome, + output: Path, + overlapping_match: int, + exact_match: int, + set_size: int, + disable_bar: bool = False, +): """ Draw only the selected spot and give parameters @@ -596,7 +786,9 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: :param disable_bar: Allow preventing bar progress print """ - logging.getLogger("PPanGGOLiN").info("Ordering genes among regions, and drawing spots...") + logging.getLogger("PPanGGOLiN").info( + "Ordering genes among regions, and drawing spots..." + ) multigenics = pangenome.get_multigenics(pangenome.parameters["rgp"]["dup_margin"]) @@ -605,30 +797,56 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: for fam in mod.families: fam2mod[fam] = f"module_{mod.ID}" - for spot in tqdm(selected_spots, total=len(selected_spots), unit="spot", disable=disable_bar): + for spot in tqdm( + selected_spots, total=len(selected_spots), unit="spot", disable=disable_bar + ): basename = f"spot_{str(spot.ID)}" - identical_rgp_out = output / (basename + '_identical_rgps.tsv') + identical_rgp_out = output / (basename + "_identical_rgps.tsv") # write rgps representatives and the rgps they are identical to - with open(identical_rgp_out, 'w') as out_struc: - out_struc.write('representative_rgp\trepresentative_rgp_genome\tidentical_rgp\tidentical_rgp_genome\n') + with open(identical_rgp_out, "w") as out_struc: + out_struc.write( + "representative_rgp\trepresentative_rgp_genome\tidentical_rgp\tidentical_rgp_genome\n" + ) for key_rgp, other_rgps in spot.get_uniq_to_rgp().items(): for rgp in other_rgps: - out_struc.write(f"{key_rgp.name}\t{key_rgp.organism.name}\t{rgp.name}\t{rgp.organism.name}\n") + out_struc.write( + f"{key_rgp.name}\t{key_rgp.organism.name}\t{rgp.name}\t{rgp.organism.name}\n" + ) fams = set() gene_lists = [] for rgp in spot.regions: contig = rgp.contig - left_border_and_in_between_genes, right_border_and_in_between_genes = rgp.get_bordering_genes(set_size, multigenics, return_only_persistents=False) + left_border_and_in_between_genes, right_border_and_in_between_genes = ( + rgp.get_bordering_genes( + set_size, multigenics, return_only_persistents=False + ) + ) # clean borders from multigenic and non persistent genes - left_border = [gene for gene in left_border_and_in_between_genes if gene.family.named_partition == "persistent" and gene.family not in multigenics] - right_border = [gene for gene in right_border_and_in_between_genes if gene.family.named_partition == "persistent" and gene.family not in multigenics] + left_border = [ + gene + for gene in left_border_and_in_between_genes + if gene.family.named_partition == "persistent" + and gene.family not in multigenics + ] + right_border = [ + gene + for gene in right_border_and_in_between_genes + if gene.family.named_partition == "persistent" + and gene.family not in multigenics + ] # in some rare case with plasmid left and right border can be made of the same genes # we use a set to only have one gene represented. - consecutive_genes_lists = contig.get_ordered_consecutive_genes(set(left_border_and_in_between_genes + right_border_and_in_between_genes + list(rgp.genes))) + consecutive_genes_lists = contig.get_ordered_consecutive_genes( + set( + left_border_and_in_between_genes + + right_border_and_in_between_genes + + list(rgp.genes) + ) + ) consecutive_genes_and_rnas_lists = [] @@ -641,10 +859,14 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: if start < rna.start < stop: rnas_toadd.append(rna) - ordered_genes_with_rnas = sorted(consecutive_genes + rnas_toadd, key=lambda x: x.start) + ordered_genes_with_rnas = sorted( + consecutive_genes + rnas_toadd, key=lambda x: x.start + ) consecutive_genes_and_rnas_lists.append(ordered_genes_with_rnas) - ordered_genes = [gene for genes in consecutive_genes_and_rnas_lists for gene in genes] + ordered_genes = [ + gene for genes in consecutive_genes_and_rnas_lists for gene in genes + ] fams |= {gene.family for gene in ordered_genes if gene.type == "CDS"} @@ -652,7 +874,9 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: famcolors = make_colors_for_iterable(fams) # order all rgps the same way, and order them by similarity in gene content - gene_lists = order_gene_lists(gene_lists, overlapping_match, exact_match, set_size) + gene_lists = order_gene_lists( + gene_lists, overlapping_match, exact_match, set_size + ) count_uniq = spot.count_uniq_ordered_set() # keep only the representative rgps for the figure @@ -666,12 +890,24 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: draw_spot_out = output / (basename + ".html") subgraph_out = output / (basename + ".gexf") - draw_curr_spot(uniq_gene_lists, ordered_counts, fam2mod, famcolors, draw_spot_out) - subgraph(spot, subgraph_out, set_size=set_size, multigenics=multigenics, fam_to_mod=fam2mod) - logging.getLogger("PPanGGOLiN").info(f"Done drawing spot(s), they can be found in the directory: '{output}'") - - -def draw_spots(pangenome: Pangenome, output: Path, spot_list: str, disable_bar: bool = False): + draw_curr_spot( + uniq_gene_lists, ordered_counts, fam2mod, famcolors, draw_spot_out + ) + subgraph( + spot, + subgraph_out, + set_size=set_size, + multigenics=multigenics, + fam_to_mod=fam2mod, + ) + logging.getLogger("PPanGGOLiN").info( + f"Done drawing spot(s), they can be found in the directory: '{output}'" + ) + + +def draw_spots( + pangenome: Pangenome, output: Path, spot_list: str, disable_bar: bool = False +): """ Main function to draw spot @@ -688,33 +924,61 @@ def draw_spots(pangenome: Pangenome, output: Path, spot_list: str, disable_bar: # modules are not required to be loaded, but if they have been computed we load them. need_mod = True - check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=False, need_partitions=True, - need_rgp=True, need_spots=True, need_modules=need_mod, disable_bar=disable_bar) - - if spot_list == 'all' or any(x == 'all' for x in spot_list): - logging.getLogger("PPanGGOLiN").debug("'all' value is found in spot list, all spots are drawn.") + check_pangenome_info( + pangenome, + need_annotations=True, + need_families=True, + need_graph=False, + need_partitions=True, + need_rgp=True, + need_spots=True, + need_modules=need_mod, + disable_bar=disable_bar, + ) + + if spot_list == "all" or any(x == "all" for x in spot_list): + logging.getLogger("PPanGGOLiN").debug( + "'all' value is found in spot list, all spots are drawn." + ) selected_spots = list(pangenome.spots) - elif spot_list == "synteny" or any(x == 'synteny' for x in spot_list): + elif spot_list == "synteny" or any(x == "synteny" for x in spot_list): logging.getLogger().debug( - "'synteny' value is found in spot list, all spots with more than 1 conserved synteny are drawn.") - selected_spots = [s for s in pangenome.spots if len(s.get_uniq_ordered_set()) > 1] + "'synteny' value is found in spot list, all spots with more than 1 conserved synteny are drawn." + ) + selected_spots = [ + s for s in pangenome.spots if len(s.get_uniq_ordered_set()) > 1 + ] else: - curated_spot_list = {'spot_' + str(s) if not s.startswith("spot_") else str(s) for s in spot_list} - logging.getLogger("PPanGGOLiN").debug(f'Required spots to draw: {curated_spot_list}') - selected_spots = [s for s in pangenome.spots if "spot_" + str(s.ID) in curated_spot_list] + curated_spot_list = { + "spot_" + str(s) if not s.startswith("spot_") else str(s) for s in spot_list + } + logging.getLogger("PPanGGOLiN").debug( + f"Required spots to draw: {curated_spot_list}" + ) + selected_spots = [ + s for s in pangenome.spots if "spot_" + str(s.ID) in curated_spot_list + ] if len(selected_spots) != len(curated_spot_list): existing_spots = {"spot_" + str(s.ID) for s in pangenome.spots} required_non_existing_spots = curated_spot_list - existing_spots logging.getLogger("PPanGGOLiN").warning( - f'{len(required_non_existing_spots)} required spots to draw do not exist: {" ".join(required_non_existing_spots)} ') + f'{len(required_non_existing_spots)} required spots to draw do not exist: {" ".join(required_non_existing_spots)} ' + ) if len(selected_spots) < 10: - logging.getLogger("PPanGGOLiN").info(f"Drawing the following spots: " - f"{','.join(['spot_' + str(s.ID) for s in selected_spots])}") + logging.getLogger("PPanGGOLiN").info( + f"Drawing the following spots: " + f"{','.join(['spot_' + str(s.ID) for s in selected_spots])}" + ) else: logging.getLogger("PPanGGOLiN").info(f"Drawing {len(selected_spots)} spots") - draw_selected_spots(selected_spots, pangenome, output, - overlapping_match=pangenome.parameters["spot"]["overlapping_match"], - exact_match=pangenome.parameters["spot"]["exact_match_size"], - set_size=pangenome.parameters["spot"]["set_size"], disable_bar=disable_bar) + draw_selected_spots( + selected_spots, + pangenome, + output, + overlapping_match=pangenome.parameters["spot"]["overlapping_match"], + exact_match=pangenome.parameters["spot"]["exact_match_size"], + set_size=pangenome.parameters["spot"]["set_size"], + disable_bar=disable_bar, + ) diff --git a/ppanggolin/figures/drawing.py b/ppanggolin/figures/drawing.py index efe2e441..46c67447 100644 --- a/ppanggolin/figures/drawing.py +++ b/ppanggolin/figures/drawing.py @@ -24,9 +24,12 @@ def check_spot_args(args: argparse.Namespace): :type args: argparse.Namespace :raises argparse.ArgumentError: If args.spots is specified but args.draw_spots is False. """ - default_arg_spots = 'all' + default_arg_spots = "all" if not args.draw_spots and args.spots != default_arg_spots: - raise argparse.ArgumentError(None, "The --spots argument cannot be used when --draw_spots is not specified.") + raise argparse.ArgumentError( + None, + "The --spots argument cannot be used when --draw_spots is not specified.", + ) def launch(args: argparse.Namespace): @@ -43,13 +46,29 @@ def launch(args: argparse.Namespace): pangenome = Pangenome() pangenome.add_file(args.pangenome) if args.tile_plot: - draw_tile_plot(pangenome, args.output, args.nocloud, draw_dendrogram=args.add_dendrogram, disable_bar=args.disable_prog_bar, - add_metadata=args.add_metadata, - metadata_sources=args.metadata_sources) + draw_tile_plot( + pangenome, + args.output, + args.nocloud, + draw_dendrogram=args.add_dendrogram, + disable_bar=args.disable_prog_bar, + add_metadata=args.add_metadata, + metadata_sources=args.metadata_sources, + ) if args.ucurve: - draw_ucurve(pangenome, args.output, soft_core=args.soft_core, disable_bar=args.disable_prog_bar) + draw_ucurve( + pangenome, + args.output, + soft_core=args.soft_core, + disable_bar=args.disable_prog_bar, + ) if args.draw_spots: - draw_spots(pangenome=pangenome, output=args.output, spot_list=args.spots, disable_bar=args.disable_prog_bar) + draw_spots( + pangenome=pangenome, + output=args.output, + spot_list=args.spots, + disable_bar=args.disable_prog_bar, + ) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -61,7 +80,9 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : parser arguments for align command """ - parser = sub_parser.add_parser("draw", formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "draw", formatter_class=argparse.RawTextHelpFormatter + ) parser_draw(parser) return parser @@ -73,34 +94,73 @@ def parser_draw(parser: argparse.ArgumentParser): :param parser: parser for align argument """ date = time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", time.localtime()) - required = parser.add_argument_group(title="Required arguments", - description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome.h5 file") + required = parser.add_argument_group( + title="Required arguments", + description="One of the following arguments is required :", + ) + required.add_argument( + "-p", "--pangenome", required=False, type=Path, help="The pangenome.h5 file" + ) optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument('-o', '--output', required=False, type=Path, - default=Path(f'ppanggolin_output{date}_PID{str(os.getpid())}'), - help="Output directory") - optional.add_argument("--tile_plot", required=False, default=False, action="store_true", - help="draw the tile plot of the pangenome") - - optional.add_argument("--soft_core", required=False, default=0.95, type=restricted_float, - help="Soft core threshold to use") - optional.add_argument("--ucurve", required=False, default=False, action="store_true", - help="draw the U-curve of the pangenome") - optional.add_argument("--draw_spots", required=False, default=False, action="store_true", - help="draw plots for spots of the pangenome") - optional.add_argument("--spots", required=False, default='all', nargs='+', - help="a comma-separated list of spots to draw (or 'all' to draw all spots, or 'synteny' to draw spots with different RGP syntenies).") - - optional.add_argument("--nocloud", required=False, default=False, action="store_true", - help="Do not draw the cloud genes in the tile plot") + optional.add_argument( + "-o", + "--output", + required=False, + type=Path, + default=Path(f"ppanggolin_output{date}_PID{str(os.getpid())}"), + help="Output directory", + ) + optional.add_argument( + "--tile_plot", + required=False, + default=False, + action="store_true", + help="draw the tile plot of the pangenome", + ) + + optional.add_argument( + "--soft_core", + required=False, + default=0.95, + type=restricted_float, + help="Soft core threshold to use", + ) + optional.add_argument( + "--ucurve", + required=False, + default=False, + action="store_true", + help="draw the U-curve of the pangenome", + ) + optional.add_argument( + "--draw_spots", + required=False, + default=False, + action="store_true", + help="draw plots for spots of the pangenome", + ) + optional.add_argument( + "--spots", + required=False, + default="all", + nargs="+", + help="a comma-separated list of spots to draw (or 'all' to draw all spots, or 'synteny' to draw spots with different RGP syntenies).", + ) + + optional.add_argument( + "--nocloud", + required=False, + default=False, + action="store_true", + help="Do not draw the cloud genes in the tile plot", + ) optional.add_argument( "--add_dendrogram", required=False, default=False, action="store_true", - help="Include a dendrogram for genomes in the tile plot based on the presence/absence of gene families." + help="Include a dendrogram for genomes in the tile plot based on the presence/absence of gene families.", ) optional.add_argument( @@ -108,24 +168,26 @@ def parser_draw(parser: argparse.ArgumentParser): required=False, default=False, action="store_true", - help="Display gene metadata as hover text for each cell in the tile plot." + help="Display gene metadata as hover text for each cell in the tile plot.", ) - optional.add_argument("--metadata_sources", - default=None, - nargs="+", - help="Which source of metadata should be written in the tile plot. " - "By default all metadata sources are included.") - + optional.add_argument( + "--metadata_sources", + default=None, + nargs="+", + help="Which source of metadata should be written in the tile plot. " + "By default all metadata sources are included.", + ) -if __name__ == '__main__': +if __name__ == "__main__": """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser_draw(main_parser) add_common_arguments(main_parser) diff --git a/ppanggolin/figures/tile_plot.py b/ppanggolin/figures/tile_plot.py index be130727..e1e40d33 100644 --- a/ppanggolin/figures/tile_plot.py +++ b/ppanggolin/figures/tile_plot.py @@ -20,13 +20,16 @@ from ppanggolin.pangenome import Pangenome from ppanggolin.utils import jaccard_similarities -def draw_tile_plot(pangenome: Pangenome, - output: Path, - nocloud: bool = False, - draw_dendrogram: bool = False, - add_metadata:bool=False, - metadata_sources:Optional[Set[str]]=None, - disable_bar: bool = False): + +def draw_tile_plot( + pangenome: Pangenome, + output: Path, + nocloud: bool = False, + draw_dendrogram: bool = False, + add_metadata: bool = False, + metadata_sources: Optional[Set[str]] = None, + disable_bar: bool = False, +): """ Draw a tile plot from a partitioned pangenome. @@ -38,9 +41,19 @@ def draw_tile_plot(pangenome: Pangenome, """ # Check if the pangenome has the required information and is partitioned - check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=True, disable_bar=disable_bar, need_metadata=add_metadata, sources=metadata_sources) + check_pangenome_info( + pangenome, + need_annotations=True, + need_families=True, + need_graph=True, + disable_bar=disable_bar, + need_metadata=add_metadata, + sources=metadata_sources, + ) if pangenome.status["partitioned"] == "No": - raise Exception("Cannot draw the tile plot as the pangenome has not been partitioned.") + raise Exception( + "Cannot draw the tile plot as the pangenome has not been partitioned." + ) # Warn if there are more than 32767 genomes, as the output might be too large for browsers to handle if pangenome.number_of_organisms > 32767: @@ -55,8 +68,9 @@ def draw_tile_plot(pangenome: Pangenome, "You can use the --nocloud flag to exclude cloud families from the plot. " ) - - logging.getLogger("PPanGGOLiN").info("Starting the process of drawing the tile plot...") + logging.getLogger("PPanGGOLiN").info( + "Starting the process of drawing the tile plot..." + ) # Prepare the data structures required for generating the tile plot families, org_index = prepare_data_structures(pangenome, nocloud) @@ -66,19 +80,32 @@ def draw_tile_plot(pangenome: Pangenome, order_organisms, dendrogram_fig = generate_dendrogram(mat_p_a, org_index) # Process the data to be displayed in the tile plot - binary_data, text_data, fam_order, separators = process_tile_data(families, order_organisms) + binary_data, text_data, fam_order, separators = process_tile_data( + families, order_organisms + ) # Create the tile plot figure with or without the dendrogram - fig = create_tile_plot(binary_data, text_data, fam_order, separators, order_organisms, dendrogram_fig, draw_dendrogram) + fig = create_tile_plot( + binary_data, + text_data, + fam_order, + separators, + order_organisms, + dendrogram_fig, + draw_dendrogram, + ) # Save the plot to the specified output directory filename = output / "tile_plot.html" fig.write_html(filename) - logging.getLogger("PPanGGOLiN").info(f"Tile plot successfully created and saved to: '{filename}'.") + logging.getLogger("PPanGGOLiN").info( + f"Tile plot successfully created and saved to: '{filename}'." + ) return fig + def prepare_data_structures(pangenome: Pangenome, nocloud: bool) -> Tuple[set, dict]: """ Prepare data structures required for generating the tile plot. @@ -90,7 +117,9 @@ def prepare_data_structures(pangenome: Pangenome, nocloud: bool) -> Tuple[set, d # Exclude gene families in the cloud partition if 'nocloud' is True; otherwise, include all gene families if nocloud: - families = {fam for fam in pangenome.gene_families if not fam.partition.startswith("C")} + families = { + fam for fam in pangenome.gene_families if not fam.partition.startswith("C") + } else: families = set(pangenome.gene_families) @@ -98,6 +127,7 @@ def prepare_data_structures(pangenome: Pangenome, nocloud: bool) -> Tuple[set, d org_index = pangenome.get_org_index() return families, org_index + def build_presence_absence_matrix(families: set, org_index: dict) -> csc_matrix: """ Build the presence-absence matrix for gene families. @@ -121,10 +151,15 @@ def build_presence_absence_matrix(families: set, org_index: dict) -> csc_matrix: data.extend([1.0] * len(new_col)) # All presences are marked with 1.0 # Construct the presence-absence matrix using Compressed Sparse Column format - mat_p_a = csc_matrix((data, (all_indexes, all_columns)), shape=(len(families), len(org_index)), dtype='float') + mat_p_a = csc_matrix( + (data, (all_indexes, all_columns)), + shape=(len(families), len(org_index)), + dtype="float", + ) return mat_p_a + def generate_dendrogram(mat_p_a: csc_matrix, org_index: dict) -> Tuple[List, go.Figure]: """ Generate the order of organisms based on a dendrogram. @@ -144,20 +179,31 @@ def generate_dendrogram(mat_p_a: csc_matrix, org_index: dict) -> Tuple[List, go. distance_matrice = 1 - jaccard_similarities(mat_p_a, 0).todense() # Create a dendrogram figure using the computed distance matrix - dendrogram_fig = ff.create_dendrogram(distance_matrice, labels=genom_names, orientation='bottom') + dendrogram_fig = ff.create_dendrogram( + distance_matrice, labels=genom_names, orientation="bottom" + ) # Adjust the dendrogram figure to make it match with the heatmap later on - for i in range(len(dendrogram_fig['data'])): - dendrogram_fig['data'][i]['yaxis'] = 'y2' # Aligns dendrogram data on a secondary y-axis - dendrogram_fig['data'][i]['showlegend'] = False # Hides legends in the dendrogram + for i in range(len(dendrogram_fig["data"])): + dendrogram_fig["data"][i][ + "yaxis" + ] = "y2" # Aligns dendrogram data on a secondary y-axis + dendrogram_fig["data"][i][ + "showlegend" + ] = False # Hides legends in the dendrogram # Extract the ordered list of organisms from the dendrogram tick labels - order_organisms = [name_to_org[org_name] for org_name in dendrogram_fig['layout']['xaxis']['ticktext']] + order_organisms = [ + name_to_org[org_name] + for org_name in dendrogram_fig["layout"]["xaxis"]["ticktext"] + ] return order_organisms, dendrogram_fig -def process_tile_data(families: set, order_organisms: List) -> Tuple[List[List[float]], List[List[str]], List[str], List[Tuple[str, float]]]: +def process_tile_data( + families: set, order_organisms: List +) -> Tuple[List[List[float]], List[List[str]], List[str], List[Tuple[str, float]]]: """ Process data for each tile in the plot. @@ -181,8 +227,22 @@ def process_tile_data(families: set, order_organisms: List) -> Tuple[List[List[f for node in ordered_nodes: fam_order.append(node.name) data = set(node.organisms) - binary_data.append([len(list(node.get_genes_per_org(org))) if org in data else np.nan for org in order_organisms]) - text_data.append([("\n".join(map(str, node.get_genes_per_org(org))) if org in data else np.nan) for org in order_organisms]) + binary_data.append( + [ + len(list(node.get_genes_per_org(org))) if org in data else np.nan + for org in order_organisms + ] + ) + text_data.append( + [ + ( + "\n".join(map(str, node.get_genes_per_org(org))) + if org in data + else np.nan + ) + for org in order_organisms + ] + ) # Generate hover text for the heatmap text_data = get_heatmap_hover_text(ordered_nodes, order_organisms) @@ -190,7 +250,9 @@ def process_tile_data(families: set, order_organisms: List) -> Tuple[List[List[f return binary_data, text_data, fam_order, separators -def order_nodes(partitions_dict: dict, shell_subs: set) -> Tuple[List, List[Tuple[str, float]]]: +def order_nodes( + partitions_dict: dict, shell_subs: set +) -> Tuple[List, List[Tuple[str, float]]]: """ Order gene families based on their partitions. @@ -200,8 +262,12 @@ def order_nodes(partitions_dict: dict, shell_subs: set) -> Tuple[List, List[Tupl """ # Sort persistent and cloud partitions by the number of organisms in descending order - ordered_nodes_p = sorted(partitions_dict["P"], key=lambda n: n.number_of_organisms, reverse=True) - ordered_nodes_c = sorted(partitions_dict["C"], key=lambda n: n.number_of_organisms, reverse=True) + ordered_nodes_p = sorted( + partitions_dict["P"], key=lambda n: n.number_of_organisms, reverse=True + ) + ordered_nodes_c = sorted( + partitions_dict["C"], key=lambda n: n.number_of_organisms, reverse=True + ) partition_separators = [("Persistent", len(ordered_nodes_p) - 0.5)] ordered_nodes = ordered_nodes_p @@ -209,13 +275,21 @@ def order_nodes(partitions_dict: dict, shell_subs: set) -> Tuple[List, List[Tupl # Sort shell subpartitions and add them to the ordered nodes list for subpartition in sorted(shell_subs): partition_name = "Shell" if len(shell_subs) == 1 else f"Shell_{subpartition}" - ordered_nodes_s = sorted(partitions_dict[subpartition], key=lambda n: n.number_of_organisms, reverse=True) + ordered_nodes_s = sorted( + partitions_dict[subpartition], + key=lambda n: n.number_of_organisms, + reverse=True, + ) ordered_nodes += ordered_nodes_s - partition_separators.append((partition_name, partition_separators[-1][1] + len(ordered_nodes_s))) + partition_separators.append( + (partition_name, partition_separators[-1][1] + len(ordered_nodes_s)) + ) # Append cloud partition to the ordered nodes list ordered_nodes += ordered_nodes_c - partition_separators.append(("Cloud", partition_separators[-1][1] + len(ordered_nodes_c))) + partition_separators.append( + ("Cloud", partition_separators[-1][1] + len(ordered_nodes_c)) + ) return ordered_nodes, partition_separators @@ -224,7 +298,7 @@ def create_partition_shapes( separators: List[Tuple[str, float]], xval_max: float, heatmap_row: int, - partition_to_color: Dict[str, str] + partition_to_color: Dict[str, str], ) -> List[dict]: """ Create the shapes for plot separators to visually distinguish partitions in the plot. @@ -238,32 +312,62 @@ def create_partition_shapes( shapes = [] sep_prec = 0 - xref = 'x1' - yref = f'y{heatmap_row}' + xref = "x1" + yref = f"y{heatmap_row}" for partition_name, sep in separators: color = partition_to_color[partition_name] # Left vertical line for partition separator - shapes.append(dict( - type='line', x0=-1, x1=-1, y0=sep_prec, y1=sep, - line=dict(width=10, color=color), xref=xref, yref=yref, - name=partition_name, showlegend=True, legendgroup=partition_name - )) + shapes.append( + dict( + type="line", + x0=-1, + x1=-1, + y0=sep_prec, + y1=sep, + line=dict(width=10, color=color), + xref=xref, + yref=yref, + name=partition_name, + showlegend=True, + legendgroup=partition_name, + ) + ) # Right vertical line for partition separator - shapes.append(dict( - type='line', x0=xval_max, x1=xval_max, y0=sep_prec, y1=sep, - line=dict(width=10, color=color), xref=xref, yref=yref, - name=partition_name, showlegend=False, legendgroup=partition_name - )) + shapes.append( + dict( + type="line", + x0=xval_max, + x1=xval_max, + y0=sep_prec, + y1=sep, + line=dict(width=10, color=color), + xref=xref, + yref=yref, + name=partition_name, + showlegend=False, + legendgroup=partition_name, + ) + ) # Horizontal line across the partition boundary - shapes.append(dict( - type='line', x0=-1, x1=xval_max, y0=sep, y1=sep, - line=dict(width=1, color=color), xref=xref, yref=yref, - name=partition_name, showlegend=False, legendgroup=partition_name - )) + shapes.append( + dict( + type="line", + x0=-1, + x1=xval_max, + y0=sep, + y1=sep, + line=dict(width=1, color=color), + xref=xref, + yref=yref, + name=partition_name, + showlegend=False, + legendgroup=partition_name, + ) + ) sep_prec = sep @@ -277,17 +381,22 @@ def metadata_stringify(gene) -> str: :param gene: The gene object with potential metadata. :return: A formatted string containing gene metadata information. """ - metadata_str = '' + metadata_str = "" if gene.has_metadata(): - metadata_str = f'

{gene.ID} metadata' + metadata_str = f"

{gene.ID} metadata" for metadata in gene.metadata: metadata_str += f"
metadata source: {metadata.source}
" metadata_dict = metadata.to_dict() - metadata_str += '
'.join((f"{key}: {value}" for key, value in metadata_dict.items())) + metadata_str += "
".join( + (f"{key}: {value}" for key, value in metadata_dict.items()) + ) return metadata_str -def get_heatmap_hover_text(ordered_families: List, order_organisms: List) -> List[List[str]]: + +def get_heatmap_hover_text( + ordered_families: List, order_organisms: List +) -> List[List[str]]: """ Generate hover text for the heatmap cells. @@ -304,18 +413,28 @@ def get_heatmap_hover_text(ordered_families: List, order_organisms: List) -> Lis if org in family.organisms: # gene_count = len(list(family.get_genes_per_org(org))) genes = ";".join(map(str, family.get_genes_per_org(org))) - names = ";".join((gene.name for gene in family.get_genes_per_org(org) if gene.name)) + names = ";".join( + (gene.name for gene in family.get_genes_per_org(org) if gene.name) + ) # Compile additional information about genes extra_gene_info = f"genes:{genes}" if names: - extra_gene_info += f'
names:{names}' - - metadata = "
".join((metadata_stringify(gene) for gene in family.get_genes_per_org(org) if gene.has_metadata())) + extra_gene_info += f"
names:{names}" + + metadata = "
".join( + ( + metadata_stringify(gene) + for gene in family.get_genes_per_org(org) + if gene.has_metadata() + ) + ) extra_gene_info += metadata else: # gene_count = 0 - extra_gene_info = np.nan # Using np.nan instead of numpy.nan for consistency with numpy import + extra_gene_info = ( + np.nan + ) # Using np.nan instead of numpy.nan for consistency with numpy import # To get a more explicit hover. But is quite heavy on the finam html # gene_info = f"genome:{org.name}
family:{family.name}
gene_count:{gene_count}
{extra_gene_info}" @@ -327,14 +446,17 @@ def get_heatmap_hover_text(ordered_families: List, order_organisms: List) -> Lis return text_data + def create_tile_plot( binary_data: List[List[float]], text_data: List[List[str]], fam_order: List[str], partition_separator: List[tuple], - order_organisms: List[Organism], # Replace 'Any' with the appropriate type if available + order_organisms: List[ + Organism + ], # Replace 'Any' with the appropriate type if available dendrogram_fig: go.Figure, - draw_dendrogram: bool + draw_dendrogram: bool, ) -> go.Figure: """ Create the heatmap tile plot using Plotly. @@ -351,71 +473,90 @@ def create_tile_plot( xaxis_values = [org.name for org in order_organisms] - heatmap_color = {"presence":"#005AB5", # blue - "multicopy":'#DC3220' # red - } + heatmap_color = {"presence": "#005AB5", "multicopy": "#DC3220"} # blue # red - green_colors = ['rgb(35,139,69)', - 'rgb(65,171,93)', - 'rgb(116,196,118)', - 'rgb(161,217,155)', - 'rgb(199,233,192)', - 'rgb(229,245,224)'] + green_colors = [ + "rgb(35,139,69)", + "rgb(65,171,93)", + "rgb(116,196,118)", + "rgb(161,217,155)", + "rgb(199,233,192)", + "rgb(229,245,224)", + ] shell_color_generator = cycle(green_colors) - partition_to_color = {"Persistent": "#F7A507", "Cloud": "#79DEFF", "Shell_S1": "#00D860"} - partition_to_color.update({partition:next(shell_color_generator) for partition, _ in partition_separator if partition not in partition_to_color}) - - - heatmap = [go.Heatmap(z=binary_data, - x=xaxis_values, - y=fam_order, - text=text_data, - zauto=False, - zmin=0, - zmax=2, - autocolorscale=False, - hovertemplate = 'genome: %{x}
family: %{y}
gene_count: %{z}
%{text} ', - colorscale=[[0, '#ffffff'],[0.33, '#ffffff'], - [0.33, heatmap_color['presence']],[0.66, heatmap_color['presence']], - [0.66, heatmap_color['multicopy']], [1, heatmap_color['multicopy']]], - - colorbar=dict(title='Presence/Absence', - titleside='top', - tickmode='array', - tickvals=[0.33, 1, 1.66], - ticktext=['Absence', 'Presence', 'Multicopy'], - len=0.27, - outlinecolor='black', - outlinewidth=0.5, - ticks=None, orientation='v'))] - + partition_to_color = { + "Persistent": "#F7A507", + "Cloud": "#79DEFF", + "Shell_S1": "#00D860", + } + partition_to_color.update( + { + partition: next(shell_color_generator) + for partition, _ in partition_separator + if partition not in partition_to_color + } + ) + heatmap = [ + go.Heatmap( + z=binary_data, + x=xaxis_values, + y=fam_order, + text=text_data, + zauto=False, + zmin=0, + zmax=2, + autocolorscale=False, + hovertemplate="genome: %{x}
family: %{y}
gene_count: %{z}
%{text} ", + colorscale=[ + [0, "#ffffff"], + [0.33, "#ffffff"], + [0.33, heatmap_color["presence"]], + [0.66, heatmap_color["presence"]], + [0.66, heatmap_color["multicopy"]], + [1, heatmap_color["multicopy"]], + ], + colorbar=dict( + title="Presence/Absence", + titleside="top", + tickmode="array", + tickvals=[0.33, 1, 1.66], + ticktext=["Absence", "Presence", "Multicopy"], + len=0.27, + outlinecolor="black", + outlinewidth=0.5, + ticks=None, + orientation="v", + ), + ) + ] if draw_dendrogram: heatmap_row = 2 - fig = make_subplots(rows=2, cols=1, - shared_xaxes=True, - vertical_spacing=0.01, - row_heights=[0.2, 0.8]) + fig = make_subplots( + rows=2, + cols=1, + shared_xaxes=True, + vertical_spacing=0.01, + row_heights=[0.2, 0.8], + ) - for data in dendrogram_fig['data']: - fig.add_trace(data, row=1, col=1) + for data in dendrogram_fig["data"]: + fig.add_trace(data, row=1, col=1) else: heatmap_row = 1 fig = make_subplots(rows=1, cols=1) - - heatmap[0]['x'] = dendrogram_fig['layout']['xaxis']['tickvals'] + heatmap[0]["x"] = dendrogram_fig["layout"]["xaxis"]["tickvals"] for data in heatmap: fig.add_trace(data, row=heatmap_row, col=1) - layout = go.Layout(title="Presence-Absence Matrix", - plot_bgcolor='#ffffff') + layout = go.Layout(title="Presence-Absence Matrix", plot_bgcolor="#ffffff") if draw_dendrogram: layout.xaxis2 = dendrogram_fig.layout.xaxis @@ -424,39 +565,47 @@ def create_tile_plot( fig.update_layout(layout) - - - fig.update_xaxes( - ticklen=0, - title="Genomes" - ) + fig.update_xaxes(ticklen=0, title="Genomes") fig.update_yaxes( ticklen=0, - title='Gene Families', + title="Gene Families", tickfont=dict(size=10), automargin=True, ) if draw_dendrogram: fig.layout.yaxis.title = None - fig.layout.yaxis2.title = dict(text='Gene Families') + fig.layout.yaxis2.title = dict(text="Gene Families") fig.layout.xaxis.title = None - xmax = dendrogram_fig['layout']['xaxis']['tickvals'][-1] + dendrogram_fig['layout']['xaxis']['tickvals'][0] + 0.5 - shapes = create_partition_shapes(partition_separator, xmax, heatmap_row, partition_to_color) + xmax = ( + dendrogram_fig["layout"]["xaxis"]["tickvals"][-1] + + dendrogram_fig["layout"]["xaxis"]["tickvals"][0] + + 0.5 + ) + shapes = create_partition_shapes( + partition_separator, xmax, heatmap_row, partition_to_color + ) - fig.update_layout(go.Layout(shapes=shapes, - showlegend=True, - )) + fig.update_layout( + go.Layout( + shapes=shapes, + showlegend=True, + ) + ) - fig.update_layout(legend=dict( - title="Family Partition", - traceorder="reversed", - )) + fig.update_layout( + legend=dict( + title="Family Partition", + traceorder="reversed", + ) + ) - fig.update_layout({'width':1000, 'height':1000, - }) + fig.update_layout( + { + "width": 1000, + "height": 1000, + } + ) return fig - - diff --git a/ppanggolin/figures/ucurve.py b/ppanggolin/figures/ucurve.py index 485055e1..75133162 100644 --- a/ppanggolin/figures/ucurve.py +++ b/ppanggolin/figures/ucurve.py @@ -12,7 +12,12 @@ from ppanggolin.pangenome import Pangenome -def draw_ucurve(pangenome: Pangenome, output: Path, soft_core: float = 0.95, disable_bar: bool = False): +def draw_ucurve( + pangenome: Pangenome, + output: Path, + soft_core: float = 0.95, + disable_bar: bool = False, +): """ :param pangenome: Partitioned pangenome @@ -21,7 +26,13 @@ def draw_ucurve(pangenome: Pangenome, output: Path, soft_core: float = 0.95, di :param disable_bar: Allow to disable progress bar :return: """ - check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=True, disable_bar=disable_bar) + check_pangenome_info( + pangenome, + need_annotations=True, + need_families=True, + need_graph=True, + disable_bar=disable_bar, + ) logging.getLogger("PPanGGOLiN").info("Drawing the U-shaped curve...") max_bar = 0 count = defaultdict(lambda: defaultdict(int)) @@ -35,14 +46,30 @@ def draw_ucurve(pangenome: Pangenome, output: Path, soft_core: float = 0.95, di has_undefined = True count[nb_org][fam.named_partition] += 1 count[nb_org]["pangenome"] += 1 - max_bar = count[nb_org]["pangenome"] if count[nb_org]["pangenome"] > max_bar else max_bar + max_bar = ( + count[nb_org]["pangenome"] + if count[nb_org]["pangenome"] > max_bar + else max_bar + ) data_plot = [] chao = "NA" if count[1]["pangenome"] > 0: - chao = round(pangenome.number_of_gene_families + (pow(count[1]["pangenome"], 2) / (count[2]["pangenome"] * 2)), 2) - colors = {"pangenome": "black", "exact_accessory": "#EB37ED", "exact_core": "#FF2828", "soft_core": "#c7c938", - "soft_accessory": "#996633", "shell": "#00D860", "persistent": "#F7A507", "cloud": "#79DEFF", - "undefined": "#828282"} + chao = round( + pangenome.number_of_gene_families + + (pow(count[1]["pangenome"], 2) / (count[2]["pangenome"] * 2)), + 2, + ) + colors = { + "pangenome": "black", + "exact_accessory": "#EB37ED", + "exact_core": "#FF2828", + "soft_core": "#c7c938", + "soft_accessory": "#996633", + "shell": "#00D860", + "persistent": "#F7A507", + "cloud": "#79DEFF", + "undefined": "#828282", + } if is_partitioned and not has_undefined: persistent_values = [] @@ -52,29 +79,68 @@ def draw_ucurve(pangenome: Pangenome, output: Path, soft_core: float = 0.95, di persistent_values.append(count[nb_org]["persistent"]) shell_values.append(count[nb_org]["shell"]) cloud_values.append(count[nb_org]["cloud"]) - data_plot.append(go.Bar(x=list(range(1, pangenome.number_of_organisms + 1)), y=persistent_values, name='persistent', - marker=dict(color=colors["persistent"]))) - data_plot.append(go.Bar(x=list(range(1, pangenome.number_of_organisms + 1)), y=shell_values, name='shell', - marker=dict(color=colors["shell"]))) - data_plot.append(go.Bar(x=list(range(1, pangenome.number_of_organisms + 1)), y=cloud_values, name='cloud', - marker=dict(color=colors["cloud"]))) + data_plot.append( + go.Bar( + x=list(range(1, pangenome.number_of_organisms + 1)), + y=persistent_values, + name="persistent", + marker=dict(color=colors["persistent"]), + ) + ) + data_plot.append( + go.Bar( + x=list(range(1, pangenome.number_of_organisms + 1)), + y=shell_values, + name="shell", + marker=dict(color=colors["shell"]), + ) + ) + data_plot.append( + go.Bar( + x=list(range(1, pangenome.number_of_organisms + 1)), + y=cloud_values, + name="cloud", + marker=dict(color=colors["cloud"]), + ) + ) else: - text = 'undefined' if has_undefined else "pangenome" - undefined_values = [count[nb_org][text] - for nb_org - in range(1, pangenome.number_of_organisms + 1)] + text = "undefined" if has_undefined else "pangenome" + undefined_values = [ + count[nb_org][text] + for nb_org in range(1, pangenome.number_of_organisms + 1) + ] - data_plot.append(go.Bar(x=list(range(1, pangenome.number_of_organisms + 1)), y=undefined_values, name=text, - marker=dict(color=colors[text]))) + data_plot.append( + go.Bar( + x=list(range(1, pangenome.number_of_organisms + 1)), + y=undefined_values, + name=text, + marker=dict(color=colors[text]), + ) + ) x = pangenome.number_of_organisms * soft_core - layout = go.Layout(title="Gene families frequency distribution (U shape), chao=" + str(chao), - xaxis=dict(title='Occurring in x genomes'), - yaxis=dict(title='# of gene families (F)'), - barmode='stack', - shapes=[dict(type='line', x0=x, x1=x, y0=0, y1=max_bar, - line=dict(dict(width=5, dash='dashdot', color="grey")))], - plot_bgcolor='#ffffff') + layout = go.Layout( + title="Gene families frequency distribution (U shape), chao=" + str(chao), + xaxis=dict(title="Occurring in x genomes"), + yaxis=dict(title="# of gene families (F)"), + barmode="stack", + shapes=[ + dict( + type="line", + x0=x, + x1=x, + y0=0, + y1=max_bar, + line=dict(dict(width=5, dash="dashdot", color="grey")), + ) + ], + plot_bgcolor="#ffffff", + ) fig = go.Figure(data=data_plot, layout=layout) - out_plotly.plot(fig, filename=output.as_posix() + "/Ushaped_plot.html", auto_open=False) - logging.getLogger("PPanGGOLiN").info(f"Done drawing the U-shaped curve : '{output.as_posix() + '/Ushaped_plot.html'}'") + out_plotly.plot( + fig, filename=output.as_posix() + "/Ushaped_plot.html", auto_open=False + ) + logging.getLogger("PPanGGOLiN").info( + f"Done drawing the U-shaped curve : '{output.as_posix() + '/Ushaped_plot.html'}'" + ) diff --git a/ppanggolin/formats/__init__.py b/ppanggolin/formats/__init__.py index accc36c2..1cdfd28e 100644 --- a/ppanggolin/formats/__init__.py +++ b/ppanggolin/formats/__init__.py @@ -4,4 +4,4 @@ from .writeSequences import subparser, launch from .writeMSA import subparser, launch from .writeFlatGenomes import subparser, launch -from .writeFlatMetadata import subparser, launch \ No newline at end of file +from .writeFlatMetadata import subparser, launch diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index da370e01..46bc61a5 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -25,8 +25,18 @@ class Genedata: genedata table """ - def __init__(self, start: int, stop: int, strand: str, gene_type: str, position: int, name: str, product: str, - genetic_code: int, coordinates: List[Tuple[int]] = None): + def __init__( + self, + start: int, + stop: int, + strand: str, + gene_type: str, + position: int, + name: str, + product: str, + genetic_code: int, + coordinates: List[Tuple[int]] = None, + ): """Constructor method :param start: Gene start position @@ -50,24 +60,36 @@ def __init__(self, start: int, stop: int, strand: str, gene_type: str, position: self.coordinates = coordinates def __eq__(self, other): - return self.start == other.start \ - and self.stop == other.stop \ - and self.strand == other.strand \ - and self.gene_type == other.gene_type \ - and self.position == other.position \ - and self.name == other.name \ - and self.product == other.product \ - and self.genetic_code == other.genetic_code \ - and self.coordinates == other.coordinates \ - + return ( + self.start == other.start + and self.stop == other.stop + and self.strand == other.strand + and self.gene_type == other.gene_type + and self.position == other.position + and self.name == other.name + and self.product == other.product + and self.genetic_code == other.genetic_code + and self.coordinates == other.coordinates + ) def __hash__(self): - return hash((self.start, self.stop, self.strand, self.gene_type, self.position, - self.name, self.product, self.genetic_code, tuple(self.coordinates))) + return hash( + ( + self.start, + self.stop, + self.strand, + self.gene_type, + self.position, + self.name, + self.product, + self.genetic_code, + tuple(self.coordinates), + ) + ) def get_number_of_organisms(pangenome: Pangenome) -> int: - """ Standalone function to get the number of organisms in a pangenome + """Standalone function to get the number of organisms in a pangenome :param pangenome: Annotated pangenome @@ -76,7 +98,9 @@ def get_number_of_organisms(pangenome: Pangenome) -> int: if hasattr(pangenome, "file"): filename = pangenome.file else: - raise FileNotFoundError("The provided pangenome does not have an associated .h5 file") + raise FileNotFoundError( + "The provided pangenome does not have an associated .h5 file" + ) h5f = tables.open_file(filename, "r") annotations = h5f.root.annotations @@ -113,14 +137,18 @@ def get_status(pangenome: Pangenome, pangenome_file: Path): pangenome.status["ppanggolin_version"] = str(status_group._v_attrs.version) else: logging.getLogger("PPanGGOLiN").error( - f'The provided pangenome file {pangenome_file} does not have a version stored in its status.' - ' This issue may indicate that the file is corrupted.') + f"The provided pangenome file {pangenome_file} does not have a version stored in its status." + " This issue may indicate that the file is corrupted." + ) pangenome.status["ppanggolin_version"] = None if status_group._v_attrs.Partitioned: pangenome.status["partitioned"] = "inFile" - if hasattr(status_group._v_attrs, "predictedRGP") and status_group._v_attrs.predictedRGP: + if ( + hasattr(status_group._v_attrs, "predictedRGP") + and status_group._v_attrs.predictedRGP + ): pangenome.status["predictedRGP"] = "inFile" if hasattr(status_group._v_attrs, "spots") and status_group._v_attrs.spots: @@ -173,26 +201,33 @@ def read_genedata(h5f: tables.File) -> Dict[int, Genedata]: start = int(row["start"]) stop = int(row["stop"]) - if "has_joined_coordinates" in row.dtype.names and row["has_joined_coordinates"]: + if ( + "has_joined_coordinates" in row.dtype.names + and row["has_joined_coordinates"] + ): # manage gene with joined coordinates if the info exists try: coordinates = genedata_id_to_coordinates[row["genedata_id"]] except KeyError: - raise KeyError(f'Genedata {row["genedata_id"]} is supposed to have joined ' - 'coordinates but is not found in annotations.joinCoordinates table') + raise KeyError( + f'Genedata {row["genedata_id"]} is supposed to have joined ' + "coordinates but is not found in annotations.joinCoordinates table" + ) else: coordinates = [(start, stop)] - genedata = Genedata(start=start, - stop=stop, - strand=row["strand"].decode(), - gene_type=row["gene_type"].decode(), - position=int(row["position"]), - name=row["name"].decode(), - product=row["product"].decode(), - genetic_code=int(row["genetic_code"]), - coordinates=coordinates) + genedata = Genedata( + start=start, + stop=stop, + strand=row["strand"].decode(), + gene_type=row["gene_type"].decode(), + position=int(row["position"]), + name=row["name"].decode(), + product=row["product"].decode(), + genetic_code=int(row["genetic_code"]), + coordinates=coordinates, + ) genedata_id = row["genedata_id"] genedata_id2genedata[genedata_id] = genedata @@ -220,12 +255,15 @@ def read_join_coordinates(h5f: tables.File) -> Dict[str, List[Tuple[int, int]]]: genedata_id = row["genedata_id"] genedata_id_to_coordinates[genedata_id].append( - (int(row["coordinate_rank"]), int(row["start"]), int(row["stop"]))) + (int(row["coordinate_rank"]), int(row["start"]), int(row["stop"])) + ) # sort coordinate by their rank genedata_id_to_sorted_coordinates = {} for genedata_id, coordinates in genedata_id_to_coordinates.items(): - sorted_coordinates = [(start, stop) for rank, start, stop in sorted(coordinates)] + sorted_coordinates = [ + (start, stop) for rank, start, stop in sorted(coordinates) + ] genedata_id_to_sorted_coordinates[genedata_id] = sorted_coordinates return genedata_id_to_sorted_coordinates @@ -240,12 +278,13 @@ def read_sequences(h5f: tables.File) -> dict: table = h5f.root.annotations.sequences seqid2seq = {} for row in read_chunks(table, chunk=20000): - seqid2seq[row["seqid"]] = row['dna'].decode() + seqid2seq[row["seqid"]] = row["dna"].decode() return seqid2seq -def get_non_redundant_gene_sequences_from_file(pangenome_filename: str, output: Path, add: str = '', - disable_bar: bool = False): +def get_non_redundant_gene_sequences_from_file( + pangenome_filename: str, output: Path, add: str = "", disable_bar: bool = False +): """ Writes the non-redundant CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS, and adds the eventual str 'add' in front of the identifiers. Loads the sequences from a .h5 pangenome file. @@ -257,8 +296,10 @@ def get_non_redundant_gene_sequences_from_file(pangenome_filename: str, output: """ - logging.getLogger("PPanGGOLiN").info(f"Extracting and writing non redundant CDS sequences from {pangenome_filename}" - f" to {output.absolute()}") + logging.getLogger("PPanGGOLiN").info( + f"Extracting and writing non redundant CDS sequences from {pangenome_filename}" + f" to {output.absolute()}" + ) with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f: @@ -272,14 +313,25 @@ def get_non_redundant_gene_sequences_from_file(pangenome_filename: str, output: table = h5f.root.annotations.sequences with open(output, "w") as file_obj: - for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): + for row in tqdm( + read_chunks(table, chunk=20000), + total=table.nrows, + unit="gene", + disable=disable_bar, + ): cds_name = seqid2cds_name[row["seqid"]] - file_obj.write(f'>{add}{cds_name}\n') + file_obj.write(f">{add}{cds_name}\n") file_obj.write(f'{row["dna"].decode()}\n') -def write_gene_sequences_from_pangenome_file(pangenome_filename: str, output: Path, list_cds: Optional[Iterator] = None, - add: str = '', compress: bool = False, disable_bar: bool = False): +def write_gene_sequences_from_pangenome_file( + pangenome_filename: str, + output: Path, + list_cds: Optional[Iterator] = None, + add: str = "", + compress: bool = False, + disable_bar: bool = False, +): """ Writes the CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS, and adds the eventual str 'add' in front of the identifiers. Loads the sequences from a .h5 pangenome file. @@ -291,21 +343,30 @@ def write_gene_sequences_from_pangenome_file(pangenome_filename: str, output: Pa :param compress: Compress the output file :param disable_bar: Prevent to print disable progress bar """ - logging.getLogger("PPanGGOLiN").info(f"Extracting and writing CDS sequences from a {pangenome_filename} " - "file to a fasta file...") + logging.getLogger("PPanGGOLiN").info( + f"Extracting and writing CDS sequences from a {pangenome_filename} " + "file to a fasta file..." + ) with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f: table = h5f.root.annotations.geneSequences list_cds = set(list_cds) if list_cds is not None else None seqid2seq = read_sequences(h5f) with write_compressed_or_not(output, compress) as file_obj: - for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): + for row in tqdm( + read_chunks(table, chunk=20000), + total=table.nrows, + unit="gene", + disable=disable_bar, + ): # Read the table chunk per chunk otherwise RAM dies on big pangenomes name_cds = row["gene"].decode() if row["type"] == b"CDS" and (list_cds is None or name_cds in list_cds): - file_obj.write('>' + add + name_cds + "\n") + file_obj.write(">" + add + name_cds + "\n") file_obj.write(seqid2seq[row["seqid"]] + "\n") - logging.getLogger("PPanGGOLiN").debug("Gene sequences from pangenome file was written to " - f"{output.absolute()}{'.gz' if compress else ''}") + logging.getLogger("PPanGGOLiN").debug( + "Gene sequences from pangenome file was written to " + f"{output.absolute()}{'.gz' if compress else ''}" + ) def read_rgp_genes_from_pangenome_file(h5f: tables.File) -> Set[bytes]: @@ -328,57 +389,64 @@ def get_families_from_genes(h5f: tables.File, genes: Set[bytes]) -> Set[bytes]: :param genes: A set of gene names (as bytes) for which to retrieve the associated families. :return: A set of gene family names (as bytes) associated with the specified genes. """ - + families = set() for row in read_chunks(h5f.root.geneFamilies, chunk=20000): - if row['gene'] in genes: + if row["gene"] in genes: families.add(row["geneFam"]) return families -def read_module_families_from_pangenome_file(h5f: tables.File, module_name: str) -> Set[bytes]: + +def read_module_families_from_pangenome_file( + h5f: tables.File, module_name: str +) -> Set[bytes]: """ Retrieves gene families associated with a specified module from the pangenome file. :param h5f: The open HDF5 pangenome file containing module data. - :param module_name: The name of the module (as a string). The module ID is extracted from + :param module_name: The name of the module (as a string). The module ID is extracted from the name by removing the "module_" prefix. :return: A set of gene family names (as bytes) associated with the specified module. """ - + family_to_write = set() - module_id = int(module_name[len("module_"):]) + module_id = int(module_name[len("module_") :]) module_table = h5f.root.modules for row in read_chunks(module_table, chunk=20000): - if row['module'] == module_id: - family_to_write.add(row['geneFam']) + if row["module"] == module_id: + family_to_write.add(row["geneFam"]) return family_to_write + def get_families_matching_partition(h5f: tables.File, partition: str) -> Set[bytes]: """ Retrieves gene families that match the specified partition. :param h5f: The open HDF5 pangenome file containing gene family information. - :param partition: The partition name (as a string). If "all", all gene families are included. + :param partition: The partition name (as a string). If "all", all gene families are included. Otherwise, it filters by the first letter of the partition. :return: A set of gene family names (as bytes) that match the partition criteria. """ family_to_write = set() - + gene_fam_info_table = h5f.root.geneFamiliesInfo parition_first_letter = partition[0].upper() for row in read_chunks(gene_fam_info_table, chunk=20000): - - if partition == "all" or row['partition'].decode().startswith(parition_first_letter): - family_to_write.add(row['name']) + + if partition == "all" or row["partition"].decode().startswith( + parition_first_letter + ): + family_to_write.add(row["name"]) return family_to_write + def get_genes_from_families(h5f: tables.File, families: List[bytes]) -> Set[bytes]: """ Retrieves a set of genes that belong to the specified families. @@ -390,24 +458,30 @@ def get_genes_from_families(h5f: tables.File, families: List[bytes]) -> Set[byte :param families: A list of gene families (as bytes) to filter genes by. :return: A set of genes (as bytes) that belong to the specified families. """ - + matching_genes = set() - + gene_fam_table = h5f.root.geneFamilies for row in read_chunks(gene_fam_table, chunk=20000): if row["geneFam"] in families: - matching_genes.add(row['gene']) + matching_genes.add(row["gene"]) return matching_genes -def get_seqid_to_genes(h5f: tables.File, genes:Set[bytes], get_all_genes:bool = False, disable_bar:bool = False) -> Dict[int, List[str]]: + +def get_seqid_to_genes( + h5f: tables.File, + genes: Set[bytes], + get_all_genes: bool = False, + disable_bar: bool = False, +) -> Dict[int, List[str]]: """ Creates a mapping of sequence IDs to gene names. :param h5f: The open HDF5 pangenome file containing gene sequence data. :param genes: A list of gene names to include in the mapping (if `get_all_genes` is False). - :param get_all_genes: Boolean flag to indicate if all genes should be included in the mapping. + :param get_all_genes: Boolean flag to indicate if all genes should be included in the mapping. If set to True, all genes will be added regardless of the `genes` parameter. :param disable_bar: Boolean flag to disable the progress bar if set to True. :return: A dictionary mapping sequence IDs (integers) to lists of gene names (strings). @@ -416,21 +490,34 @@ def get_seqid_to_genes(h5f: tables.File, genes:Set[bytes], get_all_genes:bool = seq_id_to_genes = defaultdict(list) gene_seq_table = h5f.root.annotations.geneSequences match_count = 0 - for row in tqdm(read_chunks(gene_seq_table, chunk=20000), total=gene_seq_table.nrows, unit="gene", disable=disable_bar): - - if get_all_genes or row['gene'] in genes: - seq_id_to_genes[row['seqid']].append(row['gene'].decode()) + for row in tqdm( + read_chunks(gene_seq_table, chunk=20000), + total=gene_seq_table.nrows, + unit="gene", + disable=disable_bar, + ): + + if get_all_genes or row["gene"] in genes: + seq_id_to_genes[row["seqid"]].append(row["gene"].decode()) match_count += 1 - assert get_all_genes or match_count == len(genes), f"Number of sequences found ({match_count}) does not match the number of expected genes {len(genes)}." + assert get_all_genes or match_count == len( + genes + ), f"Number of sequences found ({match_count}) does not match the number of expected genes {len(genes)}." return seq_id_to_genes -def write_genes_seq_from_pangenome_file(h5f: tables.File, outpath: Path, compress: bool, seq_id_to_genes: Dict[int, List[str]], disable_bar:bool): +def write_genes_seq_from_pangenome_file( + h5f: tables.File, + outpath: Path, + compress: bool, + seq_id_to_genes: Dict[int, List[str]], + disable_bar: bool, +): """ - Writes gene sequences from the pangenome file to an output file. - + Writes gene sequences from the pangenome file to an output file. + Only sequences whose IDs match the ones in `seq_id_to_genes` will be written. :param h5f: The open HDF5 pangenome file containing sequence data. @@ -439,12 +526,14 @@ def write_genes_seq_from_pangenome_file(h5f: tables.File, outpath: Path, compres :param seq_id_to_genes: A dictionary mapping sequence IDs to lists of gene names. :param disable_bar: Boolean flag to disable the progress bar if set to True. """ - + with write_compressed_or_not(file_path=outpath, compress=compress) as file_obj: seq_table = h5f.root.annotations.sequences - with tqdm(total=len(seq_id_to_genes), unit="sequence", disable=disable_bar) as pbar: + with tqdm( + total=len(seq_id_to_genes), unit="sequence", disable=disable_bar + ) as pbar: for row in read_chunks(table=seq_table, chunk=20000): if row["seqid"] in seq_id_to_genes: @@ -452,7 +541,6 @@ def write_genes_seq_from_pangenome_file(h5f: tables.File, outpath: Path, compres for seq_name in seq_id_to_genes[row["seqid"]]: file_obj.write(f">{seq_name}\n") file_obj.write(row["dna"].decode() + "\n") - pbar.update(1) @@ -464,10 +552,16 @@ def get_gene_to_genome(h5f: tables.File) -> Dict[bytes, bytes]: :param h5f: The open HDF5 pangenome file containing contig and gene annotations. :return: A dictionary mapping gene IDs to genome names. """ - - contig_id_to_genome = {row["ID"]:row['genome'] for row in read_chunks( h5f.root.annotations.contigs, chunk=20000) } - gene_to_genome = {row["ID"]:contig_id_to_genome[row['contig'] ] for row in read_chunks( h5f.root.annotations.genes, chunk=20000) } + contig_id_to_genome = { + row["ID"]: row["genome"] + for row in read_chunks(h5f.root.annotations.contigs, chunk=20000) + } + + gene_to_genome = { + row["ID"]: contig_id_to_genome[row["contig"]] + for row in read_chunks(h5f.root.annotations.genes, chunk=20000) + } return gene_to_genome @@ -479,19 +573,28 @@ def get_family_to_genome_count(h5f: tables.File) -> Dict[bytes, int]: :param h5f: The open HDF5 pangenome file containing contig, gene, and gene family data. :return: A dictionary mapping gene family names (as bytes) to the count of unique genomes. """ - - contig_id_to_genome = {row["ID"]:row['genome'] for row in read_chunks( h5f.root.annotations.contigs, chunk=20000) } - gene_to_genome = {row["ID"]:contig_id_to_genome[row['contig'] ] for row in read_chunks( h5f.root.annotations.genes, chunk=20000) } + contig_id_to_genome = { + row["ID"]: row["genome"] + for row in read_chunks(h5f.root.annotations.contigs, chunk=20000) + } + + gene_to_genome = { + row["ID"]: contig_id_to_genome[row["contig"]] + for row in read_chunks(h5f.root.annotations.genes, chunk=20000) + } family_to_genomes = defaultdict(set) - for row in read_chunks( h5f.root.geneFamilies, chunk=20000): - family_to_genomes[row['geneFam']].add(gene_to_genome[row["gene"]]) + for row in read_chunks(h5f.root.geneFamilies, chunk=20000): + family_to_genomes[row["geneFam"]].add(gene_to_genome[row["gene"]]) - family_to_genome_count = {fam: len(genomes) for fam, genomes in family_to_genomes.items()} + family_to_genome_count = { + fam: len(genomes) for fam, genomes in family_to_genomes.items() + } return family_to_genome_count + def get_soft_core_families(h5f: tables.File, soft_core: float) -> Set[bytes]: """ Identifies gene families that are present in at least a specified proportion of genomes. @@ -506,11 +609,22 @@ def get_soft_core_families(h5f: tables.File, soft_core: float) -> Set[bytes]: genome_count = pangenome_info["Content"]["Genomes"] genome_count_threshold = genome_count * soft_core - soft_core_families = {family for family, fam_genome_count in family_to_genome_count.items() if fam_genome_count >= genome_count_threshold} + soft_core_families = { + family + for family, fam_genome_count in family_to_genome_count.items() + if fam_genome_count >= genome_count_threshold + } return soft_core_families -def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Path, family_filter: str, soft_core:float = 0.95, - compress: bool = False, disable_bar=False): + +def write_fasta_gene_fam_from_pangenome_file( + pangenome_filename: str, + output: Path, + family_filter: str, + soft_core: float = 0.95, + compress: bool = False, + disable_bar=False, +): """ Write representative nucleotide sequences of gene families @@ -526,13 +640,14 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f: - if family_filter in ["all", 'persistent', 'shell', 'cloud']: + if family_filter in ["all", "persistent", "shell", "cloud"]: family_to_write = get_families_matching_partition(h5f, family_filter) - elif family_filter.startswith("module_"): - family_to_write = read_module_families_from_pangenome_file(h5f, module_name=family_filter) - + family_to_write = read_module_families_from_pangenome_file( + h5f, module_name=family_filter + ) + elif family_filter == "rgp": rgp_genes = read_rgp_genes_from_pangenome_file(h5f) family_to_write = get_families_from_genes(h5f, rgp_genes) @@ -540,23 +655,35 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa elif family_filter in ["softcore", "core"]: if family_filter == "core": soft_core = 1.0 - + family_to_write = get_soft_core_families(h5f, soft_core) if len(family_to_write) == 0: - logging.getLogger("PPanGGOLiN").warning(f"No families matching filter {family_filter}.") + logging.getLogger("PPanGGOLiN").warning( + f"No families matching filter {family_filter}." + ) return seq_id_to_genes = get_seqid_to_genes(h5f, set(family_to_write)) - write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes, disable_bar=disable_bar) + write_genes_seq_from_pangenome_file( + h5f, outpath, compress, seq_id_to_genes, disable_bar=disable_bar + ) - logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences " - f"of the gene families : '{outpath}{'.gz' if compress else ''}") + logging.getLogger("PPanGGOLiN").info( + "Done writing the representative nucleotide sequences " + f"of the gene families : '{outpath}{'.gz' if compress else ''}" + ) -def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Path, family_filter: str, soft_core:float=0.95, - compress: bool = False, disable_bar=False): +def write_fasta_prot_fam_from_pangenome_file( + pangenome_filename: str, + output: Path, + family_filter: str, + soft_core: float = 0.95, + compress: bool = False, + disable_bar=False, +): """ Write representative amino acid sequences of gene families. @@ -573,44 +700,63 @@ def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Pa partition_filter = False family_to_write = [] - with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f, write_compressed_or_not(outpath, compress) as fasta: + with tables.open_file( + pangenome_filename, "r", driver_core_backing_store=0 + ) as h5f, write_compressed_or_not(outpath, compress) as fasta: - if family_filter in ["all", 'persistent', 'shell', 'cloud']: + if family_filter in ["all", "persistent", "shell", "cloud"]: partition_filter = True parition_first_letter = family_filter[0].upper() elif family_filter == "rgp": rgp_genes = read_rgp_genes_from_pangenome_file(h5f) family_to_write = get_families_from_genes(h5f, rgp_genes) - + elif family_filter.startswith("module_"): - family_to_write = read_module_families_from_pangenome_file(h5f, module_name=family_filter) - + family_to_write = read_module_families_from_pangenome_file( + h5f, module_name=family_filter + ) + elif family_filter in ["softcore", "core"]: - + soft_core_to_apply = 1.0 if family_filter == "core" else soft_core family_to_write = get_soft_core_families(h5f, soft_core=soft_core_to_apply) gene_fam_info_table = h5f.root.geneFamiliesInfo + for row in tqdm( + read_chunks(gene_fam_info_table, chunk=20000), + total=gene_fam_info_table.nrows, + unit="family", + disable=disable_bar, + ): - for row in tqdm(read_chunks(gene_fam_info_table, chunk=20000), total=gene_fam_info_table.nrows, unit="family", disable=disable_bar): - - partition_match = partition_filter and (family_filter == "all" or row['partition'].decode().startswith(parition_first_letter)) - family_match = row['name'] in family_to_write + partition_match = partition_filter and ( + family_filter == "all" + or row["partition"].decode().startswith(parition_first_letter) + ) + family_match = row["name"] in family_to_write if partition_match or family_match: - + fasta.write(f">{row['name'].decode()}\n") - fasta.write(row['protein'].decode() + "\n") + fasta.write(row["protein"].decode() + "\n") - logging.getLogger("PPanGGOLiN").info(f"Done writing the representative amino acid sequences of the gene families:" - f"'{outpath}{'.gz' if compress else ''}'") - + logging.getLogger("PPanGGOLiN").info( + f"Done writing the representative amino acid sequences of the gene families:" + f"'{outpath}{'.gz' if compress else ''}'" + ) -def write_genes_from_pangenome_file(pangenome_filename: str, output: Path, gene_filter: str, soft_core:float=0.95, - compress: bool = False, disable_bar=False): + +def write_genes_from_pangenome_file( + pangenome_filename: str, + output: Path, + gene_filter: str, + soft_core: float = 0.95, + compress: bool = False, + disable_bar=False, +): """ Write representative nucleotide sequences of gene families @@ -625,19 +771,26 @@ def write_genes_from_pangenome_file(pangenome_filename: str, output: Path, gene_ outpath = output / f"{gene_filter}_genes.fna" get_all_genes = False - with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f: - if gene_filter in ['persistent', 'shell', 'cloud', "softcore", "core"] or gene_filter.startswith("module_"): + if gene_filter in [ + "persistent", + "shell", + "cloud", + "softcore", + "core", + ] or gene_filter.startswith("module_"): if gene_filter.startswith("module_"): - families = read_module_families_from_pangenome_file(h5f, module_name=gene_filter) + families = read_module_families_from_pangenome_file( + h5f, module_name=gene_filter + ) elif gene_filter in ["softcore", "core"]: - + soft_core_to_apply = 1.0 if gene_filter == "core" else soft_core families = get_soft_core_families(h5f, soft_core=soft_core_to_apply) - + else: families = get_families_matching_partition(h5f, gene_filter) @@ -650,12 +803,18 @@ def write_genes_from_pangenome_file(pangenome_filename: str, output: Path, gene_ genes_to_write = set() get_all_genes = True - seq_id_to_genes = get_seqid_to_genes(h5f, genes_to_write, get_all_genes=get_all_genes, disable_bar=disable_bar) + seq_id_to_genes = get_seqid_to_genes( + h5f, genes_to_write, get_all_genes=get_all_genes, disable_bar=disable_bar + ) - write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes, disable_bar=disable_bar) - - logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences " - f"of the gene families : '{outpath}{'.gz' if compress else ''}") + write_genes_seq_from_pangenome_file( + h5f, outpath, compress, seq_id_to_genes, disable_bar=disable_bar + ) + + logging.getLogger("PPanGGOLiN").info( + "Done writing the representative nucleotide sequences " + f"of the gene families : '{outpath}{'.gz' if compress else ''}" + ) def read_graph(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): @@ -668,18 +827,29 @@ def read_graph(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False """ table = h5f.root.edges - if pangenome.status["genomesAnnotated"] not in ["Computed", "Loaded"] or \ - pangenome.status["genesClustered"] not in ["Computed", "Loaded"]: - raise Exception("It's not possible to read the graph " - "if the annotations and the gene families have not been loaded.") - for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="contig adjacency", disable=disable_bar): + if pangenome.status["genomesAnnotated"] not in [ + "Computed", + "Loaded", + ] or pangenome.status["genesClustered"] not in ["Computed", "Loaded"]: + raise Exception( + "It's not possible to read the graph " + "if the annotations and the gene families have not been loaded." + ) + for row in tqdm( + read_chunks(table, chunk=20000), + total=table.nrows, + unit="contig adjacency", + disable=disable_bar, + ): source = pangenome.get_gene(row["geneSource"].decode()) target = pangenome.get_gene(row["geneTarget"].decode()) pangenome.add_edge(source, target) pangenome.status["neighborsGraph"] = "Loaded" -def read_gene_families(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): +def read_gene_families( + pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False +): """ Read gene families in pangenome hdf5 file to add in pangenome object @@ -689,13 +859,24 @@ def read_gene_families(pangenome: Pangenome, h5f: tables.File, disable_bar: bool """ table = h5f.root.geneFamilies - link = True if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] else False - - for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene family", disable=disable_bar): + link = ( + True + if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] + else False + ) + + for row in tqdm( + read_chunks(table, chunk=20000), + total=table.nrows, + unit="gene family", + disable=disable_bar, + ): try: fam = pangenome.get_gene_family(name=row["geneFam"].decode()) except KeyError: - fam = GeneFamily(family_id=pangenome.max_fam_id, name=row["geneFam"].decode()) + fam = GeneFamily( + family_id=pangenome.max_fam_id, name=row["geneFam"].decode() + ) pangenome.add_gene_family(fam) if link: # linking if we have loaded the annotations gene_obj = pangenome.get_gene(row["gene"].decode()) @@ -705,7 +886,9 @@ def read_gene_families(pangenome: Pangenome, h5f: tables.File, disable_bar: bool pangenome.status["genesClustered"] = "Loaded" -def read_gene_families_info(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): +def read_gene_families_info( + pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False +): """ Read information about gene families in pangenome hdf5 file to add in pangenome object @@ -715,7 +898,12 @@ def read_gene_families_info(pangenome: Pangenome, h5f: tables.File, disable_bar: """ table = h5f.root.geneFamiliesInfo - for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene family", disable=disable_bar): + for row in tqdm( + read_chunks(table, chunk=20000), + total=table.nrows, + unit="gene family", + disable=disable_bar, + ): fam = pangenome.get_gene_family(row["name"].decode()) fam.partition = row["partition"].decode() fam.add_sequence(row["protein"].decode()) @@ -726,7 +914,9 @@ def read_gene_families_info(pangenome: Pangenome, h5f: tables.File, disable_bar: pangenome.status["geneFamilySequences"] = "Loaded" -def read_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): +def read_gene_sequences( + pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False +): """ Read gene sequences in pangenome hdf5 file to add in pangenome object @@ -735,14 +925,21 @@ def read_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: boo :param disable_bar: Disable the progress bar """ if pangenome.status["genomesAnnotated"] not in ["Computed", "Loaded"]: - raise Exception("It's not possible to read the pangenome gene dna sequences " - "if the annotations have not been loaded.") + raise Exception( + "It's not possible to read the pangenome gene dna sequences " + "if the annotations have not been loaded." + ) table = h5f.root.annotations.geneSequences seqid2seq = read_sequences(h5f) - for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): - gene = pangenome.get_gene(row['gene'].decode()) - gene.add_sequence(seqid2seq[row['seqid']]) + for row in tqdm( + read_chunks(table, chunk=20000), + total=table.nrows, + unit="gene", + disable=disable_bar, + ): + gene = pangenome.get_gene(row["gene"].decode()) + gene.add_sequence(seqid2seq[row["seqid"]]) pangenome.status["geneSequences"] = "Loaded" @@ -754,13 +951,22 @@ def read_rgp(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): :param h5f: Pangenome HDF5 file with RGP computed :param disable_bar: Disable the progress bar """ - if pangenome.status["genomesAnnotated"] not in ["Computed", "Loaded"] or \ - pangenome.status["genesClustered"] not in ["Computed", "Loaded"]: - raise Exception("It's not possible to read the RGP " - "if the annotations and the gene families have not been loaded.") + if pangenome.status["genomesAnnotated"] not in [ + "Computed", + "Loaded", + ] or pangenome.status["genesClustered"] not in ["Computed", "Loaded"]: + raise Exception( + "It's not possible to read the RGP " + "if the annotations and the gene families have not been loaded." + ) table = h5f.root.RGP - for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="region", disable=disable_bar): + for row in tqdm( + read_chunks(table, chunk=20000), + total=table.nrows, + unit="region", + disable=disable_bar, + ): try: region = pangenome.get_region(row["RGP"].decode()) except KeyError: @@ -807,23 +1013,34 @@ def read_modules(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = Fal :param disable_bar: Disable the progress bar """ if pangenome.status["genesClustered"] not in ["Computed", "Loaded"]: - raise Exception("It's not possible to read the modules if the gene families have not been loaded.") + raise Exception( + "It's not possible to read the modules if the gene families have not been loaded." + ) table = h5f.root.modules modules = {} # id2mod - for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="module", disable=disable_bar): - curr_module = modules.get(int(row['module'])) + for row in tqdm( + read_chunks(table, chunk=20000), + total=table.nrows, + unit="module", + disable=disable_bar, + ): + curr_module = modules.get(int(row["module"])) if curr_module is None: - curr_module = Module(int(row['module'])) + curr_module = Module(int(row["module"])) modules[row["module"]] = curr_module - family = pangenome.get_gene_family(row['geneFam'].decode()) + family = pangenome.get_gene_family(row["geneFam"].decode()) curr_module.add(family) for module in modules.values(): pangenome.add_module(module) pangenome.status["modules"] = "Loaded" -def read_organisms(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20000, - disable_bar: bool = False): +def read_organisms( + pangenome: Pangenome, + table: tables.Table, + chunk_size: int = 20000, + disable_bar: bool = False, +): """Read organism table in pangenome file to add them to the pangenome object :param pangenome: Pangenome object @@ -831,13 +1048,22 @@ def read_organisms(pangenome: Pangenome, table: tables.Table, chunk_size: int = :param chunk_size: Size of the chunk reading :param disable_bar: Disable progress bar """ - for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="genome", disable=disable_bar): + for row in tqdm( + read_chunks(table, chunk=chunk_size), + total=table.nrows, + unit="genome", + disable=disable_bar, + ): organism = Organism(row["name"].decode()) pangenome.add_organism(organism) -def read_contigs(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20000, - disable_bar: bool = False): +def read_contigs( + pangenome: Pangenome, + table: tables.Table, + chunk_size: int = 20000, + disable_bar: bool = False, +): """Read contig table in pangenome file to add them to the pangenome object :param pangenome: Pangenome object @@ -845,8 +1071,17 @@ def read_contigs(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20 :param chunk_size: Size of the chunk reading :param disable_bar: Disable progress bar """ - for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="contig", disable=disable_bar): - contig = Contig(identifier=int(row["ID"]), name=row["name"].decode(), is_circular=row["is_circular"]) + for row in tqdm( + read_chunks(table, chunk=chunk_size), + total=table.nrows, + unit="contig", + disable=disable_bar, + ): + contig = Contig( + identifier=int(row["ID"]), + name=row["name"].decode(), + is_circular=row["is_circular"], + ) contig.length = int(row["length"]) try: organism = pangenome.get_organism(row["genome"].decode()) @@ -856,8 +1091,14 @@ def read_contigs(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20 organism.add(contig) -def read_genes(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int, Genedata], - link: bool = True, chunk_size: int = 20000, disable_bar: bool = False): +def read_genes( + pangenome: Pangenome, + table: tables.Table, + genedata_dict: Dict[int, Genedata], + link: bool = True, + chunk_size: int = 20000, + disable_bar: bool = False, +): """Read genes in pangenome file to add them to the pangenome object :param pangenome: Pangenome object @@ -867,17 +1108,30 @@ def read_genes(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[in :param chunk_size: Size of the chunk reading :param disable_bar: Disable progress bar """ - for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="gene", disable=disable_bar): + for row in tqdm( + read_chunks(table, chunk=chunk_size), + total=table.nrows, + unit="gene", + disable=disable_bar, + ): gene = Gene(row["ID"].decode()) genedata = genedata_dict[row["genedata_id"]] try: local = row["local"].decode() except ValueError: local = "" - gene.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand, - gene_type=genedata.gene_type, name=genedata.name, position=genedata.position, - genetic_code=genedata.genetic_code, product=genedata.product, local_identifier=local, - coordinates=genedata.coordinates) + gene.fill_annotations( + start=genedata.start, + stop=genedata.stop, + strand=genedata.strand, + gene_type=genedata.gene_type, + name=genedata.name, + position=genedata.position, + genetic_code=genedata.genetic_code, + product=genedata.product, + local_identifier=local, + coordinates=genedata.coordinates, + ) gene.is_fragment = row["is_fragment"] if link: contig = pangenome.get_contig(identifier=int(row["contig"])) @@ -885,8 +1139,14 @@ def read_genes(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[in contig.add(gene) -def read_rnas(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int, Genedata], - link: bool = True, chunk_size: int = 20000, disable_bar: bool = False): +def read_rnas( + pangenome: Pangenome, + table: tables.Table, + genedata_dict: Dict[int, Genedata], + link: bool = True, + chunk_size: int = 20000, + disable_bar: bool = False, +): """Read RNAs in pangenome file to add them to the pangenome object :param pangenome: Pangenome object @@ -896,28 +1156,49 @@ def read_rnas(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int :param chunk_size: Size of the chunk reading :param disable_bar: Disable progress bar """ - for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="gene", disable=disable_bar): + for row in tqdm( + read_chunks(table, chunk=chunk_size), + total=table.nrows, + unit="gene", + disable=disable_bar, + ): rna = RNA(row["ID"].decode()) genedata = genedata_dict[row["genedata_id"]] if genedata.start > genedata.stop: - logging.warning(f"Wrong coordinates in RNA gene {genedata.name}: Start ({genedata.start}) should not be greater than stop ({genedata.stop}). This gene is ignored.") + logging.warning( + f"Wrong coordinates in RNA gene {genedata.name}: Start ({genedata.start}) should not be greater than stop ({genedata.stop}). This gene is ignored." + ) continue if genedata.start < 1 or genedata.stop < 1: - logging.warning(f"Wrong coordinates in RNA gene {genedata.name}: Start ({genedata.start}) and stop ({genedata.stop}) should be greater than 0. This gene is ignored.") + logging.warning( + f"Wrong coordinates in RNA gene {genedata.name}: Start ({genedata.start}) and stop ({genedata.stop}) should be greater than 0. This gene is ignored." + ) continue - rna.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand, - gene_type=genedata.gene_type, name=genedata.name, - product=genedata.product) + rna.fill_annotations( + start=genedata.start, + stop=genedata.stop, + strand=genedata.strand, + gene_type=genedata.gene_type, + name=genedata.name, + product=genedata.product, + ) if link: contig = pangenome.get_contig(int(row["contig"])) rna.fill_parents(contig.organism, contig) contig.add_rna(rna) -def read_annotation(pangenome: Pangenome, h5f: tables.File, load_organisms: bool = True, load_contigs: bool = True, - load_genes: bool = True, load_rnas: bool = True, chunk_size: int = 20000, - disable_bar: bool = False): +def read_annotation( + pangenome: Pangenome, + h5f: tables.File, + load_organisms: bool = True, + load_contigs: bool = True, + load_genes: bool = True, + load_rnas: bool = True, + chunk_size: int = 20000, + disable_bar: bool = False, +): """ Read annotation in pangenome hdf5 file to add in pangenome object @@ -933,18 +1214,40 @@ def read_annotation(pangenome: Pangenome, h5f: tables.File, load_organisms: bool annotations = h5f.root.annotations genedata_dict = None if load_organisms: - read_organisms(pangenome, annotations.genomes, chunk_size=chunk_size, disable_bar=disable_bar) + read_organisms( + pangenome, + annotations.genomes, + chunk_size=chunk_size, + disable_bar=disable_bar, + ) if load_contigs: - read_contigs(pangenome, annotations.contigs, chunk_size=chunk_size, disable_bar=disable_bar) + read_contigs( + pangenome, + annotations.contigs, + chunk_size=chunk_size, + disable_bar=disable_bar, + ) if load_genes: genedata_dict = read_genedata(h5f) - read_genes(pangenome, annotations.genes, genedata_dict, - all([load_organisms, load_contigs]), chunk_size=chunk_size, disable_bar=disable_bar) + read_genes( + pangenome, + annotations.genes, + genedata_dict, + all([load_organisms, load_contigs]), + chunk_size=chunk_size, + disable_bar=disable_bar, + ) if load_rnas: - read_rnas(pangenome, annotations.RNAs, read_genedata(h5f) if genedata_dict is None else genedata_dict, - all([load_organisms, load_contigs]), chunk_size=chunk_size, disable_bar=disable_bar) + read_rnas( + pangenome, + annotations.RNAs, + read_genedata(h5f) if genedata_dict is None else genedata_dict, + all([load_organisms, load_contigs]), + chunk_size=chunk_size, + disable_bar=disable_bar, + ) pangenome.status["genomesAnnotated"] = "Loaded" @@ -956,60 +1259,70 @@ def create_info_dict(info_group: tables.group.Group): """ attributes = info_group._v_attrs._f_list() - info_dict = {"Genes": int(info_group._v_attrs['numberOfGenes'])} + info_dict = {"Genes": int(info_group._v_attrs["numberOfGenes"])} if "numberOfGenomes" in attributes: - info_dict["Genomes"] = int(info_group._v_attrs['numberOfGenomes']) + info_dict["Genomes"] = int(info_group._v_attrs["numberOfGenomes"]) if "numberOfClusters" in attributes: - info_dict["Families"] = int(info_group._v_attrs['numberOfClusters']) + info_dict["Families"] = int(info_group._v_attrs["numberOfClusters"]) if "numberOfEdges" in attributes: - info_dict["Edges"] = int(info_group._v_attrs['numberOfEdges']) + info_dict["Edges"] = int(info_group._v_attrs["numberOfEdges"]) - if 'numberOfCloud' in attributes: # then all the others are there + if "numberOfCloud" in attributes: # then all the others are there - persistent_stat = {"Family_count": int(info_group._v_attrs['numberOfPersistent'])} - persistent_stat.update(info_group._v_attrs['persistentStats']) + persistent_stat = { + "Family_count": int(info_group._v_attrs["numberOfPersistent"]) + } + persistent_stat.update(info_group._v_attrs["persistentStats"]) info_dict["Persistent"] = persistent_stat - shell_stat = {"Family_count": int(info_group._v_attrs['numberOfShell'])} - shell_stat.update(info_group._v_attrs['shellStats']) + shell_stat = {"Family_count": int(info_group._v_attrs["numberOfShell"])} + shell_stat.update(info_group._v_attrs["shellStats"]) info_dict["Shell"] = shell_stat - cloud_stat = {"Family_count": int(info_group._v_attrs['numberOfCloud'])} - cloud_stat.update(info_group._v_attrs['cloudStats']) + cloud_stat = {"Family_count": int(info_group._v_attrs["numberOfCloud"])} + cloud_stat.update(info_group._v_attrs["cloudStats"]) info_dict["Cloud"] = cloud_stat - info_dict["Number_of_partitions"] = int(info_group._v_attrs['numberOfPartitions']) + info_dict["Number_of_partitions"] = int( + info_group._v_attrs["numberOfPartitions"] + ) - if info_group._v_attrs['numberOfPartitions'] != 3: - subpartition_stat = {f"Shell_{key}": int(val) for key, val in - info_group._v_attrs['numberOfSubpartitions'].items()} + if info_group._v_attrs["numberOfPartitions"] != 3: + subpartition_stat = { + f"Shell_{key}": int(val) + for key, val in info_group._v_attrs["numberOfSubpartitions"].items() + } info_dict.update(subpartition_stat) - if 'genomes_fluidity' in attributes: - info_dict["Genomes_fluidity"] = {key: round(val, 3) for key, val in - info_group._v_attrs['genomes_fluidity'].items()} + if "genomes_fluidity" in attributes: + info_dict["Genomes_fluidity"] = { + key: round(val, 3) + for key, val in info_group._v_attrs["genomes_fluidity"].items() + } - if 'family_fluidity' in attributes: - info_dict["Family_fluidity"] = info_group._v_attrs['family_fluidity'] + if "family_fluidity" in attributes: + info_dict["Family_fluidity"] = info_group._v_attrs["family_fluidity"] - if 'numberOfRGP' in attributes: - info_dict["RGP"] = int(info_group._v_attrs['numberOfRGP']) + if "numberOfRGP" in attributes: + info_dict["RGP"] = int(info_group._v_attrs["numberOfRGP"]) - if 'numberOfSpots' in attributes: - info_dict["Spots"] = int(info_group._v_attrs['numberOfSpots']) + if "numberOfSpots" in attributes: + info_dict["Spots"] = int(info_group._v_attrs["numberOfSpots"]) - if 'numberOfModules' in attributes: + if "numberOfModules" in attributes: info_dict["Modules"] = { - 'Number_of_modules': int(info_group._v_attrs['numberOfModules']), - 'Families_in_Modules': int(info_group._v_attrs['numberOfFamiliesInModules']), - 'Partition_composition': { - "Persistent": info_group._v_attrs['PersistentSpecInModules']['percent'], - "Shell": info_group._v_attrs['ShellSpecInModules']['percent'], - "Cloud": info_group._v_attrs['CloudSpecInModules']['percent'] - } + "Number_of_modules": int(info_group._v_attrs["numberOfModules"]), + "Families_in_Modules": int( + info_group._v_attrs["numberOfFamiliesInModules"] + ), + "Partition_composition": { + "Persistent": info_group._v_attrs["PersistentSpecInModules"]["percent"], + "Shell": info_group._v_attrs["ShellSpecInModules"]["percent"], + "Cloud": info_group._v_attrs["CloudSpecInModules"]["percent"], + }, } return info_dict @@ -1026,8 +1339,13 @@ def read_info(h5f): return {"Content": content} -def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str, - sources: Set[str] = None, disable_bar: bool = False): +def read_metadata( + pangenome: Pangenome, + h5f: tables.File, + metatype: str, + sources: Set[str] = None, + disable_bar: bool = False, +): """Read metadata to add them to the pangenome object :param pangenome: Pangenome object @@ -1039,13 +1357,22 @@ def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str, metadata_group = h5f.root.metadata._f_get_child(metatype) for source in sources: source_table = metadata_group._f_get_child(source) - for row in tqdm(read_chunks(source_table), total=source_table.nrows, unit='metadata', disable=disable_bar): - meta_dict = {'source': source} + for row in tqdm( + read_chunks(source_table), + total=source_table.nrows, + unit="metadata", + disable=disable_bar, + ): + meta_dict = {"source": source} try: meta_id = int(row["metadata_id"]) except KeyError: meta_id = None - identifier = row["ID"].decode("utf-8") if isinstance(row["ID"], bytes) else int(row["ID"]) + identifier = ( + row["ID"].decode("utf-8") + if isinstance(row["ID"], bytes) + else int(row["ID"]) + ) # else: # identifier = row["name"].decode() if metatype == "families": @@ -1063,12 +1390,25 @@ def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str, elif metatype == "contigs": element = pangenome.get_contig(identifier) else: - expected_types = ["families", "genomes", "contigs", "genes", "RGPs", "spots", "modules"] + expected_types = [ + "families", + "genomes", + "contigs", + "genes", + "RGPs", + "spots", + "modules", + ] raise KeyError( - f'The metatype {metatype} is unexpected. Object associated with metadata are {expected_types}') + f"The metatype {metatype} is unexpected. Object associated with metadata are {expected_types}" + ) for field in row.dtype.names: if field not in ["ID", "name"]: - meta_dict[field] = row[field].decode() if isinstance(row[field], bytes) else row[field] + meta_dict[field] = ( + row[field].decode() + if isinstance(row[field], bytes) + else row[field] + ) element.add_metadata(metadata=Metadata(**meta_dict), metadata_id=meta_id) pangenome.status["metadata"][metatype] = "Loaded" @@ -1106,10 +1446,20 @@ def get_pangenome_parameters(h5f: tables.File) -> Dict[str, Dict[str, Any]]: return info_group._v_attrs["parameters"] -def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = False, graph: bool = False, - rgp: bool = False, spots: bool = False, gene_sequences: bool = False, modules: bool = False, - metadata: bool = False, metatypes: Set[str] = None, sources: Set[str] = None, - disable_bar: bool = False): +def read_pangenome( + pangenome, + annotation: bool = False, + gene_families: bool = False, + graph: bool = False, + rgp: bool = False, + spots: bool = False, + gene_sequences: bool = False, + modules: bool = False, + metadata: bool = False, + metatypes: Set[str] = None, + sources: Set[str] = None, + disable_bar: bool = False, +): """ Reads a previously written pangenome, with all of its parts, depending on what is asked, with regard to what is filled in the 'status' field of the hdf5 file. @@ -1128,25 +1478,35 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa :param disable_bar: Allow to disable the progress bar """ if pangenome.file is None: - raise FileNotFoundError("Your pangenome object has not been associated to any file.") + raise FileNotFoundError( + "Your pangenome object has not been associated to any file." + ) filename = pangenome.file h5f = tables.open_file(filename, "r") - if annotation: # I place annotation here, to link gene to gene families if organism are not loaded + if ( + annotation + ): # I place annotation here, to link gene to gene families if organism are not loaded if h5f.root.status._v_attrs.genomesAnnotated: logging.getLogger("PPanGGOLiN").info("Reading pangenome annotations...") read_annotation(pangenome, h5f, disable_bar=disable_bar) else: - raise Exception(f"The pangenome in file '{filename}' has not been annotated, or has been improperly filled") + raise Exception( + f"The pangenome in file '{filename}' has not been annotated, or has been improperly filled" + ) if gene_sequences: if h5f.root.status._v_attrs.geneSequences: - logging.getLogger("PPanGGOLiN").info("Reading pangenome gene dna sequences...") + logging.getLogger("PPanGGOLiN").info( + "Reading pangenome gene dna sequences..." + ) read_gene_sequences(pangenome, h5f, disable_bar=disable_bar) else: - raise Exception(f"The pangenome in file '{filename}' does not have gene sequences, " - f"or has been improperly filled") + raise Exception( + f"The pangenome in file '{filename}' does not have gene sequences, " + f"or has been improperly filled" + ) if gene_families: if h5f.root.status._v_attrs.genesClustered: @@ -1155,38 +1515,47 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa read_gene_families_info(pangenome, h5f, disable_bar=disable_bar) else: raise Exception( - f"The pangenome in file '{filename}' does not have gene families, or has been improperly filled") + f"The pangenome in file '{filename}' does not have gene families, or has been improperly filled" + ) if graph: if h5f.root.status._v_attrs.NeighborsGraph: logging.getLogger("PPanGGOLiN").info("Reading the neighbors graph edges...") read_graph(pangenome, h5f, disable_bar=disable_bar) else: - raise Exception(f"The pangenome in file '{filename}' does not have graph information, " - f"or has been improperly filled") + raise Exception( + f"The pangenome in file '{filename}' does not have graph information, " + f"or has been improperly filled" + ) if rgp: if h5f.root.status._v_attrs.predictedRGP: logging.getLogger("PPanGGOLiN").info("Reading the RGP...") read_rgp(pangenome, h5f, disable_bar=disable_bar) else: - raise Exception(f"The pangenome in file '{filename}' does not have RGP information, " - f"or has been improperly filled") + raise Exception( + f"The pangenome in file '{filename}' does not have RGP information, " + f"or has been improperly filled" + ) if spots: if h5f.root.status._v_attrs.spots: logging.getLogger("PPanGGOLiN").info("Reading the spots...") read_spots(pangenome, h5f, disable_bar=disable_bar) else: - raise AttributeError(f"The pangenome in file '{pangenome.file}' does not have spots information, " - f"or has been improperly filled") + raise Exception( + f"The pangenome in file '{filename}' does not have spots information, " + f"or has been improperly filled" + ) if modules: if h5f.root.status._v_attrs.modules: logging.getLogger("PPanGGOLiN").info("Reading the modules...") read_modules(pangenome, h5f, disable_bar=disable_bar) else: - raise Exception(f"The pangenome in file '{filename}' does not have modules information, " - f"or has been improperly filled") + raise Exception( + f"The pangenome in file '{filename}' does not have modules information, " + f"or has been improperly filled" + ) if metadata: for metatype in metatypes: @@ -1198,83 +1567,133 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa metatype_sources = set(metasources._v_attrs[metatype]) & sources if metastatus._v_attrs[metatype] and len(metatype_sources) > 0: logging.getLogger("PPanGGOLiN").info( - f"Reading the {metatype} metadata from sources {metatype_sources}...") - read_metadata(pangenome, h5f, metatype, metatype_sources, disable_bar=disable_bar) + f"Reading the {metatype} metadata from sources {metatype_sources}..." + ) + read_metadata( + pangenome, + h5f, + metatype, + metatype_sources, + disable_bar=disable_bar, + ) else: - raise KeyError(f"The pangenome in file '{filename}' does not have metadata associated to {metatype}, ") + raise KeyError( + f"The pangenome in file '{filename}' does not have metadata associated to {metatype}, " + ) h5f.close() -def get_need_info(pangenome, need_annotations: bool = False, need_families: bool = False, need_graph: bool = False, - need_partitions: bool = False, need_rgp: bool = False, need_spots: bool = False, - need_gene_sequences: bool = False, need_modules: bool = False, need_metadata: bool = False, - metatypes: Set[str] = None, sources: Set[str] = None): - need_info = {"annotation": False, - "gene_families": False, - "graph": False, - "rgp": False, - "spots": False, - "gene_sequences": False, - "modules": False, - "metadata": False, - "metatypes": metatypes, - "sources": sources} +def get_need_info( + pangenome, + need_annotations: bool = False, + need_families: bool = False, + need_graph: bool = False, + need_partitions: bool = False, + need_rgp: bool = False, + need_spots: bool = False, + need_gene_sequences: bool = False, + need_modules: bool = False, + need_metadata: bool = False, + metatypes: Set[str] = None, + sources: Set[str] = None, +): + need_info = { + "annotation": False, + "gene_families": False, + "graph": False, + "rgp": False, + "spots": False, + "gene_sequences": False, + "modules": False, + "metadata": False, + "metatypes": metatypes, + "sources": sources, + } # TODO Automate call if one need another if need_annotations: if pangenome.status["genomesAnnotated"] == "inFile": need_info["annotation"] = True elif pangenome.status["genomesAnnotated"] not in ["Computed", "Loaded"]: - raise Exception("Your pangenome has no genes. See the 'annotate' subcommand.") + raise Exception( + "Your pangenome has no genes. See the 'annotate' subcommand." + ) if need_families: if pangenome.status["genesClustered"] == "inFile": need_info["gene_families"] = True elif pangenome.status["genesClustered"] not in ["Computed", "Loaded"]: - raise Exception("Your pangenome has no gene families. See the 'cluster' subcommand.") + raise Exception( + "Your pangenome has no gene families. See the 'cluster' subcommand." + ) if need_graph: if pangenome.status["neighborsGraph"] == "inFile": need_info["graph"] = True elif pangenome.status["neighborsGraph"] not in ["Computed", "Loaded"]: - raise Exception("Your pangenome does not have a graph (no edges). See the 'graph' subcommand.") - if need_partitions and pangenome.status["partitioned"] not in ["Computed", "Loaded", "inFile"]: - raise Exception("Your pangenome has not been partitioned. See the 'partition' subcommand") + raise Exception( + "Your pangenome does not have a graph (no edges). See the 'graph' subcommand." + ) + if need_partitions and pangenome.status["partitioned"] not in [ + "Computed", + "Loaded", + "inFile", + ]: + raise Exception( + "Your pangenome has not been partitioned. See the 'partition' subcommand" + ) if need_rgp: if pangenome.status["predictedRGP"] == "inFile": need_info["rgp"] = True elif pangenome.status["predictedRGP"] not in ["Computed", "Loaded"]: raise Exception( - "Your pangenome regions of genomic plasticity have not been predicted. See the 'rgp' subcommand") + "Your pangenome regions of genomic plasticity have not been predicted. See the 'rgp' subcommand" + ) if need_spots: if pangenome.status["spots"] == "inFile": need_info["spots"] = True elif pangenome.status["spots"] not in ["Computed", "Loaded"]: - raise Exception("Your pangenome spots of insertion have not been predicted. See the 'spot' subcommand") + raise Exception( + "Your pangenome spots of insertion have not been predicted. See the 'spot' subcommand" + ) if need_gene_sequences: if pangenome.status["geneSequences"] == "inFile": need_info["gene_sequences"] = True elif pangenome.status["geneSequences"] not in ["Computed", "Loaded"]: - raise Exception("Your pangenome does not include gene sequences. " - "This is possible only if you provided your own cluster file with the 'cluster' subcommand") + raise Exception( + "Your pangenome does not include gene sequences. " + "This is possible only if you provided your own cluster file with the 'cluster' subcommand" + ) if need_modules: if pangenome.status["modules"] == "inFile": need_info["modules"] = True elif pangenome.status["modules"] not in ["Computed", "Loaded"]: - raise Exception("Your pangenome modules have not been predicted. See the 'module' subcommand") + raise Exception( + "Your pangenome modules have not been predicted. See the 'module' subcommand" + ) metatypes_to_load = set() sources_to_load = set() if need_metadata: if metatypes is None: # load all metadata contained in the pangenome - metatypes = [metatype for metatype, status in pangenome.status["metadata"].items() if status == 'inFile'] + metatypes = [ + metatype + for metatype, status in pangenome.status["metadata"].items() + if status == "inFile" + ] else: # check that specified types have metadata associated for metatype in metatypes: - if pangenome.status["metadata"][metatype] not in ["Computed", "Loaded", "inFile"]: - logging.getLogger("PPanGGOLiN").warning("The pangenome does not have any metadata associated " - f"with {metatype}. See the 'metadata' subcommand") + if pangenome.status["metadata"][metatype] not in [ + "Computed", + "Loaded", + "inFile", + ]: + logging.getLogger("PPanGGOLiN").warning( + "The pangenome does not have any metadata associated " + f"with {metatype}. See the 'metadata' subcommand" + ) if sources is None: # load all metadata sources for each metatype @@ -1283,12 +1702,16 @@ def get_need_info(pangenome, need_annotations: bool = False, need_families: bool else: # check that specified source exist for at least one metatype for source in set(sources): - if any(source in pangenome.status["metasources"][metatype] for metatype in metatypes): + if any( + source in pangenome.status["metasources"][metatype] + for metatype in metatypes + ): sources_to_load.add(source) else: logging.getLogger("PPanGGOLiN").warning( f"There is no metadata assigned to any element of the pangenome with " - f"source={source}. This source is ignored") + f"source={source}. This source is ignored" + ) # select only metatypes that have a requested source . for metatype in metatypes: @@ -1297,10 +1720,15 @@ def get_need_info(pangenome, need_annotations: bool = False, need_families: bool else: logging.getLogger("PPanGGOLiN").debug( f"There is no metadata assigned to {metatype} with specified sources:" - f" {', '.join(sources_to_load)} in the pangenome. This metatype is ignored.") + f" {', '.join(sources_to_load)} in the pangenome. This metatype is ignored." + ) if metatypes_to_load and sources_to_load: - logging.getLogger("PPanGGOLiN").debug(f"metadata types to load: {', '.join(metatypes_to_load)}") - logging.getLogger("PPanGGOLiN").debug(f"metadata sources to load: {', '.join(sources_to_load)}") + logging.getLogger("PPanGGOLiN").debug( + f"metadata types to load: {', '.join(metatypes_to_load)}" + ) + logging.getLogger("PPanGGOLiN").debug( + f"metadata sources to load: {', '.join(sources_to_load)}" + ) need_info["metadata"] = True need_info["metatypes"] = metatypes_to_load need_info["sources"] = sources_to_load @@ -1308,11 +1736,21 @@ def get_need_info(pangenome, need_annotations: bool = False, need_families: bool return need_info -def check_pangenome_info(pangenome, need_annotations: bool = False, need_families: bool = False, - need_graph: bool = False, need_partitions: bool = False, need_rgp: bool = False, - need_spots: bool = False, need_gene_sequences: bool = False, need_modules: bool = False, - need_metadata: bool = False, metatypes: Optional[Set[str]] = None, sources: Optional[Set[str]] = None, - disable_bar: bool = False): +def check_pangenome_info( + pangenome, + need_annotations: bool = False, + need_families: bool = False, + need_graph: bool = False, + need_partitions: bool = False, + need_rgp: bool = False, + need_spots: bool = False, + need_gene_sequences: bool = False, + need_modules: bool = False, + need_metadata: bool = False, + metatypes: Optional[Set[str]] = None, + sources: Optional[Set[str]] = None, + disable_bar: bool = False, +): """ Defines what needs to be read depending on what is needed, and automatically checks if the required elements have been computed with regard to the `pangenome.status` @@ -1331,9 +1769,20 @@ def check_pangenome_info(pangenome, need_annotations: bool = False, need_familie :param sources: sources of the metadata to get (None means all possible sources) :param disable_bar: Allow to disable the progress bar """ - need_info = get_need_info(pangenome, need_annotations, need_families, need_graph, need_partitions, - need_rgp, need_spots, need_gene_sequences, need_modules, need_metadata, - metatypes, sources) + need_info = get_need_info( + pangenome, + need_annotations, + need_families, + need_graph, + need_partitions, + need_rgp, + need_spots, + need_gene_sequences, + need_modules, + need_metadata, + metatypes, + sources, + ) if any([v for k, v in need_info.items() if k not in ["metatypes", "sources"]]): # if no flag is true, then nothing is needed. read_pangenome(pangenome, disable_bar=disable_bar, **need_info) diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py index efdaa204..c98d805f 100644 --- a/ppanggolin/formats/writeAnnotations.py +++ b/ppanggolin/formats/writeAnnotations.py @@ -25,7 +25,13 @@ def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int, i :return: Maximum size of each annotation """ - max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id = 1, 1, 1, 1, 1 + max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id = ( + 1, + 1, + 1, + 1, + 1, + ) for org in pangenome.organisms: if len(org.name) > max_org_len: max_org_len = len(org.name) @@ -41,7 +47,13 @@ def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int, i if len(rna.ID) > max_rna_id_len: max_rna_id_len = len(rna.ID) - return max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id + return ( + max_org_len, + max_contig_len, + max_gene_id_len, + max_rna_id_len, + max_gene_local_id, + ) def organism_desc(org_len: int) -> Dict[str, tables.StringCol]: @@ -52,11 +64,16 @@ def organism_desc(org_len: int) -> Dict[str, tables.StringCol]: :return: Formatted table """ - return {'name': tables.StringCol(itemsize=org_len)} + return {"name": tables.StringCol(itemsize=org_len)} -def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, - organism_desc: Dict[str, tables.StringCol], disable_bar=False): +def write_organisms( + pangenome: Pangenome, + h5f: tables.File, + annotation: tables.Group, + organism_desc: Dict[str, tables.StringCol], + disable_bar=False, +): """Write organisms information in the pangenome file :param pangenome: Annotated pangenome object @@ -65,17 +82,27 @@ def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.G :param organism_desc: Organisms table description. :param disable_bar: Allow disabling progress bar """ - organism_table = h5f.create_table(annotation, "genomes", organism_desc, - expectedrows=pangenome.number_of_organisms) - logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_organisms} genomes") + organism_table = h5f.create_table( + annotation, "genomes", organism_desc, expectedrows=pangenome.number_of_organisms + ) + logging.getLogger("PPanGGOLiN").debug( + f"Writing {pangenome.number_of_organisms} genomes" + ) organism_row = organism_table.row - for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): + for org in tqdm( + pangenome.organisms, + total=pangenome.number_of_organisms, + unit="genome", + disable=disable_bar, + ): organism_row["name"] = org.name organism_row.append() organism_table.flush() -def contig_desc(contig_len: int, org_len: int) -> Dict[str, Union[tables.StringCol, tables.BoolCol, tables.UInt32Col]]: +def contig_desc( + contig_len: int, org_len: int +) -> Dict[str, Union[tables.StringCol, tables.BoolCol, tables.UInt32Col]]: """Table description to save contig-related information :param contig_len: Maximum size of contig name @@ -83,16 +110,22 @@ def contig_desc(contig_len: int, org_len: int) -> Dict[str, Union[tables.StringC :return: Formatted table """ - return {'ID': tables.UInt32Col(), - 'name': tables.StringCol(itemsize=contig_len), - "is_circular": tables.BoolCol(dflt=False), - 'length': tables.UInt32Col(), - "genome": tables.StringCol(itemsize=org_len)} + return { + "ID": tables.UInt32Col(), + "name": tables.StringCol(itemsize=contig_len), + "is_circular": tables.BoolCol(dflt=False), + "length": tables.UInt32Col(), + "genome": tables.StringCol(itemsize=org_len), + } -def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, - contig_desc: Dict[str, Union[tables.StringCol, tables.BoolCol, tables.UInt32Col]], - disable_bar=False): +def write_contigs( + pangenome: Pangenome, + h5f: tables.File, + annotation: tables.Group, + contig_desc: Dict[str, Union[tables.StringCol, tables.BoolCol, tables.UInt32Col]], + disable_bar=False, +): """Write contigs information in the pangenome file :param pangenome: Annotated pangenome object :param h5f: Pangenome file @@ -100,10 +133,19 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro :param contig_desc: Contigs table description :param disable_bar: Allow disabling progress bar """ - contig_table = h5f.create_table(annotation, "contigs", contig_desc, expectedrows=pangenome.number_of_contigs) - logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_contigs} contigs") + contig_table = h5f.create_table( + annotation, "contigs", contig_desc, expectedrows=pangenome.number_of_contigs + ) + logging.getLogger("PPanGGOLiN").debug( + f"Writing {pangenome.number_of_contigs} contigs" + ) contig_row = contig_table.row - for contig in tqdm(pangenome.contigs, total=pangenome.number_of_contigs, unit="contigs", disable=disable_bar): + for contig in tqdm( + pangenome.contigs, + total=pangenome.number_of_contigs, + unit="contigs", + disable=disable_bar, + ): contig_row["ID"] = contig.ID contig_row["name"] = contig.name contig_row["is_circular"] = contig.is_circular @@ -113,7 +155,9 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro contig_table.flush() -def gene_desc(id_len: int, max_local_id: int) -> Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]]: +def gene_desc( + id_len: int, max_local_id: int +) -> Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]]: """Table description to save gene-related information :param id_len: Maximum size of gene name @@ -121,16 +165,22 @@ def gene_desc(id_len: int, max_local_id: int) -> Dict[str, Union[tables.StringCo :return: Formatted table """ - return {'ID': tables.StringCol(itemsize=id_len), - 'genedata_id': tables.UInt32Col(), - 'local': tables.StringCol(itemsize=max_local_id), - 'is_fragment': tables.BoolCol(dflt=False), - 'contig': tables.UInt32Col()} + return { + "ID": tables.StringCol(itemsize=id_len), + "genedata_id": tables.UInt32Col(), + "local": tables.StringCol(itemsize=max_local_id), + "is_fragment": tables.BoolCol(dflt=False), + "contig": tables.UInt32Col(), + } -def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, - gene_desc: Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]], - disable_bar=False) -> Dict[Genedata, int]: +def write_genes( + pangenome: Pangenome, + h5f: tables.File, + annotation: tables.Group, + gene_desc: Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]], + disable_bar=False, +) -> Dict[Genedata, int]: """Write genes information in the pangenome file :param pangenome: Annotated pangenome object @@ -143,10 +193,17 @@ def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Grou """ global genedata_counter genedata2gene = {} - gene_table = h5f.create_table(annotation, "genes", gene_desc, expectedrows=pangenome.number_of_genes) + gene_table = h5f.create_table( + annotation, "genes", gene_desc, expectedrows=pangenome.number_of_genes + ) logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes") gene_row = gene_table.row - for gene in tqdm(pangenome.genes, total=pangenome.number_of_genes, unit="gene", disable=disable_bar): + for gene in tqdm( + pangenome.genes, + total=pangenome.number_of_genes, + unit="gene", + disable=disable_bar, + ): gene_row["ID"] = gene.ID gene_row["is_fragment"] = gene.is_fragment gene_row["local"] = gene.local_identifier @@ -171,14 +228,20 @@ def rna_desc(id_len: int) -> Dict[str, Union[tables.StringCol, tables.UInt32Col] :return: Formatted table """ - return {'ID': tables.StringCol(itemsize=id_len), - 'genedata_id': tables.UInt32Col(), - 'contig': tables.UInt32Col()} + return { + "ID": tables.StringCol(itemsize=id_len), + "genedata_id": tables.UInt32Col(), + "contig": tables.UInt32Col(), + } -def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, - rna_desc: Dict[str, Union[tables.StringCol, tables.UInt32Col]], - disable_bar=False) -> Dict[Genedata, int]: +def write_rnas( + pangenome: Pangenome, + h5f: tables.File, + annotation: tables.Group, + rna_desc: Dict[str, Union[tables.StringCol, tables.UInt32Col]], + disable_bar=False, +) -> Dict[Genedata, int]: """Write RNAs information in the pangenome file :param pangenome: Annotated pangenome object @@ -191,10 +254,14 @@ def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group """ global genedata_counter genedata2rna = {} - rna_table = h5f.create_table(annotation, "RNAs", rna_desc, expectedrows=pangenome.number_of_genes) + rna_table = h5f.create_table( + annotation, "RNAs", rna_desc, expectedrows=pangenome.number_of_genes + ) logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes") rna_row = rna_table.row - for rna in tqdm(pangenome.RNAs, total=pangenome.number_of_rnas, unit="RNA", disable=disable_bar): + for rna in tqdm( + pangenome.RNAs, total=pangenome.number_of_rnas, unit="RNA", disable=disable_bar + ): rna_row["ID"] = rna.ID rna_row["contig"] = rna.contig.ID genedata = get_genedata(rna) @@ -209,7 +276,9 @@ def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group return genedata2rna -def genedata_desc(type_len: int, name_len: int, product_len: int) -> Dict[str, Union[tables.UIntCol, tables.StringCol]]: +def genedata_desc( + type_len: int, name_len: int, product_len: int +) -> Dict[str, Union[tables.UIntCol, tables.StringCol]]: """ Creates a table for gene-related data @@ -219,19 +288,22 @@ def genedata_desc(type_len: int, name_len: int, product_len: int) -> Dict[str, U :return: Formatted table for gene metadata """ return { - 'genedata_id': tables.UInt32Col(), - 'start': tables.UInt32Col(), - 'stop': tables.UInt32Col(), - 'strand': tables.StringCol(itemsize=1), - 'gene_type': tables.StringCol(itemsize=type_len), - 'position': tables.UInt32Col(), - 'name': tables.StringCol(itemsize=name_len), - 'product': tables.StringCol(itemsize=product_len), - 'genetic_code': tables.UInt32Col(dflt=11), - 'has_joined_coordinates':tables.BoolCol(dflt=False), + "genedata_id": tables.UInt32Col(), + "start": tables.UInt32Col(), + "stop": tables.UInt32Col(), + "strand": tables.StringCol(itemsize=1), + "gene_type": tables.StringCol(itemsize=type_len), + "position": tables.UInt32Col(), + "name": tables.StringCol(itemsize=name_len), + "product": tables.StringCol(itemsize=product_len), + "genetic_code": tables.UInt32Col(dflt=11), + "has_joined_coordinates": tables.BoolCol(dflt=False), } -def gene_joined_coordinates_desc() -> Dict[str, Union[tables.UIntCol, tables.StringCol]]: + +def gene_joined_coordinates_desc() -> ( + Dict[str, Union[tables.UIntCol, tables.StringCol]] +): """ Creates a table for gene-related data @@ -241,12 +313,13 @@ def gene_joined_coordinates_desc() -> Dict[str, Union[tables.UIntCol, tables.Str :return: Formatted table for gene metadata """ return { - 'genedata_id': tables.UInt32Col(), - 'start': tables.UInt32Col(), - 'stop': tables.UInt32Col(), - 'coordinate_rank': tables.UInt32Col(), + "genedata_id": tables.UInt32Col(), + "start": tables.UInt32Col(), + "stop": tables.UInt32Col(), + "coordinate_rank": tables.UInt32Col(), } + def get_max_len_genedata(pangenome: Pangenome) -> Tuple[int, int, int]: """ Get the maximum size of each gene data information to optimize disk space @@ -290,10 +363,22 @@ def get_genedata(feature: Union[Gene, RNA]) -> Genedata: if isinstance(feature, Gene): position = feature.position genetic_code = feature.genetic_code - return Genedata(feature.start, feature.stop, feature.strand, feature.type, position, feature.name, - feature.product, genetic_code, coordinates = feature.coordinates) - -def write_gene_joined_coordinates(h5f, annotation, genes_with_joined_coordinates_2_id, disable_bar): + return Genedata( + feature.start, + feature.stop, + feature.strand, + feature.type, + position, + feature.name, + feature.product, + genetic_code, + coordinates=feature.coordinates, + ) + + +def write_gene_joined_coordinates( + h5f, annotation, genes_with_joined_coordinates_2_id, disable_bar +): """Writing genedata information in pangenome file :param h5f: Pangenome file @@ -301,34 +386,48 @@ def write_gene_joined_coordinates(h5f, annotation, genes_with_joined_coordinates :param genedata2gene: Dictionary linking genedata to gene identifier. :param disable_bar: Allow disabling progress bar """ - number_of_gene_pieces = sum([len(gene.coordinates) for gene in genes_with_joined_coordinates_2_id]) + number_of_gene_pieces = sum( + [len(gene.coordinates) for gene in genes_with_joined_coordinates_2_id] + ) try: joined_coordinates_tables = annotation.joinedCoordinates except tables.exceptions.NoSuchNodeError: - joined_coordinates_tables = h5f.create_table(annotation, "joinedCoordinates", gene_joined_coordinates_desc(), - expectedrows=number_of_gene_pieces) - - - logging.getLogger("PPanGGOLiN").debug(f"Writing {number_of_gene_pieces} piece of genes from " - f"{len(genes_with_joined_coordinates_2_id)} genes that have joined coordinates ") + joined_coordinates_tables = h5f.create_table( + annotation, + "joinedCoordinates", + gene_joined_coordinates_desc(), + expectedrows=number_of_gene_pieces, + ) + + logging.getLogger("PPanGGOLiN").debug( + f"Writing {number_of_gene_pieces} piece of genes from " + f"{len(genes_with_joined_coordinates_2_id)} genes that have joined coordinates " + ) genedata_row = joined_coordinates_tables.row - for genedata, genedata_id in tqdm(genes_with_joined_coordinates_2_id.items(), unit="genedata", disable=disable_bar): + for genedata, genedata_id in tqdm( + genes_with_joined_coordinates_2_id.items(), unit="genedata", disable=disable_bar + ): for index, (start, stop) in enumerate(genedata.coordinates): genedata_row["genedata_id"] = genedata_id genedata_row["start"] = start genedata_row["stop"] = stop - genedata_row['coordinate_rank'] = index + genedata_row["coordinate_rank"] = index genedata_row.append() joined_coordinates_tables.flush() -def write_genedata(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, - genedata2gene: Dict[Genedata, int], disable_bar=False): +def write_genedata( + pangenome: Pangenome, + h5f: tables.File, + annotation: tables.Group, + genedata2gene: Dict[Genedata, int], + disable_bar=False, +): """Writing genedata information in pangenome file :param pangenome: Pangenome object filled with annotation. @@ -340,13 +439,21 @@ def write_genedata(pangenome: Pangenome, h5f: tables.File, annotation: tables.G try: genedata_table = annotation.genedata except tables.exceptions.NoSuchNodeError: - genedata_table = h5f.create_table(annotation, "genedata", genedata_desc(*get_max_len_genedata(pangenome)), - expectedrows=len(genedata2gene)) - - logging.getLogger("PPanGGOLiN").debug(f"Writing {len(genedata2gene)} gene-related data " - "(can be lower than the number of genes)") + genedata_table = h5f.create_table( + annotation, + "genedata", + genedata_desc(*get_max_len_genedata(pangenome)), + expectedrows=len(genedata2gene), + ) + + logging.getLogger("PPanGGOLiN").debug( + f"Writing {len(genedata2gene)} gene-related data " + "(can be lower than the number of genes)" + ) genedata_row = genedata_table.row - for genedata, genedata_id in tqdm(genedata2gene.items(), unit="genedata", disable=disable_bar): + for genedata, genedata_id in tqdm( + genedata2gene.items(), unit="genedata", disable=disable_bar + ): genedata_row["genedata_id"] = genedata_id genedata_row["start"] = genedata.start genedata_row["stop"] = genedata.stop @@ -366,8 +473,15 @@ def write_genedata(pangenome: Pangenome, h5f: tables.File, annotation: tables.G genedata_table.flush() -def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: bool = True, rec_contigs: bool = True, - rec_genes: bool = True, rec_rnas: bool = True, disable_bar: bool = False): +def write_annotations( + pangenome: Pangenome, + h5f: tables.File, + rec_organisms: bool = True, + rec_contigs: bool = True, + rec_genes: bool = True, + rec_rnas: bool = True, + disable_bar: bool = False, +): """Function writing all the pangenome annotations :param pangenome: Annotated pangenome @@ -378,9 +492,13 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: boo :param rec_rnas: Allow writing RNAs in pangenomes :param disable_bar: Allow to disable progress bar """ - annotation = h5f.create_group("/", "annotations", "Annotations of the pangenome organisms") + annotation = h5f.create_group( + "/", "annotations", "Annotations of the pangenome organisms" + ) - org_len, contig_len, gene_id_len, rna_id_len, gene_local_id = get_max_len_annotations(pangenome) + org_len, contig_len, gene_id_len, rna_id_len, gene_local_id = ( + get_max_len_annotations(pangenome) + ) # I add these boolean in case we would one day only load organism, contig or genes, without the other. @@ -400,10 +518,23 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: boo genedata2rna = write_rnas(pangenome, h5f, annotation, desc, disable_bar) write_genedata(pangenome, h5f, annotation, genedata2rna, disable_bar) - genes_with_joined_coordinates_2_id = {gene : gene_id for gene, gene_id in genedata2gene.items() if gene.has_joined_coordinates} - genes_with_joined_coordinates_2_id.update({gene : gene_id for gene, gene_id in genedata2rna.items() if gene.has_joined_coordinates}) + genes_with_joined_coordinates_2_id = { + gene: gene_id + for gene, gene_id in genedata2gene.items() + if gene.has_joined_coordinates + } + genes_with_joined_coordinates_2_id.update( + { + gene: gene_id + for gene, gene_id in genedata2rna.items() + if gene.has_joined_coordinates + } + ) + + write_gene_joined_coordinates( + h5f, annotation, genes_with_joined_coordinates_2_id, disable_bar + ) - write_gene_joined_coordinates(h5f, annotation, genes_with_joined_coordinates_2_id, disable_bar) def get_gene_sequences_len(pangenome: Pangenome) -> Tuple[int, int]: """ @@ -421,7 +552,9 @@ def get_gene_sequences_len(pangenome: Pangenome) -> Tuple[int, int]: return max_gene_id_len, max_gene_type -def gene_sequences_desc(gene_id_len: int, gene_type_len: int) -> Dict[str, Union[tables.UIntCol, tables.StringCol]]: +def gene_sequences_desc( + gene_id_len: int, gene_type_len: int +) -> Dict[str, Union[tables.UIntCol, tables.StringCol]]: """ Create table to save gene sequences @@ -433,7 +566,7 @@ def gene_sequences_desc(gene_id_len: int, gene_type_len: int) -> Dict[str, Union return { "gene": tables.StringCol(itemsize=gene_id_len), "seqid": tables.UInt32Col(), - "type": tables.StringCol(itemsize=gene_type_len) + "type": tables.StringCol(itemsize=gene_type_len), } @@ -450,33 +583,42 @@ def get_sequence_len(pangenome: Pangenome) -> int: return max_seq_len -def sequence_desc(max_seq_len: int) -> Dict[str, Union[tables.UIntCol, tables.StringCol]]: +def sequence_desc( + max_seq_len: int, +) -> Dict[str, Union[tables.UIntCol, tables.StringCol]]: """ Table description to save sequences :param max_seq_len: Maximum size of gene type :return: Formatted table """ - return { - "seqid": tables.UInt32Col(), - "dna": tables.StringCol(itemsize=max_seq_len) - } + return {"seqid": tables.UInt32Col(), "dna": tables.StringCol(itemsize=max_seq_len)} -def write_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): +def write_gene_sequences( + pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False +): """ Function writing all the pangenome gene sequences :param pangenome: Pangenome with gene sequences :param h5f: Pangenome HDF5 file without sequences :param disable_bar: Disable progress bar """ - gene_seq = h5f.create_table("/annotations", "geneSequences", gene_sequences_desc(*get_gene_sequences_len(pangenome)), - expectedrows=pangenome.number_of_genes) + gene_seq = h5f.create_table( + "/annotations", + "geneSequences", + gene_sequences_desc(*get_gene_sequences_len(pangenome)), + expectedrows=pangenome.number_of_genes, + ) # process sequences to save them only once seq2seqid = {} id_counter = 0 gene_row = gene_seq.row - for gene in tqdm(sorted(pangenome.genes, key=lambda x: x.ID), total=pangenome.number_of_genes, unit="gene", - disable=disable_bar): + for gene in tqdm( + sorted(pangenome.genes, key=lambda x: x.ID), + total=pangenome.number_of_genes, + unit="gene", + disable=disable_bar, + ): curr_seq_id = seq2seqid.get(gene.dna) if curr_seq_id is None: curr_seq_id = id_counter @@ -488,8 +630,12 @@ def write_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bo gene_row.append() gene_seq.flush() - seq_table = h5f.create_table("/annotations", "sequences", sequence_desc(get_sequence_len(pangenome)), - expectedrows=len(seq2seqid)) + seq_table = h5f.create_table( + "/annotations", + "sequences", + sequence_desc(get_sequence_len(pangenome)), + expectedrows=len(seq2seqid), + ) seq_row = seq_table.row for seq, seqid in seq2seqid.items(): diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index d682a201..17ba76ab 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -15,14 +15,17 @@ # local libraries from ppanggolin.pangenome import Pangenome from ppanggolin.formats.writeAnnotations import write_annotations, write_gene_sequences -from ppanggolin.formats.writeMetadata import write_metadata, erase_metadata, write_metadata_status +from ppanggolin.formats.writeMetadata import ( + write_metadata, + erase_metadata, + write_metadata_status, +) from ppanggolin.genome import Feature, Gene from ppanggolin.formats.readBinaries import read_genedata, Genedata - def getmean(arg: iter) -> float: - """ Compute the mean of arguments if exist 0 else + """Compute the mean of arguments if exist 0 else :param arg: list of values @@ -30,8 +33,9 @@ def getmean(arg: iter) -> float: """ return 0 if len(arg) == 0 else round(statistics.mean(arg), 2) + def getstdev(arg: iter) -> float: - """ Compute the standard deviation of arguments if exist 0 else + """Compute the standard deviation of arguments if exist 0 else :param arg: list of values @@ -39,8 +43,9 @@ def getstdev(arg: iter) -> float: """ return 0 if len(arg) <= 1 else round(statistics.stdev(arg), 2) + def getmax(arg: iter) -> float: - """ Get the maximum of arguments if exist 0 else + """Get the maximum of arguments if exist 0 else :param arg: list of values @@ -48,8 +53,9 @@ def getmax(arg: iter) -> float: """ return 0 if len(arg) == 0 else round(max(arg), 2) + def getmin(arg: iter) -> float: - """ Get the minimum of arguments if exist 0 else + """Get the minimum of arguments if exist 0 else :param arg: list of values @@ -57,7 +63,10 @@ def getmin(arg: iter) -> float: """ return 0 if len(arg) == 0 else round(min(arg), 2) -def gene_fam_desc(max_name_len: int, max_sequence_length: int, max_part_len: int) -> dict: + +def gene_fam_desc( + max_name_len: int, max_sequence_length: int, max_part_len: int +) -> dict: """ Create a formatted table for gene families description @@ -70,7 +79,7 @@ def gene_fam_desc(max_name_len: int, max_sequence_length: int, max_part_len: int return { "name": tables.StringCol(itemsize=max_name_len), "protein": tables.StringCol(itemsize=max_sequence_length), - "partition": tables.StringCol(itemsize=max_part_len) + "partition": tables.StringCol(itemsize=max_part_len), } @@ -95,7 +104,12 @@ def get_gene_fam_len(pangenome: Pangenome) -> Tuple[int, int, int]: return max_gene_fam_name_len, max_gene_fam_seq_len, max_part_len -def write_gene_fam_info(pangenome: Pangenome, h5f: tables.File, force: bool = False, disable_bar: bool = False): +def write_gene_fam_info( + pangenome: Pangenome, + h5f: tables.File, + force: bool = False, + disable_bar: bool = False, +): """ Writing a table containing the protein sequences of each family @@ -104,15 +118,27 @@ def write_gene_fam_info(pangenome: Pangenome, h5f: tables.File, force: bool = Fa :param force: force to write information if precedent information exist :param disable_bar: Disable progress bar """ - if '/geneFamiliesInfo' in h5f and force is True: - logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed gene family representative sequences...") - h5f.remove_node('/', 'geneFamiliesInfo') # erasing the table, and rewriting a new one. - gene_fam_seq = h5f.create_table("/", "geneFamiliesInfo", gene_fam_desc(*get_gene_fam_len(pangenome)), - expectedrows=pangenome.number_of_gene_families) + if "/geneFamiliesInfo" in h5f and force is True: + logging.getLogger("PPanGGOLiN").info( + "Erasing the formerly computed gene family representative sequences..." + ) + h5f.remove_node( + "/", "geneFamiliesInfo" + ) # erasing the table, and rewriting a new one. + gene_fam_seq = h5f.create_table( + "/", + "geneFamiliesInfo", + gene_fam_desc(*get_gene_fam_len(pangenome)), + expectedrows=pangenome.number_of_gene_families, + ) row = gene_fam_seq.row - for fam in tqdm(pangenome.gene_families, total=pangenome.number_of_gene_families, - unit="gene family", disable=disable_bar): + for fam in tqdm( + pangenome.gene_families, + total=pangenome.number_of_gene_families, + unit="gene family", + disable=disable_bar, + ): row["name"] = fam.name row["protein"] = fam.sequence row["partition"] = fam.partition @@ -131,7 +157,7 @@ def gene_to_fam_desc(gene_fam_name_len: int, gene_id_len: int) -> dict: """ return { "geneFam": tables.StringCol(itemsize=gene_fam_name_len), - "gene": tables.StringCol(itemsize=gene_id_len) + "gene": tables.StringCol(itemsize=gene_id_len), } @@ -154,7 +180,12 @@ def get_gene_to_fam_len(pangenome: Pangenome): return max_gene_fam_name, max_gene_id -def write_gene_families(pangenome: Pangenome, h5f: tables.File, force: bool = False, disable_bar: bool = False): +def write_gene_families( + pangenome: Pangenome, + h5f: tables.File, + force: bool = False, + disable_bar: bool = False, +): """ Function writing all the pangenome gene families @@ -163,13 +194,23 @@ def write_gene_families(pangenome: Pangenome, h5f: tables.File, force: bool = Fa :param force: Force to write gene families in hdf5 file if there is already gene families :param disable_bar: Disable progress bar """ - if '/geneFamilies' in h5f and force is True: - logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed gene family to gene associations...") - h5f.remove_node('/', 'geneFamilies') # erasing the table, and rewriting a new one. - gene_families = h5f.create_table("/", "geneFamilies", gene_to_fam_desc(*get_gene_to_fam_len(pangenome))) + if "/geneFamilies" in h5f and force is True: + logging.getLogger("PPanGGOLiN").info( + "Erasing the formerly computed gene family to gene associations..." + ) + h5f.remove_node( + "/", "geneFamilies" + ) # erasing the table, and rewriting a new one. + gene_families = h5f.create_table( + "/", "geneFamilies", gene_to_fam_desc(*get_gene_to_fam_len(pangenome)) + ) gene_row = gene_families.row - for family in tqdm(pangenome.gene_families, total=pangenome.number_of_gene_families, unit="gene family", - disable=disable_bar): + for family in tqdm( + pangenome.gene_families, + total=pangenome.number_of_gene_families, + unit="gene family", + disable=disable_bar, + ): for gene in family.genes: gene_row["gene"] = gene.ID gene_row["geneFam"] = family.name @@ -186,8 +227,8 @@ def graph_desc(max_gene_id_len): :return: formatted table """ return { - 'geneTarget': tables.StringCol(itemsize=max_gene_id_len), - 'geneSource': tables.StringCol(itemsize=max_gene_id_len) + "geneTarget": tables.StringCol(itemsize=max_gene_id_len), + "geneSource": tables.StringCol(itemsize=max_gene_id_len), } @@ -206,7 +247,12 @@ def get_gene_id_len(pangenome: Pangenome) -> int: return max_gene_len -def write_graph(pangenome: Pangenome, h5f: tables.File, force: bool = False, disable_bar: bool = False): +def write_graph( + pangenome: Pangenome, + h5f: tables.File, + force: bool = False, + disable_bar: bool = False, +): """ Function writing the pangenome graph @@ -218,13 +264,22 @@ def write_graph(pangenome: Pangenome, h5f: tables.File, force: bool = False, dis # TODO if we want to be able to read the graph without reading the annotations (because it's one of the most time # consumming parts to read), it might be good to add the organism name in the table here. # for now, forcing the read of annotations. - if '/edges' in h5f and force is True: + if "/edges" in h5f and force is True: logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed edges") h5f.remove_node("/", "edges") - edge_table = h5f.create_table("/", "edges", graph_desc(get_gene_id_len(pangenome)), - expectedrows=pangenome.number_of_edges) + edge_table = h5f.create_table( + "/", + "edges", + graph_desc(get_gene_id_len(pangenome)), + expectedrows=pangenome.number_of_edges, + ) edge_row = edge_table.row - for edge in tqdm(pangenome.edges, total=pangenome.number_of_edges, unit="edge", disable=disable_bar): + for edge in tqdm( + pangenome.edges, + total=pangenome.number_of_edges, + unit="edge", + disable=disable_bar, + ): for gene1, gene2 in edge.gene_pairs: edge_row["geneTarget"] = gene1.ID edge_row["geneSource"] = gene2.ID @@ -242,8 +297,8 @@ def rgp_desc(max_rgp_len, max_gene_len): :return: formatted table """ return { - 'RGP': tables.StringCol(itemsize=max_rgp_len), - 'gene': tables.StringCol(itemsize=max_gene_len) + "RGP": tables.StringCol(itemsize=max_rgp_len), + "gene": tables.StringCol(itemsize=max_gene_len), } @@ -266,7 +321,12 @@ def get_rgp_len(pangenome: Pangenome) -> Tuple[int, int]: return max_rgp_len, max_gene_len -def write_rgp(pangenome: Pangenome, h5f: tables.File, force: bool = False, disable_bar: bool = False): +def write_rgp( + pangenome: Pangenome, + h5f: tables.File, + force: bool = False, + disable_bar: bool = False, +): """ Function writing all the region of genomic plasticity in pangenome @@ -275,14 +335,23 @@ def write_rgp(pangenome: Pangenome, h5f: tables.File, force: bool = False, disab :param force: Force to write gene families in hdf5 file if there is already RGP :param disable_bar: Disable progress bar """ - if '/RGP' in h5f and force is True: + if "/RGP" in h5f and force is True: logging.getLogger("PPanGGOLiN").info("Erasing the formerly computer RGP") - h5f.remove_node('/', 'RGP') - - rgp_table = h5f.create_table('/', 'RGP', rgp_desc(*get_rgp_len(pangenome)), - expectedrows=sum([len(region) for region in pangenome.regions])) + h5f.remove_node("/", "RGP") + + rgp_table = h5f.create_table( + "/", + "RGP", + rgp_desc(*get_rgp_len(pangenome)), + expectedrows=sum([len(region) for region in pangenome.regions]), + ) rgp_row = rgp_table.row - for region in tqdm(pangenome.regions, total=pangenome.number_of_rgp, unit="region", disable=disable_bar): + for region in tqdm( + pangenome.regions, + total=pangenome.number_of_rgp, + unit="region", + disable=disable_bar, + ): for gene in region.genes: rgp_row["RGP"] = region.name rgp_row["gene"] = gene.ID @@ -298,10 +367,7 @@ def spot_desc(max_rgp_len): :return: formatted table """ - return { - 'spot': tables.UInt32Col(), - 'RGP': tables.StringCol(itemsize=max_rgp_len) - } + return {"spot": tables.UInt32Col(), "RGP": tables.StringCol(itemsize=max_rgp_len)} def get_spot_desc(pangenome: Pangenome) -> int: @@ -320,7 +386,12 @@ def get_spot_desc(pangenome: Pangenome) -> int: return max_rgp_len -def write_spots(pangenome: Pangenome, h5f: tables.File, force: bool = False, disable_bar: bool = False): +def write_spots( + pangenome: Pangenome, + h5f: tables.File, + force: bool = False, + disable_bar: bool = False, +): """ Function writing all the pangenome hotspot @@ -329,14 +400,23 @@ def write_spots(pangenome: Pangenome, h5f: tables.File, force: bool = False, dis :param force: Force to write gene families in hdf5 file if there is already spot :param disable_bar: Disable progress bar """ - if '/spots' in h5f and force is True: + if "/spots" in h5f and force is True: logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed spots") h5f.remove_node("/", "spots") - spot_table = h5f.create_table("/", "spots", spot_desc(get_spot_desc(pangenome)), - expectedrows=sum([len(spot) for spot in pangenome.spots])) + spot_table = h5f.create_table( + "/", + "spots", + spot_desc(get_spot_desc(pangenome)), + expectedrows=sum([len(spot) for spot in pangenome.spots]), + ) spot_row = spot_table.row - for spot in tqdm(pangenome.spots, total=pangenome.number_of_spots, unit="spot", disable=disable_bar): + for spot in tqdm( + pangenome.spots, + total=pangenome.number_of_spots, + unit="spot", + disable=disable_bar, + ): for region in spot.regions: spot_row["spot"] = spot.ID spot_row["RGP"] = region.name @@ -374,7 +454,12 @@ def get_mod_desc(pangenome: Pangenome) -> int: return max_fam_len -def write_modules(pangenome: Pangenome, h5f: tables.File, force: bool = False, disable_bar: bool = False): +def write_modules( + pangenome: Pangenome, + h5f: tables.File, + force: bool = False, + disable_bar: bool = False, +): """ Function writing all the pangenome modules @@ -383,15 +468,24 @@ def write_modules(pangenome: Pangenome, h5f: tables.File, force: bool = False, d :param force: Force to write gene families in hdf5 file if there is already spot :param disable_bar: Disable progress bar """ - if '/modules' in h5f and force is True: + if "/modules" in h5f and force is True: logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed modules") h5f.remove_node("/", "modules") - mod_table = h5f.create_table('/', 'modules', mod_desc(get_mod_desc(pangenome)), - expectedrows=sum([len(mod) for mod in pangenome.modules])) + mod_table = h5f.create_table( + "/", + "modules", + mod_desc(get_mod_desc(pangenome)), + expectedrows=sum([len(mod) for mod in pangenome.modules]), + ) mod_row = mod_table.row - for mod in tqdm(pangenome.modules, total=pangenome.number_of_modules, unit="modules", disable=disable_bar): + for mod in tqdm( + pangenome.modules, + total=pangenome.number_of_modules, + unit="modules", + disable=disable_bar, + ): for fam in mod.families: mod_row["geneFam"] = fam.name mod_row["module"] = mod.ID @@ -400,6 +494,7 @@ def write_modules(pangenome: Pangenome, h5f: tables.File, force: bool = False, d write_info_modules(pangenome, h5f) + def write_status(pangenome: Pangenome, h5f: tables.File): """ Write pangenome status in HDF5 file @@ -410,26 +505,57 @@ def write_status(pangenome: Pangenome, h5f: tables.File): if "/status" in h5f: # if statuses are already written status_group = h5f.root.status else: # else create the status group. - status_group = h5f.create_group("/", "status", "Statuses of the pangenome content") - status_group._v_attrs.genomesAnnotated = True if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded", - "inFile"] else False - status_group._v_attrs.geneSequences = True if pangenome.status["geneSequences"] in ["Computed", "Loaded", - "inFile"] else False - status_group._v_attrs.genesClustered = True if pangenome.status["genesClustered"] in ["Computed", "Loaded", - "inFile"] else False - status_group._v_attrs.geneFamilySequences = True if pangenome.status["geneFamilySequences"] in ["Computed", - "Loaded", - "inFile"] else False - status_group._v_attrs.NeighborsGraph = True if pangenome.status["neighborsGraph"] in ["Computed", "Loaded", - "inFile"] else False - status_group._v_attrs.Partitioned = True if pangenome.status["partitioned"] in ["Computed", "Loaded", - "inFile"] else False - status_group._v_attrs.defragmented = True if pangenome.status["defragmented"] in ["Computed", "Loaded", - "inFile"] else False - status_group._v_attrs.predictedRGP = True if pangenome.status["predictedRGP"] in ["Computed", "Loaded", - "inFile"] else False - status_group._v_attrs.spots = True if pangenome.status["spots"] in ["Computed", "Loaded", "inFile"] else False - status_group._v_attrs.modules = True if pangenome.status["modules"] in ["Computed", "Loaded", "inFile"] else False + status_group = h5f.create_group( + "/", "status", "Statuses of the pangenome content" + ) + status_group._v_attrs.genomesAnnotated = ( + True + if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded", "inFile"] + else False + ) + status_group._v_attrs.geneSequences = ( + True + if pangenome.status["geneSequences"] in ["Computed", "Loaded", "inFile"] + else False + ) + status_group._v_attrs.genesClustered = ( + True + if pangenome.status["genesClustered"] in ["Computed", "Loaded", "inFile"] + else False + ) + status_group._v_attrs.geneFamilySequences = ( + True + if pangenome.status["geneFamilySequences"] in ["Computed", "Loaded", "inFile"] + else False + ) + status_group._v_attrs.NeighborsGraph = ( + True + if pangenome.status["neighborsGraph"] in ["Computed", "Loaded", "inFile"] + else False + ) + status_group._v_attrs.Partitioned = ( + True + if pangenome.status["partitioned"] in ["Computed", "Loaded", "inFile"] + else False + ) + status_group._v_attrs.defragmented = ( + True + if pangenome.status["defragmented"] in ["Computed", "Loaded", "inFile"] + else False + ) + status_group._v_attrs.predictedRGP = ( + True + if pangenome.status["predictedRGP"] in ["Computed", "Loaded", "inFile"] + else False + ) + status_group._v_attrs.spots = ( + True if pangenome.status["spots"] in ["Computed", "Loaded", "inFile"] else False + ) + status_group._v_attrs.modules = ( + True + if pangenome.status["modules"] in ["Computed", "Loaded", "inFile"] + else False + ) status_group._v_attrs.metadata = write_metadata_status(pangenome, h5f, status_group) status_group._v_attrs.version = distribution("ppanggolin").version @@ -445,7 +571,9 @@ def write_info(pangenome: Pangenome, h5f: tables.File): if "/info" in h5f: info_group = h5f.root.info else: - info_group = h5f.create_group("/", "info", "Information about the pangenome content") + info_group = h5f.create_group( + "/", "info", "Information about the pangenome content" + ) if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"]: info_group._v_attrs.numberOfGenes = pangenome.number_of_genes info_group._v_attrs.numberOfGenomes = pangenome.number_of_organisms @@ -460,29 +588,37 @@ def write_info(pangenome: Pangenome, h5f: tables.File): part_set = set() for fam in pangenome.gene_families: named_part_counter[fam.named_partition] += 1 - part_distribs[fam.named_partition].append(fam.number_of_organisms / pangenome.number_of_organisms) + part_distribs[fam.named_partition].append( + fam.number_of_organisms / pangenome.number_of_organisms + ) if fam.named_partition == "shell": subpart_counter[fam.partition] += 1 if fam.partition != "S_": part_set.add(fam.partition) info_group._v_attrs.numberOfPersistent = named_part_counter["persistent"] - info_group._v_attrs.persistentStats = {"min_genomes_frequency": getmin(part_distribs["persistent"]), - "max_genomes_frequency": getmax(part_distribs["persistent"]), - "sd_genomes_frequency": getstdev(part_distribs["persistent"]), - "mean_genomes_frequency": getmean(part_distribs["persistent"])} + info_group._v_attrs.persistentStats = { + "min_genomes_frequency": getmin(part_distribs["persistent"]), + "max_genomes_frequency": getmax(part_distribs["persistent"]), + "sd_genomes_frequency": getstdev(part_distribs["persistent"]), + "mean_genomes_frequency": getmean(part_distribs["persistent"]), + } info_group._v_attrs.numberOfShell = named_part_counter["shell"] - info_group._v_attrs.shellStats = {"min_genomes_frequency": getmin(part_distribs["shell"]), - "max_genomes_frequency": getmax(part_distribs["shell"]), - "sd_genomes_frequency": getstdev(part_distribs["shell"]), - "mean_genomes_frequency": getmean(part_distribs["shell"])} + info_group._v_attrs.shellStats = { + "min_genomes_frequency": getmin(part_distribs["shell"]), + "max_genomes_frequency": getmax(part_distribs["shell"]), + "sd_genomes_frequency": getstdev(part_distribs["shell"]), + "mean_genomes_frequency": getmean(part_distribs["shell"]), + } info_group._v_attrs.numberOfCloud = named_part_counter["cloud"] - info_group._v_attrs.cloudStats = {"min_genomes_frequency": getmin(part_distribs["cloud"]), - "max_genomes_frequency": getmax(part_distribs["cloud"]), - "sd_genomes_frequency": getstdev(part_distribs["cloud"]), - "mean_genomes_frequency": getmean(part_distribs["cloud"])} + info_group._v_attrs.cloudStats = { + "min_genomes_frequency": getmin(part_distribs["cloud"]), + "max_genomes_frequency": getmax(part_distribs["cloud"]), + "sd_genomes_frequency": getstdev(part_distribs["cloud"]), + "mean_genomes_frequency": getmean(part_distribs["cloud"]), + } info_group._v_attrs.numberOfPartitions = len(part_set) info_group._v_attrs.numberOfSubpartitions = subpart_counter @@ -495,9 +631,14 @@ def write_info(pangenome: Pangenome, h5f: tables.File): if pangenome.status["modules"] in ["Computed", "Loaded"]: info_group._v_attrs.numberOfModules = pangenome.number_of_modules - info_group._v_attrs.numberOfFamiliesInModules = sum([len(mod) for mod in pangenome.modules]) + info_group._v_attrs.numberOfFamiliesInModules = sum( + [len(mod) for mod in pangenome.modules] + ) + + info_group._v_attrs.parameters = ( + pangenome.parameters + ) # saving the pangenome parameters - info_group._v_attrs.parameters = pangenome.parameters # saving the pangenome parameters def write_info_modules(pangenome: Pangenome, h5f: tables.File): """ @@ -518,43 +659,58 @@ def part_spec(part: str) -> list: pangenome.compute_mod_bitarrays(part) return [popcount(module.bitarray) for module in pangenome.modules] - if "/info" not in h5f: write_info(pangenome, h5f) info_group = h5f.root.info - mod_fam = [len(module) for module in pangenome.modules] sum_mod_fam = sum(mod_fam) - info_group._v_attrs.StatOfFamiliesInModules = {"min": getmin(mod_fam), - "max": getmax(mod_fam), - "sd": getstdev(mod_fam), - "mean": getmean(mod_fam)} + info_group._v_attrs.StatOfFamiliesInModules = { + "min": getmin(mod_fam), + "max": getmax(mod_fam), + "sd": getstdev(mod_fam), + "mean": getmean(mod_fam), + } - spec_pers = part_spec(part='persistent') - spec_shell = part_spec(part='shell') - spec_cloud = part_spec(part='cloud') + spec_pers = part_spec(part="persistent") + spec_shell = part_spec(part="shell") + spec_cloud = part_spec(part="cloud") + + info_group._v_attrs.PersistentSpecInModules = { + "percent": ( + round((sum(spec_pers) / sum_mod_fam) * 100, 2) if sum_mod_fam > 0 else 0 + ), + "min": getmin(spec_pers), + "max": getmax(spec_pers), + "sd": getstdev(spec_pers), + "mean": getmean(spec_pers), + } - info_group._v_attrs.PersistentSpecInModules = {"percent": round((sum(spec_pers) / sum_mod_fam) * 100, 2) if sum_mod_fam > 0 else 0, - "min": getmin(spec_pers), - "max": getmax(spec_pers), - "sd": getstdev(spec_pers), - "mean": getmean(spec_pers)} + info_group._v_attrs.ShellSpecInModules = { + "percent": ( + round((sum(spec_shell) / sum_mod_fam) * 100, 2) if sum_mod_fam > 0 else 0 + ), + "min": getmin(spec_shell), + "max": getmax(spec_shell), + "sd": getstdev(spec_shell), + "mean": getmean(spec_shell), + } - info_group._v_attrs.ShellSpecInModules = {"percent": round((sum(spec_shell) / sum_mod_fam) * 100, 2) if sum_mod_fam > 0 else 0, - "min": getmin(spec_shell), - "max": getmax(spec_shell), - "sd": getstdev(spec_shell), - "mean": getmean(spec_shell)} + info_group._v_attrs.CloudSpecInModules = { + "percent": ( + round((sum(spec_cloud) / sum_mod_fam) * 100, 2) if sum_mod_fam > 0 else 0 + ), + "min": getmin(spec_cloud), + "max": getmax(spec_cloud), + "sd": getstdev(spec_cloud), + "mean": getmean(spec_cloud), + } - info_group._v_attrs.CloudSpecInModules = {"percent": round((sum(spec_cloud) / sum_mod_fam) * 100, 2) if sum_mod_fam > 0 else 0, - "min": getmin(spec_cloud), - "max": getmax(spec_cloud), - "sd": getstdev(spec_cloud), - "mean": getmean(spec_cloud)} -def update_gene_fam_partition(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): +def update_gene_fam_partition( + pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False +): """ Update the gene families table with partition information @@ -562,14 +718,18 @@ def update_gene_fam_partition(pangenome: Pangenome, h5f: tables.File, disable_ba :param h5f: HDF5 file with gene families :param disable_bar: Allow to disable progress bar """ - logging.getLogger("PPanGGOLiN").info("Updating gene families with partition information") + logging.getLogger("PPanGGOLiN").info( + "Updating gene families with partition information" + ) table = h5f.root.geneFamiliesInfo for row in tqdm(table, total=table.nrows, unit="gene family", disable=disable_bar): row["partition"] = pangenome.get_gene_family(row["name"].decode()).partition row.update() -def update_gene_fragments(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): +def update_gene_fragments( + pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False +): """ Updates the annotation table with the fragmentation information from the defrag pipeline @@ -577,21 +737,32 @@ def update_gene_fragments(pangenome: Pangenome, h5f: tables.File, disable_bar: b :param h5f: HDF5 pangenome file :param disable_bar: Allow to disable progress bar """ - logging.getLogger("PPanGGOLiN").info("Updating annotations with fragment information") + logging.getLogger("PPanGGOLiN").info( + "Updating annotations with fragment information" + ) genedataid2genedata = read_genedata(h5f) table = h5f.root.annotations.genes for row in tqdm(table, total=table.nrows, unit="gene", disable=disable_bar): - genedata_id = row['genedata_id'] - if genedataid2genedata[genedata_id].gene_type == 'CDS': - row['is_fragment'] = pangenome.get_gene(row['ID'].decode()).is_fragment + genedata_id = row["genedata_id"] + if genedataid2genedata[genedata_id].gene_type == "CDS": + row["is_fragment"] = pangenome.get_gene(row["ID"].decode()).is_fragment row.update() table.flush() -def erase_pangenome(pangenome: Pangenome, graph: bool = False, gene_families: bool = False, partition: bool = False, - rgp: bool = False, spots: bool = False, modules: bool = False, - metadata: bool = False, metatype: str = None, source: str = None): +def erase_pangenome( + pangenome: Pangenome, + graph: bool = False, + gene_families: bool = False, + partition: bool = False, + rgp: bool = False, + spots: bool = False, + modules: bool = False, + metadata: bool = False, + metatype: str = None, + source: str = None, +): """ Erases tables from a pangenome .h5 file @@ -614,15 +785,19 @@ def erase_pangenome(pangenome: Pangenome, graph: bool = False, gene_families: bo status_group = h5f.root.status info_group = h5f.root.info - if '/edges' in h5f and (graph or gene_families): + if "/edges" in h5f and (graph or gene_families): logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed edges") h5f.remove_node("/", "edges") status_group._v_attrs.NeighborsGraph = False pangenome.status["neighborsGraph"] = "No" h5f.del_node_attr(info_group, "numberOfEdges") - if '/geneFamilies' in h5f and gene_families: - logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed gene family to gene associations...") - h5f.remove_node('/', 'geneFamilies') # erasing the table, and rewriting a new one. + if "/geneFamilies" in h5f and gene_families: + logging.getLogger("PPanGGOLiN").info( + "Erasing the formerly computed gene family to gene associations..." + ) + h5f.remove_node( + "/", "geneFamilies" + ) # erasing the table, and rewriting a new one. pangenome.status["defragmented"] = "No" pangenome.status["genesClustered"] = "No" status_group._v_attrs.defragmented = False @@ -630,16 +805,20 @@ def erase_pangenome(pangenome: Pangenome, graph: bool = False, gene_families: bo h5f.del_node_attr(info_group, "numberOfClusters") - if '/geneFamiliesInfo' in h5f and gene_families: - logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed gene family representative sequences...") - h5f.remove_node('/', 'geneFamiliesInfo') # erasing the table, and rewriting a new one. + if "/geneFamiliesInfo" in h5f and gene_families: + logging.getLogger("PPanGGOLiN").info( + "Erasing the formerly computed gene family representative sequences..." + ) + h5f.remove_node( + "/", "geneFamiliesInfo" + ) # erasing the table, and rewriting a new one. pangenome.status["geneFamilySequences"] = "No" status_group._v_attrs.geneFamilySequences = False if partition: logging.getLogger("PPanGGOLiN").info("Erasing former partitions...") pangenome.status["partitioned"] = "No" status_group._v_attrs.Partitioned = False - if 'Partitioned' in status_group._v_attrs._f_list(): + if "Partitioned" in status_group._v_attrs._f_list(): status_group._v_attrs.Partitioned = False h5f.del_node_attr(info_group, "numberOfPersistent") @@ -651,7 +830,7 @@ def erase_pangenome(pangenome: Pangenome, graph: bool = False, gene_families: bo h5f.del_node_attr(info_group, "numberOfPartitions") h5f.del_node_attr(info_group, "numberOfSubpartitions") - if '/RGP' in h5f and (gene_families or partition or rgp): + if "/RGP" in h5f and (gene_families or partition or rgp): logging.getLogger("PPanGGOLiN").info("Erasing the formerly computer RGP...") pangenome.status["predictedRGP"] = "No" status_group._v_attrs.predictedRGP = False @@ -659,32 +838,43 @@ def erase_pangenome(pangenome: Pangenome, graph: bool = False, gene_families: bo h5f.del_node_attr(info_group, "numberOfRGP") - if '/spots' in h5f and (gene_families or partition or rgp or spots): - logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed spots...") + if "/spots" in h5f and (gene_families or partition or rgp or spots): + logging.getLogger("PPanGGOLiN").info( + "Erasing the formerly computed spots..." + ) pangenome.status["spots"] = "No" status_group._v_attrs.spots = False h5f.remove_node("/", "spots") h5f.del_node_attr(info_group, "numberOfSpots") - if '/modules' in h5f and (gene_families or partition or modules): - logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed modules...") + if "/modules" in h5f and (gene_families or partition or modules): + logging.getLogger("PPanGGOLiN").info( + "Erasing the formerly computed modules..." + ) pangenome.status["modules"] = "No" status_group._v_attrs.modules = False h5f.remove_node("/", "modules") h5f.del_node_attr(info_group, "numberOfModules") h5f.del_node_attr(info_group, "numberOfFamiliesInModules") - for info in ['CloudSpecInModules', 'PersistentSpecInModules', 'ShellSpecInModules', 'numberOfFamiliesInModules', - 'StatOfFamiliesInModules']: + for info in [ + "CloudSpecInModules", + "PersistentSpecInModules", + "ShellSpecInModules", + "numberOfFamiliesInModules", + "StatOfFamiliesInModules", + ]: if info in info_group._v_attrs._f_list(): h5f.del_node_attr(info_group, info) - if '/metadata/' in h5f and metadata: + if "/metadata/" in h5f and metadata: erase_metadata(pangenome, h5f, status_group, metatype, source) -def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable_bar: bool = False): +def write_pangenome( + pangenome: Pangenome, filename, force: bool = False, disable_bar: bool = False +): """ Writes or updates a pangenome file @@ -696,12 +886,16 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable try: assert pangenome.status["genomesAnnotated"] in ["Computed", "Loaded", "inFile"] except AssertionError: - raise AssertionError("Something REALLY unexpected and unplanned for happened here. " - "Please post an issue on github with what you did to reach this error.") + raise AssertionError( + "Something REALLY unexpected and unplanned for happened here. " + "Please post an issue on github with what you did to reach this error." + ) if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded", "inFile"]: if pangenome.status["genomesAnnotated"] == "Computed": - compression_filter = tables.Filters(complevel=1, shuffle=True, bitshuffle=True, complib='blosc:zstd') + compression_filter = tables.Filters( + complevel=1, shuffle=True, bitshuffle=True, complib="blosc:zstd" + ) h5f = tables.open_file(filename, "w", filters=compression_filter) logging.getLogger("PPanGGOLiN").info("Writing genome annotations...") @@ -714,39 +908,57 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable h5f = tables.open_file(filename, "a") if pangenome.status["geneSequences"] == "Computed": - logging.getLogger("PPanGGOLiN").info("writing the protein coding gene dna sequences in pangenome...") + logging.getLogger("PPanGGOLiN").info( + "writing the protein coding gene dna sequences in pangenome..." + ) write_gene_sequences(pangenome, h5f, disable_bar=disable_bar) pangenome.status["geneSequences"] = "Loaded" if pangenome.status["genesClustered"] == "Computed": - logging.getLogger("PPanGGOLiN").info("Writing gene families and gene associations in pangenome...") + logging.getLogger("PPanGGOLiN").info( + "Writing gene families and gene associations in pangenome..." + ) write_gene_families(pangenome, h5f, force, disable_bar=disable_bar) - logging.getLogger("PPanGGOLiN").info("Writing gene families information in pangenome...") + logging.getLogger("PPanGGOLiN").info( + "Writing gene families information in pangenome..." + ) write_gene_fam_info(pangenome, h5f, force, disable_bar=disable_bar) - if pangenome.status["genomesAnnotated"] in ["Loaded", "inFile"] and \ - pangenome.status["defragmented"] == "Computed": + if ( + pangenome.status["genomesAnnotated"] in ["Loaded", "inFile"] + and pangenome.status["defragmented"] == "Computed" + ): # if the annotations have not been computed in this run, # and there has been a clustering with defragmentation, then the annotations can be updated update_gene_fragments(pangenome, h5f, disable_bar=disable_bar) pangenome.status["genesClustered"] = "Loaded" if pangenome.status["neighborsGraph"] == "Computed": - logging.getLogger("PPanGGOLiN").info("Writing the edges of neighbors graph in pangenome...") + logging.getLogger("PPanGGOLiN").info( + "Writing the edges of neighbors graph in pangenome..." + ) write_graph(pangenome, h5f, force, disable_bar=disable_bar) pangenome.status["neighborsGraph"] = "Loaded" - if pangenome.status["partitioned"] == "Computed" and \ - pangenome.status["genesClustered"] in ["Loaded", "inFile"]: # otherwise, it's been written already. + if pangenome.status["partitioned"] == "Computed" and pangenome.status[ + "genesClustered" + ] in [ + "Loaded", + "inFile", + ]: # otherwise, it's been written already. update_gene_fam_partition(pangenome, h5f, disable_bar=disable_bar) pangenome.status["partitioned"] = "Loaded" - if pangenome.status['predictedRGP'] == "Computed": - logging.getLogger("PPanGGOLiN").info("Writing Regions of Genomic Plasticity in pangenome...") + if pangenome.status["predictedRGP"] == "Computed": + logging.getLogger("PPanGGOLiN").info( + "Writing Regions of Genomic Plasticity in pangenome..." + ) write_rgp(pangenome, h5f, force, disable_bar=disable_bar) - pangenome.status['predictedRGP'] = "Loaded" + pangenome.status["predictedRGP"] = "Loaded" if pangenome.status["spots"] == "Computed": - logging.getLogger("PPanGGOLiN").info("Writing Spots of Insertion in pangenome...") + logging.getLogger("PPanGGOLiN").info( + "Writing Spots of Insertion in pangenome..." + ) write_spots(pangenome, h5f, force, disable_bar=disable_bar) - pangenome.status['spots'] = "Loaded" + pangenome.status["spots"] = "Loaded" if pangenome.status["modules"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing Modules in pangenome...") @@ -759,6 +971,6 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable write_info(pangenome, h5f) h5f.close() - logging.getLogger("PPanGGOLiN").info(f"Done writing the pangenome. It is in file : {filename}") - - + logging.getLogger("PPanGGOLiN").info( + f"Done writing the pangenome. It is in file : {filename}" + ) diff --git a/ppanggolin/formats/writeFlatGenomes.py b/ppanggolin/formats/writeFlatGenomes.py index cd9082f1..8f8a78f8 100644 --- a/ppanggolin/formats/writeFlatGenomes.py +++ b/ppanggolin/formats/writeFlatGenomes.py @@ -22,7 +22,12 @@ from ppanggolin.genome import Organism, Gene, RNA from ppanggolin.region import Region, Module from ppanggolin.pangenome import Pangenome -from ppanggolin.utils import write_compressed_or_not, mk_outdir, extract_contig_window, parse_input_paths_file +from ppanggolin.utils import ( + write_compressed_or_not, + mk_outdir, + extract_contig_window, + parse_input_paths_file, +) from ppanggolin.formats.readBinaries import check_pangenome_info from ppanggolin.formats.write_proksee import write_proksee_organism from ppanggolin.formats.writeSequences import read_genome_file, write_spaced_fasta @@ -50,8 +55,15 @@ def count_neighbors_partitions(gene_family: GeneFamily): return nb_pers, nb_shell, nb_cloud -def write_tsv_genome_file(organism: Organism, output: Path, compress: bool = False, metadata_sep:str = "|", - need_regions: bool = False, need_spots: bool = False, need_modules: bool = False): +def write_tsv_genome_file( + organism: Organism, + output: Path, + compress: bool = False, + metadata_sep: str = "|", + need_regions: bool = False, + need_spots: bool = False, + need_modules: bool = False, +): """ Write the table of genes with pangenome annotation for one organism in tsv @@ -70,13 +82,17 @@ def write_tsv_genome_file(organism: Organism, output: Path, compress: bool = Fal gene_info = {} - gene_info["gene"] = gene.ID if gene.local_identifier == "" else gene.local_identifier + gene_info["gene"] = ( + gene.ID if gene.local_identifier == "" else gene.local_identifier + ) gene_info["contig"] = gene.contig.name gene_info["start"] = gene.start gene_info["stop"] = gene.stop gene_info["strand"] = gene.strand gene_info["family"] = gene.family.name - gene_info["nb_copy_in_genome"] = len(list(gene.family.get_genes_per_org(organism))) + gene_info["nb_copy_in_genome"] = len( + list(gene.family.get_genes_per_org(organism)) + ) gene_info["partition"] = gene.family.named_partition gene_info["persistent_neighbors"] = nb_pers gene_info["shell_neighbors"] = nb_shell @@ -85,34 +101,53 @@ def write_tsv_genome_file(organism: Organism, output: Path, compress: bool = Fal if need_regions: gene_info["RGP"] = str(gene.RGP) if gene.RGP is not None else None if need_spots: - gene_info['Spot'] = str(gene.spot) if gene.spot is not None else None + gene_info["Spot"] = str(gene.spot) if gene.spot is not None else None if need_modules: - gene_info['Module'] = str(gene.family.module) if gene.family.has_module else None + gene_info["Module"] = ( + str(gene.family.module) if gene.family.has_module else None + ) # Add metadata - gene_metadata = {f"gene_{key}":value for key, value in gene.formatted_metadata_dict(metadata_sep).items()} + gene_metadata = { + f"gene_{key}": value + for key, value in gene.formatted_metadata_dict(metadata_sep).items() + } gene_info.update(gene_metadata) - family_metadata = {f"family_{key}":value for key, value in gene.family.formatted_metadata_dict(metadata_sep).items()} + family_metadata = { + f"family_{key}": value + for key, value in gene.family.formatted_metadata_dict(metadata_sep).items() + } gene_info.update(family_metadata) if need_regions and gene.RGP: - rgp_metadata = {f"rgp_{key}":value for key, value in gene.RGP.formatted_metadata_dict(metadata_sep).items()} + rgp_metadata = { + f"rgp_{key}": value + for key, value in gene.RGP.formatted_metadata_dict(metadata_sep).items() + } gene_info.update(rgp_metadata) rows.append(gene_info) - pd.DataFrame(rows).to_csv(output / f"{organism.name}.tsv{'.gz' if compress else ''}", sep="\t", index=False) + pd.DataFrame(rows).to_csv( + output / f"{organism.name}.tsv{'.gz' if compress else ''}", + sep="\t", + index=False, + ) - logging.getLogger("PPangGGOLiN").debug(f"Done writing the table with pangenome annotation for {organism.name}") + logging.getLogger("PPangGGOLiN").debug( + f"Done writing the table with pangenome annotation for {organism.name}" + ) -def manage_module_colors(modules: Set[Module], window_size: int = 100) -> Dict[Module, str]: +def manage_module_colors( + modules: Set[Module], window_size: int = 100 +) -> Dict[Module, str]: """ Manages colors for a list of modules based on gene positions and a specified window size. :param modules: A list of module objects for which you want to determine colors. - :param window_size: Minimum number of genes between two modules to color them with the same color. + :param window_size: Minimum number of genes between two modules to color them with the same color. A higher value results in more module colors. :return: A dictionary that maps each module to its assigned color. """ @@ -132,23 +167,29 @@ def manage_module_colors(modules: Set[Module], window_size: int = 100) -> Dict[M for contig, mod_genes in contig_to_mod_genes.items(): gene_positions = (gene.position for gene in mod_genes) - contig_windows = extract_contig_window(contig.number_of_genes, - gene_positions, - window_size=window_size, - is_circular=contig.is_circular) + contig_windows = extract_contig_window( + contig.number_of_genes, + gene_positions, + window_size=window_size, + is_circular=contig.is_circular, + ) contig_windows = list(contig_windows) - for (start, end) in contig_windows: - module_in_window = {gene_to_module[gene] for gene in mod_genes if start <= gene.position <= end} + for start, end in contig_windows: + module_in_window = { + gene_to_module[gene] + for gene in mod_genes + if start <= gene.position <= end + } # Add edges between closely located modules - module_edges = [(mod_a, mod_b) for mod_a, mod_b in combinations(module_in_window, 2)] + module_edges = [ + (mod_a, mod_b) for mod_a, mod_b in combinations(module_in_window, 2) + ] color_mod_graph.add_edges_from(module_edges) - module_to_group = nx.coloring.greedy_color(color_mod_graph) - # Attempt to have always the same color associated with the same module... module_to_color_int = {} group_with_color = [] @@ -158,13 +199,14 @@ def manage_module_colors(modules: Set[Module], window_size: int = 100) -> Dict[M group_with_color.append(group) module_to_color_int[module] = group_with_color.index(group) - # If you want to export the graph to see the coloring: # nx.set_node_attributes(color_mod_graph, module_to_color_int, name="color") # nx.readwrite.graphml.write_graphml(color_mod_graph, f"module_graph_window_size{window_size}.graphml") nb_colors = len(set(module_to_color_int.values())) - logging.getLogger().debug(f"We have found that {nb_colors} colors were necessary to color Modules.") + logging.getLogger().debug( + f"We have found that {nb_colors} colors were necessary to color Modules." + ) colors = palette(nb_colors) module_to_color = {mod: colors[col_i] for mod, col_i in module_to_color_int.items()} @@ -185,8 +227,10 @@ def palette(nb_colors: int) -> List[str]: if len(colors) < nb_colors: # Generate random colors if not enough predefined colors are available - random_colors = ["#" + ''.join([random.choice('0123456789ABCDEF') for _ in range(6)]) for _ in - range(nb_colors - len(colors))] + random_colors = [ + "#" + "".join([random.choice("0123456789ABCDEF") for _ in range(6)]) + for _ in range(nb_colors - len(colors)) + ] colors += random_colors else: colors = colors[:nb_colors] @@ -197,20 +241,20 @@ def palette(nb_colors: int) -> List[str]: def encode_attribute_val(product: str) -> str: """ Encode special characters forbidden in column 9 of the GFF3 format. - + :param product: The input string to encode. :return: The encoded string with special characters replaced. - + Reference: - GFF3 format requirement: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md - Code source taken from Bakta: https://github.com/oschwengers/bakta """ product = str(product) - product = product.replace('%', '%25') - product = product.replace(';', '%3B') - product = product.replace('=', '%3D') - product = product.replace('&', '%26') - product = product.replace(',', '%2C') + product = product.replace("%", "%25") + product = product.replace(";", "%3B") + product = product.replace("=", "%3D") + product = product.replace("&", "%26") + product = product.replace(",", "%2C") return product @@ -221,12 +265,23 @@ def encode_attributes(attributes: List[Tuple]) -> str: :param attributes: A list of attribute key-value pairs represented as tuples. :return: The encoded attributes as a semicolon-separated string. """ - return ';'.join( - [f"{encode_attribute_val(k)}={encode_attribute_val(v)}" for k, v in attributes if str(v) != "" and v is not None]) - - -def write_gff_file(organism: Organism, outdir: Path, annotation_sources: Dict[str, str], - genome_sequences: Dict[str, str], metadata_sep: str = "|", compress: bool = False): + return ";".join( + [ + f"{encode_attribute_val(k)}={encode_attribute_val(v)}" + for k, v in attributes + if str(v) != "" and v is not None + ] + ) + + +def write_gff_file( + organism: Organism, + outdir: Path, + annotation_sources: Dict[str, str], + genome_sequences: Dict[str, str], + metadata_sep: str = "|", + compress: bool = False, +): """ Write the GFF file of the provided organism. @@ -241,39 +296,53 @@ def write_gff_file(organism: Organism, outdir: Path, annotation_sources: Dict[st # sort contig by their name sorted_contigs = sorted(organism.contigs, key=lambda x: x.name) - organism_metadata = [(f"genome_{key}", value) for key, value in organism.formatted_metadata_dict(metadata_sep).items()] + organism_metadata = [ + (f"genome_{key}", value) + for key, value in organism.formatted_metadata_dict(metadata_sep).items() + ] - with write_compressed_or_not(outdir / F"{organism.name}.gff", compress) as outfile: + with write_compressed_or_not(outdir / f"{organism.name}.gff", compress) as outfile: # write gff header - outfile.write('##gff-version 3\n') + outfile.write("##gff-version 3\n") for contig in sorted_contigs: if contig.length is None: - raise AttributeError(f'Contig {contig.name} has no length defined.') + raise AttributeError(f"Contig {contig.name} has no length defined.") - outfile.write(f'##sequence-region {contig.name} 1 {contig.length}\n') + outfile.write(f"##sequence-region {contig.name} 1 {contig.length}\n") for contig in sorted_contigs: - contig_metadata = [(f"contig_{key}", value) for key, value in - contig.formatted_metadata_dict(metadata_sep).items()] - attributes = [("ID", contig.name), - ("Is_circular", - "true" if contig.is_circular else "false")] + organism_metadata + contig_metadata + contig_metadata = [ + (f"contig_{key}", value) + for key, value in contig.formatted_metadata_dict(metadata_sep).items() + ] + attributes = ( + [ + ("ID", contig.name), + ("Is_circular", "true" if contig.is_circular else "false"), + ] + + organism_metadata + + contig_metadata + ) attributes_str = encode_attributes(attributes) - contig_line = [contig.name, - ".", - "region", - "1", - contig.length, - ".", - "+", - ".", - attributes_str] - contig_line_str = '\t'.join(map(str, contig_line)) + contig_line = [ + contig.name, + ".", + "region", + "1", + contig.length, + ".", + "+", + ".", + attributes_str, + ] + contig_line_str = "\t".join(map(str, contig_line)) outfile.write(contig_line_str + "\n") - contig_elements = sorted(list(contig.regions) + list(contig.genes) + list(contig.RNAs), - key=lambda x: x.start) + contig_elements = sorted( + list(contig.regions) + list(contig.genes) + list(contig.RNAs), + key=lambda x: x.start, + ) for feature in contig_elements: phase = "." @@ -288,13 +357,14 @@ def write_gff_file(organism: Organism, outdir: Path, annotation_sources: Dict[st # before the CDS or RNA line a gene line is created. with the following id parent_gene_id = f"gene-{feature.ID}" - attributes = [("ID", feature.ID), - ("Name", feature.name), - ('Parent', parent_gene_id), - ("product", feature.product), - ] + attributes = [ + ("ID", feature.ID), + ("Name", feature.name), + ("Parent", parent_gene_id), + ("product", feature.product), + ] - score = '.' + score = "." if isinstance(feature, Gene): rgp = feature.RGP.name if feature.RGP else "" @@ -303,15 +373,26 @@ def write_gff_file(organism: Organism, outdir: Path, annotation_sources: Dict[st attributes += [ ("family", feature.family.name), ("partition", feature.family.named_partition), - ('rgp', rgp), - ('module', feature.family.module) # family.module can be None... + ("rgp", rgp), + ( + "module", + feature.family.module, + ), # family.module can be None... ] # adding attributes - gene_metadata = [(f"gene_{key}", value) for key, value in - feature.formatted_metadata_dict(metadata_sep).items()] - family_metadata = [(f"family_{key}", value) for key, value in - feature.family.formatted_metadata_dict(metadata_sep).items()] + gene_metadata = [ + (f"gene_{key}", value) + for key, value in feature.formatted_metadata_dict( + metadata_sep + ).items() + ] + family_metadata = [ + (f"family_{key}", value) + for key, value in feature.family.formatted_metadata_dict( + metadata_sep + ).items() + ] attributes += gene_metadata attributes += family_metadata @@ -321,17 +402,18 @@ def write_gff_file(organism: Organism, outdir: Path, annotation_sources: Dict[st if feature.overlaps_contig_edge: stop = contig.length + feature.stop - gene_line = [contig.name, - source, - 'gene', - feature.start, - stop, - '.', - strand, - ".", - f'ID={encode_attribute_val(parent_gene_id)}' - ] - line_str = '\t'.join(map(str, gene_line)) + gene_line = [ + contig.name, + source, + "gene", + feature.start, + stop, + ".", + strand, + ".", + f"ID={encode_attribute_val(parent_gene_id)}", + ] + line_str = "\t".join(map(str, gene_line)) outfile.write(line_str + "\n") elif isinstance(feature, Region): @@ -340,51 +422,68 @@ def write_gff_file(organism: Organism, outdir: Path, annotation_sources: Dict[st strand = "." score = "." - rgp_metadata = [(f"rgp_{key}", value) for key, value in - feature.formatted_metadata_dict(metadata_sep).items()] + rgp_metadata = [ + (f"rgp_{key}", value) + for key, value in feature.formatted_metadata_dict( + metadata_sep + ).items() + ] attributes = [ ("Name", feature.name), - ("spot", feature.spot.ID if feature.spot is not None else "No_spot"), - ("Note", "Region of Genomic Plasticity (RGP)") + ( + "spot", + feature.spot.ID if feature.spot is not None else "No_spot", + ), + ("Note", "Region of Genomic Plasticity (RGP)"), ] attributes += rgp_metadata else: raise TypeError( - f'The feature to write in gff file does not have an expected types. {type(feature)}') + f"The feature to write in gff file does not have an expected types. {type(feature)}" + ) attributes_str = encode_attributes(attributes) coordinates = feature.coordinates if feature.overlaps_contig_edge: - coordinates = convert_overlapping_coordinates_for_gff(feature.coordinates, len(contig)) + coordinates = convert_overlapping_coordinates_for_gff( + feature.coordinates, len(contig) + ) for start, stop in coordinates: - line = [contig.name, - source, # Source - feat_type, - start, - stop, - score, - strand, - phase, - attributes_str, - ] - - line_str = '\t'.join(map(str, line)) + line = [ + contig.name, + source, # Source + feat_type, + start, + stop, + score, + strand, + phase, + attributes_str, + ] + + line_str = "\t".join(map(str, line)) outfile.write(line_str + "\n") if genome_sequences: - logging.getLogger("PPanGGOLiN").debug("Writing fasta section of gff file...") + logging.getLogger("PPanGGOLiN").debug( + "Writing fasta section of gff file..." + ) outfile.write("##FASTA\n") for contig in sorted_contigs: outfile.write(f">{contig.name}\n") - outfile.write(write_spaced_fasta(genome_sequences[contig.name], space=60)) + outfile.write( + write_spaced_fasta(genome_sequences[contig.name], space=60) + ) -def convert_overlapping_coordinates_for_gff(coordinates: List[Tuple[int, int]], contig_length: int): +def convert_overlapping_coordinates_for_gff( + coordinates: List[Tuple[int, int]], contig_length: int +): """ Converts overlapping gene coordinates in GFF format for circular contigs. @@ -393,11 +492,11 @@ def convert_overlapping_coordinates_for_gff(coordinates: List[Tuple[int, int]], """ start, stop = coordinates[0] - new_coordinates = [(start, stop )] + new_coordinates = [(start, stop)] # convert all coordinates that are at the beginning # of the contig to the extent of the contig for start_n, stop_n in coordinates[1:]: - if start_n < start: # we are on the beginning of the contig + if start_n < start: # we are on the beginning of the contig new_start = contig_length + start_n new_stop = contig_length + stop_n new_coordinates.append((new_start, new_stop)) @@ -413,7 +512,7 @@ def convert_overlapping_coordinates_for_gff(coordinates: List[Tuple[int, int]], start, stop = new_coordinates[0] for start_n, stop_n in new_coordinates[1:]: - if stop +1 == start_n: + if stop + 1 == start_n: stop = stop_n else: merged_coordinates.append((start, stop)) @@ -434,17 +533,25 @@ def get_organism_list(organisms_filt: str, pangenome: Pangenome) -> Set[Organism :return: A set of selected Organism objects. """ if organisms_filt == "all": - logging.getLogger("PPanGGOLiN").info("Writing output for all genomes of the pangenome.") + logging.getLogger("PPanGGOLiN").info( + "Writing output for all genomes of the pangenome." + ) organisms_list = set(pangenome.organisms) else: if Path(organisms_filt).is_file(): - logging.getLogger("PPanGGOLiN").debug("Parsing the list of genomes from a file " - "to determine which genomes should be included in the output.") + logging.getLogger("PPanGGOLiN").debug( + "Parsing the list of genomes from a file " + "to determine which genomes should be included in the output." + ) with open(organisms_filt) as fl: - org_names = [line.strip() for line in fl if line and not line.startswith("#")] + org_names = [ + line.strip() for line in fl if line and not line.startswith("#") + ] else: - org_names = [name.strip() for name in organisms_filt.split(',') if name.strip()] + org_names = [ + name.strip() for name in organisms_filt.split(",") if name.strip() + ] organisms_list = set() org_not_in_pangenome = set() @@ -455,17 +562,27 @@ def get_organism_list(organisms_filt: str, pangenome: Pangenome) -> Set[Organism except KeyError: org_not_in_pangenome.add(org_name) if org_not_in_pangenome: - raise KeyError(f"{len(org_not_in_pangenome)} organism(s) specified with '--genomes' parameter were " - f"not found in the pangenome: {', '.join(org_not_in_pangenome)}") + raise KeyError( + f"{len(org_not_in_pangenome)} organism(s) specified with '--genomes' parameter were " + f"not found in the pangenome: {', '.join(org_not_in_pangenome)}" + ) logging.getLogger("PPanGGOLiN").info( - f"Writing output for {len(organisms_list)}/{pangenome.number_of_organisms} genomes of the pangenome.") + f"Writing output for {len(organisms_list)}/{pangenome.number_of_organisms} genomes of the pangenome." + ) return organisms_list -def mp_write_genomes_file(organism: Organism, output: Path, organisms_file: Path = None, - proksee: bool = False, gff: bool = False, table: bool = False, **kwargs) -> str: +def mp_write_genomes_file( + organism: Organism, + output: Path, + organisms_file: Path = None, + proksee: bool = False, + gff: bool = False, + table: bool = False, + **kwargs, +) -> str: """Wrapper for the write_genomes_file function that allows it to be used in multiprocessing. :param organism: Specify the organism to be written @@ -490,33 +607,72 @@ def mp_write_genomes_file(organism: Organism, output: Path, organisms_file: Path output_file = proksee_outdir / f"{organism.name}.json" # Write ProkSee data for the organism - write_proksee_organism(organism, output_file, features=['all'], genome_sequences=genome_sequences, - **{arg: kwargs[arg] for arg in kwargs.keys() & {'module_to_colors', 'compress', 'metadata_sep', 'multigenics'}}) + write_proksee_organism( + organism, + output_file, + features=["all"], + genome_sequences=genome_sequences, + **{ + arg: kwargs[arg] + for arg in kwargs.keys() + & {"module_to_colors", "compress", "metadata_sep", "multigenics"} + }, + ) if gff: gff_outdir = output / "gff" mk_outdir(gff_outdir, force=True, exist_ok=True) - write_gff_file(organism, outdir=gff_outdir, genome_sequences=genome_sequences, - **{arg: kwargs[arg] for arg in kwargs.keys() & {'compress', 'annotation_sources', - 'metadata_sep'}}) + write_gff_file( + organism, + outdir=gff_outdir, + genome_sequences=genome_sequences, + **{ + arg: kwargs[arg] + for arg in kwargs.keys() + & {"compress", "annotation_sources", "metadata_sep"} + }, + ) if table: table_outdir = output / "table" mk_outdir(table_outdir, force=True, exist_ok=True) - write_tsv_genome_file(organism=organism, output=table_outdir, **{arg: kwargs[arg] for arg in kwargs.keys() & - {'need_regions', 'need_modules', 'need_spots', - 'compress', 'metadata_sep'}}) + write_tsv_genome_file( + organism=organism, + output=table_outdir, + **{ + arg: kwargs[arg] + for arg in kwargs.keys() + & { + "need_regions", + "need_modules", + "need_spots", + "compress", + "metadata_sep", + } + }, + ) return organism.name -def write_flat_genome_files(pangenome: Pangenome, output: Path, table: bool = False, gff: bool = False, - proksee: bool = False, compress: bool = False, fasta: Path = None, - anno: Path = None, organisms_filt: str = "all", - add_metadata: bool = False, metadata_sep: str = "|", metadata_sources: List[str] = None, - cpu: int = 1, disable_bar: bool = False): +def write_flat_genome_files( + pangenome: Pangenome, + output: Path, + table: bool = False, + gff: bool = False, + proksee: bool = False, + compress: bool = False, + fasta: Path = None, + anno: Path = None, + organisms_filt: str = "all", + add_metadata: bool = False, + metadata_sep: str = "|", + metadata_sources: List[str] = None, + cpu: int = 1, + disable_bar: bool = False, +): """ Main function to write flat files from pangenome @@ -537,19 +693,21 @@ def write_flat_genome_files(pangenome: Pangenome, output: Path, table: bool = Fa """ if not any(x for x in [table, gff, proksee]): - raise argparse.ArgumentError(argument=None, message="You did not indicate what file you wanted to write.") - - need_dict = {"need_annotations": True, - "need_families": True, - "need_partitions": True, - "need_rgp": True if pangenome.status["predictedRGP"] != "No" else False, - "need_spots": True if pangenome.status["spots"] != "No" else False, - "need_modules": True if pangenome.status["modules"] != "No" else False, - "need_graph": True if table else False, - "need_metadata": add_metadata, - "sources": metadata_sources - } - + raise argparse.ArgumentError( + argument=None, message="You did not indicate what file you wanted to write." + ) + + need_dict = { + "need_annotations": True, + "need_families": True, + "need_partitions": True, + "need_rgp": True if pangenome.status["predictedRGP"] != "No" else False, + "need_spots": True if pangenome.status["spots"] != "No" else False, + "need_modules": True if pangenome.status["modules"] != "No" else False, + "need_graph": True if table else False, + "need_metadata": add_metadata, + "sources": metadata_sources, + } # Place here to raise an error if file doesn't found before to read pangenome organisms_file = fasta if fasta is not None else anno @@ -558,33 +716,55 @@ def write_flat_genome_files(pangenome: Pangenome, output: Path, table: bool = Fa organisms_list = get_organism_list(organisms_filt, pangenome) if not organisms_list: - raise ValueError("No genomes are selected for output. Please check the '--genomes' parameter.") + raise ValueError( + "No genomes are selected for output. Please check the '--genomes' parameter." + ) multigenics = None if need_dict["need_rgp"]: - multigenics = pangenome.get_multigenics(pangenome.parameters["rgp"]["dup_margin"]) + multigenics = pangenome.get_multigenics( + pangenome.parameters["rgp"]["dup_margin"] + ) - org_dict = parse_input_paths_file(organisms_file) if organisms_file and (gff or proksee) else None + org_dict = ( + parse_input_paths_file(organisms_file) + if organisms_file and (gff or proksee) + else None + ) if proksee: # Generate a color mapping for modules module_to_colors = manage_module_colors(set(pangenome.modules)) - organism2args = defaultdict(lambda: {"output": output, "table": table, "gff": gff, - "proksee": proksee, "compress": compress, "multigenics":multigenics}) + organism2args = defaultdict( + lambda: { + "output": output, + "table": table, + "gff": gff, + "proksee": proksee, + "compress": compress, + "multigenics": multigenics, + } + ) for organism in organisms_list: - organism_args = {"genome_file": org_dict[organism.name]['path'] if org_dict else None, - "metadata_sep": metadata_sep} + organism_args = { + "genome_file": org_dict[organism.name]["path"] if org_dict else None, + "metadata_sep": metadata_sep, + } if proksee: - organism_args["module_to_colors"] = {module: module_to_colors[module] for module in organism.modules} + organism_args["module_to_colors"] = { + module: module_to_colors[module] for module in organism.modules + } if gff: # prepare variable for gff output if pangenome.parameters["annotate"]["# read_annotations_from_file"]: - organism_args['annotation_sources'] = {"rRNA": "external", - "tRNA": "external", - "CDS": "external"} + organism_args["annotation_sources"] = { + "rRNA": "external", + "tRNA": "external", + "CDS": "external", + } else: organism_args["annotation_sources"] = {} @@ -594,28 +774,39 @@ def write_flat_genome_files(pangenome: Pangenome, output: Path, table: bool = Fa for family in pangenome.gene_families: family.get_org_dict() - organism_args.update({"need_regions": need_dict['need_rgp'], - "need_modules": need_dict['need_modules'], - "need_spots": need_dict['need_spots']}) + organism_args.update( + { + "need_regions": need_dict["need_rgp"], + "need_modules": need_dict["need_modules"], + "need_spots": need_dict["need_spots"], + } + ) organism2args[organism].update(organism_args) start_writing = time.time() with ThreadPoolExecutor(max_workers=cpu) as executor: - with tqdm(total=(len(organisms_list)), unit="genome", disable=disable_bar) as progress: + with tqdm( + total=(len(organisms_list)), unit="genome", disable=disable_bar + ) as progress: futures = [] for organism, kwargs in organism2args.items(): - logging.getLogger("PPanGGOLiN").debug(f"Writing genome annotations for {organism.name}") + logging.getLogger("PPanGGOLiN").debug( + f"Writing genome annotations for {organism.name}" + ) future = executor.submit(mp_write_genomes_file, organism, **kwargs) future.add_done_callback(lambda p: progress.update()) futures.append(future) for future in futures: org_name = future.result() - logging.getLogger("PPanGGOLiN").debug(f"Done writing the GFF file with pangenome annotation for {org_name}.") + logging.getLogger("PPanGGOLiN").debug( + f"Done writing the GFF file with pangenome annotation for {org_name}." + ) writing_time = time.time() - start_writing logging.getLogger("PPanGGOLiN").debug( - f"writing_time for {pangenome.number_of_organisms} genomes: {writing_time} seconds") + f"writing_time for {pangenome.number_of_organisms} genomes: {writing_time} seconds" + ) def launch(args: argparse.Namespace): @@ -629,10 +820,22 @@ def launch(args: argparse.Namespace): pangenome = Pangenome() pangenome.add_file(args.pangenome) - write_flat_genome_files(pangenome, args.output, table=args.table, gff=args.gff, proksee=args.proksee, - compress=args.compress, fasta=args.fasta, anno=args.anno, organisms_filt=args.genomes, - add_metadata=args.add_metadata, metadata_sep=args.metadata_sep, - metadata_sources=args.metadata_sources, cpu=args.cpu, disable_bar=args.disable_prog_bar) + write_flat_genome_files( + pangenome, + args.output, + table=args.table, + gff=args.gff, + proksee=args.proksee, + compress=args.compress, + fasta=args.fasta, + anno=args.anno, + organisms_filt=args.genomes, + add_metadata=args.add_metadata, + metadata_sep=args.metadata_sep, + metadata_sources=args.metadata_sources, + cpu=args.cpu, + disable_bar=args.disable_prog_bar, + ) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -643,7 +846,9 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : parser arguments for align command """ - parser = sub_parser.add_parser("write_genomes", formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "write_genomes", formatter_class=argparse.RawTextHelpFormatter + ) parser_flat(parser) return parser @@ -654,76 +859,126 @@ def parser_flat(parser: argparse.ArgumentParser): :param parser: parser for align argument """ - required = parser.add_argument_group(title="Required arguments", - description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") - required.add_argument('-o', '--output', required=True, type=Path, - help="Output directory where the file(s) will be written") + required = parser.add_argument_group( + title="Required arguments", + description="One of the following arguments is required :", + ) + required.add_argument( + "-p", "--pangenome", required=False, type=Path, help="The pangenome .h5 file" + ) + required.add_argument( + "-o", + "--output", + required=True, + type=Path, + help="Output directory where the file(s) will be written", + ) optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument("--table", required=False, action="store_true", - help="Generate a tsv file for each genome with pangenome annotations.") - - optional.add_argument("--gff", required=False, action="store_true", - help="Generate a gff file for each genome containing pangenome annotations.") - - optional.add_argument("--proksee", required=False, action="store_true", - help="Generate JSON map files for PROKSEE for each genome containing pangenome annotations " - "to be used in proksee.") - - optional.add_argument("--compress", required=False, action="store_true", - help="Compress the files in .gz") - - optional.add_argument("--genomes", - required=False, - default="all", - help="Specify the genomes for which to generate output. " - "You can provide a list of genome names either directly in the command line separated " - "by commas, or by referencing a file containing the list of genome names, " - "with one name per line.") - - optional.add_argument("--add_metadata", - required=False, - action="store_true", - help="Include metadata information in the output files " - "if any have been added to pangenome elements (see ppanggolin metadata command).") - - optional.add_argument("--metadata_sources", - default=None, - nargs="+", - help="Which source of metadata should be written. " - "By default all metadata sources are included.") - - optional.add_argument("--metadata_sep", - required=False, - default='|', - help="The separator used to join multiple metadata values for elements with multiple metadata" - " values from the same source. This character should not appear in metadata values.") - - optional.add_argument("-c", "--cpu", required=False, default=1, type=int, - help="Number of available cpus") - - context = parser.add_argument_group(title="Contextually required arguments", - description="With --proksee and --gff, the following arguments can be " - "used to add sequence information to the output file:") - - context.add_argument('--fasta', required=False, type=Path, - help="A tab-separated file listing the genome names, and the fasta filepath of its genomic " - "sequence(s) (the fastas can be compressed with gzip). One line per genome.") - - context.add_argument('--anno', required=False, type=Path, - help="A tab-separated file listing the genome names, and the gff/gbff filepath of its " - "annotations (the files can be compressed with gzip). One line per genome. " - "If this is provided, those annotations will be used.") - - -if __name__ == '__main__': + optional.add_argument( + "--table", + required=False, + action="store_true", + help="Generate a tsv file for each genome with pangenome annotations.", + ) + + optional.add_argument( + "--gff", + required=False, + action="store_true", + help="Generate a gff file for each genome containing pangenome annotations.", + ) + + optional.add_argument( + "--proksee", + required=False, + action="store_true", + help="Generate JSON map files for PROKSEE for each genome containing pangenome annotations " + "to be used in proksee.", + ) + + optional.add_argument( + "--compress", + required=False, + action="store_true", + help="Compress the files in .gz", + ) + + optional.add_argument( + "--genomes", + required=False, + default="all", + help="Specify the genomes for which to generate output. " + "You can provide a list of genome names either directly in the command line separated " + "by commas, or by referencing a file containing the list of genome names, " + "with one name per line.", + ) + + optional.add_argument( + "--add_metadata", + required=False, + action="store_true", + help="Include metadata information in the output files " + "if any have been added to pangenome elements (see ppanggolin metadata command).", + ) + + optional.add_argument( + "--metadata_sources", + default=None, + nargs="+", + help="Which source of metadata should be written. " + "By default all metadata sources are included.", + ) + + optional.add_argument( + "--metadata_sep", + required=False, + default="|", + help="The separator used to join multiple metadata values for elements with multiple metadata" + " values from the same source. This character should not appear in metadata values.", + ) + + optional.add_argument( + "-c", + "--cpu", + required=False, + default=1, + type=int, + help="Number of available cpus", + ) + + context = parser.add_argument_group( + title="Contextually required arguments", + description="With --proksee and --gff, the following arguments can be " + "used to add sequence information to the output file:", + ) + + context.add_argument( + "--fasta", + required=False, + type=Path, + help="A tab-separated file listing the genome names, and the fasta filepath of its genomic " + "sequence(s) (the fastas can be compressed with gzip). One line per genome.", + ) + + context.add_argument( + "--anno", + required=False, + type=Path, + help="A tab-separated file listing the genome names, and the gff/gbff filepath of its " + "annotations (the files can be compressed with gzip). One line per genome. " + "If this is provided, those annotations will be used.", + ) + + +if __name__ == "__main__": """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser_flat(main_parser) add_common_arguments(main_parser) diff --git a/ppanggolin/formats/writeFlatMetadata.py b/ppanggolin/formats/writeFlatMetadata.py index 5e8066cc..8c4fae46 100644 --- a/ppanggolin/formats/writeFlatMetadata.py +++ b/ppanggolin/formats/writeFlatMetadata.py @@ -16,13 +16,18 @@ from ppanggolin.formats.readBinaries import check_pangenome_info -def write_flat_metadata_files(pangenome: Pangenome, output: Path, - pangenome_elements: List[str] = None, metadata_sources: List[str] = None, - compress: bool = False, disable_bar: bool = False) -> None: +def write_flat_metadata_files( + pangenome: Pangenome, + output: Path, + pangenome_elements: List[str] = None, + metadata_sources: List[str] = None, + compress: bool = False, + disable_bar: bool = False, +) -> None: """ Main function to write flat metadata files from a pangenome. :todo: Split the function in subfunction - + :param pangenome: Pangenome object :param output: Path to output directory :param pangenome_elements: List of pangenome elements to include metadata for @@ -32,33 +37,47 @@ def write_flat_metadata_files(pangenome: Pangenome, output: Path, """ if not pangenome.has_metadata(): - logging.getLogger("PPanGGOLiN").warning("No metadata is assigned to any pangenome element. Writing metadata is not possible.") + logging.getLogger("PPanGGOLiN").warning( + "No metadata is assigned to any pangenome element. Writing metadata is not possible." + ) return if pangenome_elements is None: pangenome_elements = [] - if all(pangenome.status['metadata'][element] == "No" for element in pangenome_elements): - logging.getLogger("PPanGGOLiN").warning(f"No metadata is assigned to any of the requested pangenome elements: {', '.join(pangenome_elements)}.") + if all( + pangenome.status["metadata"][element] == "No" for element in pangenome_elements + ): + logging.getLogger("PPanGGOLiN").warning( + f"No metadata is assigned to any of the requested pangenome elements: {', '.join(pangenome_elements)}." + ) return if metadata_sources is None: # Use all available sources if none are specified - element_to_sources = {element: sources for element, sources in pangenome.status['metasources'].items() if sources} + element_to_sources = { + element: sources + for element, sources in pangenome.status["metasources"].items() + if sources + } else: # Use only the specified sources, if they match available sources element_to_sources = {} for element in pangenome_elements: - existing_sources = pangenome.status['metasources'][element] + existing_sources = pangenome.status["metasources"][element] matching_sources = set(existing_sources) & set(metadata_sources) if matching_sources: element_to_sources[element] = matching_sources if not element_to_sources: - logging.getLogger("PPanGGOLiN").warning(f"None of the specified metadata sources ({metadata_sources}) match the requested pangenome elements: {pangenome_elements}.") + logging.getLogger("PPanGGOLiN").warning( + f"None of the specified metadata sources ({metadata_sources}) match the requested pangenome elements: {pangenome_elements}." + ) return - logging.getLogger("PPanGGOLiN").info(f"Writing metadata for {', '.join(element_to_sources.keys())} from {len([s for sources in element_to_sources.values() for s in sources])} sources.") + logging.getLogger("PPanGGOLiN").info( + f"Writing metadata for {', '.join(element_to_sources.keys())} from {len([s for sources in element_to_sources.values() for s in sources])} sources." + ) need_dict = { "need_annotations": True, @@ -67,7 +86,7 @@ def write_flat_metadata_files(pangenome: Pangenome, output: Path, "need_spots": "spots" in element_to_sources, "need_modules": "modules" in element_to_sources, "need_metadata": True, - "sources": metadata_sources + "sources": metadata_sources, } check_pangenome_info(pangenome, disable_bar=disable_bar, **need_dict) @@ -79,7 +98,7 @@ def write_flat_metadata_files(pangenome: Pangenome, output: Path, "genes": "genes", "modules": "modules", "spots": "spots", - "contigs": "contigs" + "contigs": "contigs", } for element_type, sources in element_to_sources.items(): @@ -96,26 +115,33 @@ def write_flat_metadata_files(pangenome: Pangenome, output: Path, metadata_dict[element_type] = f"module_{element.ID}" elif element_type in ["genes"]: metadata_dict[element_type] = element.ID - elif element_type in ["families", 'genomes', "RGPs", "contigs"]: + elif element_type in ["families", "genomes", "RGPs", "contigs"]: metadata_dict[element_type] = element.name # Add genome name for genome-specific elements if genome is not already a metadata - if element_type in ["genes", "contigs", "RGPs"] and "genomes" not in metadata_dict: + if ( + element_type in ["genes", "contigs", "RGPs"] + and "genomes" not in metadata_dict + ): metadata_dict["genomes"] = element.organism.name - first_columns = ["genomes", element_type ] + first_columns = ["genomes", element_type] source_to_metadata[source].append(metadata_dict) for source, metadata_list in source_to_metadata.items(): df_source = pd.DataFrame(metadata_list) - columns_order = first_columns + [col for col in df_source.columns if col not in first_columns] + columns_order = first_columns + [ + col for col in df_source.columns if col not in first_columns + ] df_source = df_source.reindex(columns_order, axis=1) tsv_file_name = f"{element_type}_metadata_from_{source}.tsv" if compress: tsv_file_name += ".gz" tsv_path = output / tsv_file_name - logging.getLogger("PPanGGOLiN").info(f"Writing {element_type} metadata from source '{source}' to '{tsv_path}'") + logging.getLogger("PPanGGOLiN").info( + f"Writing {element_type} metadata from source '{source}' to '{tsv_path}'" + ) df_source.to_csv(tsv_path, sep="\t", index=False) logging.getLogger("PPanGGOLiN").info("Finished writing metadata in TSV format.") @@ -132,9 +158,14 @@ def launch(args: argparse.Namespace): pangenome = Pangenome() pangenome.add_file(args.pangenome) - write_flat_metadata_files(pangenome, metadata_sources=args.metadata_sources, - pangenome_elements=args.pangenome_elements, - output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar) + write_flat_metadata_files( + pangenome, + metadata_sources=args.metadata_sources, + pangenome_elements=args.pangenome_elements, + output=args.output, + compress=args.compress, + disable_bar=args.disable_prog_bar, + ) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -145,7 +176,9 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : parser arguments for align command """ - parser = sub_parser.add_parser("write_metadata", formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "write_metadata", formatter_class=argparse.RawTextHelpFormatter + ) parser_flat(parser) return parser @@ -156,39 +189,67 @@ def parser_flat(parser: argparse.ArgumentParser): :param parser: parser for align argument """ - pangenome_elements = {"families", "genomes", "contigs", "genes", "RGPs", "spots", "modules"} - required = parser.add_argument_group(title="Required arguments", - description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") - required.add_argument('-o', '--output', required=True, type=Path, - help="Output directory where the file(s) will be written") + pangenome_elements = { + "families", + "genomes", + "contigs", + "genes", + "RGPs", + "spots", + "modules", + } + required = parser.add_argument_group( + title="Required arguments", + description="One of the following arguments is required :", + ) + required.add_argument( + "-p", "--pangenome", required=False, type=Path, help="The pangenome .h5 file" + ) + required.add_argument( + "-o", + "--output", + required=True, + type=Path, + help="Output directory where the file(s) will be written", + ) optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument("--compress", required=False, action="store_true", - help="Compress the files in .gz") - - optional.add_argument("-e", "--pangenome_elements", - required=False, - nargs="+", - choices=pangenome_elements, - default=pangenome_elements, - help="Specify pangenome elements for which to write metadata. " - "default is all element with metadata. ") - - optional.add_argument("-s", "--metadata_sources", - default=None, - nargs="+", - help="Which source of metadata should be written. " - "By default all metadata sources are included.") - - -if __name__ == '__main__': + optional.add_argument( + "--compress", + required=False, + action="store_true", + help="Compress the files in .gz", + ) + + optional.add_argument( + "-e", + "--pangenome_elements", + required=False, + nargs="+", + choices=pangenome_elements, + default=pangenome_elements, + help="Specify pangenome elements for which to write metadata. " + "default is all element with metadata. ", + ) + + optional.add_argument( + "-s", + "--metadata_sources", + default=None, + nargs="+", + help="Which source of metadata should be written. " + "By default all metadata sources are included.", + ) + + +if __name__ == "__main__": """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser_flat(main_parser) add_common_arguments(main_parser) diff --git a/ppanggolin/formats/writeFlatPangenome.py b/ppanggolin/formats/writeFlatPangenome.py index 563d0c2e..5ef84f4b 100644 --- a/ppanggolin/formats/writeFlatPangenome.py +++ b/ppanggolin/formats/writeFlatPangenome.py @@ -6,7 +6,7 @@ from multiprocessing import get_context from collections import Counter, defaultdict import logging -from typing import TextIO,List, Dict, Set, Any +from typing import TextIO, List, Dict, Set, Any from pathlib import Path from typing import TextIO from importlib.metadata import distribution @@ -24,7 +24,12 @@ from ppanggolin.genome import Organism from ppanggolin.region import Region from ppanggolin.pangenome import Pangenome -from ppanggolin.utils import write_compressed_or_not, mk_outdir, restricted_float, flatten_nested_dict +from ppanggolin.utils import ( + write_compressed_or_not, + mk_outdir, + restricted_float, + flatten_nested_dict, +) from ppanggolin.formats.readBinaries import check_pangenome_info # global variable to store the pangenome @@ -54,13 +59,17 @@ def write_json_header(json: TextIO): orgstr.append('"' + org.name + '": {') contigstr = [] for contig in org.contigs: - contigstr.append(f'"{contig.name}": ' + '{"is_circular": ' + - ('true' if contig.is_circular else 'false') + '}') - orgstr[-1] += ', '.join(contigstr) + "}" + contigstr.append( + f'"{contig.name}": ' + + '{"is_circular": ' + + ("true" if contig.is_circular else "false") + + "}" + ) + orgstr[-1] += ", ".join(contigstr) + "}" - json.write(', '.join(orgstr) + "}") + json.write(", ".join(orgstr) + "}") # if other things are to be written such as the parameters, write them here - json.write('},') + json.write("},") def write_json_gene_fam(gene_fam: GeneFamily, json: TextIO): @@ -69,8 +78,10 @@ def write_json_gene_fam(gene_fam: GeneFamily, json: TextIO): :param gene_fam: file-like object, compressed or not :param json: file-like object, compressed or not """ - json.write('{' + f'"id": "{gene_fam.name}", "nb_genes": {len(gene_fam)}, ' - f'"partition": "{gene_fam.named_partition}", "subpartition": "{gene_fam.partition}"' ) + json.write( + "{" + f'"id": "{gene_fam.name}", "nb_genes": {len(gene_fam)}, ' + f'"partition": "{gene_fam.named_partition}", "subpartition": "{gene_fam.partition}"' + ) org_dict = {} name_counts = Counter() product_counts = Counter() @@ -87,8 +98,10 @@ def write_json_gene_fam(gene_fam: GeneFamily, json: TextIO): except KeyError: org_dict[gene.organism] = {gene.contig: [gene]} - json.write(f', "name": "{name_counts.most_common(1)[0][0]}", "product": "{product_counts.most_common(1)[0][0]}", ' - f'"length": {length_counts.most_common(1)[0][0]}') + json.write( + f', "name": "{name_counts.most_common(1)[0][0]}", "product": "{product_counts.most_common(1)[0][0]}", ' + f'"length": {length_counts.most_common(1)[0][0]}' + ) json.write(', "genomes": {') orgstr = [] @@ -99,11 +112,18 @@ def write_json_gene_fam(gene_fam: GeneFamily, json: TextIO): contigstr.append('"' + contig.name + '": {') genestr = [] for gene in org_dict[org][contig]: - identifier = gene.ID if gene.local_identifier == "" else gene.local_identifier - genestr.append('"' + identifier + '": {' + f'"name": "{gene.name}", "product": "{gene.product}", ' - f'"is_fragment": {"true" if gene.is_fragment else "false"},' - f' "position": {gene.position}, "strand": "{gene.strand}",' - f' "end": {gene.stop}, "start": {gene.start}' + '}') + identifier = ( + gene.ID if gene.local_identifier == "" else gene.local_identifier + ) + genestr.append( + '"' + + identifier + + '": {' + + f'"name": "{gene.name}", "product": "{gene.product}", ' + f'"is_fragment": {"true" if gene.is_fragment else "false"},' + f' "position": {gene.position}, "strand": "{gene.strand}",' + f' "end": {gene.stop}, "start": {gene.start}' + "}" + ) contigstr[-1] += ", ".join(genestr) + "}" orgstr[-1] += ", ".join(contigstr) + "}" json.write(", ".join(orgstr) + "}}") @@ -119,9 +139,9 @@ def write_json_nodes(json: TextIO): first_fam = fam_list[0] write_json_gene_fam(first_fam, json) for family in fam_list[1:]: - json.write(', ') + json.write(", ") write_json_gene_fam(family, json) - json.write(']') + json.write("]") def write_json_edge(edge: Edge, json: TextIO): @@ -131,17 +151,25 @@ def write_json_edge(edge: Edge, json: TextIO): :param json: file-like object, compressed or not """ json.write("{") - json.write(f'"weight": {len(edge.gene_pairs)}, "source": "{edge.source.name}", "target": "{edge.target.name}"') + json.write( + f'"weight": {len(edge.gene_pairs)}, "source": "{edge.source.name}", "target": "{edge.target.name}"' + ) json.write(', "genomes": {') orgstr = [] for org in edge.organisms: orgstr.append('"' + org.name + '": [') genepairstr = [] for gene_pair in edge.get_organism_genes_pairs(org): - genepairstr.append('{"source": "' + gene_pair[0].ID + '", "target": "' + gene_pair[ - 1].ID + f'", "length": {gene_pair[0].start - gene_pair[1].stop}' + '}') - orgstr[-1] += ', '.join(genepairstr) + ']' - json.write(', '.join(orgstr) + "}}") + genepairstr.append( + '{"source": "' + + gene_pair[0].ID + + '", "target": "' + + gene_pair[1].ID + + f'", "length": {gene_pair[0].start - gene_pair[1].stop}' + + "}" + ) + orgstr[-1] += ", ".join(genepairstr) + "]" + json.write(", ".join(orgstr) + "}}") def write_json_edges(json): @@ -155,7 +183,7 @@ def write_json_edges(json): for edge in edge_list[1:]: json.write(", ") write_json_edge(edge, json) - json.write(']') + json.write("]") def write_json(output: Path, compress: bool = False): @@ -164,14 +192,18 @@ def write_json(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.getLogger("PPanGGOLiN").info("Writing the json file for the pangenome graph...") + logging.getLogger("PPanGGOLiN").info( + "Writing the json file for the pangenome graph..." + ) outname = output / "pangenomeGraph.json" with write_compressed_or_not(outname, compress) as json: write_json_header(json) write_json_nodes(json) write_json_edges(json) json.write("}") - logging.getLogger("PPanGGOLiN").info(f"Done writing the json file : '{outname.as_posix()}'") + logging.getLogger("PPanGGOLiN").info( + f"Done writing the json file : '{outname.as_posix()}'" + ) def write_gexf_header(gexf: TextIO, light: bool = True): @@ -183,8 +215,10 @@ def write_gexf_header(gexf: TextIO, light: bool = True): index = None if not light: index = pan.get_org_index() # has been computed already - gexf.write('\n\n') # TODO update link + gexf.write( + '\n\n' + ) # TODO update link gexf.write(' \n') gexf.write(' \n') gexf.write(' \n') @@ -205,24 +239,37 @@ def write_gexf_header(gexf: TextIO, light: bool = True): gexf.write(' \n') shift = 14 - source_fields = {m.source: m.fields for f in pan.gene_families if len(list(f.metadata)) > 0 for m in f.metadata} + source_fields = { + m.source: m.fields + for f in pan.gene_families + if len(list(f.metadata)) > 0 + for m in f.metadata + } for source_metadata_families in pan.metadata_sources("families"): for field in source_fields[source_metadata_families]: - gexf.write(f' \n') + gexf.write( + f' \n' + ) shift += 1 if not light: for org, org_idx in index.items(): - gexf.write(f' \n') - gexf.write(' \n') + gexf.write( + f' \n' + ) + gexf.write(" \n") gexf.write(' \n') gexf.write(' \n') if not light: for org, org_idx in index.items(): - gexf.write(f' \n') - gexf.write(' \n') - gexf.write(' \n') - gexf.write(f' PPanGGOLiN {distribution("ppanggolin").version}\n') - gexf.write(' \n') + gexf.write( + f' \n' + ) + gexf.write(" \n") + gexf.write(" \n") + gexf.write( + f' PPanGGOLiN {distribution("ppanggolin").version}\n' + ) + gexf.write(" \n") def write_gexf_nodes(gexf: TextIO, light: bool = True, soft_core: False = 0.95): @@ -233,9 +280,12 @@ def write_gexf_nodes(gexf: TextIO, light: bool = True, soft_core: False = 0.95): :param soft_core: Soft core threshold to use """ index = None - gexf.write(' \n') - colors = {"persistent": 'a="0" b="7" g="165" r="247"', 'shell': 'a="0" b="96" g="216" r="0"', - 'cloud': 'a="0" b="255" g="222" r="121"'} + gexf.write(" \n") + colors = { + "persistent": 'a="0" b="7" g="165" r="247"', + "shell": 'a="0" b="96" g="216" r="0"', + "cloud": 'a="0" b="255" g="222" r="121"', + } if not light: index = pan.get_org_index() @@ -248,28 +298,42 @@ def write_gexf_nodes(gexf: TextIO, light: bool = True, soft_core: False = 0.95): lis = [] for gene in fam.genes: name[gene.name] += 1 - product[gene.product.replace('&', 'and')] += 1 + product[gene.product.replace("&", "and")] += 1 gtype[gene.type] += 1 lis.append(gene.stop - gene.start) gexf.write(f' \n') - gexf.write(f' \n') + gexf.write(f" \n") gexf.write(f' \n') - gexf.write(' \n') + gexf.write(" \n") gexf.write(f' \n') - gexf.write(f' \n') - gexf.write(f' \n') - gexf.write(f' \n') + gexf.write( + f' \n' + ) + gexf.write( + f' \n' + ) + gexf.write( + f' \n' + ) gexf.write(f' \n') gexf.write(f' \n') - gexf.write(f' \n') - gexf.write(f' = (pan.number_of_organisms * soft_core) else "soft_accessory"}"' - f' />\n') - gexf.write(f' \n') + gexf.write( + f' \n' + ) + gexf.write( + f' = (pan.number_of_organisms * soft_core) else "soft_accessory"}"' + f" />\n" + ) + gexf.write( + f' \n' + ) gexf.write(f' \n') - gexf.write(f' \n') + gexf.write( + f' \n' + ) if pan.number_of_spots > 0: str_spot = "|".join([str(s) for s in list(fam.spots)]) gexf.write(f' \n') @@ -277,26 +341,34 @@ def write_gexf_nodes(gexf: TextIO, light: bool = True, soft_core: False = 0.95): str_module = str(fam.module) if fam.has_module else "" gexf.write(f' \n') shift = 14 - source_fields = {m.source: m.fields for f in pan.gene_families if len(list(f.metadata)) > 0 for m in f.metadata} - for source_metadata_families in pan_metadata_sources: + source_fields = { + m.source: m.fields + for f in pan.gene_families + if len(list(f.metadata)) > 0 + for m in f.metadata + } + for source_metadata_families in pan_metadata_sources: to_concat = defaultdict(list) for m in fam.metadata: if m.source == source_metadata_families: for field in m.fields: to_concat[field].append(str(getattr(m, field))) for field in source_fields[source_metadata_families]: - concatenated_fields = '|'.join(to_concat[field]) - gexf.write(f' \n') + concatenated_fields = "|".join(to_concat[field]) + gexf.write( + f' \n' + ) shift += 1 if not light: for org, genes in fam.get_org_dict().items(): gexf.write( f' \n') - gexf.write(' \n') - gexf.write(' \n') - gexf.write(' \n') + f'value="{"|".join([gene.ID if gene.local_identifier == "" else gene.local_identifier for gene in genes])}" />\n' + ) + gexf.write(" \n") + gexf.write(" \n") + gexf.write(" \n") def write_gexf_edges(gexf: TextIO, light: bool = True): @@ -305,24 +377,28 @@ def write_gexf_edges(gexf: TextIO, light: bool = True): :param gexf: file-like object, compressed or not :param light: save the light version of the pangenome graph """ - gexf.write(' \n') + gexf.write(" \n") edgeids = 0 index = pan.get_org_index() shift = 14 metadata_count = len(pan.metadata_sources("families")) for edge in pan.edges: - gexf.write(f' \n') + gexf.write( + f' \n' + ) gexf.write(f' \n') - gexf.write(' \n') + gexf.write(" \n") gexf.write(f' \n') if not light: for org, genes_pairs in edge.get_organisms_dict().items(): - gexf.write(f' \n') - gexf.write(' \n') - gexf.write(' \n') + gexf.write( + f' \n' + ) + gexf.write(" \n") + gexf.write(" \n") edgeids += 1 - gexf.write(' \n') + gexf.write(" \n") def write_gexf_end(gexf: TextIO): @@ -342,12 +418,16 @@ def write_gexf(output: Path, light: bool = True, compress: bool = False): :param compress: Compress the file in .gz """ txt = "Writing the " - txt += "light gexf file for the pangenome graph..." if light else "gexf file for the pangenome graph..." + txt += ( + "light gexf file for the pangenome graph..." + if light + else "gexf file for the pangenome graph..." + ) logging.getLogger("PPanGGOLiN").info(txt) outname = output / f"pangenomeGraph{'_light' if light else ''}.gexf" with write_compressed_or_not(outname, compress) as gexf: - graph_type = 'ligth gexf' if light else 'gexf' + graph_type = "ligth gexf" if light else "gexf" logging.getLogger("PPanGGOLiN").debug(f"Writing the {graph_type} header...") write_gexf_header(gexf, light) logging.getLogger("PPanGGOLiN").debug(f"Writing the {graph_type} nodes...") @@ -356,10 +436,18 @@ def write_gexf(output: Path, light: bool = True, compress: bool = False): write_gexf_edges(gexf, light) logging.getLogger("PPanGGOLiN").debug(f"Writing the {graph_type} ends...") write_gexf_end(gexf) - logging.getLogger("PPanGGOLiN").info(f"Done writing the gexf file : '{gexf.name}'") - - -def write_matrix(output: Path, sep: str = ',', ext: str = 'csv', compress: bool = False, gene_names: bool = False): + logging.getLogger("PPanGGOLiN").info( + f"Done writing the gexf file : '{gexf.name}'" + ) + + +def write_matrix( + output: Path, + sep: str = ",", + ext: str = "csv", + compress: bool = False, + gene_names: bool = False, +): """ Write a csv file format as used by Roary, among others. The alternative gene ID will be the partition, if there is one @@ -377,25 +465,36 @@ def write_matrix(output: Path, sep: str = ',', ext: str = 'csv', compress: bool index_org = {} default_dat = [] for index, org in enumerate(pan.organisms): - default_dat.append('0') + default_dat.append("0") index_org[org] = index - matrix.write(sep.join(['"Gene"', # 1 - '"Non-unique Gene name"', # 2 - '"Annotation"', # 3 - '"No. isolates"', # 4 - '"No. sequences"', # 5 - '"Avg sequences per isolate"', # 6 - '"Accessory Fragment"', # 7 - '"Genome Fragment"', # 8 - '"Order within Fragment"', # 9 - '"Accessory Order with Fragment"', # 10 - '"QC"', # 11 - '"Min group size nuc"', # 12 - '"Max group size nuc"', # 13 - '"Avg group size nuc"'] # 14 - + ['"' + str(org) + '"' for org in pan.organisms]) + "\n") # 15 - default_genes = ['""'] * pan.number_of_organisms if gene_names else ["0"] * pan.number_of_organisms + matrix.write( + sep.join( + [ + '"Gene"', # 1 + '"Non-unique Gene name"', # 2 + '"Annotation"', # 3 + '"No. isolates"', # 4 + '"No. sequences"', # 5 + '"Avg sequences per isolate"', # 6 + '"Accessory Fragment"', # 7 + '"Genome Fragment"', # 8 + '"Order within Fragment"', # 9 + '"Accessory Order with Fragment"', # 10 + '"QC"', # 11 + '"Min group size nuc"', # 12 + '"Max group size nuc"', # 13 + '"Avg group size nuc"', + ] # 14 + + ['"' + str(org) + '"' for org in pan.organisms] + ) + + "\n" + ) # 15 + default_genes = ( + ['""'] * pan.number_of_organisms + if gene_names + else ["0"] * pan.number_of_organisms + ) org_index = pan.get_org_index() # should just return things for fam in pan.gene_families: genes = default_genes.copy() @@ -403,8 +502,11 @@ def write_matrix(output: Path, sep: str = ',', ext: str = 'csv', compress: bool genenames = Counter() product = Counter() for org, gene_list in fam.get_org_dict().items(): - genes[org_index[org]] = " ".join(['"' + str(gene) + - '"' for gene in gene_list]) if gene_names else str(len(gene_list)) + genes[org_index[org]] = ( + " ".join(['"' + str(gene) + '"' for gene in gene_list]) + if gene_names + else str(len(gene_list)) + ) for gene in gene_list: lis.append(gene.stop - gene.start) product[gene.product] += 1 @@ -416,22 +518,33 @@ def write_matrix(output: Path, sep: str = ',', ext: str = 'csv', compress: bool alt = str(product.most_common(1)[0][0]) lis = [gene.stop - gene.start for gene in fam.genes] - matrix.write(sep.join(['"' + fam.name + '"', # 1 - '"' + alt + '"', # 2 - '"' + str(product.most_common(1)[0][0]) + '"', # 3 - '"' + str(fam.number_of_organisms) + '"', # 4 - '"' + str(len(fam)) + '"', # 5 - '"' + str(round(len(fam) / fam.number_of_organisms, 2)) + '"', # 6 - '"NA"', # 7 - '"NA"', # 8 - '""', # 9 - '""', # 10 - '""', # 11 - '"' + str(min(lis)) + '"', # 12 - '"' + str(max(lis)) + '"', # 13 - '"' + str(round(sum(lis) / len(lis), 2)) + '"'] # 14 - + genes) + "\n") # 15 - logging.getLogger("PPanGGOLiN").info(f"Done writing the matrix : '{outname.as_posix()}'") + matrix.write( + sep.join( + [ + '"' + fam.name + '"', # 1 + '"' + alt + '"', # 2 + '"' + str(product.most_common(1)[0][0]) + '"', # 3 + '"' + str(fam.number_of_organisms) + '"', # 4 + '"' + str(len(fam)) + '"', # 5 + '"' + + str(round(len(fam) / fam.number_of_organisms, 2)) + + '"', # 6 + '"NA"', # 7 + '"NA"', # 8 + '""', # 9 + '""', # 10 + '""', # 11 + '"' + str(min(lis)) + '"', # 12 + '"' + str(max(lis)) + '"', # 13 + '"' + str(round(sum(lis) / len(lis), 2)) + '"', + ] # 14 + + genes + ) + + "\n" + ) # 15 + logging.getLogger("PPanGGOLiN").info( + f"Done writing the matrix : '{outname.as_posix()}'" + ) def write_gene_presence_absence(output: Path, compress: bool = False): @@ -447,11 +560,12 @@ def write_gene_presence_absence(output: Path, compress: bool = False): index_org = {} default_dat = [] for index, org in enumerate(pan.organisms): - default_dat.append('0') + default_dat.append("0") index_org[org] = index - matrix.write('\t'.join(['Gene'] + # 14 - [str(org) for org in pan.organisms]) + "\n") # 15 + matrix.write( + "\t".join(["Gene"] + [str(org) for org in pan.organisms]) + "\n" # 14 + ) # 15 default_genes = ["0"] * pan.number_of_organisms org_index = pan.get_org_index() # should just return things for fam in pan.gene_families: @@ -459,19 +573,22 @@ def write_gene_presence_absence(output: Path, compress: bool = False): for org in fam.organisms: genes[org_index[org]] = "1" - matrix.write('\t'.join([fam.name] # 14 - + genes) + "\n") # 15 - logging.getLogger("PPanGGOLiN").info(f"Done writing the gene presence absence file : '{outname.as_posix()}'") - - -def summarize_genome(organism: Organism, - pangenome_persistent_count: int, - pangenome_persistent_single_copy_families: Set[GeneFamily], - soft_core_families:Set[GeneFamily], - exact_core_families:Set[GeneFamily], - rgp_count: int, - spot_count: int, - module_count: int) -> Dict[str, any]: + matrix.write("\t".join([fam.name] + genes) + "\n") # 14 # 15 + logging.getLogger("PPanGGOLiN").info( + f"Done writing the gene presence absence file : '{outname.as_posix()}'" + ) + + +def summarize_genome( + organism: Organism, + pangenome_persistent_count: int, + pangenome_persistent_single_copy_families: Set[GeneFamily], + soft_core_families: Set[GeneFamily], + exact_core_families: Set[GeneFamily], + rgp_count: int, + spot_count: int, + module_count: int, +) -> Dict[str, any]: """ Summarizes genomic information of an organism. @@ -489,39 +606,51 @@ def summarize_genome(organism: Organism, partition_to_genes = organism.group_genes_by_partition() - persistent_gene_count = len(partition_to_genes['persistent']) - shell_gene_count = len(partition_to_genes['shell']) - cloud_gene_count = len(partition_to_genes['cloud']) + persistent_gene_count = len(partition_to_genes["persistent"]) + shell_gene_count = len(partition_to_genes["shell"]) + cloud_gene_count = len(partition_to_genes["cloud"]) gene_count = persistent_gene_count + shell_gene_count + cloud_gene_count - persistent_family_count = len({g.family for g in partition_to_genes['persistent']}) - shell_family_count = len({g.family for g in partition_to_genes['shell']}) - cloud_family_count = len({g.family for g in partition_to_genes['cloud']}) + persistent_family_count = len({g.family for g in partition_to_genes["persistent"]}) + shell_family_count = len({g.family for g in partition_to_genes["shell"]}) + cloud_family_count = len({g.family for g in partition_to_genes["cloud"]}) - persistent_fragmented_genes = {g for g in partition_to_genes['persistent'] if g.is_fragment} - shell_fragmented_genes = {g for g in partition_to_genes['shell'] if g.is_fragment} - cloud_fragmented_genes = {g for g in partition_to_genes['cloud'] if g.is_fragment} + persistent_fragmented_genes = { + g for g in partition_to_genes["persistent"] if g.is_fragment + } + shell_fragmented_genes = {g for g in partition_to_genes["shell"] if g.is_fragment} + cloud_fragmented_genes = {g for g in partition_to_genes["cloud"] if g.is_fragment} persistent_fragmented_genes_count = len(persistent_fragmented_genes) shell_fragmented_genes_count = len(shell_fragmented_genes) cloud_fragmented_genes_count = len(cloud_fragmented_genes) - fragmented_genes_count = persistent_fragmented_genes_count + shell_fragmented_genes_count + cloud_fragmented_genes_count + fragmented_genes_count = ( + persistent_fragmented_genes_count + + shell_fragmented_genes_count + + cloud_fragmented_genes_count + ) - persistent_fragmented_family_count = len({g.family for g in persistent_fragmented_genes}) + persistent_fragmented_family_count = len( + {g.family for g in persistent_fragmented_genes} + ) shell_fragmented_family_count = len({g.family for g in shell_fragmented_genes}) cloud_fragmented_family_count = len({g.family for g in cloud_fragmented_genes}) - families_with_fragment_count = persistent_fragmented_family_count + shell_fragmented_family_count + cloud_fragmented_family_count + families_with_fragment_count = ( + persistent_fragmented_family_count + + shell_fragmented_family_count + + cloud_fragmented_family_count + ) families_count = persistent_family_count + shell_family_count + cloud_family_count - completeness = "NA" if pangenome_persistent_count > 0: - completeness = round((persistent_family_count / pangenome_persistent_count) * 100, 2) - + completeness = round( + (persistent_family_count / pangenome_persistent_count) * 100, 2 + ) orgs_families_in_multicopy_by_part = defaultdict(set) for family in organism.families: @@ -529,24 +658,45 @@ def summarize_genome(organism: Organism, # the family has more than one gene in the genome orgs_families_in_multicopy_by_part[family.named_partition].add(family) - - orgs_persistent_families_in_multicopy_count = len(orgs_families_in_multicopy_by_part['persistent']) - orgs_shell_families_in_multicopy_count = len(orgs_families_in_multicopy_by_part['shell']) - orgs_cloud_families_in_multicopy_count = len(orgs_families_in_multicopy_by_part['cloud']) - - orgs_families_in_multicopy_count = orgs_persistent_families_in_multicopy_count + orgs_shell_families_in_multicopy_count + orgs_cloud_families_in_multicopy_count - - single_copy_families_found_in_multicopy_count = len(pangenome_persistent_single_copy_families & orgs_families_in_multicopy_by_part['persistent']) - contamination = 'NA' + orgs_persistent_families_in_multicopy_count = len( + orgs_families_in_multicopy_by_part["persistent"] + ) + orgs_shell_families_in_multicopy_count = len( + orgs_families_in_multicopy_by_part["shell"] + ) + orgs_cloud_families_in_multicopy_count = len( + orgs_families_in_multicopy_by_part["cloud"] + ) + + orgs_families_in_multicopy_count = ( + orgs_persistent_families_in_multicopy_count + + orgs_shell_families_in_multicopy_count + + orgs_cloud_families_in_multicopy_count + ) + + single_copy_families_found_in_multicopy_count = len( + pangenome_persistent_single_copy_families + & orgs_families_in_multicopy_by_part["persistent"] + ) + contamination = "NA" if len(pangenome_persistent_single_copy_families) > 0: - contamination = round(100 * single_copy_families_found_in_multicopy_count / len(pangenome_persistent_single_copy_families) , 2) - - fragmentation = 'NA' + contamination = round( + 100 + * single_copy_families_found_in_multicopy_count + / len(pangenome_persistent_single_copy_families), + 2, + ) + + fragmentation = "NA" if families_count > 0: fragmentation = round(100.0 * families_with_fragment_count / families_count, 2) - soft_core_genes = {gene for gene in organism.genes if gene.family in soft_core_families} - exact_core_genes = {gene for gene in organism.genes if gene.family in exact_core_families} + soft_core_genes = { + gene for gene in organism.genes if gene.family in soft_core_families + } + exact_core_genes = { + gene for gene in organism.genes if gene.family in exact_core_families + } soft_core_families_count = len({gene.family for gene in soft_core_genes}) exact_core_families_count = len({gene.family for gene in exact_core_genes}) @@ -561,41 +711,50 @@ def summarize_genome(organism: Organism, "Genes": gene_count, "Fragmented_genes": fragmented_genes_count, "Families": families_count, - "Families_with_fragments":families_with_fragment_count, + "Families_with_fragments": families_with_fragment_count, "Families_in_multicopy": orgs_families_in_multicopy_count, - "Soft_core": {"families":soft_core_families_count, - "genes": len(soft_core_genes) }, - 'Exact_core':{"families":exact_core_families_count, - "genes": len(exact_core_genes) }, + "Soft_core": { + "families": soft_core_families_count, + "genes": len(soft_core_genes), + }, + "Exact_core": { + "families": exact_core_families_count, + "genes": len(exact_core_genes), + }, "Persistent": { "genes": persistent_gene_count, "fragmented_genes": persistent_fragmented_genes_count, "families": persistent_family_count, "families_with_fragments": persistent_fragmented_family_count, - "families_in_multicopy": orgs_persistent_families_in_multicopy_count}, + "families_in_multicopy": orgs_persistent_families_in_multicopy_count, + }, "Shell": { "genes": shell_gene_count, "fragmented_genes": shell_fragmented_genes_count, "families": shell_family_count, "families_with_fragments": shell_fragmented_family_count, - "families_in_multicopy": orgs_shell_families_in_multicopy_count}, + "families_in_multicopy": orgs_shell_families_in_multicopy_count, + }, "Cloud": { "genes": cloud_gene_count, "fragmented_genes": cloud_fragmented_genes_count, "families": cloud_family_count, "families_with_fragments": cloud_fragmented_family_count, - "families_in_multicopy": orgs_cloud_families_in_multicopy_count}, + "families_in_multicopy": orgs_cloud_families_in_multicopy_count, + }, "Completeness": completeness, "Contamination": contamination, "Fragmentation": fragmentation, "RGPs": rgp_count, "Spots": spot_count, - "Modules": module_count + "Modules": module_count, } return summary_info -def write_persistent_duplication_statistics(pangenome: Pangenome, output: Path, dup_margin: float, compress: bool) -> Set[GeneFamily]: +def write_persistent_duplication_statistics( + pangenome: Pangenome, output: Path, dup_margin: float, compress: bool +) -> Set[GeneFamily]: """ Writes statistics on persistent duplications in gene families to a specified output file. @@ -606,13 +765,22 @@ def write_persistent_duplication_statistics(pangenome: Pangenome, output: Path, :return : """ - logging.getLogger("PPanGGOLiN").info("Writing statistics on persistent duplication...") + logging.getLogger("PPanGGOLiN").info( + "Writing statistics on persistent duplication..." + ) single_copy_persistent = set() # Could use bitarrays if speed is needed - with write_compressed_or_not(output / "mean_persistent_duplication.tsv", compress) as outfile: - fieldnames = ["persistent_family", "duplication_ratio", "mean_presence", "is_single_copy_marker"] - writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter='\t') + with write_compressed_or_not( + output / "mean_persistent_duplication.tsv", compress + ) as outfile: + fieldnames = [ + "persistent_family", + "duplication_ratio", + "mean_presence", + "is_single_copy_marker", + ] + writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter="\t") writer.writeheader() for fam in pangenome.gene_families: @@ -624,18 +792,26 @@ def write_persistent_duplication_statistics(pangenome: Pangenome, output: Path, if is_scm: single_copy_persistent.add(fam) - writer.writerow({ - "persistent_family": fam.name, - "duplication_ratio": round(dup_ratio, 3), - "mean_presence": round(mean_pres, 3), - "is_single_copy_marker": is_scm - }) + writer.writerow( + { + "persistent_family": fam.name, + "duplication_ratio": round(dup_ratio, 3), + "mean_presence": round(mean_pres, 3), + "is_single_copy_marker": is_scm, + } + ) logging.getLogger("PPanGGOLiN").info("Done writing stats on persistent duplication") return single_copy_persistent -def write_summaries_in_tsv(summaries: List[Dict[str, Any]], output_file: Path, - dup_margin:float, soft_core:float, compress:bool = False): + +def write_summaries_in_tsv( + summaries: List[Dict[str, Any]], + output_file: Path, + dup_margin: float, + soft_core: float, + compress: bool = False, +): """ Writes summaries of organisms stored in a dictionary into a Tab-Separated Values (TSV) file. @@ -656,9 +832,15 @@ def write_summaries_in_tsv(summaries: List[Dict[str, Any]], output_file: Path, flout.write(f"#duplication_margin={round(dup_margin, 3)}\n") # Write the DataFrame to a TSV file - df_summary.to_csv(flout, sep='\t', index=False) + df_summary.to_csv(flout, sep="\t", index=False) + -def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, compress: bool = False): +def write_stats( + output: Path, + soft_core: float = 0.95, + dup_margin: float = 0.05, + compress: bool = False, +): """ Write pangenome statistics for each genomes @@ -671,43 +853,60 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, """ logging.getLogger("PPanGGOLiN").info("Writing pangenome statistics...") - single_copy_persistent = write_persistent_duplication_statistics(pangenome=pan, output=output, - dup_margin=dup_margin, compress=compress) + single_copy_persistent = write_persistent_duplication_statistics( + pangenome=pan, output=output, dup_margin=dup_margin, compress=compress + ) - logging.getLogger("PPanGGOLiN").info("Writing genome per genome statistics (completeness and counts)...") + logging.getLogger("PPanGGOLiN").info( + "Writing genome per genome statistics (completeness and counts)..." + ) soft_core_families = pan.soft_core_families(soft_core) exact_core_families = pan.exact_core_families() - pangenome_persistent_single_copy_families = pan.get_single_copy_persistent_families(dup_margin = dup_margin, exclude_fragments=True) + pangenome_persistent_single_copy_families = pan.get_single_copy_persistent_families( + dup_margin=dup_margin, exclude_fragments=True + ) assert pangenome_persistent_single_copy_families == single_copy_persistent - pangenome_persistent_count = len([fam for fam in pan.gene_families if fam.named_partition == "persistent"]) + pangenome_persistent_count = len( + [fam for fam in pan.gene_families if fam.named_partition == "persistent"] + ) summaries = [] for organism in pan.organisms: - - rgp_count = organism.number_of_regions if pan.status["predictedRGP"] != "No" else None + rgp_count = ( + organism.number_of_regions if pan.status["predictedRGP"] != "No" else None + ) spot_count = organism.number_of_spots if pan.status["spots"] != "No" else None - module_count = organism.number_of_modules if pan.status["modules"] != "No" else None - - organism_summary = summarize_genome(organism=organism, - pangenome_persistent_count=pangenome_persistent_count, - pangenome_persistent_single_copy_families=pangenome_persistent_single_copy_families, - soft_core_families=soft_core_families, - exact_core_families=exact_core_families, - rgp_count=rgp_count, - spot_count=spot_count, - module_count=module_count) + module_count = ( + organism.number_of_modules if pan.status["modules"] != "No" else None + ) + + organism_summary = summarize_genome( + organism=organism, + pangenome_persistent_count=pangenome_persistent_count, + pangenome_persistent_single_copy_families=pangenome_persistent_single_copy_families, + soft_core_families=soft_core_families, + exact_core_families=exact_core_families, + rgp_count=rgp_count, + spot_count=spot_count, + module_count=module_count, + ) summaries.append(organism_summary) - write_summaries_in_tsv(summaries, output_file= output / "genomes_statistics.tsv", dup_margin=dup_margin, soft_core=soft_core, compress=compress) + write_summaries_in_tsv( + summaries, + output_file=output / "genomes_statistics.tsv", + dup_margin=dup_margin, + soft_core=soft_core, + compress=compress, + ) logging.getLogger("PPanGGOLiN").info("Done writing genome per genome statistics") - def write_partitions(output: Path, soft_core: float = 0.95): """ Write the list of gene families for each partition @@ -715,14 +914,23 @@ def write_partitions(output: Path, soft_core: float = 0.95): :param output: Path to output directory :param soft_core: Soft core threshold to use """ - logging.getLogger("PPanGGOLiN").info("Writing the list of gene families for each partition ...") + logging.getLogger("PPanGGOLiN").info( + "Writing the list of gene families for each partition ..." + ) if not os.path.exists(output / "partitions"): os.makedirs(output / "partitions") part_sets = defaultdict(set) # initializing key, value pairs so that files exist even if they are empty - for needed_key in ["soft_core", "exact_core", "exact_accessory", - "soft_accessory", "persistent", "shell", "cloud"]: + for needed_key in [ + "soft_core", + "exact_core", + "exact_accessory", + "soft_accessory", + "persistent", + "shell", + "cloud", + ]: part_sets[needed_key] = set() for fam in pan.gene_families: @@ -745,12 +953,16 @@ def write_partitions(output: Path, soft_core: float = 0.95): for key, val in part_sets.items(): with open(output / f"partitions/{key}.txt", "w") as curr_key_file: if len(val) > 0: - curr_key_file.write('\n'.join(val) + "\n") + curr_key_file.write("\n".join(val) + "\n") - logging.getLogger("PPanGGOLiN").info("Done writing the list of gene families for each partition") + logging.getLogger("PPanGGOLiN").info( + "Done writing the list of gene families for each partition" + ) -def write_gene_families_tsv(output: Path, compress: bool = False, disable_bar: bool = False): +def write_gene_families_tsv( + output: Path, compress: bool = False, disable_bar: bool = False +): """ Write the file providing the association between genes and gene families @@ -759,22 +971,47 @@ def write_gene_families_tsv(output: Path, compress: bool = False, disable_bar: b :param disable_bar: Flag to disable progress bar """ logging.getLogger("PPanGGOLiN").info( - "Writing the file providing the association between genes and gene families...") + "Writing the file providing the association between genes and gene families..." + ) outname = output / f"gene_families.tsv{'.gz' if compress else ''}" out_list = [] - for fam in tqdm(pan.gene_families, total=pan.number_of_gene_families, unit='family', disable=disable_bar): + for fam in tqdm( + pan.gene_families, + total=pan.number_of_gene_families, + unit="family", + disable=disable_bar, + ): for gene in fam.genes: - out_list.append([fam.name, gene.ID, gene.local_identifier, "F" if gene.is_fragment else ""]) + out_list.append( + [ + fam.name, + gene.ID, + gene.local_identifier, + "F" if gene.is_fragment else "", + ] + ) out_df = pd.DataFrame(out_list, columns=["GeneFam", "Gene", "local_id", "is_frag"]) - out_df["count"] = out_df.groupby("GeneFam")["GeneFam"].transform('count') - out_df = out_df.sort_values(by=["count", "Gene", "local_id", "is_frag"], ascending=[False, True, True, True]) - out_df = out_df.drop(columns=['count']) - out_df.to_csv(outname, sep="\t", index=False, header=False, compression='infer' if compress else None) - logging.getLogger("PPanGGOLiN").info("Done writing the file providing the association between genes and " - f"gene families: '{outname}'") + out_df["count"] = out_df.groupby("GeneFam")["GeneFam"].transform("count") + out_df = out_df.sort_values( + by=["count", "Gene", "local_id", "is_frag"], ascending=[False, True, True, True] + ) + out_df = out_df.drop(columns=["count"]) + out_df.to_csv( + outname, + sep="\t", + index=False, + header=False, + compression="infer" if compress else None, + ) + logging.getLogger("PPanGGOLiN").info( + "Done writing the file providing the association between genes and " + f"gene families: '{outname}'" + ) -def summarize_spots(spots: set, output: Path, compress: bool = False, file_name="summarize_spots.tsv"): +def summarize_spots( + spots: set, output: Path, compress: bool = False, file_name="summarize_spots.tsv" +): """ Write a file providing summarize information about hotspots @@ -788,12 +1025,13 @@ def r_and_s(value: float): """rounds to dp figures and returns a str of the provided value""" return str(round(value, 3)) if isinstance(value, float) else str(value) - file_path = output / file_name with write_compressed_or_not(file_path, compress) as fout: - fout.write("spot\tnb_rgp\tnb_families\tnb_unique_family_sets\tmean_nb_genes\t" - "stdev_nb_genes\tmax_nb_genes\tmin_nb_genes\n") + fout.write( + "spot\tnb_rgp\tnb_families\tnb_unique_family_sets\tmean_nb_genes\t" + "stdev_nb_genes\tmax_nb_genes\tmin_nb_genes\n" + ) for spot in sorted(spots, key=lambda x: len(x), reverse=True): tot_fams = set() len_uniq_content = len(spot.get_uniq_content()) @@ -805,8 +1043,24 @@ def r_and_s(value: float): stdev_size = stdev(size_list) if len(size_list) > 1 else 0 max_size = max(size_list) min_size = min(size_list) - fout.write("\t".join(map(r_and_s, [f"{str(spot)}", len(spot), len(tot_fams), len_uniq_content, - mean_size, stdev_size, max_size, min_size])) + "\n") + fout.write( + "\t".join( + map( + r_and_s, + [ + f"{str(spot)}", + len(spot), + len(tot_fams), + len_uniq_content, + mean_size, + stdev_size, + max_size, + min_size, + ], + ) + ) + + "\n" + ) logging.getLogger("PPanGGOLiN").info(f"Done writing spots in '{file_path}'") @@ -821,9 +1075,7 @@ def write_regions(output: Path, compress: bool = False): write_rgp_table(pan.regions, output, compress) - -def write_rgp_table(regions: Set[Region], - output: Path, compress: bool = False): +def write_rgp_table(regions: Set[Region], output: Path, compress: bool = False): """ Write the file providing information about regions of genomic plasticity. @@ -833,14 +1085,25 @@ def write_rgp_table(regions: Set[Region], """ fname = output / "regions_of_genomic_plasticity.tsv" with write_compressed_or_not(fname, compress) as tab: - fieldnames = ["region", "genome", "contig", "genes", "first_gene", "last_gene", - "start", "stop", "length", "coordinates", "contigBorder", "wholeContig"] - - writer = csv.DictWriter(tab, fieldnames=fieldnames, delimiter='\t') + fieldnames = [ + "region", + "genome", + "contig", + "genes", + "first_gene", + "last_gene", + "start", + "stop", + "length", + "coordinates", + "contigBorder", + "wholeContig", + ] + + writer = csv.DictWriter(tab, fieldnames=fieldnames, delimiter="\t") writer.writeheader() - regions = sorted(regions, key=lambda x: ( - x.organism.name, x.contig.name, x.ID)) + regions = sorted(regions, key=lambda x: (x.organism.name, x.contig.name, x.ID)) for region in regions: row = { @@ -855,7 +1118,7 @@ def write_rgp_table(regions: Set[Region], "length": region.length, "coordinates": region.string_coordinates(), "contigBorder": region.is_contig_border, - "wholeContig": region.is_whole_contig + "wholeContig": region.is_whole_contig, } writer.writerow(row) @@ -875,7 +1138,7 @@ def spot2rgp(spots: set, output: Path, compress: bool = False): def write_spots(output: Path, compress: bool = False): - """ Write tsv files providing spots information and association with RGP + """Write tsv files providing spots information and association with RGP :param output: Path to output directory :param compress: Compress the file in .gz @@ -905,7 +1168,9 @@ def write_borders(output: Path, dup_margin: float = 0.05, compress: bool = False all_fams |= set(border[1]) fout.write(f"{spot.ID}\t{c}\t{famstring1}\t{famstring2}\n") - with write_compressed_or_not(output / "border_protein_genes.fasta", compress) as fout: + with write_compressed_or_not( + output / "border_protein_genes.fasta", compress + ) as fout: for fam in all_fams: fout.write(f">{fam.name}\n") fout.write(f"{fam.sequence}\n") @@ -920,7 +1185,9 @@ def write_module_summary(output: Path, compress: bool = False): """ logging.getLogger("PPanGGOLiN").info("Writing functional modules summary...") with write_compressed_or_not(output / "modules_summary.tsv", compress) as fout: - fout.write("module_id\tnb_families\tnb_genomes\tpartition\tmean_number_of_occurrence\n") + fout.write( + "module_id\tnb_families\tnb_genomes\tpartition\tmean_number_of_occurrence\n" + ) for mod in pan.modules: org_dict = defaultdict(set) partition_counter = Counter() @@ -930,10 +1197,13 @@ def write_module_summary(output: Path, compress: bool = False): org_dict[gene.organism].add(gene) fout.write( f"module_{mod.ID}\t{len(mod)}\t{len(org_dict)}\t{partition_counter.most_common(1)[0][0]}\t" - f"{round((sum([len(genes) for genes in org_dict.values()]) / len(org_dict)) / len(mod), 3)}\n") + f"{round((sum([len(genes) for genes in org_dict.values()]) / len(org_dict)) / len(mod), 3)}\n" + ) fout.close() - logging.getLogger("PPanGGOLiN").info(f"Done writing module summary: '{output.as_posix() + '/modules_summary.tsv'}'") + logging.getLogger("PPanGGOLiN").info( + f"Done writing module summary: '{output.as_posix() + '/modules_summary.tsv'}'" + ) def write_modules(output: Path, compress: bool = False): @@ -951,7 +1221,8 @@ def write_modules(output: Path, compress: bool = False): fout.close() logging.getLogger("PPanGGOLiN").info( - f"Done writing functional modules to: '{output.as_posix() + '/functional_modules.tsv'}'") + f"Done writing functional modules to: '{output.as_posix() + '/functional_modules.tsv'}'" + ) def write_org_modules(output: Path, compress: bool = False): @@ -972,7 +1243,8 @@ def write_org_modules(output: Path, compress: bool = False): fout.write(f"module_{mod.ID}\t{org.name}\t{completion:.2}\n") fout.close() logging.getLogger("PPanGGOLiN").info( - f"Done writing modules to genomes associations to: '{output.as_posix() + '/modules_in_genomes.tsv'}'") + f"Done writing modules to genomes associations to: '{output.as_posix() + '/modules_in_genomes.tsv'}'" + ) def write_spot_modules(output: Path, compress: bool = False): @@ -1000,7 +1272,8 @@ def write_spot_modules(output: Path, compress: bool = False): fout.write(f"module_{module.ID}\tspot_{spot.ID}\n") logging.getLogger("PPanGGOLiN").info( - f"Done writing module to spot associations to: {output.as_posix() + '/modules_spots.tsv'}") + f"Done writing module to spot associations to: {output.as_posix() + '/modules_spots.tsv'}" + ) def write_rgp_modules(output: Path, compress: bool = False): @@ -1040,21 +1313,39 @@ def write_rgp_modules(output: Path, compress: bool = False): myspot = region2spot.get(region) if myspot is not None: spot_list.add(region2spot[region]) - lists.write(f"{regions[0].name}\t{len(spot_list)}\t{','.join(['module_' + str(mod.ID) for mod in mod_list])}\t" - f"{','.join([reg.name for reg in regions])}\n") + lists.write( + f"{regions[0].name}\t{len(spot_list)}\t{','.join(['module_' + str(mod.ID) for mod in mod_list])}\t" + f"{','.join([reg.name for reg in regions])}\n" + ) lists.close() logging.getLogger("PPanGGOLiN").info( - f"RGP and associated modules are listed in : {output.as_posix() + '/modules_RGP_lists.tsv'}") - - -def write_pangenome_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core: float = 0.95, - dup_margin: float = 0.05, csv: bool = False, gene_pa: bool = False, gexf: bool = False, - light_gexf: bool = False, - stats: bool = False, json: bool = False, - partitions: bool = False, families_tsv: bool = False, regions: bool = False, spots: bool = False, - borders: bool = False, modules: bool = False, spot_modules: bool = False, compress: bool = False, - disable_bar: bool = False): + f"RGP and associated modules are listed in : {output.as_posix() + '/modules_RGP_lists.tsv'}" + ) + + +def write_pangenome_flat_files( + pangenome: Pangenome, + output: Path, + cpu: int = 1, + soft_core: float = 0.95, + dup_margin: float = 0.05, + csv: bool = False, + gene_pa: bool = False, + gexf: bool = False, + light_gexf: bool = False, + stats: bool = False, + json: bool = False, + partitions: bool = False, + families_tsv: bool = False, + regions: bool = False, + spots: bool = False, + borders: bool = False, + modules: bool = False, + spot_modules: bool = False, + compress: bool = False, + disable_bar: bool = False, +): """ Main function to write flat files from pangenome @@ -1080,8 +1371,24 @@ def write_pangenome_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, :param disable_bar: Disable progress bar """ # TODO Add force parameter to check if output already exist - if not any(x for x in [csv, gene_pa, gexf, light_gexf, stats, json, partitions, spots, borders, - families_tsv, modules, spot_modules, regions]): + if not any( + x + for x in [ + csv, + gene_pa, + gexf, + light_gexf, + stats, + json, + partitions, + spots, + borders, + families_tsv, + modules, + spot_modules, + regions, + ] + ): raise Exception("You did not indicate what file you wanted to write.") processes = [] @@ -1099,8 +1406,21 @@ def write_pangenome_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, pan = pangenome - if csv or gene_pa or gexf or light_gexf or stats or json or partitions or spots or \ - families_tsv or borders or modules or spot_modules or regions: + if ( + csv + or gene_pa + or gexf + or light_gexf + or stats + or json + or partitions + or spots + or families_tsv + or borders + or modules + or spot_modules + or regions + ): needAnnotations = True needFamilies = True if stats or partitions or spots or borders: @@ -1122,41 +1442,81 @@ def write_pangenome_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, if modules or spot_modules: # or projection: needModules = True - check_pangenome_info(pangenome, need_annotations=needAnnotations, need_families=needFamilies, need_graph=needGraph, - need_partitions=needPartitions, need_rgp=needRegions, need_spots=needSpots, - need_modules=needModules, need_metadata=needMetadata, metatypes=[metatype], sources=None, - disable_bar=disable_bar) + check_pangenome_info( + pangenome, + need_annotations=needAnnotations, + need_families=needFamilies, + need_graph=needGraph, + need_partitions=needPartitions, + need_rgp=needRegions, + need_spots=needSpots, + need_modules=needModules, + need_metadata=needMetadata, + metatypes=[metatype], + sources=None, + disable_bar=disable_bar, + ) pan.get_org_index() # make the index because it will be used most likely - with get_context('fork').Pool(processes=cpu) as p: + with get_context("fork").Pool(processes=cpu) as p: if csv: - processes.append(p.apply_async(func=write_matrix, args=(output, ',', "csv", compress, True))) + processes.append( + p.apply_async( + func=write_matrix, args=(output, ",", "csv", compress, True) + ) + ) if gene_pa: - processes.append(p.apply_async(func=write_gene_presence_absence, args=(output, compress))) + processes.append( + p.apply_async(func=write_gene_presence_absence, args=(output, compress)) + ) if gexf: - processes.append(p.apply_async(func=write_gexf, args=(output, False, compress))) + processes.append( + p.apply_async(func=write_gexf, args=(output, False, compress)) + ) if light_gexf: - processes.append(p.apply_async(func=write_gexf, args=(output, True, compress))) + processes.append( + p.apply_async(func=write_gexf, args=(output, True, compress)) + ) if stats: - processes.append(p.apply_async(func=write_stats, args=(output, soft_core, dup_margin, compress))) + processes.append( + p.apply_async( + func=write_stats, args=(output, soft_core, dup_margin, compress) + ) + ) if json: processes.append(p.apply_async(func=write_json, args=(output, compress))) if partitions: - processes.append(p.apply_async(func=write_partitions, args=(output, soft_core))) + processes.append( + p.apply_async(func=write_partitions, args=(output, soft_core)) + ) if families_tsv: - processes.append(p.apply_async(func=write_gene_families_tsv, args=(output, compress, disable_bar))) + processes.append( + p.apply_async( + func=write_gene_families_tsv, args=(output, compress, disable_bar) + ) + ) if spots: processes.append(p.apply_async(func=write_spots, args=(output, compress))) if regions: processes.append(p.apply_async(func=write_regions, args=(output, compress))) if borders: - processes.append(p.apply_async(func=write_borders, args=(output, dup_margin, compress))) + processes.append( + p.apply_async(func=write_borders, args=(output, dup_margin, compress)) + ) if modules: processes.append(p.apply_async(func=write_modules, args=(output, compress))) - processes.append(p.apply_async(func=write_module_summary, args=(output, compress))) - processes.append(p.apply_async(func=write_org_modules, args=(output, compress))) + processes.append( + p.apply_async(func=write_module_summary, args=(output, compress)) + ) + processes.append( + p.apply_async(func=write_org_modules, args=(output, compress)) + ) if spot_modules: - processes.append(p.apply_async(func=write_spot_modules, args=(output, compress))) - processes.append(p.apply_async(func=write_rgp_modules, args=(output, compress))) + processes.append( + p.apply_async(func=write_spot_modules, args=(output, compress)) + ) + processes.append( + p.apply_async(func=write_rgp_modules, args=(output, compress)) + ) for process in processes: process.get() # get all the results @@ -1171,11 +1531,28 @@ def launch(args: argparse.Namespace): mk_outdir(args.output, args.force) global pan pan.add_file(args.pangenome) - write_pangenome_flat_files(pan, args.output, cpu=args.cpu, soft_core=args.soft_core, dup_margin=args.dup_margin, csv=args.csv, - gene_pa=args.Rtab, gexf=args.gexf, light_gexf=args.light_gexf, - stats=args.stats, json=args.json, partitions=args.partitions, - families_tsv=args.families_tsv, regions=args.regions, spots=args.spots, borders=args.borders, modules=args.modules, - spot_modules=args.spot_modules, compress=args.compress, disable_bar=args.disable_prog_bar) + write_pangenome_flat_files( + pan, + args.output, + cpu=args.cpu, + soft_core=args.soft_core, + dup_margin=args.dup_margin, + csv=args.csv, + gene_pa=args.Rtab, + gexf=args.gexf, + light_gexf=args.light_gexf, + stats=args.stats, + json=args.json, + partitions=args.partitions, + families_tsv=args.families_tsv, + regions=args.regions, + spots=args.spots, + borders=args.borders, + modules=args.modules, + spot_modules=args.spot_modules, + compress=args.compress, + disable_bar=args.disable_prog_bar, + ) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -1186,7 +1563,9 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : parser arguments for align command """ - parser = sub_parser.add_parser("write_pangenome", formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "write_pangenome", formatter_class=argparse.RawTextHelpFormatter + ) parser_flat(parser) return parser @@ -1197,65 +1576,150 @@ def parser_flat(parser: argparse.ArgumentParser): :param parser: parser for align argument """ - required = parser.add_argument_group(title="Required arguments", - description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") - required.add_argument('-o', '--output', required=True, type=Path, - help="Output directory where the file(s) will be written") + required = parser.add_argument_group( + title="Required arguments", + description="One of the following arguments is required :", + ) + required.add_argument( + "-p", "--pangenome", required=False, type=Path, help="The pangenome .h5 file" + ) + required.add_argument( + "-o", + "--output", + required=True, + type=Path, + help="Output directory where the file(s) will be written", + ) optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument("--soft_core", required=False, type=restricted_float, default=0.95, - help="Soft core threshold to use") - - optional.add_argument("--dup_margin", required=False, type=restricted_float, default=0.05, - help="minimum ratio of genomes in which the family must have multiple genes " - "for it to be considered 'duplicated'") - - - optional.add_argument("--gexf", required=False, action="store_true", - help="write a gexf file with all the annotations and all the genes of each gene family") - optional.add_argument("--light_gexf", required=False, action="store_true", - help="write a gexf file with the gene families and basic information about them") - - optional.add_argument("--json", required=False, action="store_true", help="Writes the graph in a json file format") - - optional.add_argument("--csv", required=False, action="store_true", - help="csv file format as used by Roary, among others. " - "The alternative gene ID will be the partition, if there is one") - optional.add_argument("--Rtab", required=False, action="store_true", - help="tabular file for the gene binary presence absence matrix") - - optional.add_argument("--stats", required=False, action="store_true", - help="tsv files with some statistics for each each gene family") - - optional.add_argument("--partitions", required=False, action="store_true", - help="list of families belonging to each partition, with one file per partitions and " - "one family per line") - - optional.add_argument("--families_tsv", required=False, action="store_true", - help="Write a tsv file providing the association between genes and gene families") - - optional.add_argument("--regions", required=False, action="store_true", - help="Writes the predicted RGP and descriptive metrics in 'plastic_regions.tsv'") - optional.add_argument("--spots", required=False, action="store_true", - help="Write spot summary and a list of all RGP in each spot") - optional.add_argument("--borders", required=False, action="store_true", help="List all borders of each spot") - optional.add_argument("--modules", required=False, action="store_true", - help="Write a tsv file listing functional modules and the families that belong to them") - optional.add_argument("--spot_modules", required=False, action="store_true", - help="writes 2 files comparing the presence of modules within spots") - - optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz") - optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") - - -if __name__ == '__main__': + optional.add_argument( + "--soft_core", + required=False, + type=restricted_float, + default=0.95, + help="Soft core threshold to use", + ) + + optional.add_argument( + "--dup_margin", + required=False, + type=restricted_float, + default=0.05, + help="minimum ratio of genomes in which the family must have multiple genes " + "for it to be considered 'duplicated'", + ) + + optional.add_argument( + "--gexf", + required=False, + action="store_true", + help="write a gexf file with all the annotations and all the genes of each gene family", + ) + optional.add_argument( + "--light_gexf", + required=False, + action="store_true", + help="write a gexf file with the gene families and basic information about them", + ) + + optional.add_argument( + "--json", + required=False, + action="store_true", + help="Writes the graph in a json file format", + ) + + optional.add_argument( + "--csv", + required=False, + action="store_true", + help="csv file format as used by Roary, among others. " + "The alternative gene ID will be the partition, if there is one", + ) + optional.add_argument( + "--Rtab", + required=False, + action="store_true", + help="tabular file for the gene binary presence absence matrix", + ) + + optional.add_argument( + "--stats", + required=False, + action="store_true", + help="tsv files with some statistics for each each gene family", + ) + + optional.add_argument( + "--partitions", + required=False, + action="store_true", + help="list of families belonging to each partition, with one file per partitions and " + "one family per line", + ) + + optional.add_argument( + "--families_tsv", + required=False, + action="store_true", + help="Write a tsv file providing the association between genes and gene families", + ) + + optional.add_argument( + "--regions", + required=False, + action="store_true", + help="Writes the predicted RGP and descriptive metrics in 'plastic_regions.tsv'", + ) + optional.add_argument( + "--spots", + required=False, + action="store_true", + help="Write spot summary and a list of all RGP in each spot", + ) + optional.add_argument( + "--borders", + required=False, + action="store_true", + help="List all borders of each spot", + ) + optional.add_argument( + "--modules", + required=False, + action="store_true", + help="Write a tsv file listing functional modules and the families that belong to them", + ) + optional.add_argument( + "--spot_modules", + required=False, + action="store_true", + help="writes 2 files comparing the presence of modules within spots", + ) + + optional.add_argument( + "--compress", + required=False, + action="store_true", + help="Compress the files in .gz", + ) + optional.add_argument( + "-c", + "--cpu", + required=False, + default=1, + type=int, + help="Number of available cpus", + ) + + +if __name__ == "__main__": """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser_flat(main_parser) add_common_arguments(main_parser) diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index c4b37414..50bf9925 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -21,8 +21,13 @@ from ppanggolin.genetic_codes import genetic_codes -def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core", soft_core: float = 0.95, - dup_margin: float = 0.95, single_copy: bool = True) -> Set[GeneFamily]: +def get_families_to_write( + pangenome: Pangenome, + partition_filter: str = "core", + soft_core: float = 0.95, + dup_margin: float = 0.95, + single_copy: bool = True, +) -> Set[GeneFamily]: """ Get families corresponding to the given partition @@ -43,7 +48,9 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core", for family in pangenome.gene_families: if family.named_partition == partition_filter: if single_copy: - if family.is_single_copy(dup_margin=dup_margin, exclude_fragment=True): + if family.is_single_copy( + dup_margin=dup_margin, exclude_fragment=True + ): families.add(family) else: families.add(family) @@ -52,7 +59,9 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core", for family in pangenome.gene_families: if family.number_of_organisms == nb_org: if single_copy: - if family.is_single_copy(dup_margin=dup_margin, exclude_fragment=False): + if family.is_single_copy( + dup_margin=dup_margin, exclude_fragment=False + ): families.add(family) else: families.add(family) @@ -60,7 +69,9 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core", for family in pangenome.gene_families: if family.number_of_organisms < nb_org: if single_copy: - if family.is_single_copy(dup_margin=dup_margin, exclude_fragment=False): + if family.is_single_copy( + dup_margin=dup_margin, exclude_fragment=False + ): families.add(family) else: families.add(family) @@ -68,7 +79,9 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core", for family in pangenome.gene_families: if family.number_of_organisms >= nb_org * soft_core: if single_copy: - if family.is_single_copy(dup_margin=dup_margin, exclude_fragment=False): + if family.is_single_copy( + dup_margin=dup_margin, exclude_fragment=False + ): families.add(family) else: families.add(family) @@ -92,20 +105,26 @@ def translate(gene: Gene, code: Dict[str, Dict[str, str]]) -> Tuple[str, bool]: partial = True msg = ( f"Gene {gene.ID} {'' if gene.local_identifier == '' else 'with local identifier ' + gene.local_identifier}" - f" has a sequence length of {len(gene.dna)} which modulo 3 was different than 0.") + f" has a sequence length of {len(gene.dna)} which modulo 3 was different than 0." + ) logging.getLogger("PPANGGOLIN").debug(msg) - protein = start_table[gene.dna[0: 3]] + protein = start_table[gene.dna[0:3]] for i in range(3, len(gene.dna) - mod, 3): - codon = gene.dna[i: i + 3] + codon = gene.dna[i : i + 3] try: protein += table[codon] except KeyError: # codon was not planned for. Probably can't determine it. - protein += 'X' # X is for unknown + protein += "X" # X is for unknown return protein, partial -def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory, code_table: Dict[str, Dict[str, str]], - source: str = 'protein', use_gene_id: bool = False) -> Tuple[Path, bool]: +def write_fasta_families( + family: GeneFamily, + tmpdir: tempfile.TemporaryDirectory, + code_table: Dict[str, Dict[str, str]], + source: str = "protein", + use_gene_id: bool = False, +) -> Tuple[Path, bool]: """Write fasta files for each gene family :param family: gene family to write @@ -134,14 +153,16 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory else: f_obj.write(f">{gene.organism.name}\n") if source == "dna": - f_obj.write(gene.dna + '\n') + f_obj.write(gene.dna + "\n") elif source == "protein": protein, part = translate(gene, code_table) if not partial: partial = part f_obj.write(protein + "\n") else: - raise ValueError(f"Unknown sequence source '{source}' provided. Expected 'dna' or 'protein'.") + raise ValueError( + f"Unknown sequence source '{source}' provided. Expected 'dna' or 'protein'." + ) return f_name, partial @@ -161,7 +182,7 @@ def launch_mafft(fname: Path, output: Path, fam_name: str): def launch_multi_mafft(args: List[Tuple[Path, Path, str]]): - """ Allow to launch mafft in multiprocessing + """Allow to launch mafft in multiprocessing :param args: Pack of argument for launch_mafft @@ -170,8 +191,16 @@ def launch_multi_mafft(args: List[Tuple[Path, Path, str]]): launch_mafft(*args) -def compute_msa(families: Set[GeneFamily], output: Path, tmpdir: Path, cpu: int = 1, source: str = "protein", - use_gene_id: bool = False, code: str = "11", disable_bar: bool = False): +def compute_msa( + families: Set[GeneFamily], + output: Path, + tmpdir: Path, + cpu: int = 1, + source: str = "protein", + use_gene_id: bool = False, + code: str = "11", + disable_bar: bool = False, +): """ Compute MSA between pangenome gene families @@ -194,25 +223,34 @@ def compute_msa(families: Set[GeneFamily], output: Path, tmpdir: Path, cpu: int partial = False for family in tqdm(families, unit="family", disable=disable_bar): start_write = time.time() - fname, part = write_fasta_families(family, newtmpdir, code_table, source, use_gene_id) + fname, part = write_fasta_families( + family, newtmpdir, code_table, source, use_gene_id + ) if not partial: partial = part write_total = write_total + (time.time() - start_write) args.append((fname, output, family.name)) if partial: - logging.getLogger("PPanGGOLiN").warning("Partial gene was found during translation. " - "Last nucleotides were removed to translate. " - "Use --verbose 2 to see genes that are partial") + logging.getLogger("PPanGGOLiN").warning( + "Partial gene was found during translation. " + "Last nucleotides were removed to translate. " + "Use --verbose 2 to see genes that are partial" + ) logging.getLogger("PPanGGOLiN").info("Computing the MSA ...") - with get_context('fork').Pool(cpu) as p: + with get_context("fork").Pool(cpu) as p: with tqdm(total=len(families), unit="family", disable=disable_bar) as bar: for _ in p.imap_unordered(launch_multi_mafft, args): bar.update() -def write_whole_genome_msa(pangenome: Pangenome, families: set, phylo_name: Path, outdir: Path, - use_gene_id: bool = False): +def write_whole_genome_msa( + pangenome: Pangenome, + families: set, + phylo_name: Path, + outdir: Path, + use_gene_id: bool = False, +): """ Writes a whole genome msa file for additional phylogenetic analysis @@ -238,7 +276,7 @@ def write_whole_genome_msa(pangenome: Pangenome, families: set, phylo_name: Path curr_phylo_dict = {} for line in fin: - if line.startswith('>'): + if line.startswith(">"): # Save sequence of previous record if genome_id != "": if genome_id in observed_genomes: @@ -267,7 +305,9 @@ def write_whole_genome_msa(pangenome: Pangenome, families: set, phylo_name: Path observed_genomes.add(genome_id) # write gaps for all missing genomes - missing_genomes = [g for g in set(phylo_dict.keys()) if g not in observed_genomes] + missing_genomes = [ + g for g in set(phylo_dict.keys()) if g not in observed_genomes + ] for genome in missing_genomes: curr_phylo_dict[genome] = "-" * curr_len @@ -280,10 +320,22 @@ def write_whole_genome_msa(pangenome: Pangenome, families: set, phylo_name: Path fout.write(val + "\n") -def write_msa_files(pangenome: Pangenome, output: Path, cpu: int = 1, partition: str = "core", tmpdir: Path = None, - source: str = "protein", soft_core: float = 0.95, phylo: bool = False, use_gene_id: bool = False, - translation_table: str = "11", dup_margin: float = 0.95, single_copy: bool = True, - force: bool = False, disable_bar: bool = False): +def write_msa_files( + pangenome: Pangenome, + output: Path, + cpu: int = 1, + partition: str = "core", + tmpdir: Path = None, + source: str = "protein", + soft_core: float = 0.95, + phylo: bool = False, + use_gene_id: bool = False, + translation_table: str = "11", + dup_margin: float = 0.95, + single_copy: bool = True, + force: bool = False, + disable_bar: bool = False, +): """ Main function to write MSA files @@ -311,22 +363,47 @@ def write_msa_files(pangenome: Pangenome, output: Path, cpu: int = 1, partition: outdir = output / f"msa_{partition}_{source}/" mk_outdir(outdir, force=force) - check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_partitions=need_partitions, - need_gene_sequences=True, disable_bar=disable_bar) + check_pangenome_info( + pangenome, + need_annotations=True, + need_families=True, + need_partitions=need_partitions, + need_gene_sequences=True, + disable_bar=disable_bar, + ) logging.getLogger("PPanGGOLiN").info(f"Doing MSA for {partition} families...") - families = get_families_to_write(pangenome, partition_filter=partition, soft_core=soft_core, dup_margin=dup_margin, - single_copy=single_copy) + families = get_families_to_write( + pangenome, + partition_filter=partition, + soft_core=soft_core, + dup_margin=dup_margin, + single_copy=single_copy, + ) # check that the code is similar than the one used previously, if there is one - if 'translation_table' in pangenome.parameters["cluster"]: - if pangenome.parameters["cluster"]["translation_table"] != str(translation_table): - logging.getLogger("PPanGGOLiN").warning("The translation table used during clustering " - f"('{pangenome.parameters['cluster']['translation_table']}') " - f"is different than the one provided now ('{translation_table}')") - - compute_msa(families, outdir, cpu=cpu, tmpdir=tmpdir, source=source, use_gene_id=use_gene_id, - code=str(translation_table), disable_bar=disable_bar) - logging.getLogger("PPanGGOLiN").info(f"Done writing all {partition} MSA in: {outdir}") + if "translation_table" in pangenome.parameters["cluster"]: + if pangenome.parameters["cluster"]["translation_table"] != str( + translation_table + ): + logging.getLogger("PPanGGOLiN").warning( + "The translation table used during clustering " + f"('{pangenome.parameters['cluster']['translation_table']}') " + f"is different than the one provided now ('{translation_table}')" + ) + + compute_msa( + families, + outdir, + cpu=cpu, + tmpdir=tmpdir, + source=source, + use_gene_id=use_gene_id, + code=str(translation_table), + disable_bar=disable_bar, + ) + logging.getLogger("PPanGGOLiN").info( + f"Done writing all {partition} MSA in: {outdir}" + ) if phylo: logging.getLogger("PPanGGOLiN").info("Writing the whole genome msa file") @@ -334,8 +411,12 @@ def write_msa_files(pangenome: Pangenome, output: Path, cpu: int = 1, partition: phylo_name = output / f"{partition}_{soft_core}_genome_alignment.aln" else: phylo_name = output / f"{partition}_genome_alignment.aln" - write_whole_genome_msa(pangenome, families, phylo_name, outdir, use_gene_id=use_gene_id) - logging.getLogger("PPanGGOLiN").info(f"Done writing the {partition} genome alignment in: '{phylo_name}'") + write_whole_genome_msa( + pangenome, families, phylo_name, outdir, use_gene_id=use_gene_id + ) + logging.getLogger("PPanGGOLiN").info( + f"Done writing the {partition} genome alignment in: '{phylo_name}'" + ) def launch(args: argparse.Namespace): @@ -347,10 +428,22 @@ def launch(args: argparse.Namespace): mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) - write_msa_files(pangenome, args.output, cpu=args.cpu, partition=args.partition, tmpdir=args.tmpdir, - source=args.source, soft_core=args.soft_core, phylo=args.phylo, use_gene_id=args.use_gene_id, - translation_table=args.translation_table, dup_margin=args.dup_margin, single_copy=args.single_copy, - force=args.force, disable_bar=args.disable_prog_bar) + write_msa_files( + pangenome, + args.output, + cpu=args.cpu, + partition=args.partition, + tmpdir=args.tmpdir, + source=args.source, + soft_core=args.soft_core, + phylo=args.phylo, + use_gene_id=args.use_gene_id, + translation_table=args.translation_table, + dup_margin=args.dup_margin, + single_copy=args.single_copy, + force=args.force, + disable_bar=args.disable_prog_bar, + ) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -372,48 +465,116 @@ def parser_msa(parser: argparse.ArgumentParser): :param parser: parser for align argument """ - required = parser.add_argument_group(title="Required arguments", - description="The following arguments are required :") - required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") - required.add_argument('-o', '--output', required=True, type=Path, - help="Output directory where the file(s) will be written") - - optional = parser.add_argument_group(title="Optional arguments. Indicating 'all' writes all elements. " - "Writing a partition ('persistent', 'shell', 'cloud', 'core' or " - "'accessory') write the elements associated to said partition.") + required = parser.add_argument_group( + title="Required arguments", description="The following arguments are required :" + ) + required.add_argument( + "-p", "--pangenome", required=False, type=Path, help="The pangenome .h5 file" + ) + required.add_argument( + "-o", + "--output", + required=True, + type=Path, + help="Output directory where the file(s) will be written", + ) + + optional = parser.add_argument_group( + title="Optional arguments. Indicating 'all' writes all elements. " + "Writing a partition ('persistent', 'shell', 'cloud', 'core' or " + "'accessory') write the elements associated to said partition." + ) # could make choice to allow customization - optional.add_argument("--soft_core", required=False, type=restricted_float, default=0.95, - help="Soft core threshold to use if 'softcore' partition is chosen") - optional.add_argument("--dup_margin", required=False, type=restricted_float, default=0.05, - help="minimum ratio of genomes in which the family must have multiple genes " - "for it to be considered 'duplicated'") - optional.add_argument("--single_copy", required=False, action="store_true", default=False, - help="Use report gene families that are considered 'single copy', for details see " - "option --dup_margin") - optional.add_argument("--partition", required=False, default="core", - choices=["all", "persistent", "shell", "cloud", "core", "accessory", 'softcore'], - help="compute Multiple Sequence Alignment of the gene families in the given partition") - optional.add_argument("--source", required=False, default="protein", choices=["dna", "protein"], - help="indicates whether to use protein or dna sequences to compute the msa") - optional.add_argument("--phylo", required=False, action='store_true', - help="Writes a whole genome msa file for additional phylogenetic analysis") - optional.add_argument("--use_gene_id", required=False, action='store_true', - help="Use gene identifiers rather than genome names for sequences in the family MSA" - " (genome names are used by default)") - optional.add_argument("--translation_table", required=False, default=11, type=int, - help="Translation table (genetic code) to use.") - optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") - optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), - help="directory for storing temporary files") - - -if __name__ == '__main__': + optional.add_argument( + "--soft_core", + required=False, + type=restricted_float, + default=0.95, + help="Soft core threshold to use if 'softcore' partition is chosen", + ) + optional.add_argument( + "--dup_margin", + required=False, + type=restricted_float, + default=0.05, + help="minimum ratio of genomes in which the family must have multiple genes " + "for it to be considered 'duplicated'", + ) + optional.add_argument( + "--single_copy", + required=False, + action="store_true", + default=False, + help="Use report gene families that are considered 'single copy', for details see " + "option --dup_margin", + ) + optional.add_argument( + "--partition", + required=False, + default="core", + choices=[ + "all", + "persistent", + "shell", + "cloud", + "core", + "accessory", + "softcore", + ], + help="compute Multiple Sequence Alignment of the gene families in the given partition", + ) + optional.add_argument( + "--source", + required=False, + default="protein", + choices=["dna", "protein"], + help="indicates whether to use protein or dna sequences to compute the msa", + ) + optional.add_argument( + "--phylo", + required=False, + action="store_true", + help="Writes a whole genome msa file for additional phylogenetic analysis", + ) + optional.add_argument( + "--use_gene_id", + required=False, + action="store_true", + help="Use gene identifiers rather than genome names for sequences in the family MSA" + " (genome names are used by default)", + ) + optional.add_argument( + "--translation_table", + required=False, + default=11, + type=int, + help="Translation table (genetic code) to use.", + ) + optional.add_argument( + "-c", + "--cpu", + required=False, + default=1, + type=int, + help="Number of available cpus", + ) + optional.add_argument( + "--tmpdir", + required=False, + type=str, + default=Path(tempfile.gettempdir()), + help="directory for storing temporary files", + ) + + +if __name__ == "__main__": """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser_msa(main_parser) add_common_arguments(main_parser) diff --git a/ppanggolin/formats/writeMetadata.py b/ppanggolin/formats/writeMetadata.py index 32d4e602..e21cb7b5 100644 --- a/ppanggolin/formats/writeMetadata.py +++ b/ppanggolin/formats/writeMetadata.py @@ -16,7 +16,9 @@ from ppanggolin.region import Region, Spot, Module -def write_metadata_status(pangenome: Pangenome, h5f: tables.File, status_group: tables.Group) -> bool: +def write_metadata_status( + pangenome: Pangenome, h5f: tables.File, status_group: tables.Group +) -> bool: """Write status of metadata in pangenome file :param pangenome: pangenome with metadata @@ -26,14 +28,18 @@ def write_metadata_status(pangenome: Pangenome, h5f: tables.File, status_group: """ metastatus = pangenome.status["metadata"] metasources = pangenome.status["metasources"] - if 'metastatus' in status_group: + if "metastatus" in status_group: metadata_group = status_group.metastatus else: - metadata_group = h5f.create_group(status_group, "metastatus", "Statuses of the pangenome metadata") - if 'metasources' in status_group: + metadata_group = h5f.create_group( + status_group, "metastatus", "Statuses of the pangenome metadata" + ) + if "metasources" in status_group: metasources_group = status_group.metasources else: - metasources_group = h5f.create_group(status_group, "metasources", "Sources of the pangenome metadata") + metasources_group = h5f.create_group( + status_group, "metasources", "Sources of the pangenome metadata" + ) if metastatus["families"] in ["Computed", "Loaded", "inFile"]: metadata_group._v_attrs.families = True @@ -68,28 +74,37 @@ def write_metadata_group(h5f: tables.File, metatype: str) -> tables.Group: :return: Metadata group of the corresponding metatype """ - if '/metadata' not in h5f: + if "/metadata" not in h5f: metadata_group = h5f.create_group("/", "metadata", "Pangenome metadata") else: metadata_group = h5f.root.metadata if metatype not in metadata_group: - metatype_group = h5f.create_group(metadata_group, metatype, f"{metatype} metadata") + metatype_group = h5f.create_group( + metadata_group, metatype, f"{metatype} metadata" + ) else: metatype_group = metadata_group._f_get_child(metatype) return metatype_group -def desc_metadata(max_len_dict: Dict[str, int], type_dict: Dict[str, tables.Col]) -> dict: +def desc_metadata( + max_len_dict: Dict[str, int], type_dict: Dict[str, tables.Col] +) -> dict: """Create a formatted table for metadata description :return: Formatted table """ - desc_dict = {attr: tables.StringCol(itemsize=max_value) for attr, max_value in max_len_dict.items()} + desc_dict = { + attr: tables.StringCol(itemsize=max_value) + for attr, max_value in max_len_dict.items() + } desc_dict.update(dict(type_dict.items())) return desc_dict -def get_metadata_contig_len(select_ctg: List[Contig], source: str) -> Tuple[Dict[str, int], Dict[str, tables.Col], int]: +def get_metadata_contig_len( + select_ctg: List[Contig], source: str +) -> Tuple[Dict[str, int], Dict[str, tables.Col], int]: """Get maximum size of contig metadata information :param select_ctg: selected elements from source @@ -97,16 +112,17 @@ def get_metadata_contig_len(select_ctg: List[Contig], source: str) -> Tuple[Dict :return: Maximum type and size of each element """ - type_dict = {"metadata_id": tables.Int64Col(), - "ID": tables.Int64Col()} + type_dict = {"metadata_id": tables.Int64Col(), "ID": tables.Int64Col()} max_len_dict = {} expected_rows = 0 for contig in select_ctg: for metadata in contig.get_metadata_by_source(source).values(): - for attr, value in ((k, v) for k, v in metadata.__dict__.items() if k != "source"): + for attr, value in ( + (k, v) for k, v in metadata.__dict__.items() if k != "source" + ): if isinstance(value, bytes): - value = value.decode('UTF-8') + value = value.decode("UTF-8") if isinstance(value, float) or isinstance(value, int): if attr in type_dict: if type_dict[attr] != type(value): @@ -124,14 +140,21 @@ def get_metadata_contig_len(select_ctg: List[Contig], source: str) -> Tuple[Dict else: max_len_dict[attr] = len(value) else: - logging.getLogger("PPanGGOLiN").debug(f"attr: {attr}, value: {value}") + logging.getLogger("PPanGGOLiN").debug( + f"attr: {attr}, value: {value}" + ) raise TypeError(f"{type(value)} is not acceptable") expected_rows += 1 return max_len_dict, type_dict, expected_rows -def write_metadata_contig(h5f: tables.File, source: str, select_contigs: List[Contig], disable_bar: bool = False): +def write_metadata_contig( + h5f: tables.File, + source: str, + select_contigs: List[Contig], + disable_bar: bool = False, +): """Writing a table containing the metadata associated to contig :param h5f: HDF5 file to write gene families @@ -142,9 +165,13 @@ def write_metadata_contig(h5f: tables.File, source: str, select_contigs: List[Co metatype_group = write_metadata_group(h5f, "contigs") meta_len = get_metadata_contig_len(select_contigs, source) # h5f.remove_node(metatype_group, source) - source_table = h5f.create_table(metatype_group, source, desc_metadata(*meta_len[:-1]), expectedrows=meta_len[-1]) + source_table = h5f.create_table( + metatype_group, source, desc_metadata(*meta_len[:-1]), expectedrows=meta_len[-1] + ) meta_row = source_table.row - for contig in tqdm(select_contigs, unit="contigs", desc=f'Source = {source}', disable=disable_bar): + for contig in tqdm( + select_contigs, unit="contigs", desc=f"Source = {source}", disable=disable_bar + ): for meta_id, metadata in contig.get_metadata_by_source(source).items(): meta_row["metadata_id"] = meta_id for desc in source_table.colnames: @@ -156,7 +183,7 @@ def write_metadata_contig(h5f: tables.File, source: str, select_contigs: List[Co if hasattr(metadata, desc): value = metadata.__getattribute__(desc) if isinstance(value, bytes): - value = value.decode('UTF-8') + value = value.decode("UTF-8") meta_row[desc] = value else: meta_row[desc] = None @@ -164,9 +191,17 @@ def write_metadata_contig(h5f: tables.File, source: str, select_contigs: List[Co source_table.flush() -def get_metadata_len(select_elem: Union[List[Gene], List[Organism], List[GeneFamily], - List[Region], List[Spot], List[Module]], - source: str) -> Tuple[Dict[str, int], Dict[str, tables.Col], int]: +def get_metadata_len( + select_elem: Union[ + List[Gene], + List[Organism], + List[GeneFamily], + List[Region], + List[Spot], + List[Module], + ], + source: str, +) -> Tuple[Dict[str, int], Dict[str, tables.Col], int]: """Get maximum size of metadata information :param select_elem: selected elements from source @@ -179,12 +214,14 @@ def get_metadata_len(select_elem: Union[List[Gene], List[Organism], List[GeneFam expected_rows = 0 for element in select_elem: - if hasattr(element, 'name') and element.name: - max_len_dict['ID'] = max(max_len_dict.get('ID', 0), len(element.name)) - elif hasattr(element, 'ID'): + if hasattr(element, "name") and element.name: + max_len_dict["ID"] = max(max_len_dict.get("ID", 0), len(element.name)) + elif hasattr(element, "ID"): if isinstance(element.ID, str): - max_len_dict['ID'] = max(max_len_dict.get('ID', 0), len(element.ID)) - elif isinstance(element.ID, (int, numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64)): + max_len_dict["ID"] = max(max_len_dict.get("ID", 0), len(element.ID)) + elif isinstance( + element.ID, (int, numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64) + ): type_dict["ID"] = tables.Int64Col() else: raise TypeError( @@ -198,13 +235,17 @@ def get_metadata_len(select_elem: Union[List[Gene], List[Organism], List[GeneFam ) for metadata in element.get_metadata_by_source(source).values(): - for attr, value in ((k, v) for k, v in metadata.__dict__.items() if k != "source"): + for attr, value in ( + (k, v) for k, v in metadata.__dict__.items() if k != "source" + ): if isinstance(value, bytes): - value = value.decode('UTF-8') + value = value.decode("UTF-8") if isinstance(value, float) or isinstance(value, int): if attr in type_dict: - if isinstance(type_dict[attr] , type(value)): - if isinstance(value, float) and isinstance(type_dict[attr], int): + if isinstance(type_dict[attr], type(value)): + if isinstance(value, float) and isinstance( + type_dict[attr], int + ): type_dict[attr] = tables.Float64Col() else: if isinstance(value, float): @@ -218,11 +259,13 @@ def get_metadata_len(select_elem: Union[List[Gene], List[Organism], List[GeneFam else: max_len_dict[attr] = len(value) else: - logging.getLogger("PPanGGOLiN").debug(f"attr: {attr}, value: {value}") + logging.getLogger("PPanGGOLiN").debug( + f"attr: {attr}, value: {value}" + ) raise TypeError( - f"Invalid metadata type: The attribute '{attr}' from the pangenome element '{element}' " - f"has an unexpected value '{value}' of type '{type(value).__name__}'." - ) + f"Invalid metadata type: The attribute '{attr}' from the pangenome element '{element}' " + f"has an unexpected value '{value}' of type '{type(value).__name__}'." + ) expected_rows += 1 @@ -232,14 +275,23 @@ def get_metadata_len(select_elem: Union[List[Gene], List[Organism], List[GeneFam f"Metadata attribute '{attribute}' has a length of 0, which is not allowed." ) - return max_len_dict, type_dict, expected_rows -def write_metadata_metatype(h5f: tables.File, source: str, metatype: str, - select_elements: Union[List[Gene], List[Organism], List[GeneFamily], - List[Region], List[Spot], List[Module]], - disable_bar: bool = False): +def write_metadata_metatype( + h5f: tables.File, + source: str, + metatype: str, + select_elements: Union[ + List[Gene], + List[Organism], + List[GeneFamily], + List[Region], + List[Spot], + List[Module], + ], + disable_bar: bool = False, +): """Writing a table containing the metadata associated to element from the metatype :param h5f: HDF5 file to write gene families @@ -253,13 +305,20 @@ def write_metadata_metatype(h5f: tables.File, source: str, metatype: str, desc_metadata(max_len_dict, type_dict) - source_table = h5f.create_table(metatype_group, source, desc_metadata(max_len_dict, type_dict), expectedrows=expected_rows) + source_table = h5f.create_table( + metatype_group, + source, + desc_metadata(max_len_dict, type_dict), + expectedrows=expected_rows, + ) meta_row = source_table.row - for element in tqdm(select_elements, unit=metatype, desc=f'Source = {source}', disable=disable_bar): + for element in tqdm( + select_elements, unit=metatype, desc=f"Source = {source}", disable=disable_bar + ): for meta_id, metadata in element.get_metadata_by_source(source).items(): for desc in source_table.colnames: if desc == "ID": - if hasattr(element, 'name') and len(element.name) > 0: + if hasattr(element, "name") and len(element.name) > 0: meta_row[desc] = element.name else: meta_row[desc] = element.ID @@ -269,14 +328,19 @@ def write_metadata_metatype(h5f: tables.File, source: str, metatype: str, if hasattr(metadata, desc): value = metadata.__getattribute__(desc) if isinstance(value, bytes): - value = value.decode('UTF-8') + value = value.decode("UTF-8") meta_row[desc] = value meta_row.append() source_table.flush() -def erase_metadata(pangenome: Pangenome, h5f: tables.File, status_group: tables.Group, - metatype: str = None, source: str = None): +def erase_metadata( + pangenome: Pangenome, + h5f: tables.File, + status_group: tables.Group, + metatype: str = None, + source: str = None, +): """ Erase metadata in pangenome @@ -293,18 +357,24 @@ def erase_metadata(pangenome: Pangenome, h5f: tables.File, status_group: tables. if metatype in metadata_group: metatype_group = metadata_group._f_get_child(metatype) if source in metatype_group: - logging.getLogger("PPanGGOLiN").info(f"Erasing metadata assign to {metatype} from source {source}...") + logging.getLogger("PPanGGOLiN").info( + f"Erasing metadata assign to {metatype} from source {source}..." + ) metasources[metatype].remove(source) status_group.metasources._v_attrs[metatype].remove(source) h5f.remove_node(metatype_group, source) if metatype_group._v_nchildren == 0: - logging.getLogger("PPanGGOLiN").debug(f"No more source of metadata in {metatype}. " - f"Erasing node {metatype} in metadata") - metastatus[metatype] = 'No' + logging.getLogger("PPanGGOLiN").debug( + f"No more source of metadata in {metatype}. " + f"Erasing node {metatype} in metadata" + ) + metastatus[metatype] = "No" status_group.metastatus.families = False h5f.remove_node(metadata_group, metatype) if metadata_group._v_nchildren == 0: - logging.getLogger("PPanGGOLiN").debug("No more metadata in pangenome. Erasing node metadata") + logging.getLogger("PPanGGOLiN").debug( + "No more metadata in pangenome. Erasing node metadata" + ) status_group._v_attrs.metadata = False h5f.remove_node("/", "metadata") h5f.remove_node(status_group, "metasources") @@ -319,56 +389,118 @@ def write_metadata(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = F :param disable_bar: Disable progress bar """ if pangenome.status["metadata"]["families"] == "Computed": - logging.getLogger("PPanGGOLiN").info("Writing gene families metadata in pangenome") - select_gf = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["families"][-1], - metatype="families")) - write_metadata_metatype(h5f, pangenome.status["metasources"]["families"][-1], - "families", select_gf, disable_bar) + logging.getLogger("PPanGGOLiN").info( + "Writing gene families metadata in pangenome" + ) + select_gf = list( + pangenome.get_elem_by_source( + source=pangenome.status["metasources"]["families"][-1], + metatype="families", + ) + ) + write_metadata_metatype( + h5f, + pangenome.status["metasources"]["families"][-1], + "families", + select_gf, + disable_bar, + ) pangenome.status["metadata"]["families"] = "Loaded" if pangenome.status["metadata"]["genomes"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing genomes metadata in pangenome") - select_genomes = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["genomes"][-1], - metatype="genomes")) - write_metadata_metatype(h5f, pangenome.status["metasources"]["genomes"][-1], - "genomes", select_genomes, disable_bar) + select_genomes = list( + pangenome.get_elem_by_source( + source=pangenome.status["metasources"]["genomes"][-1], + metatype="genomes", + ) + ) + write_metadata_metatype( + h5f, + pangenome.status["metasources"]["genomes"][-1], + "genomes", + select_genomes, + disable_bar, + ) pangenome.status["metadata"]["genomes"] = "Loaded" if pangenome.status["metadata"]["contigs"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing contigs metadata in pangenome") - select_contigs = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["contigs"][-1], - metatype="contigs")) - write_metadata_contig(h5f, pangenome.status["metasources"]["contigs"][-1], select_contigs, disable_bar) + select_contigs = list( + pangenome.get_elem_by_source( + source=pangenome.status["metasources"]["contigs"][-1], + metatype="contigs", + ) + ) + write_metadata_contig( + h5f, + pangenome.status["metasources"]["contigs"][-1], + select_contigs, + disable_bar, + ) pangenome.status["metadata"]["contigs"] = "Loaded" if pangenome.status["metadata"]["genes"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing genes metadata in pangenome") - select_genes = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["genes"][-1], - metatype="genes")) - write_metadata_metatype(h5f, pangenome.status["metasources"]["genes"][-1], - "genes", select_genes, disable_bar) + select_genes = list( + pangenome.get_elem_by_source( + source=pangenome.status["metasources"]["genes"][-1], metatype="genes" + ) + ) + write_metadata_metatype( + h5f, + pangenome.status["metasources"]["genes"][-1], + "genes", + select_genes, + disable_bar, + ) pangenome.status["metadata"]["genes"] = "Loaded" if pangenome.status["metadata"]["RGPs"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing RGPs metadata in pangenome") - select_rgps = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["RGPs"][-1], - metatype="RGPs")) - write_metadata_metatype(h5f, pangenome.status["metasources"]["RGPs"][-1], - "RGPs", select_rgps, disable_bar) + select_rgps = list( + pangenome.get_elem_by_source( + source=pangenome.status["metasources"]["RGPs"][-1], metatype="RGPs" + ) + ) + write_metadata_metatype( + h5f, + pangenome.status["metasources"]["RGPs"][-1], + "RGPs", + select_rgps, + disable_bar, + ) pangenome.status["metadata"]["RGPs"] = "Loaded" if pangenome.status["metadata"]["spots"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing spots metadata in pangenome") - select_spots = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["spots"][-1], - metatype="spots")) - write_metadata_metatype(h5f, pangenome.status["metasources"]["spots"][-1], - "spots", select_spots, disable_bar) + select_spots = list( + pangenome.get_elem_by_source( + source=pangenome.status["metasources"]["spots"][-1], metatype="spots" + ) + ) + write_metadata_metatype( + h5f, + pangenome.status["metasources"]["spots"][-1], + "spots", + select_spots, + disable_bar, + ) pangenome.status["metadata"]["spots"] = "Loaded" if pangenome.status["metadata"]["modules"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing modules metadata in pangenome") - select_modules = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["modules"][-1], - metatype="modules")) - write_metadata_metatype(h5f, pangenome.status["metasources"]["modules"][-1], - "modules", select_modules, disable_bar) + select_modules = list( + pangenome.get_elem_by_source( + source=pangenome.status["metasources"]["modules"][-1], + metatype="modules", + ) + ) + write_metadata_metatype( + h5f, + pangenome.status["metasources"]["modules"][-1], + "modules", + select_modules, + disable_bar, + ) pangenome.status["metadata"]["modules"] = "Loaded" diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index e69f3a5a..734ec9dc 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -16,12 +16,33 @@ from ppanggolin.pangenome import Pangenome from ppanggolin.genome import Gene, Organism -from ppanggolin.utils import (write_compressed_or_not, mk_outdir, create_tmpdir, read_compressed_or_not, - restricted_float, detect_filetype, run_subprocess) -from ppanggolin.formats.readBinaries import check_pangenome_info, write_genes_from_pangenome_file, write_fasta_gene_fam_from_pangenome_file, write_fasta_prot_fam_from_pangenome_file - -module_regex = re.compile(r'^module_\d+') # \d == [0-9] -poss_values = ['all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', 'core', module_regex] +from ppanggolin.utils import ( + write_compressed_or_not, + mk_outdir, + create_tmpdir, + read_compressed_or_not, + restricted_float, + detect_filetype, + run_subprocess, +) +from ppanggolin.formats.readBinaries import ( + check_pangenome_info, + write_genes_from_pangenome_file, + write_fasta_gene_fam_from_pangenome_file, + write_fasta_prot_fam_from_pangenome_file, +) + +module_regex = re.compile(r"^module_\d+") # \d == [0-9] +poss_values = [ + "all", + "persistent", + "shell", + "cloud", + "rgp", + "softcore", + "core", + module_regex, +] poss_values_log = f"Possible values are {', '.join(poss_values[:-1])}, module_X with X being a module id." @@ -33,13 +54,20 @@ def check_write_sequences_args(args: argparse.Namespace) -> None: :raises argparse.ArgumentTypeError: if region is given but neither fasta nor anno is given """ if args.regions is not None and args.fasta is None and args.anno is None: - raise argparse.ArgumentError(argument=None, - message="The --regions options requires the use of --anno or --fasta " - "(You need to provide the same file used to compute the pangenome)") - - -def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], output: Path, add: str = '', - compress: bool = False, disable_bar: bool = False): + raise argparse.ArgumentError( + argument=None, + message="The --regions options requires the use of --anno or --fasta " + "(You need to provide the same file used to compute the pangenome)", + ) + + +def write_gene_sequences_from_annotations( + genes_to_write: Iterable[Gene], + output: Path, + add: str = "", + compress: bool = False, + disable_bar: bool = False, +): """ Writes the CDS sequences to a File object, and adds the string provided through `add` in front of it. @@ -51,15 +79,23 @@ def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], output :param compress: Compress the file in .gz :param disable_bar: Disable progress bar. """ - logging.getLogger("PPanGGOLiN").info(f"Writing all CDS sequences in {output.absolute()}") + logging.getLogger("PPanGGOLiN").info( + f"Writing all CDS sequences in {output.absolute()}" + ) with write_compressed_or_not(output, compress) as file_obj: for gene in tqdm(genes_to_write, unit="gene", disable=disable_bar): if gene.type == "CDS": - file_obj.write(f'>{add}{gene.ID}\n') - file_obj.write(f'{gene.dna}\n') + file_obj.write(f">{add}{gene.ID}\n") + file_obj.write(f"{gene.dna}\n") -def create_mmseqs_db(sequences: Iterable[Path], db_name: str, tmpdir: Path, db_mode: int = 0, db_type: int = 0) -> Path: +def create_mmseqs_db( + sequences: Iterable[Path], + db_name: str, + tmpdir: Path, + db_mode: int = 0, + db_type: int = 0, +) -> Path: """Create a MMseqs2 database from a sequences file. :param sequences: File with the sequences @@ -82,8 +118,13 @@ def create_mmseqs_db(sequences: Iterable[Path], db_name: str, tmpdir: Path, db_m return seq_nucdb -def translate_genes(sequences: Union[Path, Iterable[Path]], tmpdir: Path, cpu: int = 1, - is_single_line_fasta: bool = False, code: int = 11) -> Path: +def translate_genes( + sequences: Union[Path, Iterable[Path]], + tmpdir: Path, + cpu: int = 1, + is_single_line_fasta: bool = False, + code: int = 11, +) -> Path: """Translate nucleotide sequences into MMSeqs2 amino acid sequences database :param sequences: File with the nucleotide sequences @@ -94,20 +135,47 @@ def translate_genes(sequences: Union[Path, Iterable[Path]], tmpdir: Path, cpu: i :return: Path to the MMSeqs2 database """ - seq_nucdb = create_mmseqs_db([sequences] if isinstance(sequences, Path) else sequences, 'nucleotides_db', - tmpdir, db_mode=1 if is_single_line_fasta else 0, db_type=2) + seq_nucdb = create_mmseqs_db( + [sequences] if isinstance(sequences, Path) else sequences, + "nucleotides_db", + tmpdir, + db_mode=1 if is_single_line_fasta else 0, + db_type=2, + ) logging.getLogger("PPanGGOLiN").debug("Translate sequence ...") - seqdb = tmpdir / 'translate_db' - cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", cpu, "--translation-table", code])) + seqdb = tmpdir / "translate_db" + cmd = list( + map( + str, + [ + "mmseqs", + "translatenucs", + seq_nucdb, + seqdb, + "--threads", + cpu, + "--translation-table", + code, + ], + ) + ) run_subprocess(cmd, msg="MMSeqs translatenucs failed with the following error:\n") return seqdb - -def write_gene_protein_sequences(pangenome_filename: str, output: Path, gene_filter: str, soft_core: float = 0.95, - compress: bool = False, keep_tmp: bool = False, tmp: Path = None, - cpu: int = 1, code: int = 11, disable_bar: bool = False): - """ Write all amino acid sequences from given genes in pangenome +def write_gene_protein_sequences( + pangenome_filename: str, + output: Path, + gene_filter: str, + soft_core: float = 0.95, + compress: bool = False, + keep_tmp: bool = False, + tmp: Path = None, + cpu: int = 1, + code: int = 11, + disable_bar: bool = False, +): + """Write all amino acid sequences from given genes in pangenome :param pangenome: Pangenome object with gene families sequences :param output: Path to output directory @@ -120,31 +188,53 @@ def write_gene_protein_sequences(pangenome_filename: str, output: Path, gene_fil :param code: Genetic code use to translate nucleotide sequences to protein sequences :param disable_bar: Disable progress bar """ - with create_tmpdir(tmp if tmp is not None else Path(tempfile.gettempdir()), - basename="translateGenes", keep_tmp=keep_tmp) as tmpdir: + with create_tmpdir( + tmp if tmp is not None else Path(tempfile.gettempdir()), + basename="translateGenes", + keep_tmp=keep_tmp, + ) as tmpdir: + + write_genes_from_pangenome_file( + pangenome_filename=pangenome_filename, + gene_filter=gene_filter, + output=tmpdir, + compress=compress, + disable_bar=disable_bar, + ) + + genes_sequence_tmp_file = ( + tmpdir / f"{gene_filter}_genes.fna{'.gz' if compress else ''}" + ) + translate_db = translate_genes( + sequences=genes_sequence_tmp_file, + tmpdir=tmpdir, + cpu=cpu, + is_single_line_fasta=True, + code=code, + ) - write_genes_from_pangenome_file(pangenome_filename=pangenome_filename, gene_filter = gene_filter, - output=tmpdir, compress=compress, disable_bar=disable_bar) - - genes_sequence_tmp_file = tmpdir / f"{gene_filter}_genes.fna{'.gz' if compress else ''}" - translate_db = translate_genes(sequences=genes_sequence_tmp_file, tmpdir=tmpdir, - cpu=cpu, is_single_line_fasta=True, code=code) - outpath = output / f"{gene_filter}_protein_genes.faa" - - logging.getLogger("PPanGGOLiN").info("Translating nucleotide gene sequence in protein sequences with mmseqs convert2fasta") + + logging.getLogger("PPanGGOLiN").info( + "Translating nucleotide gene sequence in protein sequences with mmseqs convert2fasta" + ) cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) - run_subprocess(cmd, msg="MMSeqs convert2fasta failed with the following error:\n") + run_subprocess( + cmd, msg="MMSeqs convert2fasta failed with the following error:\n" + ) if compress: with write_compressed_or_not(outpath, compress) as compress_file: with open(outpath) as sequence_file: shutil.copyfileobj(sequence_file, compress_file) outpath.unlink() - logging.getLogger("PPanGGOLiN").info(f"Done writing the gene protein sequences : '{outpath}.gz'") + logging.getLogger("PPanGGOLiN").info( + f"Done writing the gene protein sequences : '{outpath}.gz'" + ) else: - logging.getLogger("PPanGGOLiN").info(f"Done writing the gene protein sequences : '{outpath}'") - + logging.getLogger("PPanGGOLiN").info( + f"Done writing the gene protein sequences : '{outpath}'" + ) def read_fasta_or_gff(file_path: Path) -> Dict[str, str]: @@ -164,7 +254,7 @@ def read_fasta_or_gff(file_path: Path) -> Dict[str, str]: if line.startswith(">"): in_fasta_part = True if in_fasta_part: - if line.startswith('>'): + if line.startswith(">"): if seq != "": sequence_dict[seqname] = seq seq = "" @@ -191,12 +281,12 @@ def read_fasta_gbk(file_path: Path) -> Dict[str, str]: while len(lines) != 0: line = lines.pop() # beginning of contig - if line.startswith('LOCUS'): + if line.startswith("LOCUS"): contig_locus_id = line.split()[1] # If contig_id is not specified in VERSION afterward like with Prokka, # in that case we use the one in LOCUS. - while not line.startswith('FEATURES'): - if line.startswith('VERSION'): + while not line.startswith("FEATURES"): + if line.startswith("VERSION"): contig_id = line[12:].strip() line = lines.pop() if contig_id == "": @@ -205,7 +295,7 @@ def read_fasta_gbk(file_path: Path) -> Dict[str, str]: line = lines.pop() # stuff line = lines.pop() # first sequence line. sequence = "" - while not line.startswith('//'): + while not line.startswith("//"): sequence += line[10:].replace(" ", "").strip().upper() line = lines.pop() # get each gene's sequence. @@ -236,8 +326,10 @@ def read_genome_file(genome_file: Path, organism: Organism) -> Dict[str, str]: # check_contig_names if set(contig_to_sequence) != {contig.name for contig in organism.contigs}: - raise KeyError(f"Contig name inconsistency detected in genome '{organism.name}' between the " - f"information stored in the pangenome file and the contigs found in '{genome_file}'.") + raise KeyError( + f"Contig name inconsistency detected in genome '{organism.name}' between the " + f"information stored in the pangenome file and the contigs found in '{genome_file}'." + ) return contig_to_sequence @@ -254,13 +346,20 @@ def write_spaced_fasta(sequence: str, space: int = 60) -> str: seq = "" j = 0 while j < len(sequence): - seq += sequence[j:j + space] + "\n" + seq += sequence[j : j + space] + "\n" j += space return seq -def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fasta: Path = None, anno: Path = None, - compress: bool = False, disable_bar: bool = False): +def write_regions_sequences( + pangenome: Pangenome, + output: Path, + regions: str, + fasta: Path = None, + anno: Path = None, + compress: bool = False, + disable_bar: bool = False, +): """ Write representative amino acid sequences of gene families. @@ -274,23 +373,32 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa :raises SyntaxError: if no tabulation are found in list genomes file """ - assert fasta is not None or anno is not None, "Write regions requires to use anno or fasta, not any provided" + assert ( + fasta is not None or anno is not None + ), "Write regions requires to use anno or fasta, not any provided" organisms_file = fasta if fasta is not None else anno org_dict = {} for line in read_compressed_or_not(organisms_file): elements = [el.strip() for el in line.split("\t")] if len(elements) <= 1: - raise ValueError(f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'") + raise ValueError( + f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'" + ) org_dict[elements[0]] = Path(elements[1]) - if not org_dict[elements[0]].exists(): # Check tsv sanity test if it's not one it's the other - org_dict[elements[0]] = organisms_file.parent.joinpath(org_dict[elements[0]]) + if not org_dict[ + elements[0] + ].exists(): # Check tsv sanity test if it's not one it's the other + org_dict[elements[0]] = organisms_file.parent.joinpath( + org_dict[elements[0]] + ) logging.getLogger("PPanGGOLiN").info(f"Writing {regions} rgp genomic sequences...") if regions == "complete": - regions_to_write = (region for region in pangenome.regions - if not region.is_contig_border) + regions_to_write = ( + region for region in pangenome.regions if not region.is_contig_border + ) else: regions_to_write = pangenome.regions @@ -306,15 +414,31 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa genome_sequence = read_genome_file(org_dict[organism.name], organism) fasta.write(f">{region.name}\n") fasta.write( - write_spaced_fasta(genome_sequence[region.contig.name][region.start:region.stop], 60)) - logging.getLogger("PPanGGOLiN").info(f"Done writing the regions nucleotide sequences: " - f"'{outname}{'.gz' if compress else ''}'") - - -def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, anno: Path = None, - soft_core: float = 0.95, regions: str = None, genes: str = None, proteins: str = None, - gene_families: str = None, prot_families: str = None, compress: bool = False, - disable_bar: bool = False, **translate_kwgs): + write_spaced_fasta( + genome_sequence[region.contig.name][region.start : region.stop], 60 + ) + ) + logging.getLogger("PPanGGOLiN").info( + f"Done writing the regions nucleotide sequences: " + f"'{outname}{'.gz' if compress else ''}'" + ) + + +def write_sequence_files( + pangenome: Pangenome, + output: Path, + fasta: Path = None, + anno: Path = None, + soft_core: float = 0.95, + regions: str = None, + genes: str = None, + proteins: str = None, + gene_families: str = None, + prot_families: str = None, + compress: bool = False, + disable_bar: bool = False, + **translate_kwgs, +): """ Main function to write sequence file from pangenome @@ -332,49 +456,87 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, :param disable_bar: Disable progress bar """ - if gene_families is not None: - logging.getLogger("PPanGGOLiN").info("Writing the representative nucleotide sequences " - "of the gene families by reading the pangenome file directly.") - - write_fasta_gene_fam_from_pangenome_file(pangenome_filename=pangenome.file, family_filter= gene_families, soft_core=soft_core, - output=output, compress=compress, disable_bar=disable_bar) + logging.getLogger("PPanGGOLiN").info( + "Writing the representative nucleotide sequences " + "of the gene families by reading the pangenome file directly." + ) + + write_fasta_gene_fam_from_pangenome_file( + pangenome_filename=pangenome.file, + family_filter=gene_families, + soft_core=soft_core, + output=output, + compress=compress, + disable_bar=disable_bar, + ) gene_families = None - + if prot_families is not None: - logging.getLogger("PPanGGOLiN").info("Writing the representative protein sequences " - "of the gene families by reading the pangenome file directly.") - write_fasta_prot_fam_from_pangenome_file(pangenome_filename=pangenome.file, family_filter = prot_families, soft_core=soft_core, - output=output, compress=compress, disable_bar=disable_bar) + logging.getLogger("PPanGGOLiN").info( + "Writing the representative protein sequences " + "of the gene families by reading the pangenome file directly." + ) + write_fasta_prot_fam_from_pangenome_file( + pangenome_filename=pangenome.file, + family_filter=prot_families, + soft_core=soft_core, + output=output, + compress=compress, + disable_bar=disable_bar, + ) prot_families = None - if genes is not None: - logging.getLogger("PPanGGOLiN").info("Writing gene nucleotide sequences by reading the pangenome file directly.") - write_genes_from_pangenome_file(pangenome_filename=pangenome.file, gene_filter = genes, soft_core=soft_core, - output=output, compress=compress, disable_bar=disable_bar) + logging.getLogger("PPanGGOLiN").info( + "Writing gene nucleotide sequences by reading the pangenome file directly." + ) + write_genes_from_pangenome_file( + pangenome_filename=pangenome.file, + gene_filter=genes, + soft_core=soft_core, + output=output, + compress=compress, + disable_bar=disable_bar, + ) genes = None if proteins is not None: - - logging.getLogger("PPanGGOLiN").info("Writing gene protein sequences by reading the pangenome file directly.") - write_gene_protein_sequences(pangenome_filename=pangenome.file, output=output, gene_filter=proteins, soft_core=soft_core, compress=compress, disable_bar=disable_bar, - **translate_kwgs) - + logging.getLogger("PPanGGOLiN").info( + "Writing gene protein sequences by reading the pangenome file directly." + ) + write_gene_protein_sequences( + pangenome_filename=pangenome.file, + output=output, + gene_filter=proteins, + soft_core=soft_core, + compress=compress, + disable_bar=disable_bar, + **translate_kwgs, + ) + proteins = None if regions is not None: # load pangenome when writing region sequence - check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_rgp=True, disable_bar=disable_bar) + check_pangenome_info( + pangenome, + need_annotations=True, + need_families=True, + need_rgp=True, + disable_bar=disable_bar, + ) + + write_regions_sequences( + pangenome, output, regions, fasta, anno, compress, disable_bar + ) - write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar) - def launch(args: argparse.Namespace): """ @@ -383,19 +545,31 @@ def launch(args: argparse.Namespace): :param args: All arguments provide by user """ check_write_sequences_args(args) - translate_kwgs = {"code": args.translation_table, - "cpu": args.cpu, - "tmp": args.tmpdir, - "keep_tmp": args.keep_tmp} + translate_kwgs = { + "code": args.translation_table, + "cpu": args.cpu, + "tmp": args.tmpdir, + "keep_tmp": args.keep_tmp, + } mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) - - write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core, - regions=args.regions, genes=args.genes, proteins=args.proteins, - gene_families=args.gene_families, prot_families=args.prot_families, compress=args.compress, - disable_bar=args.disable_prog_bar, **translate_kwgs) + write_sequence_files( + pangenome, + args.output, + fasta=args.fasta, + anno=args.anno, + soft_core=args.soft_core, + regions=args.regions, + genes=args.genes, + proteins=args.proteins, + gene_families=args.gene_families, + prot_families=args.prot_families, + compress=args.compress, + disable_bar=args.disable_prog_bar, + **translate_kwgs, + ) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -406,7 +580,9 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : parser arguments for align command """ - parser = sub_parser.add_parser("fasta", formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "fasta", formatter_class=argparse.RawTextHelpFormatter + ) parser_seq(parser) return parser @@ -424,7 +600,9 @@ def filter_values(arg_value: str): if arg_value in poss_values or module_regex.match(arg_value): return arg_value else: - raise argparse.ArgumentTypeError(f"Invalid choice '{arg_value}'. {poss_values_log}") + raise argparse.ArgumentTypeError( + f"Invalid choice '{arg_value}'. {poss_values_log}" + ) def parser_seq(parser: argparse.ArgumentParser): @@ -434,61 +612,129 @@ def parser_seq(parser: argparse.ArgumentParser): :param parser: parser for align argument """ - required = parser.add_argument_group(title="Required arguments", - description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") - required.add_argument('-o', '--output', required=True, type=Path, - help="Output directory where the file(s) will be written") - - context = parser.add_argument_group(title="Contextually required arguments", - description="With --regions, the following arguments are required:") - context.add_argument('--fasta', required=False, type=Path, - help="A tab-separated file listing the genome names, and the fasta filepath of its genomic " - "sequence(s) (the fastas can be compressed with gzip). One line per genome.") - context.add_argument('--anno', required=False, type=Path, - help="A tab-separated file listing the genome names, and the gff/gbff filepath of its " - "annotations (the files can be compressed with gzip). One line per genome. " - "If this is provided, those annotations will be used.") - - onereq = parser.add_argument_group(title="Output file", - description="At least one of the following argument is required. " - "Indicating 'all' writes all elements. Writing a partition " - "('persistent', 'shell' or 'cloud') write the elements associated " - "to said partition. Writing 'rgp' writes elements associated to RGPs" - ) - onereq.add_argument("--genes", required=False, type=filter_values, - help=f"Write all nucleotide CDS sequences. {poss_values_log}") - onereq.add_argument("--proteins", required=False, type=filter_values, - help=f"Write representative amino acid sequences of genes. {poss_values_log}") - onereq.add_argument("--prot_families", required=False, type=filter_values, - help=f"Write representative amino acid sequences of gene families. {poss_values_log}") - onereq.add_argument("--gene_families", required=False, type=filter_values, - help=f"Write representative nucleotide sequences of gene families. {poss_values_log}") + required = parser.add_argument_group( + title="Required arguments", + description="One of the following arguments is required :", + ) + required.add_argument( + "-p", "--pangenome", required=False, type=Path, help="The pangenome .h5 file" + ) + required.add_argument( + "-o", + "--output", + required=True, + type=Path, + help="Output directory where the file(s) will be written", + ) + + context = parser.add_argument_group( + title="Contextually required arguments", + description="With --regions, the following arguments are required:", + ) + context.add_argument( + "--fasta", + required=False, + type=Path, + help="A tab-separated file listing the genome names, and the fasta filepath of its genomic " + "sequence(s) (the fastas can be compressed with gzip). One line per genome.", + ) + context.add_argument( + "--anno", + required=False, + type=Path, + help="A tab-separated file listing the genome names, and the gff/gbff filepath of its " + "annotations (the files can be compressed with gzip). One line per genome. " + "If this is provided, those annotations will be used.", + ) + + onereq = parser.add_argument_group( + title="Output file", + description="At least one of the following argument is required. " + "Indicating 'all' writes all elements. Writing a partition " + "('persistent', 'shell' or 'cloud') write the elements associated " + "to said partition. Writing 'rgp' writes elements associated to RGPs", + ) + onereq.add_argument( + "--genes", + required=False, + type=filter_values, + help=f"Write all nucleotide CDS sequences. {poss_values_log}", + ) + onereq.add_argument( + "--proteins", + required=False, + type=filter_values, + help=f"Write representative amino acid sequences of genes. {poss_values_log}", + ) + onereq.add_argument( + "--prot_families", + required=False, + type=filter_values, + help=f"Write representative amino acid sequences of gene families. {poss_values_log}", + ) + onereq.add_argument( + "--gene_families", + required=False, + type=filter_values, + help=f"Write representative nucleotide sequences of gene families. {poss_values_log}", + ) optional = parser.add_argument_group(title="Optional arguments") # could make choice to allow customization - optional.add_argument("--regions", required=False, type=str, choices=["all", "complete"], - help="Write the RGP nucleotide sequences (requires --anno or --fasta used to compute " - "the pangenome to be given)") - optional.add_argument("--soft_core", required=False, type=restricted_float, default=0.95, - help="Soft core threshold to use if 'softcore' partition is chosen") - optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz") - optional.add_argument("--translation_table", required=False, default="11", - help="Translation table (genetic code) to use.") - optional.add_argument("--cpu", required=False, default=1, type=int, help="Number of available threads") - optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), - help="directory for storing temporary files") - optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", - help="Keeping temporary files (useful for debugging).") - - -if __name__ == '__main__': + optional.add_argument( + "--regions", + required=False, + type=str, + choices=["all", "complete"], + help="Write the RGP nucleotide sequences (requires --anno or --fasta used to compute " + "the pangenome to be given)", + ) + optional.add_argument( + "--soft_core", + required=False, + type=restricted_float, + default=0.95, + help="Soft core threshold to use if 'softcore' partition is chosen", + ) + optional.add_argument( + "--compress", + required=False, + action="store_true", + help="Compress the files in .gz", + ) + optional.add_argument( + "--translation_table", + required=False, + default="11", + help="Translation table (genetic code) to use.", + ) + optional.add_argument( + "--cpu", required=False, default=1, type=int, help="Number of available threads" + ) + optional.add_argument( + "--tmpdir", + required=False, + type=Path, + default=Path(tempfile.gettempdir()), + help="directory for storing temporary files", + ) + optional.add_argument( + "--keep_tmp", + required=False, + default=False, + action="store_true", + help="Keeping temporary files (useful for debugging).", + ) + + +if __name__ == "__main__": """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser_seq(main_parser) add_common_arguments(main_parser) diff --git a/ppanggolin/formats/write_proksee.py b/ppanggolin/formats/write_proksee.py index b183932c..6e90fe75 100644 --- a/ppanggolin/formats/write_proksee.py +++ b/ppanggolin/formats/write_proksee.py @@ -38,22 +38,49 @@ def write_legend_items(features: List[str], module_to_color: Dict[Module, str] = "dark red": "#ca5c55", } - legend_data = {"items": [ - {"name": "persistent", "swatchColor": main_colors['orange'], "decoration": "arrow"}, - {"name": "shell", "swatchColor": main_colors['light green'], "decoration": "arrow"}, - {"name": "cloud", "swatchColor": main_colors['light blue'], "decoration": "arrow"}, - {"name": "RNA", "swatchColor": main_colors['purple'], "decoration": "arrow"}, - ] + legend_data = { + "items": [ + { + "name": "persistent", + "swatchColor": main_colors["orange"], + "decoration": "arrow", + }, + { + "name": "shell", + "swatchColor": main_colors["light green"], + "decoration": "arrow", + }, + { + "name": "cloud", + "swatchColor": main_colors["light blue"], + "decoration": "arrow", + }, + { + "name": "RNA", + "swatchColor": main_colors["purple"], + "decoration": "arrow", + }, + ] } if "rgp" in features or "all" in features: - legend_data["items"].append({"name": "RGP", "swatchColor": main_colors['dark green'], "decoration": "arc"}), + legend_data["items"].append( + { + "name": "RGP", + "swatchColor": main_colors["dark green"], + "decoration": "arc", + } + ), if module_to_color is not None and ("modules" in features or "all" in features): for mod, color in sorted(module_to_color.items(), key=lambda x: x[0].ID): - legend_data["items"].append({"name": str(mod), - "decoration": "arc", - "swatchColor": color, - "visible": False}) + legend_data["items"].append( + { + "name": str(mod), + "decoration": "arc", + "swatchColor": color, + "visible": False, + } + ) return legend_data @@ -74,36 +101,42 @@ def write_tracks(features: List[str]): "thicknessRatio": 1, "dataType": "feature", "dataMethod": "source", - "dataKeys": "Gene" + "dataKeys": "Gene", } ] if "rgp" in features or "all" in features: - tracks.append({ - "name": "RGP", - "separateFeaturesBy": "None", - "position": "inside", - "thicknessRatio": 1, - "dataType": "feature", - "dataMethod": "source", - "dataKeys": "RGP" - }) + tracks.append( + { + "name": "RGP", + "separateFeaturesBy": "None", + "position": "inside", + "thicknessRatio": 1, + "dataType": "feature", + "dataMethod": "source", + "dataKeys": "RGP", + } + ) if "modules" in features or "all" in features: - tracks.append({ - "name": "Module", - "separateFeaturesBy": "None", - "position": "inside", - "thicknessRatio": 1, - "dataType": "feature", - "dataMethod": "source", - "dataKeys": "Module" - }) + tracks.append( + { + "name": "Module", + "separateFeaturesBy": "None", + "position": "inside", + "thicknessRatio": 1, + "dataType": "feature", + "dataMethod": "source", + "dataKeys": "Module", + } + ) return tracks -def initiate_proksee_data(features: List[str], organism: Organism, module_to_color: Dict[Module, str] = None): +def initiate_proksee_data( + features: List[str], organism: Organism, module_to_color: Dict[Module, str] = None +): """ Initializes ProkSee data structure with legends, tracks, and captions. @@ -120,24 +153,26 @@ def initiate_proksee_data(features: List[str], organism: Organism, module_to_col "name": f"{organism.name} annotated with PPanGGOLiN", "position": "bottom-center", "font": "sans-serif,plain,18", - "backgroundColor": "rgba(255,255,255,0.4)" + "backgroundColor": "rgba(255,255,255,0.4)", } cgview_data = { "name": "PPanGGOLiN annotation at genome level", "version": "1.5.0", - 'settings': {}, + "settings": {}, "legend": proksee_legends, "tracks": proksee_tracks, "sequence": {}, - 'captions': [proksee_captions], - "meta": organism.formatted_metadata_dict() # metadata + "captions": [proksee_captions], + "meta": organism.formatted_metadata_dict(), # metadata } return {"cgview": cgview_data} -def write_contig(organism: Organism, genome_sequences: Dict[str, str] = None, metadata_sep: str = "|") -> List[Dict]: +def write_contig( + organism: Organism, genome_sequences: Dict[str, str] = None, metadata_sep: str = "|" +) -> List[Dict]: """ Writes contig data for a given organism in proksee format. @@ -157,7 +192,11 @@ def write_contig(organism: Organism, genome_sequences: Dict[str, str] = None, me metadata_for_proksee.update(genome_metadata) metadata_for_proksee.update(contig.formatted_metadata_dict(metadata_sep)) - metadata_for_proksee = {key:val for key,val in metadata_for_proksee.items() if val not in ["None", '']} + metadata_for_proksee = { + key: val + for key, val in metadata_for_proksee.items() + if val not in ["None", ""] + } contig_info = { "name": contig.name, @@ -167,110 +206,142 @@ def write_contig(organism: Organism, genome_sequences: Dict[str, str] = None, me } if genome_sequences: - contig_info['seq'] = genome_sequences.get(contig.name, "") + contig_info["seq"] = genome_sequences.get(contig.name, "") contigs_data_list.append(contig_info) return contigs_data_list -def write_genes(organism: Organism, multigenics: Set[GeneFamily], metadata_sep: str = "|", disable_bar: bool = True) -> Tuple[List[Dict], Dict[str, List[Gene]]]: + +def write_genes( + organism: Organism, + multigenics: Set[GeneFamily], + metadata_sep: str = "|", + disable_bar: bool = True, +) -> Tuple[List[Dict], Dict[str, List[Gene]]]: """ Writes gene data for a given organism, including both protein-coding genes and RNA genes. :param organism: The organism for which gene data will be written. :param metadata_sep: The separator used to join multiple metadata values :param disable_bar: A flag to disable the progress bar when processing genes (default: True). - + :return: List of gene data in a structured format and a dictionary mapping gene families to genes. """ genes_data_list = [] gf2gene = defaultdict(list) # Process protein-coding genes - for gene in tqdm(organism.genes, total=organism.number_of_genes(), unit="genes", disable=disable_bar): + for gene in tqdm( + organism.genes, + total=organism.number_of_genes(), + unit="genes", + disable=disable_bar, + ): gf = gene.family gf2gene[gf.name].append(gene) # Add gene info in meta of proksee - metadata_for_proksee = {"ID":gene.ID , - "family":gene.family.name} + metadata_for_proksee = {"ID": gene.ID, "family": gene.family.name} if multigenics and gf in multigenics: - metadata_for_proksee['multigenic'] = True + metadata_for_proksee["multigenic"] = True if gene.name: - metadata_for_proksee['name'] = gene.name + metadata_for_proksee["name"] = gene.name if gene.product: - metadata_for_proksee['product'] = gene.product + metadata_for_proksee["product"] = gene.product if gene.spot: - metadata_for_proksee['spot'] = gene.spot.ID + metadata_for_proksee["spot"] = gene.spot.ID if gene.module: - metadata_for_proksee['module'] = gene.module.ID + metadata_for_proksee["module"] = gene.module.ID if gene.has_joined_coordinates: - metadata_for_proksee['coordinates'] = gene.string_coordinates() + metadata_for_proksee["coordinates"] = gene.string_coordinates() if gene.overlaps_contig_edge: - metadata_for_proksee['overlaps_contig_edge'] = gene.overlaps_contig_edge - - metadata_for_proksee.update({f"gene_{k}": v for k, v in gene.formatted_metadata_dict(metadata_sep).items()}) - metadata_for_proksee.update({f"family_{k}": v for k, v in gene.family.formatted_metadata_dict(metadata_sep).items()}) - + metadata_for_proksee["overlaps_contig_edge"] = gene.overlaps_contig_edge + + metadata_for_proksee.update( + { + f"gene_{k}": v + for k, v in gene.formatted_metadata_dict(metadata_sep).items() + } + ) + metadata_for_proksee.update( + { + f"family_{k}": v + for k, v in gene.family.formatted_metadata_dict(metadata_sep).items() + } + ) # Proksee handles circularity effectively. When a gene extends beyond the edge of the contig, # Proksee correctly displays the gene with its initial start (at the end of the contig) and final stop (at the beginning of the contig). # However, this only applies when there's a single contig. If there are multiple contigs, the feature overlaps all contigs, causing confusion. - #In case of frameshift we don't want to split the gene by its coordinates + # In case of frameshift we don't want to split the gene by its coordinates # When the gene overlaps_contig_edge the gene is split in two piece for correct visualisation - coordinates_to_display = gene.coordinates if gene.overlaps_contig_edge else [(gene.start, gene.stop)] + coordinates_to_display = ( + gene.coordinates if gene.overlaps_contig_edge else [(gene.start, gene.stop)] + ) for start, stop in coordinates_to_display: - genes_data_list.append({ - "name": gene.name, - "type": "Gene", - "contig": gene.contig.name, - "start": start, - "stop": stop, - "strand": 1 if gene.strand == "+" else -1, - "product": gene.product, - "tags": [gene.family.named_partition], - "source": "Gene", - "legend": gene.family.named_partition, - "meta": metadata_for_proksee - }) + genes_data_list.append( + { + "name": gene.name, + "type": "Gene", + "contig": gene.contig.name, + "start": start, + "stop": stop, + "strand": 1 if gene.strand == "+" else -1, + "product": gene.product, + "tags": [gene.family.named_partition], + "source": "Gene", + "legend": gene.family.named_partition, + "meta": metadata_for_proksee, + } + ) # Process RNA genes - for gene in tqdm(organism.rna_genes, total=organism.number_of_rnas(), unit="rnas", disable=disable_bar): - - metadata_for_proksee = {"ID":gene.ID} + for gene in tqdm( + organism.rna_genes, + total=organism.number_of_rnas(), + unit="rnas", + disable=disable_bar, + ): + + metadata_for_proksee = {"ID": gene.ID} if gene.product: - metadata_for_proksee['product'] = gene.product + metadata_for_proksee["product"] = gene.product metadata_for_proksee.update(gene.formatted_metadata_dict(metadata_sep)) - coordinates_to_display = gene.coordinates if gene.overlaps_contig_edge else [(gene.start, gene.stop)] + coordinates_to_display = ( + gene.coordinates if gene.overlaps_contig_edge else [(gene.start, gene.stop)] + ) for start, stop in coordinates_to_display: - genes_data_list.append({ - "name": gene.name, - "type": "Gene", - "contig": gene.contig.name, - "start": start, - "stop": stop, - "strand": 1 if gene.strand == "+" else -1, - "product": gene.product, - "tags": [], - "source": "Gene", - "legend": "RNA", - "meta": metadata_for_proksee - }) + genes_data_list.append( + { + "name": gene.name, + "type": "Gene", + "contig": gene.contig.name, + "start": start, + "stop": stop, + "strand": 1 if gene.strand == "+" else -1, + "product": gene.product, + "tags": [], + "source": "Gene", + "legend": "RNA", + "meta": metadata_for_proksee, + } + ) return genes_data_list, gf2gene -def write_rgp(organism: Organism, metadata_sep:str = "|"): +def write_rgp(organism: Organism, metadata_sep: str = "|"): """ Writes RGP (Region of Genomic Plasticity) data for a given organism in proksee format. :param organism: The specific organism for which RGP data will be written. @@ -283,28 +354,32 @@ def write_rgp(organism: Organism, metadata_sep:str = "|"): # Iterate through each RGP in the pangenome for rgp in organism.regions: # Create an entry for the RGP in the data list - metadata_for_proksee = {"spot":f"{rgp.spot.ID}" if rgp.spot else "No_spot"} + metadata_for_proksee = {"spot": f"{rgp.spot.ID}" if rgp.spot else "No_spot"} if rgp.overlaps_contig_edge: - metadata_for_proksee['overlaps_contig_edge'] = rgp.overlaps_contig_edge + metadata_for_proksee["overlaps_contig_edge"] = rgp.overlaps_contig_edge metadata_for_proksee.update(rgp.formatted_metadata_dict(metadata_sep)) for start, stop in rgp.coordinates: - rgp_data_list.append({ - "name": rgp.name, - "contig": rgp.contig.name, - "start": start, - "stop": stop, - "legend": "RGP", - "source": "RGP", - "tags": [f"spot_{rgp.spot.ID}" if rgp.spot else "No_spot"], - "meta": metadata_for_proksee - }) + rgp_data_list.append( + { + "name": rgp.name, + "contig": rgp.contig.name, + "start": start, + "stop": stop, + "legend": "RGP", + "source": "RGP", + "tags": [f"spot_{rgp.spot.ID}" if rgp.spot else "No_spot"], + "meta": metadata_for_proksee, + } + ) return rgp_data_list -def write_modules(organism: Organism, gf2genes: Dict[str, List[Gene]], metadata_sep:str = "|"): +def write_modules( + organism: Organism, gf2genes: Dict[str, List[Gene]], metadata_sep: str = "|" +): """ Writes module data in proksee format for a list of modules associated with a given organism. @@ -322,37 +397,44 @@ def write_modules(organism: Organism, gf2genes: Dict[str, List[Gene]], metadata_ if gf_intersection: # Calculate the completion percentage - metadata_for_proksee = {'completion': round(100 * len(gf_intersection) / len(set(module.families)), 1)} + metadata_for_proksee = { + "completion": round( + 100 * len(gf_intersection) / len(set(module.families)), 1 + ) + } metadata_for_proksee.update(module.formatted_metadata_dict(metadata_sep)) # Create module data entries for genes within intersecting gene families for gf in gf_intersection: for gene in gf2genes[gf.name]: for start, stop in gene.coordinates: - modules_data_list.append({ - "name": str(module), - "presence": "Module", - "start": start, - "stop": stop, - "contig": gene.contig.name, - "legend": str(module), - "source": "Module", - "tags": [], - "meta": metadata_for_proksee - }) - - + modules_data_list.append( + { + "name": str(module), + "presence": "Module", + "start": start, + "stop": stop, + "contig": gene.contig.name, + "legend": str(module), + "source": "Module", + "tags": [], + "meta": metadata_for_proksee, + } + ) return modules_data_list -def write_proksee_organism(organism: Organism, output_file: Path, - features: List[str] = None, - module_to_colors: Dict[Module, str] = None, - genome_sequences: Dict[str, str] = None, - multigenics: Set[GeneFamily] = [], - metadata_sep: str = "|", - compress: bool = False): +def write_proksee_organism( + organism: Organism, + output_file: Path, + features: List[str] = None, + module_to_colors: Dict[Module, str] = None, + genome_sequences: Dict[str, str] = None, + multigenics: Set[GeneFamily] = [], + metadata_sep: str = "|", + compress: bool = False, +): """ Writes ProkSee data for a given organism, including contig information, genes colored by partition, RGPs, and modules. The resulting data is saved as a JSON file in the specified output file. @@ -367,17 +449,25 @@ def write_proksee_organism(organism: Organism, output_file: Path, """ proksee_data = initiate_proksee_data(features, organism, module_to_colors) - proksee_data["cgview"]["sequence"]["contigs"] = write_contig(organism, genome_sequences, metadata_sep=metadata_sep) + proksee_data["cgview"]["sequence"]["contigs"] = write_contig( + organism, genome_sequences, metadata_sep=metadata_sep + ) - genes_features, gf2genes = write_genes(organism, multigenics=multigenics, metadata_sep=metadata_sep) + genes_features, gf2genes = write_genes( + organism, multigenics=multigenics, metadata_sep=metadata_sep + ) proksee_data["cgview"]["features"] = genes_features if ("rgp" in features or "all" in features) and organism.regions is not None: - proksee_data["cgview"]["features"] += write_rgp(organism=organism, metadata_sep=metadata_sep) + proksee_data["cgview"]["features"] += write_rgp( + organism=organism, metadata_sep=metadata_sep + ) if module_to_colors is not None and ("modules" in features or "all" in features): - proksee_data["cgview"]["features"] += write_modules(organism=organism, gf2genes=gf2genes, metadata_sep=metadata_sep) + proksee_data["cgview"]["features"] += write_modules( + organism=organism, gf2genes=gf2genes, metadata_sep=metadata_sep + ) logging.debug(f"Write ProkSee for {organism.name}") with write_compressed_or_not(output_file, compress=compress) as out_json: diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index 6f1504d0..e9f843db 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -59,7 +59,7 @@ def __init__(self, family_id: int, name: str): """ assert isinstance(family_id, int), "GeneFamily object id should be an integer" assert isinstance(name, str), "GeneFamily object name should be a string" - assert name != '', "GeneFamily object cannot be created with an empty name" + assert name != "", "GeneFamily object cannot be created with an empty name" super().__init__() self.name = str(name) @@ -76,12 +76,9 @@ def __init__(self, family_id: int, name: str): self.bitarray = None def __repr__(self) -> str: - """Family representation - """ + """Family representation""" return f"{self.ID}: {self.name}" - - def __len__(self) -> int: """Get the number of genes in the family @@ -90,7 +87,7 @@ def __len__(self) -> int: return len(self._genes_getter) def __setitem__(self, identifier: str, gene: Gene): - """ Set gene to Gene Family + """Set gene to Gene Family :param identifier: ID of the gene :param gene: Gene object to add @@ -102,11 +99,17 @@ def __setitem__(self, identifier: str, gene: Gene): # TODO look at change start for position if not isinstance(gene, Gene): - raise TypeError(f"'Gene' type was expected but you provided a '{type(gene)}' type object") + raise TypeError( + f"'Gene' type was expected but you provided a '{type(gene)}' type object" + ) if not isinstance(identifier, str): - raise TypeError(f"Gene ID should be a string. You provided a '{type(identifier)}' type object") + raise TypeError( + f"Gene ID should be a string. You provided a '{type(identifier)}' type object" + ) if identifier in self._genes_getter: - raise KeyError(f"Gene with name {identifier} already exists in the gene family") + raise KeyError( + f"Gene with name {identifier} already exists in the gene family" + ) self._genes_getter[identifier] = gene # TODO define eq function @@ -123,11 +126,15 @@ def __getitem__(self, identifier: str) -> Gene: :raises KeyError: Gene with the given identifier does not exist in the contig """ if not isinstance(identifier, str): - raise TypeError(f"Gene ID should be a string. You provided a '{type(identifier)}' type object") + raise TypeError( + f"Gene ID should be a string. You provided a '{type(identifier)}' type object" + ) try: gene = self._genes_getter[identifier] except KeyError: - raise KeyError(f"Gene with the ID: {identifier} does not exist in the family") + raise KeyError( + f"Gene with the ID: {identifier} does not exist in the family" + ) else: return gene @@ -140,11 +147,15 @@ def __delitem__(self, identifier: str): :raises KeyError: Gene with the given identifier does not exist in the contig """ if not isinstance(identifier, str): - raise TypeError(f"Gene ID should be a string. You provided a '{type(identifier)}' type object") + raise TypeError( + f"Gene ID should be a string. You provided a '{type(identifier)}' type object" + ) try: del self._genes_getter[identifier] except KeyError: - raise KeyError(f"Gene with the name: {identifier} does not exist in the family") + raise KeyError( + f"Gene with the name: {identifier} does not exist in the family" + ) def add(self, gene: Gene): """Add a gene to the gene family, and sets the gene's :attr:family accordingly. @@ -154,7 +165,9 @@ def add(self, gene: Gene): :raises TypeError: If the provided `gene` is of the wrong type """ if not isinstance(gene, Gene): - raise TypeError(f"'Gene' type object was expected, but '{type(gene)}' type object was provided.") + raise TypeError( + f"'Gene' type object was expected, but '{type(gene)}' type object was provided." + ) self[gene.ID] = gene gene.family = self if gene.organism is not None and gene.organism in self._genePerOrg: @@ -171,7 +184,9 @@ def get(self, identifier: str) -> Gene: :raises TypeError: If the identifier is not instance string """ if not isinstance(identifier, str): - raise TypeError(f"Gene ID should be a string. You provided a '{type(identifier)}' type object") + raise TypeError( + f"Gene ID should be a string. You provided a '{type(identifier)}' type object" + ) return self[identifier] def remove(self, identifier): @@ -184,10 +199,11 @@ def remove(self, identifier): :raises TypeError: If the identifier is not instance string """ if not isinstance(identifier, str): - raise TypeError(f"Gene ID should be a string. You provided a '{type(identifier)}' type object") + raise TypeError( + f"Gene ID should be a string. You provided a '{type(identifier)}' type object" + ) del self[identifier] - @property def representative(self) -> Gene: """Get the representative gene of the family @@ -200,13 +216,13 @@ def representative(self) -> Gene: @representative.setter def representative(self, gene: Gene) -> None: - """Set the representative gene of the family - """ + """Set the representative gene of the family""" if not isinstance(gene, Gene): - raise TypeError(f"Representative gene should be a Gene. Found a '{type(gene)}' type object") + raise TypeError( + f"Representative gene should be a Gene. Found a '{type(gene)}' type object" + ) self._representative = gene - def contains_gene_id(self, identifier): """ Check if the family contains already a gene id @@ -218,12 +234,13 @@ def contains_gene_id(self, identifier): :raises TypeError: If the identifier is not instance string """ if not isinstance(identifier, str): - raise TypeError(f"Gene ID should be a string. You provided a '{type(identifier)}' type object") + raise TypeError( + f"Gene ID should be a string. You provided a '{type(identifier)}' type object" + ) return identifier in self._genes_getter - - #TODO define __eq__ + # TODO define __eq__ @property def partition(self): return self._partition if self._partition is not None else "" @@ -251,7 +268,6 @@ def named_partition(self) -> str: else: return "undefined" - @property def edges(self) -> Generator[Edge, None, None]: """Returns all Edges that are linked to this gene family @@ -313,34 +329,29 @@ def module(self, module: Module): @property def number_of_neighbors(self) -> int: - """Get the number of neighbor for the current gene family - """ + """Get the number of neighbor for the current gene family""" return len(self._edges_getter.keys()) @property def number_of_edges(self) -> int: - """Get the number of edges for the current gene family - """ + """Get the number of edges for the current gene family""" return len(self._edges_getter.values()) @property def number_of_genes(self) -> int: - """Get the number of genes for the current gene family - """ + """Get the number of genes for the current gene family""" return len(self._genes_getter) @property def number_of_organisms(self) -> int: - """Get the number of organisms for the current gene family - """ + """Get the number of organisms for the current gene family""" if len(self._genePerOrg) == 0: _ = self.get_org_dict() return len(self._genePerOrg.keys()) @property def number_of_spots(self) -> int: - """Get the number of spots for the current gene family - """ + """Get the number of spots for the current gene family""" return len(self._spots) @property @@ -361,8 +372,7 @@ def set_edge(self, target: GeneFamily, edge: Edge): self._edges_getter[target] = edge def get_edge(self, target: GeneFamily) -> Edge: - """Get the edge by the target gene family neighbor - """ + """Get the edge by the target gene family neighbor""" return self._edges_getter[target] def add_sequence(self, seq: str): @@ -379,7 +389,8 @@ def add_spot(self, spot: Spot): :param spot: Spot belonging to the family """ - from ppanggolin.region import Spot # prevent circular import error + from ppanggolin.region import Spot # prevent circular import error + if not isinstance(spot, Spot): raise TypeError(f"A spot object is expected, you give a {type(spot)}") self._spots.add(spot) @@ -389,12 +400,13 @@ def set_module(self, module: Module): :param module: Module belonging to the family """ - from ppanggolin.region import Module # prevent circular import error + from ppanggolin.region import Module # prevent circular import error + if not isinstance(module, Module): raise TypeError(f"A module object is expected, you give a {type(module)}") self._module = module - def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): + def mk_bitarray(self, index: Dict[Organism, int], partition: str = "all"): """Produces a bitarray representing the presence/absence of the family in the pangenome using the provided index The bitarray is stored in the :attr:`bitarray` attribute and is a :class:`gmpy2.xmpz` type. @@ -402,18 +414,18 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): :param partition: partition used to compute bitarray """ self.bitarray = gmpy2.xmpz() # pylint: disable=no-member - if partition == 'all': + if partition == "all": logging.getLogger("PPanGGOLiN").debug("all") for org in self.organisms: self.bitarray[index[org]] = 1 - elif partition in ['shell', 'cloud']: + elif partition in ["shell", "cloud"]: logging.getLogger("PPanGGOLiN").debug("shell, cloud") if self.named_partition == partition: for org in self.organisms: self.bitarray[index[org]] = 1 - elif partition == 'accessory': + elif partition == "accessory": logging.getLogger("PPanGGOLiN").debug("accessory") - if self.named_partition in ['shell', 'cloud']: + if self.named_partition in ["shell", "cloud"]: for org in self.organisms: self.bitarray[index[org]] = 1 @@ -439,10 +451,11 @@ def get_genes_per_org(self, org: Organism) -> Generator[Gene, None, None]: if len(self._genePerOrg) == 0: _ = self.get_org_dict() if org not in self._genePerOrg: - raise KeyError(f"Genome {org.name} does not have the gene family: {self.name}") + raise KeyError( + f"Genome {org.name} does not have the gene family: {self.name}" + ) yield from self._genePerOrg[org] - def is_single_copy(self, dup_margin: float, exclude_fragment: bool) -> bool: """ Checks if the gene family is considered single copy based on the provided criteria. @@ -467,7 +480,9 @@ def duplication_ratio(self, exclude_fragment: bool) -> bool: # Check if the family is in multicopy in all organisms for fam_genes_in_org in self.get_org_dict().values(): if exclude_fragment: - genes_count = len([gene for gene in fam_genes_in_org if not gene.is_fragment]) + genes_count = len( + [gene for gene in fam_genes_in_org if not gene.is_fragment] + ) else: genes_count = len(fam_genes_in_org) diff --git a/ppanggolin/genetic_codes.py b/ppanggolin/genetic_codes.py index af69157a..cdbf7ac2 100644 --- a/ppanggolin/genetic_codes.py +++ b/ppanggolin/genetic_codes.py @@ -1,6 +1,8945 @@ #!/usr/bin/env python3 -def genetic_codes(code): - return {'1': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TRA': '*', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'HTG': 'M', 'WTG': 'M', 'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTH': 'L', 'CTW': 'L', 'CTC': 'L', 'CTG': 'M', 'CTA': 'L', 'CTM': 'L', 'CTY': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'MTG': 'M', 'YTG': 'M', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TRA': '*', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTG': 'M', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '2': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': '*', 'AGG': '*', 'AGA': '*', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATC': 'I', 'ATR': 'M', 'ATG': 'M', 'ATA': 'M', 'ATY': 'I', 'ATT': 'I', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'RTG': 'M', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTH': 'V', 'GTW': 'V', 'GTC': 'V', 'GTG': 'M', 'GTA': 'V', 'GTM': 'V', 'GTY': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': '*', 'AGG': '*', 'AGA': '*', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATB': 'M', 'ATD': 'M', 'ATH': 'M', 'ATK': 'M', 'ATN': 'M', 'ATW': 'M', 'ATC': 'M', 'ATR': 'M', 'ATG': 'M', 'ATA': 'M', 'ATM': 'M', 'ATV': 'M', 'ATY': 'M', 'ATS': 'M', 'ATT': 'M', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '3': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'T', 'CTD': 'T', 'CTH': 'T', 'CTK': 'T', 'CTN': 'T', 'CTW': 'T', 'CTC': 'T', 'CTR': 'T', 'CTG': 'T', 'CTA': 'T', 'CTM': 'T', 'CTV': 'T', 'CTY': 'T', 'CTS': 'T', 'CTT': 'T', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATC': 'I', 'ATR': 'M', 'ATG': 'M', 'ATA': 'M', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'T', 'CTD': 'T', 'CTH': 'T', 'CTK': 'T', 'CTN': 'T', 'CTW': 'T', 'CTC': 'T', 'CTR': 'T', 'CTG': 'T', 'CTA': 'T', 'CTM': 'T', 'CTV': 'T', 'CTY': 'T', 'CTS': 'T', 'CTT': 'T', 'RTG': 'M', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTH': 'V', 'GTW': 'V', 'GTC': 'V', 'GTG': 'M', 'GTA': 'V', 'GTM': 'V', 'GTY': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATC': 'I', 'ATR': 'M', 'ATG': 'M', 'ATA': 'M', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '4': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'BTG': 'M', 'DTG': 'M', 'HTG': 'M', 'KTG': 'M', 'NTG': 'M', 'WTR': 'M', 'WTG': 'M', 'WTA': 'M', 'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTH': 'L', 'CTW': 'L', 'CTC': 'L', 'CTG': 'M', 'CTA': 'L', 'CTM': 'L', 'CTY': 'L', 'CTT': 'L', 'RTG': 'M', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTH': 'V', 'GTW': 'V', 'GTC': 'V', 'GTG': 'M', 'GTA': 'V', 'GTM': 'V', 'GTY': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATB': 'M', 'ATD': 'M', 'ATH': 'M', 'ATK': 'M', 'ATN': 'M', 'ATW': 'M', 'ATC': 'M', 'ATR': 'M', 'ATG': 'M', 'ATA': 'M', 'ATM': 'M', 'ATV': 'M', 'ATY': 'M', 'ATS': 'M', 'ATT': 'M', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'MTG': 'M', 'VTG': 'M', 'YTG': 'M', 'STG': 'M', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'M', 'TTG': 'M', 'TTA': 'M', 'TTY': 'F', 'TTT': 'F'}}, '5': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGB': 'S', 'AGD': 'S', 'AGH': 'S', 'AGK': 'S', 'AGN': 'S', 'AGW': 'S', 'AGC': 'S', 'AGR': 'S', 'AGG': 'S', 'AGA': 'S', 'AGM': 'S', 'AGV': 'S', 'AGY': 'S', 'AGS': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATC': 'I', 'ATR': 'M', 'ATG': 'M', 'ATA': 'M', 'ATY': 'I', 'ATT': 'I', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'DTG': 'M', 'KTG': 'M', 'WTG': 'M', 'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'RTG': 'M', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTH': 'V', 'GTW': 'V', 'GTC': 'V', 'GTG': 'M', 'GTA': 'V', 'GTM': 'V', 'GTY': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGB': 'S', 'AGD': 'S', 'AGH': 'S', 'AGK': 'S', 'AGN': 'S', 'AGW': 'S', 'AGC': 'S', 'AGR': 'S', 'AGG': 'S', 'AGA': 'S', 'AGM': 'S', 'AGV': 'S', 'AGY': 'S', 'AGS': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATB': 'M', 'ATD': 'M', 'ATH': 'M', 'ATK': 'M', 'ATN': 'M', 'ATW': 'M', 'ATC': 'M', 'ATR': 'M', 'ATG': 'M', 'ATA': 'M', 'ATM': 'M', 'ATV': 'M', 'ATY': 'M', 'ATS': 'M', 'ATT': 'M', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTG': 'M', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '6': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YAR': 'Q', 'YAG': 'Q', 'YAA': 'Q', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': 'Q', 'TAG': 'Q', 'TAA': 'Q', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YAR': 'Q', 'YAG': 'Q', 'YAA': 'Q', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': 'Q', 'TAG': 'Q', 'TAA': 'Q', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '9': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGB': 'S', 'AGD': 'S', 'AGH': 'S', 'AGK': 'S', 'AGN': 'S', 'AGW': 'S', 'AGC': 'S', 'AGR': 'S', 'AGG': 'S', 'AGA': 'S', 'AGM': 'S', 'AGV': 'S', 'AGY': 'S', 'AGS': 'S', 'AGT': 'S', 'AAH': 'N', 'AAW': 'N', 'AAC': 'N', 'AAG': 'K', 'AAA': 'N', 'AAM': 'N', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'RTG': 'M', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTH': 'V', 'GTW': 'V', 'GTC': 'V', 'GTG': 'M', 'GTA': 'V', 'GTM': 'V', 'GTY': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGB': 'S', 'AGD': 'S', 'AGH': 'S', 'AGK': 'S', 'AGN': 'S', 'AGW': 'S', 'AGC': 'S', 'AGR': 'S', 'AGG': 'S', 'AGA': 'S', 'AGM': 'S', 'AGV': 'S', 'AGY': 'S', 'AGS': 'S', 'AGT': 'S', 'AAH': 'N', 'AAW': 'N', 'AAC': 'N', 'AAG': 'K', 'AAA': 'N', 'AAM': 'N', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '10': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGH': 'C', 'TGW': 'C', 'TGC': 'C', 'TGG': 'W', 'TGA': 'C', 'TGM': 'C', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGH': 'C', 'TGW': 'C', 'TGC': 'C', 'TGG': 'W', 'TGA': 'C', 'TGM': 'C', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '11': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TRA': '*', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'BTG': 'M', 'DTG': 'M', 'HTG': 'M', 'KTG': 'M', 'NTG': 'M', 'WTG': 'M', 'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTH': 'L', 'CTW': 'L', 'CTC': 'L', 'CTG': 'M', 'CTA': 'L', 'CTM': 'L', 'CTY': 'L', 'CTT': 'L', 'RTG': 'M', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTH': 'V', 'GTW': 'V', 'GTC': 'V', 'GTG': 'M', 'GTA': 'V', 'GTM': 'V', 'GTY': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATB': 'M', 'ATD': 'M', 'ATH': 'M', 'ATK': 'M', 'ATN': 'M', 'ATW': 'M', 'ATC': 'M', 'ATR': 'M', 'ATG': 'M', 'ATA': 'M', 'ATM': 'M', 'ATV': 'M', 'ATY': 'M', 'ATS': 'M', 'ATT': 'M', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'MTG': 'M', 'VTG': 'M', 'YTG': 'M', 'YTA': 'L', 'STG': 'M', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TRA': '*', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTG': 'M', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '12': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTH': 'L', 'CTW': 'L', 'CTC': 'L', 'CTG': 'S', 'CTA': 'L', 'CTM': 'L', 'CTY': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TRA': '*', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTH': 'L', 'CTW': 'L', 'CTC': 'L', 'CTG': 'M', 'CTA': 'L', 'CTM': 'L', 'CTY': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'MTG': 'M', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TRA': '*', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '13': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'RGR': 'G', 'RGG': 'G', 'RGA': 'G', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'G', 'AGG': 'G', 'AGA': 'G', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATC': 'I', 'ATR': 'M', 'ATG': 'M', 'ATA': 'M', 'ATY': 'I', 'ATT': 'I', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'DTG': 'M', 'KTG': 'M', 'WTG': 'M', 'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'RGR': 'G', 'RGG': 'G', 'RGA': 'G', 'RTG': 'M', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTH': 'V', 'GTW': 'V', 'GTC': 'V', 'GTG': 'M', 'GTA': 'V', 'GTM': 'V', 'GTY': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'G', 'AGG': 'G', 'AGA': 'G', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATC': 'I', 'ATR': 'M', 'ATG': 'M', 'ATA': 'M', 'ATY': 'I', 'ATT': 'I', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTG': 'M', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '14': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGB': 'S', 'AGD': 'S', 'AGH': 'S', 'AGK': 'S', 'AGN': 'S', 'AGW': 'S', 'AGC': 'S', 'AGR': 'S', 'AGG': 'S', 'AGA': 'S', 'AGM': 'S', 'AGV': 'S', 'AGY': 'S', 'AGS': 'S', 'AGT': 'S', 'AAH': 'N', 'AAW': 'N', 'AAC': 'N', 'AAG': 'K', 'AAA': 'N', 'AAM': 'N', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAH': 'Y', 'TAW': 'Y', 'TAC': 'Y', 'TAG': '*', 'TAA': 'Y', 'TAM': 'Y', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGB': 'S', 'AGD': 'S', 'AGH': 'S', 'AGK': 'S', 'AGN': 'S', 'AGW': 'S', 'AGC': 'S', 'AGR': 'S', 'AGG': 'S', 'AGA': 'S', 'AGM': 'S', 'AGV': 'S', 'AGY': 'S', 'AGS': 'S', 'AGT': 'S', 'AAH': 'N', 'AAW': 'N', 'AAC': 'N', 'AAG': 'K', 'AAA': 'N', 'AAM': 'N', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAH': 'Y', 'TAW': 'Y', 'TAC': 'Y', 'TAG': '*', 'TAA': 'Y', 'TAM': 'Y', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '15': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YAG': 'Q', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TRA': '*', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAG': 'Q', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YAG': 'Q', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TRA': '*', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAG': 'Q', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '16': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TWG': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TRA': '*', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAG': 'L', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TWG': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TRA': '*', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAG': 'L', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '21': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGB': 'S', 'AGD': 'S', 'AGH': 'S', 'AGK': 'S', 'AGN': 'S', 'AGW': 'S', 'AGC': 'S', 'AGR': 'S', 'AGG': 'S', 'AGA': 'S', 'AGM': 'S', 'AGV': 'S', 'AGY': 'S', 'AGS': 'S', 'AGT': 'S', 'AAH': 'N', 'AAW': 'N', 'AAC': 'N', 'AAG': 'K', 'AAA': 'N', 'AAM': 'N', 'AAY': 'N', 'AAT': 'N', 'ATC': 'I', 'ATR': 'M', 'ATG': 'M', 'ATA': 'M', 'ATY': 'I', 'ATT': 'I', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'RTG': 'M', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTH': 'V', 'GTW': 'V', 'GTC': 'V', 'GTG': 'M', 'GTA': 'V', 'GTM': 'V', 'GTY': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGB': 'S', 'AGD': 'S', 'AGH': 'S', 'AGK': 'S', 'AGN': 'S', 'AGW': 'S', 'AGC': 'S', 'AGR': 'S', 'AGG': 'S', 'AGA': 'S', 'AGM': 'S', 'AGV': 'S', 'AGY': 'S', 'AGS': 'S', 'AGT': 'S', 'AAH': 'N', 'AAW': 'N', 'AAC': 'N', 'AAG': 'K', 'AAA': 'N', 'AAM': 'N', 'AAY': 'N', 'AAT': 'N', 'ATC': 'I', 'ATR': 'M', 'ATG': 'M', 'ATA': 'M', 'ATY': 'I', 'ATT': 'I', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '22': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TWG': 'L', 'TCB': 'S', 'TCK': 'S', 'TCC': 'S', 'TCG': 'S', 'TCA': '*', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TRA': '*', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAG': 'L', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TMA': '*', 'TVA': '*', 'TSA': '*', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TWG': 'L', 'TCB': 'S', 'TCK': 'S', 'TCC': 'S', 'TCG': 'S', 'TCA': '*', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TRA': '*', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAG': 'L', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TMA': '*', 'TVA': '*', 'TSA': '*', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '23': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTG': 'L', 'TDA': '*', 'TKA': '*', 'TWA': '*', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TRA': '*', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTG': 'L', 'TTA': '*', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'RTG': 'M', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTH': 'V', 'GTW': 'V', 'GTC': 'V', 'GTG': 'M', 'GTA': 'V', 'GTM': 'V', 'GTY': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATK': 'M', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATT': 'M', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTG': 'L', 'TDA': '*', 'TKA': '*', 'TWA': '*', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TRA': '*', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTG': 'L', 'TTA': '*', 'TTY': 'F', 'TTT': 'F'}}, '24': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'ARG': 'K', 'AGH': 'S', 'AGW': 'S', 'AGC': 'S', 'AGG': 'K', 'AGA': 'S', 'AGM': 'S', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'BTG': 'M', 'DTG': 'M', 'HTG': 'M', 'KTG': 'M', 'NTG': 'M', 'WTG': 'M', 'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTH': 'L', 'CTW': 'L', 'CTC': 'L', 'CTG': 'M', 'CTA': 'L', 'CTM': 'L', 'CTY': 'L', 'CTT': 'L', 'RTG': 'M', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTH': 'V', 'GTW': 'V', 'GTC': 'V', 'GTG': 'M', 'GTA': 'V', 'GTM': 'V', 'GTY': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'ARG': 'K', 'AGH': 'S', 'AGW': 'S', 'AGC': 'S', 'AGG': 'K', 'AGA': 'S', 'AGM': 'S', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MTG': 'M', 'VTG': 'M', 'YTG': 'M', 'YTA': 'L', 'STG': 'M', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTG': 'M', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '25': {'trans_table': {'KGA': 'G', 'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGG': 'W', 'TGA': 'G', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'DTG': 'M', 'KGA': 'G', 'KTG': 'M', 'WTG': 'M', 'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'RTG': 'M', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTH': 'V', 'GTW': 'V', 'GTC': 'V', 'GTG': 'M', 'GTA': 'V', 'GTM': 'V', 'GTY': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGG': 'W', 'TGA': 'G', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTG': 'M', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '26': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTH': 'L', 'CTW': 'L', 'CTC': 'L', 'CTG': 'A', 'CTA': 'L', 'CTM': 'L', 'CTY': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TRA': '*', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTH': 'L', 'CTW': 'L', 'CTC': 'L', 'CTG': 'M', 'CTA': 'L', 'CTM': 'L', 'CTY': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'MTG': 'M', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TRA': '*', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '27': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YAR': 'Q', 'YAG': 'Q', 'YAA': 'Q', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': 'Q', 'TAG': 'Q', 'TAA': 'Q', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YAR': 'Q', 'YAG': 'Q', 'YAA': 'Q', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': 'Q', 'TAG': 'Q', 'TAA': 'Q', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '28': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YAR': 'Q', 'YAG': 'Q', 'YAA': 'Q', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': 'Q', 'TAG': 'Q', 'TAA': 'Q', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TRA': '*', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '29': {'trans_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAB': 'Y', 'TAD': 'Y', 'TAH': 'Y', 'TAK': 'Y', 'TAN': 'Y', 'TAW': 'Y', 'TAC': 'Y', 'TAR': 'Y', 'TAG': 'Y', 'TAA': 'Y', 'TAM': 'Y', 'TAV': 'Y', 'TAY': 'Y', 'TAS': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAB': 'Y', 'TAD': 'Y', 'TAH': 'Y', 'TAK': 'Y', 'TAN': 'Y', 'TAW': 'Y', 'TAC': 'Y', 'TAR': 'Y', 'TAG': 'Y', 'TAA': 'Y', 'TAM': 'Y', 'TAV': 'Y', 'TAY': 'Y', 'TAS': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '30': {'trans_table': {'KAR': 'E', 'KAG': 'E', 'KAA': 'E', 'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': 'E', 'TAG': 'E', 'TAA': 'E', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'KAR': 'E', 'KAG': 'E', 'KAA': 'E', 'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGG': 'W', 'TGA': '*', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': 'E', 'TAG': 'E', 'TAA': 'E', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}, '31': {'trans_table': {'KAR': 'E', 'KAG': 'E', 'KAA': 'E', 'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': 'E', 'TAG': 'E', 'TAA': 'E', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}, 'start_table': {'CCB': 'P', 'CCD': 'P', 'CCH': 'P', 'CCK': 'P', 'CCN': 'P', 'CCW': 'P', 'CCC': 'P', 'CCR': 'P', 'CCG': 'P', 'CCA': 'P', 'CCM': 'P', 'CCV': 'P', 'CCY': 'P', 'CCS': 'P', 'CCT': 'P', 'CGB': 'R', 'CGD': 'R', 'CGH': 'R', 'CGK': 'R', 'CGN': 'R', 'CGW': 'R', 'CGC': 'R', 'CGR': 'R', 'CGG': 'R', 'CGA': 'R', 'CGM': 'R', 'CGV': 'R', 'CGY': 'R', 'CGS': 'R', 'CGT': 'R', 'CAC': 'H', 'CAR': 'Q', 'CAG': 'Q', 'CAA': 'Q', 'CAY': 'H', 'CAT': 'H', 'CTB': 'L', 'CTD': 'L', 'CTH': 'L', 'CTK': 'L', 'CTN': 'L', 'CTW': 'L', 'CTC': 'L', 'CTR': 'L', 'CTG': 'L', 'CTA': 'L', 'CTM': 'L', 'CTV': 'L', 'CTY': 'L', 'CTS': 'L', 'CTT': 'L', 'GCB': 'A', 'GCD': 'A', 'GCH': 'A', 'GCK': 'A', 'GCN': 'A', 'GCW': 'A', 'GCC': 'A', 'GCR': 'A', 'GCG': 'A', 'GCA': 'A', 'GCM': 'A', 'GCV': 'A', 'GCY': 'A', 'GCS': 'A', 'GCT': 'A', 'GGB': 'G', 'GGD': 'G', 'GGH': 'G', 'GGK': 'G', 'GGN': 'G', 'GGW': 'G', 'GGC': 'G', 'GGR': 'G', 'GGG': 'G', 'GGA': 'G', 'GGM': 'G', 'GGV': 'G', 'GGY': 'G', 'GGS': 'G', 'GGT': 'G', 'GAC': 'D', 'GAR': 'E', 'GAG': 'E', 'GAA': 'E', 'GAY': 'D', 'GAT': 'D', 'GTB': 'V', 'GTD': 'V', 'GTH': 'V', 'GTK': 'V', 'GTN': 'V', 'GTW': 'V', 'GTC': 'V', 'GTR': 'V', 'GTG': 'V', 'GTA': 'V', 'GTM': 'V', 'GTV': 'V', 'GTY': 'V', 'GTS': 'V', 'GTT': 'V', 'ACB': 'T', 'ACD': 'T', 'ACH': 'T', 'ACK': 'T', 'ACN': 'T', 'ACW': 'T', 'ACC': 'T', 'ACR': 'T', 'ACG': 'T', 'ACA': 'T', 'ACM': 'T', 'ACV': 'T', 'ACY': 'T', 'ACS': 'T', 'ACT': 'T', 'AGC': 'S', 'AGR': 'R', 'AGG': 'R', 'AGA': 'R', 'AGY': 'S', 'AGT': 'S', 'AAC': 'N', 'AAR': 'K', 'AAG': 'K', 'AAA': 'K', 'AAY': 'N', 'AAT': 'N', 'ATH': 'I', 'ATW': 'I', 'ATC': 'I', 'ATG': 'M', 'ATA': 'I', 'ATM': 'I', 'ATY': 'I', 'ATT': 'I', 'MGR': 'R', 'MGG': 'R', 'MGA': 'R', 'YTR': 'L', 'YTG': 'L', 'YTA': 'L', 'TCB': 'S', 'TCD': 'S', 'TCH': 'S', 'TCK': 'S', 'TCN': 'S', 'TCW': 'S', 'TCC': 'S', 'TCR': 'S', 'TCG': 'S', 'TCA': 'S', 'TCM': 'S', 'TCV': 'S', 'TCY': 'S', 'TCS': 'S', 'TCT': 'S', 'TGC': 'C', 'TGR': 'W', 'TGG': 'W', 'TGA': 'W', 'TGY': 'C', 'TGT': 'C', 'TAC': 'Y', 'TAR': '*', 'TAG': '*', 'TAA': '*', 'TAY': 'Y', 'TAT': 'Y', 'TTC': 'F', 'TTR': 'L', 'TTG': 'L', 'TTA': 'L', 'TTY': 'F', 'TTT': 'F'}}}[code] - +def genetic_codes(code): + return { + "1": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TRA": "*", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "HTG": "M", + "WTG": "M", + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTH": "L", + "CTW": "L", + "CTC": "L", + "CTG": "M", + "CTA": "L", + "CTM": "L", + "CTY": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "MTG": "M", + "YTG": "M", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TRA": "*", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTG": "M", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "2": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "*", + "AGG": "*", + "AGA": "*", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATC": "I", + "ATR": "M", + "ATG": "M", + "ATA": "M", + "ATY": "I", + "ATT": "I", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "RTG": "M", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTH": "V", + "GTW": "V", + "GTC": "V", + "GTG": "M", + "GTA": "V", + "GTM": "V", + "GTY": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "*", + "AGG": "*", + "AGA": "*", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATB": "M", + "ATD": "M", + "ATH": "M", + "ATK": "M", + "ATN": "M", + "ATW": "M", + "ATC": "M", + "ATR": "M", + "ATG": "M", + "ATA": "M", + "ATM": "M", + "ATV": "M", + "ATY": "M", + "ATS": "M", + "ATT": "M", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "3": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "T", + "CTD": "T", + "CTH": "T", + "CTK": "T", + "CTN": "T", + "CTW": "T", + "CTC": "T", + "CTR": "T", + "CTG": "T", + "CTA": "T", + "CTM": "T", + "CTV": "T", + "CTY": "T", + "CTS": "T", + "CTT": "T", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATC": "I", + "ATR": "M", + "ATG": "M", + "ATA": "M", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "T", + "CTD": "T", + "CTH": "T", + "CTK": "T", + "CTN": "T", + "CTW": "T", + "CTC": "T", + "CTR": "T", + "CTG": "T", + "CTA": "T", + "CTM": "T", + "CTV": "T", + "CTY": "T", + "CTS": "T", + "CTT": "T", + "RTG": "M", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTH": "V", + "GTW": "V", + "GTC": "V", + "GTG": "M", + "GTA": "V", + "GTM": "V", + "GTY": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATC": "I", + "ATR": "M", + "ATG": "M", + "ATA": "M", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "4": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "BTG": "M", + "DTG": "M", + "HTG": "M", + "KTG": "M", + "NTG": "M", + "WTR": "M", + "WTG": "M", + "WTA": "M", + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTH": "L", + "CTW": "L", + "CTC": "L", + "CTG": "M", + "CTA": "L", + "CTM": "L", + "CTY": "L", + "CTT": "L", + "RTG": "M", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTH": "V", + "GTW": "V", + "GTC": "V", + "GTG": "M", + "GTA": "V", + "GTM": "V", + "GTY": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATB": "M", + "ATD": "M", + "ATH": "M", + "ATK": "M", + "ATN": "M", + "ATW": "M", + "ATC": "M", + "ATR": "M", + "ATG": "M", + "ATA": "M", + "ATM": "M", + "ATV": "M", + "ATY": "M", + "ATS": "M", + "ATT": "M", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "MTG": "M", + "VTG": "M", + "YTG": "M", + "STG": "M", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "M", + "TTG": "M", + "TTA": "M", + "TTY": "F", + "TTT": "F", + }, + }, + "5": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGB": "S", + "AGD": "S", + "AGH": "S", + "AGK": "S", + "AGN": "S", + "AGW": "S", + "AGC": "S", + "AGR": "S", + "AGG": "S", + "AGA": "S", + "AGM": "S", + "AGV": "S", + "AGY": "S", + "AGS": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATC": "I", + "ATR": "M", + "ATG": "M", + "ATA": "M", + "ATY": "I", + "ATT": "I", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "DTG": "M", + "KTG": "M", + "WTG": "M", + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "RTG": "M", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTH": "V", + "GTW": "V", + "GTC": "V", + "GTG": "M", + "GTA": "V", + "GTM": "V", + "GTY": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGB": "S", + "AGD": "S", + "AGH": "S", + "AGK": "S", + "AGN": "S", + "AGW": "S", + "AGC": "S", + "AGR": "S", + "AGG": "S", + "AGA": "S", + "AGM": "S", + "AGV": "S", + "AGY": "S", + "AGS": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATB": "M", + "ATD": "M", + "ATH": "M", + "ATK": "M", + "ATN": "M", + "ATW": "M", + "ATC": "M", + "ATR": "M", + "ATG": "M", + "ATA": "M", + "ATM": "M", + "ATV": "M", + "ATY": "M", + "ATS": "M", + "ATT": "M", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTG": "M", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "6": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YAR": "Q", + "YAG": "Q", + "YAA": "Q", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "Q", + "TAG": "Q", + "TAA": "Q", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YAR": "Q", + "YAG": "Q", + "YAA": "Q", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "Q", + "TAG": "Q", + "TAA": "Q", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "9": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGB": "S", + "AGD": "S", + "AGH": "S", + "AGK": "S", + "AGN": "S", + "AGW": "S", + "AGC": "S", + "AGR": "S", + "AGG": "S", + "AGA": "S", + "AGM": "S", + "AGV": "S", + "AGY": "S", + "AGS": "S", + "AGT": "S", + "AAH": "N", + "AAW": "N", + "AAC": "N", + "AAG": "K", + "AAA": "N", + "AAM": "N", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "RTG": "M", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTH": "V", + "GTW": "V", + "GTC": "V", + "GTG": "M", + "GTA": "V", + "GTM": "V", + "GTY": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGB": "S", + "AGD": "S", + "AGH": "S", + "AGK": "S", + "AGN": "S", + "AGW": "S", + "AGC": "S", + "AGR": "S", + "AGG": "S", + "AGA": "S", + "AGM": "S", + "AGV": "S", + "AGY": "S", + "AGS": "S", + "AGT": "S", + "AAH": "N", + "AAW": "N", + "AAC": "N", + "AAG": "K", + "AAA": "N", + "AAM": "N", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "10": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGH": "C", + "TGW": "C", + "TGC": "C", + "TGG": "W", + "TGA": "C", + "TGM": "C", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGH": "C", + "TGW": "C", + "TGC": "C", + "TGG": "W", + "TGA": "C", + "TGM": "C", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "11": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TRA": "*", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "BTG": "M", + "DTG": "M", + "HTG": "M", + "KTG": "M", + "NTG": "M", + "WTG": "M", + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTH": "L", + "CTW": "L", + "CTC": "L", + "CTG": "M", + "CTA": "L", + "CTM": "L", + "CTY": "L", + "CTT": "L", + "RTG": "M", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTH": "V", + "GTW": "V", + "GTC": "V", + "GTG": "M", + "GTA": "V", + "GTM": "V", + "GTY": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATB": "M", + "ATD": "M", + "ATH": "M", + "ATK": "M", + "ATN": "M", + "ATW": "M", + "ATC": "M", + "ATR": "M", + "ATG": "M", + "ATA": "M", + "ATM": "M", + "ATV": "M", + "ATY": "M", + "ATS": "M", + "ATT": "M", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "MTG": "M", + "VTG": "M", + "YTG": "M", + "YTA": "L", + "STG": "M", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TRA": "*", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTG": "M", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "12": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTH": "L", + "CTW": "L", + "CTC": "L", + "CTG": "S", + "CTA": "L", + "CTM": "L", + "CTY": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TRA": "*", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTH": "L", + "CTW": "L", + "CTC": "L", + "CTG": "M", + "CTA": "L", + "CTM": "L", + "CTY": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "MTG": "M", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TRA": "*", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "13": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "RGR": "G", + "RGG": "G", + "RGA": "G", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "G", + "AGG": "G", + "AGA": "G", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATC": "I", + "ATR": "M", + "ATG": "M", + "ATA": "M", + "ATY": "I", + "ATT": "I", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "DTG": "M", + "KTG": "M", + "WTG": "M", + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "RGR": "G", + "RGG": "G", + "RGA": "G", + "RTG": "M", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTH": "V", + "GTW": "V", + "GTC": "V", + "GTG": "M", + "GTA": "V", + "GTM": "V", + "GTY": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "G", + "AGG": "G", + "AGA": "G", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATC": "I", + "ATR": "M", + "ATG": "M", + "ATA": "M", + "ATY": "I", + "ATT": "I", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTG": "M", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "14": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGB": "S", + "AGD": "S", + "AGH": "S", + "AGK": "S", + "AGN": "S", + "AGW": "S", + "AGC": "S", + "AGR": "S", + "AGG": "S", + "AGA": "S", + "AGM": "S", + "AGV": "S", + "AGY": "S", + "AGS": "S", + "AGT": "S", + "AAH": "N", + "AAW": "N", + "AAC": "N", + "AAG": "K", + "AAA": "N", + "AAM": "N", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAH": "Y", + "TAW": "Y", + "TAC": "Y", + "TAG": "*", + "TAA": "Y", + "TAM": "Y", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGB": "S", + "AGD": "S", + "AGH": "S", + "AGK": "S", + "AGN": "S", + "AGW": "S", + "AGC": "S", + "AGR": "S", + "AGG": "S", + "AGA": "S", + "AGM": "S", + "AGV": "S", + "AGY": "S", + "AGS": "S", + "AGT": "S", + "AAH": "N", + "AAW": "N", + "AAC": "N", + "AAG": "K", + "AAA": "N", + "AAM": "N", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAH": "Y", + "TAW": "Y", + "TAC": "Y", + "TAG": "*", + "TAA": "Y", + "TAM": "Y", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "15": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YAG": "Q", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TRA": "*", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAG": "Q", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YAG": "Q", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TRA": "*", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAG": "Q", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "16": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TWG": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TRA": "*", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAG": "L", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TWG": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TRA": "*", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAG": "L", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "21": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGB": "S", + "AGD": "S", + "AGH": "S", + "AGK": "S", + "AGN": "S", + "AGW": "S", + "AGC": "S", + "AGR": "S", + "AGG": "S", + "AGA": "S", + "AGM": "S", + "AGV": "S", + "AGY": "S", + "AGS": "S", + "AGT": "S", + "AAH": "N", + "AAW": "N", + "AAC": "N", + "AAG": "K", + "AAA": "N", + "AAM": "N", + "AAY": "N", + "AAT": "N", + "ATC": "I", + "ATR": "M", + "ATG": "M", + "ATA": "M", + "ATY": "I", + "ATT": "I", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "RTG": "M", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTH": "V", + "GTW": "V", + "GTC": "V", + "GTG": "M", + "GTA": "V", + "GTM": "V", + "GTY": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGB": "S", + "AGD": "S", + "AGH": "S", + "AGK": "S", + "AGN": "S", + "AGW": "S", + "AGC": "S", + "AGR": "S", + "AGG": "S", + "AGA": "S", + "AGM": "S", + "AGV": "S", + "AGY": "S", + "AGS": "S", + "AGT": "S", + "AAH": "N", + "AAW": "N", + "AAC": "N", + "AAG": "K", + "AAA": "N", + "AAM": "N", + "AAY": "N", + "AAT": "N", + "ATC": "I", + "ATR": "M", + "ATG": "M", + "ATA": "M", + "ATY": "I", + "ATT": "I", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "22": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TWG": "L", + "TCB": "S", + "TCK": "S", + "TCC": "S", + "TCG": "S", + "TCA": "*", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TRA": "*", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAG": "L", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TMA": "*", + "TVA": "*", + "TSA": "*", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TWG": "L", + "TCB": "S", + "TCK": "S", + "TCC": "S", + "TCG": "S", + "TCA": "*", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TRA": "*", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAG": "L", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TMA": "*", + "TVA": "*", + "TSA": "*", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "23": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTG": "L", + "TDA": "*", + "TKA": "*", + "TWA": "*", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TRA": "*", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTG": "L", + "TTA": "*", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "RTG": "M", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTH": "V", + "GTW": "V", + "GTC": "V", + "GTG": "M", + "GTA": "V", + "GTM": "V", + "GTY": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATK": "M", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATT": "M", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTG": "L", + "TDA": "*", + "TKA": "*", + "TWA": "*", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TRA": "*", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTG": "L", + "TTA": "*", + "TTY": "F", + "TTT": "F", + }, + }, + "24": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "ARG": "K", + "AGH": "S", + "AGW": "S", + "AGC": "S", + "AGG": "K", + "AGA": "S", + "AGM": "S", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "BTG": "M", + "DTG": "M", + "HTG": "M", + "KTG": "M", + "NTG": "M", + "WTG": "M", + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTH": "L", + "CTW": "L", + "CTC": "L", + "CTG": "M", + "CTA": "L", + "CTM": "L", + "CTY": "L", + "CTT": "L", + "RTG": "M", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTH": "V", + "GTW": "V", + "GTC": "V", + "GTG": "M", + "GTA": "V", + "GTM": "V", + "GTY": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "ARG": "K", + "AGH": "S", + "AGW": "S", + "AGC": "S", + "AGG": "K", + "AGA": "S", + "AGM": "S", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MTG": "M", + "VTG": "M", + "YTG": "M", + "YTA": "L", + "STG": "M", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTG": "M", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "25": { + "trans_table": { + "KGA": "G", + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGG": "W", + "TGA": "G", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "DTG": "M", + "KGA": "G", + "KTG": "M", + "WTG": "M", + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "RTG": "M", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTH": "V", + "GTW": "V", + "GTC": "V", + "GTG": "M", + "GTA": "V", + "GTM": "V", + "GTY": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGG": "W", + "TGA": "G", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTG": "M", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "26": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTH": "L", + "CTW": "L", + "CTC": "L", + "CTG": "A", + "CTA": "L", + "CTM": "L", + "CTY": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TRA": "*", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTH": "L", + "CTW": "L", + "CTC": "L", + "CTG": "M", + "CTA": "L", + "CTM": "L", + "CTY": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "MTG": "M", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TRA": "*", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "27": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YAR": "Q", + "YAG": "Q", + "YAA": "Q", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "Q", + "TAG": "Q", + "TAA": "Q", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YAR": "Q", + "YAG": "Q", + "YAA": "Q", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "Q", + "TAG": "Q", + "TAA": "Q", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "28": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YAR": "Q", + "YAG": "Q", + "YAA": "Q", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "Q", + "TAG": "Q", + "TAA": "Q", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TRA": "*", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "29": { + "trans_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAB": "Y", + "TAD": "Y", + "TAH": "Y", + "TAK": "Y", + "TAN": "Y", + "TAW": "Y", + "TAC": "Y", + "TAR": "Y", + "TAG": "Y", + "TAA": "Y", + "TAM": "Y", + "TAV": "Y", + "TAY": "Y", + "TAS": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAB": "Y", + "TAD": "Y", + "TAH": "Y", + "TAK": "Y", + "TAN": "Y", + "TAW": "Y", + "TAC": "Y", + "TAR": "Y", + "TAG": "Y", + "TAA": "Y", + "TAM": "Y", + "TAV": "Y", + "TAY": "Y", + "TAS": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "30": { + "trans_table": { + "KAR": "E", + "KAG": "E", + "KAA": "E", + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "E", + "TAG": "E", + "TAA": "E", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "KAR": "E", + "KAG": "E", + "KAA": "E", + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGG": "W", + "TGA": "*", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "E", + "TAG": "E", + "TAA": "E", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + "31": { + "trans_table": { + "KAR": "E", + "KAG": "E", + "KAA": "E", + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "E", + "TAG": "E", + "TAA": "E", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + "start_table": { + "CCB": "P", + "CCD": "P", + "CCH": "P", + "CCK": "P", + "CCN": "P", + "CCW": "P", + "CCC": "P", + "CCR": "P", + "CCG": "P", + "CCA": "P", + "CCM": "P", + "CCV": "P", + "CCY": "P", + "CCS": "P", + "CCT": "P", + "CGB": "R", + "CGD": "R", + "CGH": "R", + "CGK": "R", + "CGN": "R", + "CGW": "R", + "CGC": "R", + "CGR": "R", + "CGG": "R", + "CGA": "R", + "CGM": "R", + "CGV": "R", + "CGY": "R", + "CGS": "R", + "CGT": "R", + "CAC": "H", + "CAR": "Q", + "CAG": "Q", + "CAA": "Q", + "CAY": "H", + "CAT": "H", + "CTB": "L", + "CTD": "L", + "CTH": "L", + "CTK": "L", + "CTN": "L", + "CTW": "L", + "CTC": "L", + "CTR": "L", + "CTG": "L", + "CTA": "L", + "CTM": "L", + "CTV": "L", + "CTY": "L", + "CTS": "L", + "CTT": "L", + "GCB": "A", + "GCD": "A", + "GCH": "A", + "GCK": "A", + "GCN": "A", + "GCW": "A", + "GCC": "A", + "GCR": "A", + "GCG": "A", + "GCA": "A", + "GCM": "A", + "GCV": "A", + "GCY": "A", + "GCS": "A", + "GCT": "A", + "GGB": "G", + "GGD": "G", + "GGH": "G", + "GGK": "G", + "GGN": "G", + "GGW": "G", + "GGC": "G", + "GGR": "G", + "GGG": "G", + "GGA": "G", + "GGM": "G", + "GGV": "G", + "GGY": "G", + "GGS": "G", + "GGT": "G", + "GAC": "D", + "GAR": "E", + "GAG": "E", + "GAA": "E", + "GAY": "D", + "GAT": "D", + "GTB": "V", + "GTD": "V", + "GTH": "V", + "GTK": "V", + "GTN": "V", + "GTW": "V", + "GTC": "V", + "GTR": "V", + "GTG": "V", + "GTA": "V", + "GTM": "V", + "GTV": "V", + "GTY": "V", + "GTS": "V", + "GTT": "V", + "ACB": "T", + "ACD": "T", + "ACH": "T", + "ACK": "T", + "ACN": "T", + "ACW": "T", + "ACC": "T", + "ACR": "T", + "ACG": "T", + "ACA": "T", + "ACM": "T", + "ACV": "T", + "ACY": "T", + "ACS": "T", + "ACT": "T", + "AGC": "S", + "AGR": "R", + "AGG": "R", + "AGA": "R", + "AGY": "S", + "AGT": "S", + "AAC": "N", + "AAR": "K", + "AAG": "K", + "AAA": "K", + "AAY": "N", + "AAT": "N", + "ATH": "I", + "ATW": "I", + "ATC": "I", + "ATG": "M", + "ATA": "I", + "ATM": "I", + "ATY": "I", + "ATT": "I", + "MGR": "R", + "MGG": "R", + "MGA": "R", + "YTR": "L", + "YTG": "L", + "YTA": "L", + "TCB": "S", + "TCD": "S", + "TCH": "S", + "TCK": "S", + "TCN": "S", + "TCW": "S", + "TCC": "S", + "TCR": "S", + "TCG": "S", + "TCA": "S", + "TCM": "S", + "TCV": "S", + "TCY": "S", + "TCS": "S", + "TCT": "S", + "TGC": "C", + "TGR": "W", + "TGG": "W", + "TGA": "W", + "TGY": "C", + "TGT": "C", + "TAC": "Y", + "TAR": "*", + "TAG": "*", + "TAA": "*", + "TAY": "Y", + "TAT": "Y", + "TTC": "F", + "TTR": "L", + "TTG": "L", + "TTA": "L", + "TTY": "F", + "TTT": "F", + }, + }, + }[code] diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 83bf971c..331db45c 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -42,7 +42,7 @@ def __init__(self, identifier: str): :param identifier: Identifier of the feature """ assert isinstance(identifier, str), "Expected identifier should be a string" - if identifier == '': + if identifier == "": raise ValueError("Identifier should not be empty") super().__init__() self.ID = identifier @@ -78,7 +78,8 @@ def __len__(self) -> int: return sum([(stop - start + 1) for start, stop in self.coordinates]) except TypeError: raise ValueError( - f"Coordinates of gene {self} have not been defined. Getting its length is then impossible.") + f"Coordinates of gene {self} have not been defined. Getting its length is then impossible." + ) @property def has_joined_coordinates(self) -> bool: @@ -121,7 +122,7 @@ def organism(self, organism: Organism): :param organism: Organism belonging to the feature """ if not isinstance(organism, Organism): - raise TypeError(f'Expected type Organism, got {type(organism)}') + raise TypeError(f"Expected type Organism, got {type(organism)}") self._organism = organism @property @@ -139,11 +140,20 @@ def contig(self, contig: Contig): :param contig: Contig linked to the feature """ if not isinstance(contig, Contig): - raise TypeError(f'Expected type Contig, got {type(contig)}') + raise TypeError(f"Expected type Contig, got {type(contig)}") self._contig = contig - def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str = "", name: str = "", - product: str = "", local_identifier: str = "", coordinates: List[Tuple[int, int]] = None): + def fill_annotations( + self, + start: int, + stop: int, + strand: str, + gene_type: str = "", + name: str = "", + product: str = "", + local_identifier: str = "", + coordinates: List[Tuple[int, int]] = None, + ): """ Fill general annotation for child classes @@ -163,37 +173,59 @@ def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str = coordinates = [(start, stop)] if not isinstance(start, int): - raise TypeError(f"Start should be int. Got {type(start)} instead in {self} from {self.organism}.") + raise TypeError( + f"Start should be int. Got {type(start)} instead in {self} from {self.organism}." + ) if not isinstance(stop, int): - raise TypeError(f"Stop should be int. Got {type(stop)} instead in {self} from {self.organism}.") + raise TypeError( + f"Stop should be int. Got {type(stop)} instead in {self} from {self.organism}." + ) if not isinstance(strand, str): - raise TypeError(f"Strand should be str. Got {type(strand)} instead in {self} from {self.organism}.") + raise TypeError( + f"Strand should be str. Got {type(strand)} instead in {self} from {self.organism}." + ) if not isinstance(gene_type, str): - raise TypeError(f"Gene type should be str. Got {type(gene_type)} instead in {self} from {self.organism}.") + raise TypeError( + f"Gene type should be str. Got {type(gene_type)} instead in {self} from {self.organism}." + ) if not isinstance(name, str): - raise TypeError(f"Name should be str. Got {type(name)} instead in {self} from {self.organism}.") + raise TypeError( + f"Name should be str. Got {type(name)} instead in {self} from {self.organism}." + ) if not isinstance(product, str): - raise TypeError(f"Product should be str. Got {type(product)} instead in {self} from {self.organism}.") + raise TypeError( + f"Product should be str. Got {type(product)} instead in {self} from {self.organism}." + ) if not isinstance(local_identifier, str): raise TypeError( - f"Local identifier should be str. Got {type(local_identifier)} instead in {self} from {self.organism}.") + f"Local identifier should be str. Got {type(local_identifier)} instead in {self} from {self.organism}." + ) if strand not in ["+", "-"]: - raise ValueError(f"Strand should be '+' or '-'. Got {strand} instead in {self} from {self.organism}.") + raise ValueError( + f"Strand should be '+' or '-'. Got {strand} instead in {self} from {self.organism}." + ) if not isinstance(coordinates, list): raise TypeError( - f"Coordinates should be of type list. Got {type(coordinates)} instead in {self} from {self.organism}.") + f"Coordinates should be of type list. Got {type(coordinates)} instead in {self} from {self.organism}." + ) for start_i, stop_i in coordinates: if not isinstance(start_i, int): - raise TypeError(f"Start should be int. Got {type(start_i)} instead in {self} from {self.organism}.") + raise TypeError( + f"Start should be int. Got {type(start_i)} instead in {self} from {self.organism}." + ) if not isinstance(stop_i, int): - raise TypeError(f"Stop should be int. Got {type(stop_i)} instead in {self} from {self.organism}.") + raise TypeError( + f"Stop should be int. Got {type(stop_i)} instead in {self} from {self.organism}." + ) if stop_i < start_i: raise ValueError( - f"Wrong coordinates: {coordinates}. Start ({start_i}) should not be greater than stop ({stop_i}) in {self} from {self.organism}.") + f"Wrong coordinates: {coordinates}. Start ({start_i}) should not be greater than stop ({stop_i}) in {self} from {self.organism}." + ) if start_i < 1 or stop_i < 1: raise ValueError( - f"Wrong coordinates: {coordinates}. Start ({start_i}) and stop ({stop_i}) should be greater than 0 in {self} from {self.organism}.") + f"Wrong coordinates: {coordinates}. Start ({start_i}) and stop ({stop_i}) should be greater than 0 in {self} from {self.organism}." + ) self.start = start self.stop = stop @@ -205,7 +237,7 @@ def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str = self.coordinates = coordinates def fill_parents(self, organism: Organism = None, contig: Contig = None): - """ Associate object to an organism and a contig + """Associate object to an organism and a contig :param organism: Parent organism :param contig: Parent contig @@ -228,8 +260,9 @@ def add_sequence(self, sequence): :raise AssertionError: Sequence must be a string """ - assert isinstance(sequence, - str), f"'str' type was expected for dna sequence but you provided a '{type(sequence)}' type object" + assert isinstance( + sequence, str + ), f"'str' type was expected for dna sequence but you provided a '{type(sequence)}' type object" self.dna = sequence @@ -237,19 +270,17 @@ def string_coordinates(self) -> str: """ Return a string representation of the coordinates """ - return ','.join(f'{start}..{stop}' for start, stop in self.coordinates) + return ",".join(f"{start}..{stop}" for start, stop in self.coordinates) def start_relative_to(self, gene): - """ - """ + """ """ if gene.start <= self.start: return self.start if gene.start > self.start: return self.start + self.contig.length def stop_relative_to(self, gene): - """ - """ + """ """ if gene.start <= self.stop: return self.stop @@ -278,7 +309,7 @@ class Gene(Feature): Fields: - position: the position of the gene in the genome. - family: the family that the gene belongs to. - - RGP: A putative Region of Plasticity that contains the gene. + - RGP: A putative Region of Plasticity that contains the gene. - genetic_code: the genetic code associated with the gene. - Protein: the protein sequence corresponding to the translated gene. """ @@ -313,8 +344,9 @@ def family(self, family): :param family: Gene family linked to the gene """ from ppanggolin.geneFamily import GeneFamily + if not isinstance(family, GeneFamily): - raise TypeError(f'Expected type GeneFamily, got {type(family)}') + raise TypeError(f"Expected type GeneFamily, got {type(family)}") self._family = family @property @@ -333,8 +365,9 @@ def RGP(self, region): :param region: Region linked to the gene """ from ppanggolin.region import Region + if not isinstance(region, Region): - raise TypeError(f'Expected type Organism, got {type(region)}') + raise TypeError(f"Expected type Organism, got {type(region)}") self._RGP = region @property @@ -359,14 +392,20 @@ def module(self): """ return self.family.module - def fill_annotations(self, position: int = None, genetic_code: int = 11, is_partial: bool = False, frame: int = 0, - **kwargs): + def fill_annotations( + self, + position: int = None, + genetic_code: int = 11, + is_partial: bool = False, + frame: int = 0, + **kwargs, + ): """Fill Gene annotation provide by PPanGGOLiN dependencies :param position: Gene localization in genome :param genetic_code: Genetic code associated to gene :param is_partial: is the gene a partial gene - :param frame: One of '0', '1' or '2'. '0' indicates that the first base of the feature is the first base of a codon, + :param frame: One of '0', '1' or '2'. '0' indicates that the first base of the feature is the first base of a codon, '1' that the second base is the first base of a codon, and so on.. :param kwargs: look at Feature.fill_annotations methods @@ -394,7 +433,9 @@ def add_protein(self, protein: str): :raise TypeError: Protein sequence must be a string """ if not isinstance(protein, str): - raise TypeError(f"'str' type was expected but you provided a '{type(protein)}' type object") + raise TypeError( + f"'str' type was expected but you provided a '{type(protein)}' type object" + ) self.protein = protein @property @@ -403,7 +444,9 @@ def frame(self) -> int: Get the frame of the gene """ - assert self._frame is not None, "frame is already set and should not be set another time." + assert ( + self._frame is not None + ), "frame is already set and should not be set another time." return self._frame @@ -413,7 +456,9 @@ def frame(self, frame: int): :param contig_len: length of the contig """ - assert self._frame is None, "frame is already set and should not be set another time." + assert ( + self._frame is None + ), "frame is already set and should not be set another time." if frame not in [0, 1, 2]: raise ValueError("Frame should be equal to 0, 1 or 2.") @@ -434,8 +479,8 @@ class Contig(MetaFeatures): - is_circular: Boolean value indicating whether the contig is circular or not. - RNAs: Set of RNA annotations present in the contig. - TODO: Getter gene should be based on gene ID, and 2 other attributes should exist to get them by start or position. - Also, when set a new gene in contig, start, stop and strand should be check to check difference, maybe define __eq__ method in gene class. + TODO: Getter gene should be based on gene ID, and 2 other attributes should exist to get them by start or position. + Also, when set a new gene in contig, start, stop and strand should be check to check difference, maybe define __eq__ method in gene class. """ def __init__(self, identifier: int, name: str, is_circular: bool = False): @@ -448,7 +493,9 @@ def __init__(self, identifier: int, name: str, is_circular: bool = False): self.ID = identifier self.name = name self.is_circular = is_circular - self._rna_getter = set() # Saving the rna annotations. We're not using them in the vast majority of cases. + self._rna_getter = ( + set() + ) # Saving the rna annotations. We're not using them in the vast majority of cases. self._genes_getter = {} self._genes_position = [] self._organism = None @@ -462,10 +509,10 @@ def __str__(self) -> str: return self.name def __setitem__(self, coordinate: Tuple[int, int, str], gene: Gene): - """ + """ Set gene to Contig - Check if a gene with the same coordinate exists already in the contig. + Check if a gene with the same coordinate exists already in the contig. :param coordinate: Tuple containing start, stop and strand of the gene :param gene: Gene object to add @@ -476,21 +523,28 @@ def __setitem__(self, coordinate: Tuple[int, int, str], gene: Gene): """ if not isinstance(gene, Gene): - raise TypeError(f"'Gene' type was expected but you provided a '{type(gene)}' type object") + raise TypeError( + f"'Gene' type was expected but you provided a '{type(gene)}' type object" + ) if coordinate in self._genes_getter: raise ValueError( f"Gene '{self._genes_getter[coordinate].ID}' with coordinate {coordinate} already exists in the " f"contig '{self.name}' {f'from genome {self.organism}' if self.organism else ''}, " - f"cannot add gene '{gene.ID}' {f'from genome {gene.organism}' if gene.organism else ''}") + f"cannot add gene '{gene.ID}' {f'from genome {gene.organism}' if gene.organism else ''}" + ) if gene.position is None: - raise AttributeError("The gene object needs to have its position in the contig filled before adding it") + raise AttributeError( + "The gene object needs to have its position in the contig filled before adding it" + ) # Adding empty values. # They should be filled by the end of the parsing. # Doing this because genes are not always met in order. - self._genes_position.extend([None] * (gene.position - len(self._genes_position) + 1)) + self._genes_position.extend( + [None] * (gene.position - len(self._genes_position) + 1) + ) self._genes_position[gene.position] = gene self._genes_getter[coordinate] = gene @@ -498,11 +552,7 @@ def __setitem__(self, coordinate: Tuple[int, int, str], gene: Gene): @property def length(self) -> Union[int, None]: - """Get the length of the contig - - """ - if self._length is None: - logging.getLogger("PPanGGOLiN").warning("Contig length is unknown") + """Get the length of the contig""" return self._length @length.setter @@ -519,8 +569,12 @@ def length(self, contig_len: int): if self._length is None: self._length = contig_len elif self.length != contig_len: - logging.getLogger("PPanGGOLiN").debug(f"Known contig length = {self.length}, new length = {contig_len}") - raise ValueError('Attempting to define a contig length different from the previously defined value.') + logging.getLogger("PPanGGOLiN").debug( + f"Known contig length = {self.length}, new length = {contig_len}" + ) + raise ValueError( + "Attempting to define a contig length different from the previously defined value." + ) def __len__(self) -> int: """Get the length of the contig @@ -572,15 +626,18 @@ def add(self, gene: Gene): :raises TypeError: Region is not an instance Region """ if not isinstance(gene, Gene): - raise TypeError(f"Unexpected class / type for {type(gene)} when adding it to a contig") + raise TypeError( + f"Unexpected class / type for {type(gene)} when adding it to a contig" + ) - for attr in ['start', 'stop', 'position', 'strand']: + for attr in ["start", "stop", "position", "strand"]: if getattr(gene, attr) is None: - raise AttributeError(f'Gene {gene.name} is not fill with {attr}') + raise AttributeError(f"Gene {gene.name} is not fill with {attr}") - if gene.strand not in ['+', '-']: + if gene.strand not in ["+", "-"]: raise AttributeError( - f"Strand of Gene {gene.name} does not have the expected format. Expect '-' or '+' got {gene.strand}") + f"Strand of Gene {gene.name} does not have the expected format. Expect '-' or '+' got {gene.strand}" + ) self[(gene.start, gene.stop, gene.strand)] = gene @@ -595,11 +652,15 @@ def get_by_coordinate(self, coordinate: Tuple[int, int, str]) -> Gene: :raises TypeError: Position is not an integer """ if not isinstance(coordinate, Tuple): - raise TypeError(f"Coordinate to get gene must be a tuple. The provided type was {type(coordinate)}") + raise TypeError( + f"Coordinate to get gene must be a tuple. The provided type was {type(coordinate)}" + ) gene = self[coordinate] if gene is None: - logging.getLogger("PPanGGOLiN").debug("Given position result with a None Gene") + logging.getLogger("PPanGGOLiN").debug( + "Given position result with a None Gene" + ) return gene def remove(self, position): @@ -610,10 +671,14 @@ def remove(self, position): :raises TypeError: Position is not an integer """ if not isinstance(position, int): - raise TypeError(f"Position to get gene must be an integer. The provided type was {type(position)}") + raise TypeError( + f"Position to get gene must be an integer. The provided type was {type(position)}" + ) del self[position] - def get_genes(self, begin: int = 0, end: int = None, outrange_ok: bool = False) -> List[Gene]: + def get_genes( + self, begin: int = 0, end: int = None, outrange_ok: bool = False + ) -> List[Gene]: """ Gets a list of genes within a range of gene position. If no arguments are given it return all genes. @@ -632,11 +697,15 @@ def get_genes(self, begin: int = 0, end: int = None, outrange_ok: bool = False) end = self._genes_position[-1].position if not isinstance(begin, int) or not isinstance(end, int): - raise TypeError(f"Expected type int for 'begin' and 'end', " - f"but received types '{type(begin)}' and '{type(end)}'.") + raise TypeError( + f"Expected type int for 'begin' and 'end', " + f"but received types '{type(begin)}' and '{type(end)}'." + ) if begin > end: - raise ValueError("The 'begin' position must be less than the 'end' position.") + raise ValueError( + "The 'begin' position must be less than the 'end' position." + ) if end > self._genes_position[-1].position: if outrange_ok: @@ -650,7 +719,7 @@ def get_genes(self, begin: int = 0, end: int = None, outrange_ok: bool = False) if begin == end: return self._genes_position[begin] else: - return self._genes_position[begin: end] + return self._genes_position[begin:end] @property def number_of_genes(self) -> int: @@ -662,7 +731,7 @@ def number_of_genes(self) -> int: @property def genes(self) -> Generator[Gene, None, None]: - """ Give the gene content of the contig + """Give the gene content of the contig :return: Generator of genes in contig """ @@ -687,11 +756,11 @@ def organism(self, organism: Organism): :raises TypeError: Given organism is not an instance Organism """ if not isinstance(organism, Organism): - raise TypeError(f'Expected type Organism, got {type(organism)}') + raise TypeError(f"Expected type Organism, got {type(organism)}") self._organism = organism def add_rna(self, rna: RNA): - """ Add RNA to contig + """Add RNA to contig :param rna: RNA object to add @@ -699,9 +768,13 @@ def add_rna(self, rna: RNA): :raises KeyError: Another RNA with the same ID already exists in the contig """ if not isinstance(rna, RNA): - raise TypeError(f"'RNA' type was expected but you provided a '{type(rna)}' type object") + raise TypeError( + f"'RNA' type was expected but you provided a '{type(rna)}' type object" + ) if rna in self._rna_getter: - raise KeyError(f"RNA with the id: {rna.ID} already exist in contig {self.name}") + raise KeyError( + f"RNA with the id: {rna.ID} already exist in contig {self.name}" + ) self._rna_getter.add(rna) @property @@ -714,8 +787,7 @@ def RNAs(self) -> Generator[RNA, None, None]: @property def number_of_rnas(self) -> int: - """Get the number of RNA in the contig - """ + """Get the number of RNA in the contig""" return len(self._rna_getter) def add_contig_length(self, contig_length: int): @@ -729,7 +801,9 @@ def add_contig_length(self, contig_length: int): self.length = contig_length elif self.length != contig_length: - raise ValueError('Attempting to define a contig length different from the previously defined value.') + raise ValueError( + "Attempting to define a contig length different from the previously defined value." + ) @property def regions(self): @@ -767,9 +841,11 @@ def families(self): families = set() for gene in self.genes: if gene.family is None: - raise ValueError("Gene has no family, that should not happen. " - "Check if you're families has been computed or loaded." - "If it's the case, you can report an issue on our GitHub.") + raise ValueError( + "Gene has no family, that should not happen. " + "Check if you're families has been computed or loaded." + "If it's the case, you can report an issue on our GitHub." + ) families.add(gene.family) yield from families @@ -789,18 +865,21 @@ def modules(self): def get_ordered_consecutive_genes(self, genes: Iterable[Gene]) -> List[List[Gene]]: """ Order the given genes considering the circularity of the contig. - + :param genes: An iterable containing genes supposed to be consecutive along the contig. :return: A list of lists containing ordered consecutive genes considering circularity. """ gene_positions = [gene.position for gene in genes] # Determine consecutive region positions - consecutive_region_positions = get_consecutive_region_positions(region_positions=gene_positions, - contig_gene_count=self.number_of_genes) + consecutive_region_positions = get_consecutive_region_positions( + region_positions=gene_positions, contig_gene_count=self.number_of_genes + ) - consecutive_genes_lists = [[self[position] for position in consecutive_positions] for consecutive_positions in - consecutive_region_positions] + consecutive_genes_lists = [ + [self[position] for position in consecutive_positions] + for consecutive_positions in consecutive_region_positions + ] return consecutive_genes_lists @@ -846,12 +925,11 @@ def __str__(self) -> str: return self.name def _set_families(self): - """Set the set of gene families belonging to organism - """ + """Set the set of gene families belonging to organism""" self._families = {gene.family for gene in self.genes} def __setitem__(self, name: str, contig: Contig): - """ Set contig to the organism + """Set contig to the organism :param name: Name of the contig :param contig: Contig object to add in the organism @@ -862,9 +940,13 @@ def __setitem__(self, name: str, contig: Contig): """ if not isinstance(name, str): - raise TypeError(f"Contig name should be a string. You provided a '{type(name)}' type object") + raise TypeError( + f"Contig name should be a string. You provided a '{type(name)}' type object" + ) if not isinstance(contig, Contig): - raise TypeError(f"'Contig' type was expected but you provided a '{type(contig)}' type object") + raise TypeError( + f"'Contig' type was expected but you provided a '{type(contig)}' type object" + ) if name in self._contigs_getter: # Add test if contig are equivalent when __eq__ method will be defined in Contig raise KeyError(f"Contig {contig.name} already in genome {self.name}") @@ -906,7 +988,7 @@ def __delitem__(self, name): raise KeyError("Position of the gene in the contig does not exist") def __len__(self): - """ Get number of contigs in organism + """Get number of contigs in organism :return: Number of contigs in organism """ @@ -951,14 +1033,14 @@ def rna_genes(self) -> Generator[RNA, None, None]: yield from contig.RNAs def number_of_genes(self) -> int: - """ Get number of genes in the organism + """Get number of genes in the organism :return: Number of genes """ return sum(contig.number_of_genes for contig in self.contigs) def number_of_rnas(self) -> int: - """ Get number of genes in the organism + """Get number of genes in the organism :return: Number of genes """ @@ -966,7 +1048,7 @@ def number_of_rnas(self) -> int: @property def contigs(self) -> Generator[Contig, None, None]: - """ Generator of contigs in the organism + """Generator of contigs in the organism :return: Values in contig dictionary from organism """ @@ -974,7 +1056,7 @@ def contigs(self) -> Generator[Contig, None, None]: @property def number_of_contigs(self) -> int: - """ Get number of contigs in organism + """Get number of contigs in organism :return: Number of contigs in organism """ @@ -987,7 +1069,9 @@ def add(self, contig: Contig): :raises KeyError: Contig with the given name already exist in the organism """ - assert isinstance(contig, Contig), f"Contig object is expected, given type was {type(contig)}" + assert isinstance( + contig, Contig + ), f"Contig object is expected, given type was {type(contig)}" try: _ = self.get(contig.name) except KeyError: @@ -1077,7 +1161,7 @@ def number_of_spots(self) -> int: """ return len(list(self.spots)) - def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): + def mk_bitarray(self, index: Dict[Organism, int], partition: str = "all"): """Produces a bitarray representing the presence / absence of families in the organism using the provided index The bitarray is stored in the :attr:`bitarray` attribute and is a :class:`gmpy2.xmpz` type. @@ -1087,26 +1171,28 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): :raises Exception: Partition is not recognized """ self.bitarray = gmpy2.xmpz() # pylint: disable=no-member - if partition == 'all': + if partition == "all": logging.getLogger("PPanGGOLiN").debug("all") for fam in self.families: self.bitarray[index[fam]] = 1 - elif partition in ['shell', 'cloud']: + elif partition in ["shell", "cloud"]: logging.getLogger("PPanGGOLiN").debug("shell, cloud") for fam in self.families: if fam.named_partition == partition: self.bitarray[index[fam]] = 1 - elif partition == 'accessory': + elif partition == "accessory": logging.getLogger("PPanGGOLiN").debug("accessory") for fam in self.families: - if fam.named_partition in ['shell', 'cloud']: + if fam.named_partition in ["shell", "cloud"]: self.bitarray[index[fam]] = 1 else: - raise ValueError("There is not any partition corresponding please report a github issue") + raise ValueError( + "There is not any partition corresponding please report a github issue" + ) def group_genes_by_partition(self) -> Dict[str, Set]: """ - Groups genes based on their family's named partition and returns a dictionary + Groups genes based on their family's named partition and returns a dictionary mapping partition names to sets of genes belonging to each partition. :return: A dictionary containing sets of genes grouped by their family's named partition. diff --git a/ppanggolin/graph/makeGraph.py b/ppanggolin/graph/makeGraph.py index 11d25ec6..ae877013 100644 --- a/ppanggolin/graph/makeGraph.py +++ b/ppanggolin/graph/makeGraph.py @@ -21,9 +21,11 @@ def check_pangenome_former_graph(pangenome: Pangenome, force: bool = False): :param force: Allow to force write on Pangenome file """ if pangenome.status["neighborsGraph"] == "inFile" and not force: - raise AttributeError("You are trying to make a neighbors graph that is already built. " - "If you REALLY want to do that, use --force " - "(it will erase everything except annotation data !)") + raise AttributeError( + "You are trying to make a neighbors graph that is already built. " + "If you REALLY want to do that, use --force " + "(it will erase everything except annotation data !)" + ) elif pangenome.status["neighborsGraph"] == "inFile" and force: erase_pangenome(pangenome, graph=True) @@ -38,21 +40,35 @@ def check_pangenome_for_neighbors_graph(pangenome, force, disable_bar=False): """ check_pangenome_former_graph(pangenome, force) # TODO Check if possible to change for check_pangenome_info - if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] and \ - pangenome.status["genesClustered"] in ["Computed", "Loaded"]: + if pangenome.status["genomesAnnotated"] in [ + "Computed", + "Loaded", + ] and pangenome.status["genesClustered"] in ["Computed", "Loaded"]: pass # nothing to do, can just continue. - elif pangenome.status["genomesAnnotated"] == "inFile" and pangenome.status["genesClustered"] == "inFile": - read_pangenome(pangenome, annotation=True, gene_families=True, disable_bar=disable_bar) - elif pangenome.status["genesClustered"] == "No" and \ - pangenome.status["genomesAnnotated"] in ['inFile', 'Computed', 'Loaded']: - raise Exception("You did not cluster the genes. See the 'ppanggolin cluster' if you want to do that.") + elif ( + pangenome.status["genomesAnnotated"] == "inFile" + and pangenome.status["genesClustered"] == "inFile" + ): + read_pangenome( + pangenome, annotation=True, gene_families=True, disable_bar=disable_bar + ) + elif pangenome.status["genesClustered"] == "No" and pangenome.status[ + "genomesAnnotated" + ] in ["inFile", "Computed", "Loaded"]: + raise Exception( + "You did not cluster the genes. See the 'ppanggolin cluster' if you want to do that." + ) else: # You probably can use readPangenome anyway. - msg = "Dev : You are probably writing a new workflow with a combination that I did not test." \ - " You can probably use readPangenome instead of raising this Error. " \ - "However please test it carefully.\n" - msg += " User : I have no idea how you got there. You probably did something unexpected. " \ - "Post an issue with what you did at https://github.com/labgem/PPanGGOLiN\n" + msg = ( + "Dev : You are probably writing a new workflow with a combination that I did not test." + " You can probably use readPangenome instead of raising this Error. " + "However please test it carefully.\n" + ) + msg += ( + " User : I have no idea how you got there. You probably did something unexpected. " + "Post an issue with what you did at https://github.com/labgem/PPanGGOLiN\n" + ) raise NotImplementedError(msg) @@ -68,8 +84,12 @@ def remove_high_copy_number(pangenome, number): fam.removed = True -def compute_neighbors_graph(pangenome: Pangenome, remove_copy_number: int = 0, - force: bool = False, disable_bar: bool = False): +def compute_neighbors_graph( + pangenome: Pangenome, + remove_copy_number: int = 0, + force: bool = False, + disable_bar: bool = False, +): """ Creates the Pangenome Graph. Will either load the information from the pangenome file if they are not loaded, or use the information loaded if they are. @@ -85,7 +105,12 @@ def compute_neighbors_graph(pangenome: Pangenome, remove_copy_number: int = 0, remove_high_copy_number(pangenome, remove_copy_number) logging.getLogger("PPanGGOLiN").info("Computing the neighbors graph...") - bar = tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar) + bar = tqdm( + pangenome.organisms, + total=pangenome.number_of_organisms, + unit="genome", + disable=disable_bar, + ) for org in bar: bar.set_description(f"Processing {org.name}") bar.refresh() @@ -94,12 +119,16 @@ def compute_neighbors_graph(pangenome: Pangenome, remove_copy_number: int = 0, for gene in contig.genes: try: if not gene.family.removed: - if prev is not None and not (prev.family == gene.family and (prev.is_fragment or - gene.is_fragment)): + if prev is not None and not ( + prev.family == gene.family + and (prev.is_fragment or gene.is_fragment) + ): pangenome.add_edge(gene, prev) prev = gene except AttributeError: - raise AttributeError("a Gene does not have a GeneFamily object associated") + raise AttributeError( + "a Gene does not have a GeneFamily object associated" + ) except Exception: raise Exception("Unexpected error. Please report on our github.") if prev is not None and contig.is_circular and contig.number_of_genes > 0: @@ -121,8 +150,15 @@ def launch(args: argparse.Namespace): """ pangenome = Pangenome() pangenome.add_file(args.pangenome) - compute_neighbors_graph(pangenome, args.remove_high_copy_number, args.force, disable_bar=args.disable_prog_bar) - write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar) + compute_neighbors_graph( + pangenome, + args.remove_high_copy_number, + args.force, + disable_bar=args.disable_prog_bar, + ) + write_pangenome( + pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar + ) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -133,7 +169,9 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : parser arguments for graph command """ - parser = sub_parser.add_parser("graph", formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "graph", formatter_class=argparse.RawTextHelpFormatter + ) parser_graph(parser) return parser @@ -144,23 +182,32 @@ def parser_graph(parser: argparse.ArgumentParser): :param parser: parser for graph argument """ - required = parser.add_argument_group(title="Required arguments", - description="Following arguments is required:") - required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") + required = parser.add_argument_group( + title="Required arguments", description="Following arguments is required:" + ) + required.add_argument( + "-p", "--pangenome", required=False, type=Path, help="The pangenome .h5 file" + ) optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument('-r', '--remove_high_copy_number', type=int, default=0, - help="Positive Number: Remove families having a number of copy of gene in a single genome " - "above or equal to this threshold in at least one genome " - "(0 or negative values are ignored).") - - -if __name__ == '__main__': + optional.add_argument( + "-r", + "--remove_high_copy_number", + type=int, + default=0, + help="Positive Number: Remove families having a number of copy of gene in a single genome " + "above or equal to this threshold in at least one genome " + "(0 or negative values are ignored).", + ) + + +if __name__ == "__main__": """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser_graph(main_parser) add_common_arguments(main_parser) diff --git a/ppanggolin/info/info.py b/ppanggolin/info/info.py index 969ceb99..26e34e71 100644 --- a/ppanggolin/info/info.py +++ b/ppanggolin/info/info.py @@ -13,7 +13,9 @@ def print_yaml(yaml_dict: dict) -> None: - yaml_output = yaml.dump(yaml_dict, default_flow_style=False, sort_keys=False, indent=4) + yaml_output = yaml.dump( + yaml_dict, default_flow_style=False, sort_keys=False, indent=4 + ) print(yaml_output) @@ -35,7 +37,7 @@ def read_status(h5f: tables.File): "RGP_Predicted": status_group._v_attrs.predictedRGP, "Spots_Predicted": status_group._v_attrs.predictedRGP, # Please confirm if this should be different from "RGP Predicted" - "Modules_Predicted": status_group._v_attrs.modules + "Modules_Predicted": status_group._v_attrs.modules, } status_to_print = {key: bool(val) for key, val in status_to_print.items()} @@ -56,13 +58,21 @@ def read_metadata_status(h5f: tables.File): if hasattr(status_group._v_attrs, "metadata") and status_group._v_attrs.metadata: metastatus = status_group.metastatus metasources = status_group.metasources - metadata_info = {attr: ', '.join(metasources._v_attrs[attr]) for attr in metastatus._v_attrs._f_list()} + metadata_info = { + attr: ", ".join(metasources._v_attrs[attr]) + for attr in metastatus._v_attrs._f_list() + } return {"Metadata": metadata_info} -def print_info(pangenome: str, status: bool = False, content: bool = False, parameters: bool = False, - metadata: bool = False): +def print_info( + pangenome: str, + status: bool = False, + content: bool = False, + parameters: bool = False, + metadata: bool = False, +): """ Main function to return information about pangenome @@ -92,7 +102,9 @@ def launch(args: argparse.Namespace): :param args: All arguments provide by user """ - print_info(args.pangenome, args.status, args.content, args.parameters, args.metadata) + print_info( + args.pangenome, args.status, args.content, args.parameters, args.metadata + ) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -103,7 +115,9 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : parser arguments for info command """ - parser = sub_parser.add_parser("info", formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "info", formatter_class=argparse.RawTextHelpFormatter + ) parser_info(parser) return parser @@ -114,28 +128,58 @@ def parser_info(parser: argparse.ArgumentParser): :param parser: Parser for the 'info' argument. """ - required = parser.add_argument_group(title="Required arguments", - description="Specify the following required argument:") - required.add_argument('-p', '--pangenome', required=True, type=Path, - help="Path to the pangenome .h5 file") - - options = parser.add_argument_group(title="Information Display Options (default: all)") - options.add_argument("-a", "--parameters", required=False, action="store_true", - help="Display the parameters used or computed for each step of pangenome generation") - options.add_argument("-c", "--content", required=False, action="store_true", - help="Display detailed information about the pangenome's content") - options.add_argument("-s", "--status", required=False, action="store_true", - help="Display information about the statuses of different elements in the pangenome, " - "indicating what has been computed or not") - options.add_argument("-m", "--metadata", required=False, action="store_true", - help="Display a summary of the metadata saved in the pangenome") - - -if __name__ == '__main__': + required = parser.add_argument_group( + title="Required arguments", + description="Specify the following required argument:", + ) + required.add_argument( + "-p", + "--pangenome", + required=True, + type=Path, + help="Path to the pangenome .h5 file", + ) + + options = parser.add_argument_group( + title="Information Display Options (default: all)" + ) + options.add_argument( + "-a", + "--parameters", + required=False, + action="store_true", + help="Display the parameters used or computed for each step of pangenome generation", + ) + options.add_argument( + "-c", + "--content", + required=False, + action="store_true", + help="Display detailed information about the pangenome's content", + ) + options.add_argument( + "-s", + "--status", + required=False, + action="store_true", + help="Display information about the statuses of different elements in the pangenome, " + "indicating what has been computed or not", + ) + options.add_argument( + "-m", + "--metadata", + required=False, + action="store_true", + help="Display a summary of the metadata saved in the pangenome", + ) + + +if __name__ == "__main__": """To test local change and allow using debugger""" main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser_info(main_parser) launch(main_parser.parse_args()) diff --git a/ppanggolin/main.py b/ppanggolin/main.py index c5283977..1ded5ec9 100644 --- a/ppanggolin/main.py +++ b/ppanggolin/main.py @@ -4,13 +4,20 @@ import sys if sys.version_info < (3, 8): # minimum is python3.8 - raise AssertionError("Minimum python version to run PPanGGOLiN is 3.8. Your current python version is " + - ".".join(map(str, sys.version_info))) + raise AssertionError( + "Minimum python version to run PPanGGOLiN is 3.8. Your current python version is " + + ".".join(map(str, sys.version_info)) + ) import argparse # local modules import ppanggolin.pangenome -from ppanggolin.utils import check_input_files, set_verbosity_level, add_common_arguments, manage_cli_and_config_args +from ppanggolin.utils import ( + check_input_files, + set_verbosity_level, + add_common_arguments, + manage_cli_and_config_args, +) import ppanggolin.nem.partition import ppanggolin.nem.rarefaction import ppanggolin.graph @@ -28,24 +35,36 @@ import ppanggolin.meta import ppanggolin.utility -from ppanggolin import SUBCOMMAND_TO_SUBPARSER, epilog, pan_epilog, rgp_epilog, mod_epilog, version +from ppanggolin import ( + SUBCOMMAND_TO_SUBPARSER, + epilog, + pan_epilog, + rgp_epilog, + mod_epilog, + version, +) + def cmd_line() -> argparse.Namespace: - """ Manage the command line argument given by user + """Manage the command line argument given by user :return: arguments given and readable by PPanGGOLiN """ # need to manually write the description so that it's displayed into groups of subcommands .... desc = "\n" - desc += "All of the following subcommands have their own set of options. To see them for a given subcommand," \ - " use it with -h or --help, as such:\n" + desc += ( + "All of the following subcommands have their own set of options. To see them for a given subcommand," + " use it with -h or --help, as such:\n" + ) desc += " ppanggolin -h\n" desc += "\n" desc += " Basic:\n" desc += " all Easy workflow to run all possible analysis\n" desc += " workflow Easy workflow to run a pangenome analysis in one go\n" - desc += " panrgp Easy workflow to run a pangenome analysis with genomic islands and spots of" \ - " insertion detection\n" + desc += ( + " panrgp Easy workflow to run a pangenome analysis with genomic islands and spots of" + " insertion detection\n" + ) desc += " panmodule Easy workflow to run a pangenome analysis with module prediction\n" desc += " \n" desc += " Expert:\n" @@ -62,7 +81,9 @@ def cmd_line() -> argparse.Namespace: desc += " write_genomes Writes 'flat' files that represent the genomes along with their associated pangenome elements.\n" desc += " write_metadata Writes 'TSV' files that represent the metadata associated with elements of the pangenome.\n" desc += " fasta Writes fasta files for different elements of the pangenome.\n" - desc += " info Prints information about a given pangenome graph file.\n" + desc += ( + " info Prints information about a given pangenome graph file.\n" + ) desc += " metrics Compute several metrics on a given pangenome.\n" desc += " \n" desc += " Regions of Genomic Plasticity:\n" @@ -73,8 +94,10 @@ def cmd_line() -> argparse.Namespace: desc += " \n" desc += " Analysis using reference pangenomes:\n" desc += " msa Compute Multiple Sequence Alignments for pangenome gene families.\n" - desc += " align Aligns a genome or a set of proteins to the pangenome gene families and " \ + desc += ( + " align Aligns a genome or a set of proteins to the pangenome gene families and " "predicts information from it.\n" + ) desc += " context Local genomic context analysis.\n" desc += " projection Annotates external genomes with an existing pangenome.\n" desc += " \n" @@ -84,12 +107,16 @@ def cmd_line() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", formatter_class=argparse.RawTextHelpFormatter, - epilog=epilog + pan_epilog + rgp_epilog + mod_epilog) + epilog=epilog + pan_epilog + rgp_epilog + mod_epilog, + ) - parser.add_argument('-v', '--version', action='version', - version='%(prog)s ' + version) + parser.add_argument( + "-v", "--version", action="version", version="%(prog)s " + version + ) - subparsers = parser.add_subparsers(metavar="", dest="subcommand", title="subcommands", description=desc) + subparsers = parser.add_subparsers( + metavar="", dest="subcommand", title="subcommands", description=desc + ) subparsers.required = True # because python3 sent subcommands to hell apparently # print help if no subcommand is specified @@ -106,7 +133,14 @@ def cmd_line() -> argparse.Namespace: sub.epilog = epilog if sub_cmd not in ["rgp", "spot", "module", "rgp_cluster"]: sub.epilog += pan_epilog - if sub_cmd not in ["annotate", "cluster", "graph", "partition", "rarefaction", "workflow"]: + if sub_cmd not in [ + "annotate", + "cluster", + "graph", + "partition", + "rarefaction", + "workflow", + ]: if sub_cmd not in ["module", "panmodule"]: sub.epilog += rgp_epilog if sub_cmd not in ["rgp", "spot", "rgp_cluster", "panrgp"]: @@ -126,32 +160,60 @@ def cmd_line() -> argparse.Namespace: # First parse args to check that nothing is missing or not expected in cli and throw help when requested args = parser.parse_args() - if hasattr(args, "config"): + if hasattr(args, "config"): # the two subcommand with no common args does not have config parameter. so we can skip this part for them. - args = manage_cli_and_config_args(args.subcommand, args.config, SUBCOMMAND_TO_SUBPARSER) + args = manage_cli_and_config_args( + args.subcommand, args.config, SUBCOMMAND_TO_SUBPARSER + ) else: set_verbosity_level(args) if args.subcommand == "annotate" and args.fasta is None and args.anno is None: - parser.error("Please provide either a sequence file using the --fasta option or " - "an annotation file using the --anno option to enable annotation. " - "Use the command line or the config file.") + parser.error( + "Please provide either a sequence file using the --fasta option or " + "an annotation file using the --anno option to enable annotation. " + "Use the command line or the config file." + ) - cmds_pangenome_required = ["cluster", "info", "module", "graph", "align", - "context", "write_pangenome", "write_genomes", "write_metadata", "msa", "draw", "partition", - "rarefaction", "spot", "fasta", "metrics", "rgp", "projection", "metadata"] + cmds_pangenome_required = [ + "cluster", + "info", + "module", + "graph", + "align", + "context", + "write_pangenome", + "write_genomes", + "write_metadata", + "msa", + "draw", + "partition", + "rarefaction", + "spot", + "fasta", + "metrics", + "rgp", + "projection", + "metadata", + ] if args.subcommand in cmds_pangenome_required and args.pangenome is None: - parser.error("Please specify a pangenome file using the --pangenome argument, " - "either through the command line or the config file.") + parser.error( + "Please specify a pangenome file using the --pangenome argument, " + "either through the command line or the config file." + ) if args.subcommand == "align" and args.sequences is None: - parser.error("Please provide sequences (nucleotides or amino acids) for alignment " - "with the pangenome gene families using the --sequences argument, " - "either through the command line or the config file.") + parser.error( + "Please provide sequences (nucleotides or amino acids) for alignment " + "with the pangenome gene families using the --sequences argument, " + "either through the command line or the config file." + ) if args.subcommand == "projection": # check argument correctness and determine input mode (single or multiple files) and add it to args. - input_mode = ppanggolin.projection.projection.check_projection_arguments(args, parser) + input_mode = ppanggolin.projection.projection.check_projection_arguments( + args, parser + ) setattr(args, "input_mode", input_mode) if args.subcommand == "metadata": diff --git a/ppanggolin/meta/meta.py b/ppanggolin/meta/meta.py index 4f9aaf85..9d9e6a6b 100644 --- a/ppanggolin/meta/meta.py +++ b/ppanggolin/meta/meta.py @@ -17,9 +17,14 @@ from ppanggolin.formats import check_pangenome_info, write_pangenome, erase_pangenome -def check_pangenome_metadata(pangenome: Pangenome, source: str, metatype: str, force: bool = False, - disable_bar: bool = False): - """ Check and load pangenome information before adding metadata +def check_pangenome_metadata( + pangenome: Pangenome, + source: str, + metatype: str, + force: bool = False, + disable_bar: bool = False, +): + """Check and load pangenome information before adding metadata :param pangenome: Pangenome object :param source: source of the metadata @@ -27,31 +32,37 @@ def check_pangenome_metadata(pangenome: Pangenome, source: str, metatype: str, f :param force: erase if a metadata for the provide source and metatype already exist :param disable_bar: Disable bar """ - need_dic = {'need_annotations': True, - 'need_families': False, - 'need_rgp': False, - 'need_spots': False, - 'need_modules': False} + need_dic = { + "need_annotations": True, + "need_families": False, + "need_rgp": False, + "need_spots": False, + "need_modules": False, + } if metatype in ["families", "RGPs", "spots", "modules"]: - need_dic['need_families'] = True + need_dic["need_families"] = True if metatype in ["RGPs", "spots"]: - need_dic['need_rgp'] = True + need_dic["need_rgp"] = True if metatype == "spots": - need_dic['need_spots'] = True + need_dic["need_spots"] = True if metatype == "modules": - need_dic['need_modules'] = True + need_dic["need_modules"] = True - if pangenome.status["metadata"][metatype] == "inFile" and source in pangenome.status["metasources"][metatype]: + if ( + pangenome.status["metadata"][metatype] == "inFile" + and source in pangenome.status["metasources"][metatype] + ): if force: erase_pangenome(pangenome, metadata=True, source=source, metatype=metatype) else: raise Exception( f"An metadata corresponding to the source : '{source}' already exist in genomes of the pangenome." - "Add the option --force to erase") + "Add the option --force to erase" + ) check_pangenome_info(pangenome, disable_bar=disable_bar, **need_dic) @@ -63,38 +74,65 @@ def check_metadata_format(metadata: Path, metatype: str) -> pd.DataFrame: :return: Dataframe with metadata loaded """ - assert metatype in ["families", "genomes", "contigs", "genes", "RGPs", "spots", "modules"] - colname_check = re.compile('^[a-zA-Z_]\w*$') # \w = [A-Za-z0-9_] - metadata_df = pd.read_csv(metadata, sep="\t", header=0, quoting=csv.QUOTE_NONE, - dtype={metatype: str}) - metadata_df.replace(to_replace='-', value=pd.NA, inplace=True) + assert metatype in [ + "families", + "genomes", + "contigs", + "genes", + "RGPs", + "spots", + "modules", + ] + colname_check = re.compile("^[a-zA-Z_]\w*$") # \w = [A-Za-z0-9_] + metadata_df = pd.read_csv( + metadata, sep="\t", header=0, quoting=csv.QUOTE_NONE, dtype={metatype: str} + ) + metadata_df.replace(to_replace="-", value=pd.NA, inplace=True) if metatype not in metadata_df.columns or metadata_df.shape[1] < 2: - raise KeyError(f"You should at least provide in columns names : {metatype} and one another value. " - "Look at documentation for more information") + raise KeyError( + f"You should at least provide in columns names : {metatype} and one another value. " + "Look at documentation for more information" + ) for column in metadata_df.columns: if not colname_check.match(column): - raise ValueError(f"column name is not a valid identifier: {column}; " - f"it does not match the pattern {colname_check.pattern}") + raise ValueError( + f"column name is not a valid identifier: {column}; " + f"it does not match the pattern {colname_check.pattern}" + ) return metadata_df -def assign_metadata(metadata_df: pd.DataFrame, pangenome: Pangenome, source: str, metatype: str, - omit: bool = False, disable_bar: bool = False): +def assign_metadata( + metadata_df: pd.DataFrame, + pangenome: Pangenome, + source: str, + metatype: str, + omit: bool = False, + disable_bar: bool = False, +): """function assigns metadata to elements in a pangenome based on a metadata dataframe. :param metadata_df: A pandas dataframe containing metadata to be assigned to elements in the pangenome. :param pangenome: A Pangenome object representing the pangenome to which metadata will be assigned. :param source: A string representing the source of the metadata. :param metatype: A string representing the type of element to which metadata will be assigned. - :param omit: A boolean indicating whether to raise an error if metadata cannot be assigned to an element. - If True, metadata will not be assigned to elements that do not exist in the pangenome. + :param omit: A boolean indicating whether to raise an error if metadata cannot be assigned to an element. + If True, metadata will not be assigned to elements that do not exist in the pangenome. If False, an error will be raised. Default is False. :param disable_bar: A boolean indicating whether to disable the progress bar. Default is False. :raise KeyError: element name is not find in pangenome :raise AssertionError: Metatype is not recognized """ - assert metatype in ["families", "genomes", "contigs", "genes", "RGPs", "spots", "modules"] + assert metatype in [ + "families", + "genomes", + "contigs", + "genes", + "RGPs", + "spots", + "modules", + ] def check_duplicate_contig_name(): contig_names = set() @@ -102,15 +140,21 @@ def check_duplicate_contig_name(): old_len = len(contig_names) contig_names.add(contig.name) if len(contig_names) == old_len: - raise Exception("There are 2 contigs with the same name in the pangenome and " - "you did not provide the genome linked to contig. " - "Add a column 'genomes' to indicate to which genome the contig belongs to.") + raise Exception( + "There are 2 contigs with the same name in the pangenome and " + "you did not provide the genome linked to contig. " + "Add a column 'genomes' to indicate to which genome the contig belongs to." + ) if metatype == "contigs" and "genomes" not in metadata_df.columns: check_duplicate_contig_name() - for row in tqdm(metadata_df.iterrows(), unit='row', - total=metadata_df.shape[0], disable=disable_bar): + for row in tqdm( + metadata_df.iterrows(), + unit="row", + total=metadata_df.shape[0], + disable=disable_bar, + ): row = row[1] try: if metatype == "families": @@ -130,11 +174,18 @@ def check_duplicate_contig_name(): element = pangenome.get_module(row[metatype]) except KeyError: if not omit: - raise KeyError(f"{metatype} {row[metatype]} does not exist in pangenome. Check name in your file") + raise KeyError( + f"{metatype} {row[metatype]} does not exist in pangenome. Check name in your file" + ) else: logging.getLogger().debug(f"{metatype}: {row[metatype]} doesn't exist") else: - element.add_metadata(Metadata(source=source, **{k: v for k, v in row.to_dict().items() if k != metatype})) + element.add_metadata( + Metadata( + source=source, + **{k: v for k, v in row.to_dict().items() if k != metatype}, + ) + ) pangenome.status["metadata"][metatype] = "Computed" pangenome.status["metasources"][metatype].append(source) @@ -149,15 +200,30 @@ def launch(args: argparse.Namespace): metadata_df = check_metadata_format(args.metadata, args.assign) pangenome = Pangenome() pangenome.add_file(args.pangenome) - check_pangenome_metadata(pangenome, source=args.source, metatype=args.assign, - force=args.force, disable_bar=args.disable_prog_bar) - assign_metadata(metadata_df, pangenome=pangenome, source=args.source, metatype=args.assign, - omit=args.omit, disable_bar=args.disable_prog_bar) + check_pangenome_metadata( + pangenome, + source=args.source, + metatype=args.assign, + force=args.force, + disable_bar=args.disable_prog_bar, + ) + assign_metadata( + metadata_df, + pangenome=pangenome, + source=args.source, + metatype=args.assign, + omit=args.omit, + disable_bar=args.disable_prog_bar, + ) logging.getLogger().info("Metadata assignment Done") - write_pangenome(pangenome, pangenome.file, force=args.force, disable_bar=args.disable_prog_bar) + write_pangenome( + pangenome, pangenome.file, force=args.force, disable_bar=args.disable_prog_bar + ) -def check_metadata_arguments(args: argparse.Namespace, parser: argparse.ArgumentParser ) -> str: +def check_metadata_arguments( + args: argparse.Namespace, parser: argparse.ArgumentParser +) -> str: """ Check the arguments provided for and raise errors if they are incompatible or missing. @@ -170,8 +236,9 @@ def check_metadata_arguments(args: argparse.Namespace, parser: argparse.Argument for required_arg in ["metadata", "source", "assign"]: if getattr(args, required_arg) is None: - parser.error(f"Please specify the --{required_arg} argument, either through the command line or the config file.") - + parser.error( + f"Please specify the --{required_arg} argument, either through the command line or the config file." + ) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -182,7 +249,9 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : parser arguments for align command """ - parser = sub_parser.add_parser("metadata", formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "metadata", formatter_class=argparse.RawTextHelpFormatter + ) parser_meta(parser) return parser @@ -193,38 +262,93 @@ def parser_meta(parser: argparse.ArgumentParser): :param parser: parser for align argument """ - required = parser.add_argument_group(title="Required arguments", - description="All of the following arguments are required :") - required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") - required.add_argument('-m', '--metadata', required=False, type=Path, nargs='?', - help='Metadata in TSV file. See our github for more detail about format') - required.add_argument("-s", "--source", required=False, type=str, nargs="?", - help='Name of the metadata source') - required.add_argument("-a", "--assign", required=False, type=str, nargs="?", - choices=["families", "genomes", "contigs", "genes", "RGPs", "spots", "modules"], - help="Select to which pangenome element metadata will be assigned") + required = parser.add_argument_group( + title="Required arguments", + description="All of the following arguments are required :", + ) + required.add_argument( + "-p", "--pangenome", required=False, type=Path, help="The pangenome .h5 file" + ) + required.add_argument( + "-m", + "--metadata", + required=False, + type=Path, + nargs="?", + help="Metadata in TSV file. See our github for more detail about format", + ) + required.add_argument( + "-s", + "--source", + required=False, + type=str, + nargs="?", + help="Name of the metadata source", + ) + required.add_argument( + "-a", + "--assign", + required=False, + type=str, + nargs="?", + choices=["families", "genomes", "contigs", "genes", "RGPs", "spots", "modules"], + help="Select to which pangenome element metadata will be assigned", + ) optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument("--omit", required=False, action="store_true", - help="Allow to pass if a key in metadata is not find in pangenome") + optional.add_argument( + "--omit", + required=False, + action="store_true", + help="Allow to pass if a key in metadata is not find in pangenome", + ) -if __name__ == '__main__': +if __name__ == "__main__": """To test local change and allow using debugger""" from ppanggolin.utils import check_log, set_verbosity_level main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser_meta(main_parser) common = main_parser.add_argument_group(title="Common argument") - common.add_argument("--verbose", required=False, type=int, default=1, choices=[0, 1, 2], - help="Indicate verbose level (0 for warning and errors only, 1 for info, 2 for debug)") - common.add_argument("--log", required=False, type=check_log, default="stdout", help="log output file") - common.add_argument("-d", "--disable_prog_bar", required=False, action="store_true", - help="disables the progress bars") - common.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") - common.add_argument('-f', '--force', action="store_true", - help="Force writing in output directory and in pangenome output file.") + common.add_argument( + "--verbose", + required=False, + type=int, + default=1, + choices=[0, 1, 2], + help="Indicate verbose level (0 for warning and errors only, 1 for info, 2 for debug)", + ) + common.add_argument( + "--log", + required=False, + type=check_log, + default="stdout", + help="log output file", + ) + common.add_argument( + "-d", + "--disable_prog_bar", + required=False, + action="store_true", + help="disables the progress bars", + ) + common.add_argument( + "-c", + "--cpu", + required=False, + default=1, + type=int, + help="Number of available cpus", + ) + common.add_argument( + "-f", + "--force", + action="store_true", + help="Force writing in output directory and in pangenome output file.", + ) set_verbosity_level(main_parser.parse_args()) launch(main_parser.parse_args()) diff --git a/ppanggolin/metadata.py b/ppanggolin/metadata.py index 361e4b5a..11ef61d3 100644 --- a/ppanggolin/metadata.py +++ b/ppanggolin/metadata.py @@ -33,7 +33,9 @@ def __init__(self, source: str, **kwargs): :raises Exception: Metadata is empty """ if not isinstance(source, str): - raise TypeError(f"Metadata source name must be a string. Given type {type(source)}") + raise TypeError( + f"Metadata source name must be a string. Given type {type(source)}" + ) if source == "": raise ValueError("Metadata source name should not be empty.") self.source = source @@ -85,7 +87,7 @@ def to_dict(self) -> Dict[str, Any]: @staticmethod def _join_list(attr_list: Union[str, List[str]]): - return ','.join(attr_list) + return ",".join(attr_list) class MetaFeatures: @@ -100,14 +102,12 @@ class MetaFeatures: """ def __init__(self): - """Constructor method - """ + """Constructor method""" self._metadata_getter = defaultdict(dict) @property def number_of_metadata(self) -> int: - """Get the number of metadata associated to feature - """ + """Get the number of metadata associated to feature""" return sum(len(meta_dict) for meta_dict in self._metadata_getter.values()) @property @@ -122,7 +122,7 @@ def metadata(self) -> Generator[Metadata, None, None]: @property def sources(self) -> Generator[str, None, None]: - """ Get all metadata source in gene family + """Get all metadata source in gene family :return: Metadata source """ @@ -133,7 +133,7 @@ def formatted_metadata_dict(self, separator: str = "|") -> Dict[str, str]: Format metadata by combining source and field values. Given an object with metadata, this function creates a new dictionary where the keys - are formatted as 'source_field'. In some cases, it is possible to have multiple values for the same field, + are formatted as 'source_field'. In some cases, it is possible to have multiple values for the same field, in this situation, values are concatenated with the specified separator. :param separator: The separator used to join multiple values for the same field (default is '|'). @@ -144,12 +144,17 @@ def formatted_metadata_dict(self, separator: str = "|") -> Dict[str, str]: for field in metadata.fields: value = str(getattr(metadata, field)) if separator in value: - raise ValueError(f"Metadata {field}={value} associated to {self} from source {metadata.source} " - f"contains in its value the separator character '{separator}'. " - "Please change separator in order to be able to write the metadata.") + raise ValueError( + f"Metadata {field}={value} associated to {self} from source {metadata.source} " + f"contains in its value the separator character '{separator}'. " + "Please change separator in order to be able to write the metadata." + ) source_field_2_values[f"{metadata.source}_{field}"].append(str(value)) - return {source_field: separator.join(values) for source_field, values in source_field_2_values.items()} + return { + source_field: separator.join(values) + for source_field, values in source_field_2_values.items() + } def add_metadata(self, metadata: Metadata, metadata_id: int = None) -> None: """Add metadata to metadata getter @@ -159,7 +164,9 @@ def add_metadata(self, metadata: Metadata, metadata_id: int = None) -> None: :raises AssertionError: Source or metadata is not with the correct type """ - assert isinstance(metadata, Metadata), f"Metadata is not with type Metadata but with {type(metadata)}" + assert isinstance( + metadata, Metadata + ), f"Metadata is not with type Metadata but with {type(metadata)}" # Metadata_id should not already exist because the metadata are added from scratch to a new source, # or they are ridden @@ -175,8 +182,10 @@ def add_metadata(self, metadata: Metadata, metadata_id: int = None) -> None: except KeyError: self._metadata_getter[metadata.source][metadata_id] = metadata else: - raise KeyError(f"A metadata with ID {metadata_id} already exist " - f"for source {metadata.source} in {str(self)}") + raise KeyError( + f"A metadata with ID {metadata_id} already exist " + f"for source {metadata.source} in {str(self)}" + ) def get_metadata(self, source: str, metadata_id: int = None) -> Metadata: """Get metadata from metadata getter by its source and identifier @@ -189,8 +198,10 @@ def get_metadata(self, source: str, metadata_id: int = None) -> Metadata: try: metadata = self._metadata_getter[source][metadata_id] except KeyError: - raise KeyError(f"No metadata exist with ID {metadata_id}" - f"for source {source} in {str(self)}") + raise KeyError( + f"No metadata exist with ID {metadata_id}" + f"for source {source} in {str(self)}" + ) else: return metadata @@ -203,8 +214,12 @@ def get_metadata_by_source(self, source: str) -> Union[Dict[int, Metadata], None :raises AssertionError: Source is not with the correct type """ - assert isinstance(source, str), f"Source is not a string but with {type(source)}" - return self._metadata_getter.get(source) # if source in _metadata_getter return value else None + assert isinstance( + source, str + ), f"Source is not a string but with {type(source)}" + return self._metadata_getter.get( + source + ) # if source in _metadata_getter return value else None def get_metadata_by_attribute(self, **kwargs) -> Generator[Metadata, None, None]: """Get metadata by one or more attribute @@ -216,7 +231,10 @@ def get_metadata_by_attribute(self, **kwargs) -> Generator[Metadata, None, None] if hasattr(metadata, attr): # BUG If value is a list, the join block detection. # It would be better to keep a list and change in writing and reading metadata to join the list - if getattr(metadata, attr, None) in value or getattr(metadata, attr, None) == value: + if ( + getattr(metadata, attr, None) in value + or getattr(metadata, attr, None) == value + ): yield metadata def del_metadata_by_source(self, source: str): @@ -227,20 +245,26 @@ def del_metadata_by_source(self, source: str): :raises AssertionError: Source is not with the correct type :raises KeyError: Source does not belong in the MetaFeature """ - assert isinstance(source, str), f"Source is not a string but with {type(source)}" + assert isinstance( + source, str + ), f"Source is not a string but with {type(source)}" if self._metadata_getter.pop(source, None) is None: - logging.getLogger("PPanGGOLiN").warning("The source to remove does not exist") + logging.getLogger("PPanGGOLiN").warning( + "The source to remove does not exist" + ) def del_metadata_by_attribute(self, **kwargs): - """Remove a source from the feature - """ + """Remove a source from the feature""" for source, metadata_dict in self._metadata_getter.items(): for attr, value in kwargs.items(): for meta_id, metadata in metadata_dict.items(): if hasattr(metadata, attr): # BUG If value is a list, the join block detection. # It would be better to keep a list and change in writing and reading metadata to join the list - if getattr(metadata, attr, None) in value or getattr(metadata, attr, None) == value: + if ( + getattr(metadata, attr, None) in value + or getattr(metadata, attr, None) == value + ): del self._metadata_getter[source][meta_id] def max_metadata_by_source(self) -> Tuple[str, int]: @@ -248,7 +272,9 @@ def max_metadata_by_source(self) -> Tuple[str, int]: :return: Name of the source with the maximum annotation and the number of metadata corresponding """ - max_source, max_meta = max(self._metadata_getter.items(), key=lambda x: len(x[1])) + max_source, max_meta = max( + self._metadata_getter.items(), key=lambda x: len(x[1]) + ) return max_source, len(max_meta) def has_metadata(self) -> bool: diff --git a/ppanggolin/metrics/fluidity.py b/ppanggolin/metrics/fluidity.py index b2b5b0c6..ad7ca25a 100644 --- a/ppanggolin/metrics/fluidity.py +++ b/ppanggolin/metrics/fluidity.py @@ -14,7 +14,7 @@ def compute_genomes_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> dict: - """ Compute the genomes' fluidity from the pangenome + """Compute the genomes' fluidity from the pangenome :param pangenome: pangenome which will be used to compute the genomes' fluidity :param disable_bar: Disable the progress bar @@ -24,22 +24,36 @@ def compute_genomes_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> # check statuses and load info logging.getLogger("PPanGGOLiN").info("Check information in pangenome") - check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) - fluidity_dict = {'all': None, 'shell': None, 'cloud': None, 'accessory': None} + check_pangenome_info( + pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar + ) + fluidity_dict = {"all": None, "shell": None, "cloud": None, "accessory": None} for subset in fluidity_dict.keys(): - logging.getLogger("PPanGGOLiN").debug(f"Compute binaries for {subset} partition") + logging.getLogger("PPanGGOLiN").debug( + f"Compute binaries for {subset} partition" + ) pangenome.compute_org_bitarrays(part=subset) # Compute binaries corresponding to presence / absence of families in organisms g_sum = 0 logging.getLogger("PPanGGOLiN").debug("Get number of families in each genomes") org2_nb_fam = nb_fam_per_org(pangenome, disable_bar) - logging.getLogger("PPanGGOLiN").info(f"Compute rate of unique family for each genome combination in {subset}") - for c_organisms in tqdm(list(combinations(pangenome.organisms, 2)), unit="combination", disable=disable_bar): - tot_fam = org2_nb_fam.get(c_organisms[0].name) + org2_nb_fam.get(c_organisms[1].name) + logging.getLogger("PPanGGOLiN").info( + f"Compute rate of unique family for each genome combination in {subset}" + ) + for c_organisms in tqdm( + list(combinations(pangenome.organisms, 2)), + unit="combination", + disable=disable_bar, + ): + tot_fam = org2_nb_fam.get(c_organisms[0].name) + org2_nb_fam.get( + c_organisms[1].name + ) common_fam = popcount(c_organisms[0].bitarray & c_organisms[1].bitarray) - 1 if tot_fam > 0 and common_fam > 0: g_sum += (tot_fam - 2 * common_fam) / tot_fam - fluidity_dict[subset] = (2 / (pangenome.number_of_organisms * (pangenome.number_of_organisms - 1))) * g_sum + fluidity_dict[subset] = ( + 2 / (pangenome.number_of_organisms * (pangenome.number_of_organisms - 1)) + ) * g_sum return fluidity_dict @@ -53,7 +67,7 @@ def nb_fam_per_org(pangenome: Pangenome, disable_bar: bool = False) -> dict: :return: Dictionary with organisms as key and number of families as value """ org2_nb_fam = dict() - for org in tqdm(pangenome.organisms, unit='genome', disable=disable_bar): + for org in tqdm(pangenome.organisms, unit="genome", disable=disable_bar): org2_nb_fam[org.name] = popcount(org.bitarray) return org2_nb_fam @@ -64,8 +78,9 @@ def nb_fam_per_org(pangenome: Pangenome, disable_bar: bool = False) -> dict: # TODO Function to compute mash distance between genome for normalization + def fam_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> dict: - """ Compute the family fluidity from the pangenome + """Compute the family fluidity from the pangenome :param pangenome: pangenome which will be used to compute the genomes' fluidity :param disable_bar: Disable the progress bar @@ -74,23 +89,38 @@ def fam_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> dict: """ # check statuses and load info logging.getLogger("PPanGGOLiN").info("Check information in pangenome") - check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) - fluidity_dict = {'all': None, 'shell': None, 'cloud': None, 'accessory': None} + check_pangenome_info( + pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar + ) + fluidity_dict = {"all": None, "shell": None, "cloud": None, "accessory": None} for subset in fluidity_dict.keys(): - logging.getLogger("PPanGGOLiN").debug(f"Compute binaries for {subset} partition") + logging.getLogger("PPanGGOLiN").debug( + f"Compute binaries for {subset} partition" + ) pangenome.compute_family_bitarrays(part=subset) # Compute binaries corresponding to presence / absence of families in organisms f_sum = 0 logging.getLogger("PPanGGOLiN").debug("Get number of families in each genome") fam_2_nb_org = nb_org_per_fam(pangenome, disable_bar) - logging.getLogger("PPanGGOLiN").info("Compute rate of unique organism for each family combination") - for c_fam in tqdm(list(combinations(pangenome.gene_families, 2)), unit="combination", disable=disable_bar): + logging.getLogger("PPanGGOLiN").info( + "Compute rate of unique organism for each family combination" + ) + for c_fam in tqdm( + list(combinations(pangenome.gene_families, 2)), + unit="combination", + disable=disable_bar, + ): tot_org = fam_2_nb_org.get(c_fam[0].name) + fam_2_nb_org.get(c_fam[1].name) common_fam = popcount(c_fam[0].bitarray & c_fam[1].bitarray) - 1 if tot_org > 0 and common_fam > 0: f_sum += (tot_org - 2 * common_fam) / tot_org - fluidity_dict[subset] = (2 / (pangenome.number_of_gene_families * - (pangenome.number_of_gene_families - 1))) * f_sum + fluidity_dict[subset] = ( + 2 + / ( + pangenome.number_of_gene_families + * (pangenome.number_of_gene_families - 1) + ) + ) * f_sum return fluidity_dict @@ -104,6 +134,6 @@ def nb_org_per_fam(pangenome: Pangenome, disable_bar: bool = False) -> dict: :return: Dictionary with organisms as key and number of families as value """ fam_2_nb_org = dict() - for fam in tqdm(pangenome.gene_families, unit='gene families', disable=disable_bar): + for fam in tqdm(pangenome.gene_families, unit="gene families", disable=disable_bar): fam_2_nb_org[fam.name] = popcount(fam.bitarray) return fam_2_nb_org diff --git a/ppanggolin/metrics/metrics.py b/ppanggolin/metrics/metrics.py index 71d60b16..6dce8c5f 100644 --- a/ppanggolin/metrics/metrics.py +++ b/ppanggolin/metrics/metrics.py @@ -10,11 +10,16 @@ # local libraries from ppanggolin.pangenome import Pangenome -from ppanggolin.formats.readBinaries import read_info +from ppanggolin.formats.readBinaries import read_info from ppanggolin.metrics.fluidity import compute_genomes_fluidity, fam_fluidity -def check_already_computed_metric(pangenome: Pangenome, genomes_fluidity: bool = False, print_metric:bool = True, recompute:bool = False) : +def check_already_computed_metric( + pangenome: Pangenome, + genomes_fluidity: bool = False, + print_metric: bool = True, + recompute: bool = False, +): """ Check if one of the asked metrics is not already computed @@ -25,16 +30,23 @@ def check_already_computed_metric(pangenome: Pangenome, genomes_fluidity: bool = """ with tables.open_file(pangenome.file, "a") as h5f: info_group = h5f.root.info - if genomes_fluidity and 'genomes_fluidity' in info_group._v_attrs._f_list(): - logging.getLogger("PPanGGOLiN").warning("Genome fluidity has been already computed. " - "Use --force if you want to compute it again") + if genomes_fluidity and "genomes_fluidity" in info_group._v_attrs._f_list(): + logging.getLogger("PPanGGOLiN").warning( + "Genome fluidity has been already computed. " + "Use --force if you want to compute it again" + ) if print_metric and not recompute: - print_computed_metric(info_group._v_attrs['genomes_fluidity']) + print_computed_metric(info_group._v_attrs["genomes_fluidity"]) return True return False -def compute_metrics(pangenome: Pangenome, genomes_fluidity: bool = False, families_fluidity: bool = False, disable_bar: bool = False) -> dict: +def compute_metrics( + pangenome: Pangenome, + genomes_fluidity: bool = False, + families_fluidity: bool = False, + disable_bar: bool = False, +) -> dict: """Compute the metrics :param pangenome: pangenome which will be used to compute the genomes' fluidity @@ -47,14 +59,18 @@ def compute_metrics(pangenome: Pangenome, genomes_fluidity: bool = False, famili metrics_dict = {} if genomes_fluidity: - metrics_dict['genomes_fluidity'] = compute_genomes_fluidity(pangenome, disable_bar) + metrics_dict["genomes_fluidity"] = compute_genomes_fluidity( + pangenome, disable_bar + ) if families_fluidity: - metrics_dict['families_fluidity'] = fam_fluidity(pangenome, disable_bar) + metrics_dict["families_fluidity"] = fam_fluidity(pangenome, disable_bar) return metrics_dict -def write_metrics(pangenome: Pangenome, metrics_dict: dict, print_metrics: bool = False): +def write_metrics( + pangenome: Pangenome, metrics_dict: dict, print_metrics: bool = False +): """ Write the metrics computed in the pangenome @@ -65,13 +81,16 @@ def write_metrics(pangenome: Pangenome, metrics_dict: dict, print_metrics: bool with tables.open_file(pangenome.file, "a") as h5f: info_group = h5f.root.info logging.getLogger("PPanGGOLiN").debug("H5f open") - if 'genomes_fluidity' in metrics_dict.keys(): - logging.getLogger("PPanGGOLiN").info("Writing genome fluidity of the pangenome.") - info_group._v_attrs.genomes_fluidity = metrics_dict['genomes_fluidity'] + if "genomes_fluidity" in metrics_dict.keys(): + logging.getLogger("PPanGGOLiN").info( + "Writing genome fluidity of the pangenome." + ) + info_group._v_attrs.genomes_fluidity = metrics_dict["genomes_fluidity"] # After all metrics have been written if print_metrics: - print_computed_metric(metrics_dict['genomes_fluidity']) + print_computed_metric(metrics_dict["genomes_fluidity"]) + def print_computed_metric(metrics_dict: dict): """ @@ -79,10 +98,15 @@ def print_computed_metric(metrics_dict: dict): :params metrics_dict: Dict of computed metrics """ - metric_dict = {"Genomes_fluidity": {key:round(val,3) for key, val in metrics_dict.items()}} - metric_yaml = yaml.dump(metric_dict, default_flow_style=False, sort_keys=False, indent=4) + metric_dict = { + "Genomes_fluidity": {key: round(val, 3) for key, val in metrics_dict.items()} + } + metric_yaml = yaml.dump( + metric_dict, default_flow_style=False, sort_keys=False, indent=4 + ) print(metric_yaml) + def launch(args: argparse.Namespace): """ Command launcher @@ -97,12 +121,23 @@ def launch(args: argparse.Namespace): print_metrics = not args.no_print_info - logging.getLogger("PPanGGOLiN").debug("Check if one of the metrics was already computed") - is_metric_already_computed = check_already_computed_metric(pangenome, genomes_fluidity=args.genome_fluidity, print_metric=print_metrics, recompute=args.recompute_metrics) + logging.getLogger("PPanGGOLiN").debug( + "Check if one of the metrics was already computed" + ) + is_metric_already_computed = check_already_computed_metric( + pangenome, + genomes_fluidity=args.genome_fluidity, + print_metric=print_metrics, + recompute=args.recompute_metrics, + ) if not is_metric_already_computed or args.recompute_metrics: logging.getLogger("PPanGGOLiN").info("Metrics computation begin") - metrics_dictionary = compute_metrics(pangenome, disable_bar=args.disable_prog_bar, genomes_fluidity=args.genome_fluidity) + metrics_dictionary = compute_metrics( + pangenome, + disable_bar=args.disable_prog_bar, + genomes_fluidity=args.genome_fluidity, + ) logging.getLogger("PPanGGOLiN").info("Metrics computation done") write_metrics(pangenome, metrics_dictionary, print_metrics=print_metrics) @@ -116,7 +151,9 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : parser arguments for align command """ - parser = sub_parser.add_parser("metrics", formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "metrics", formatter_class=argparse.RawTextHelpFormatter + ) parser_metrics(parser) return parser @@ -127,32 +164,55 @@ def parser_metrics(parser: argparse.ArgumentParser): :param parser: Argument parser for the 'metrics' command. """ - required = parser.add_argument_group(title="Required arguments", - description="Specify the required argument:") - required.add_argument('-p', '--pangenome', required=False, type=Path, - help="Path to the pangenome .h5 file") - - onereq = parser.add_argument_group(title="Input file", - description="Choose one of the following arguments:") - onereq.add_argument('--genome_fluidity', required=False, action="store_true", default=False, - help="Compute the pangenome genomic fluidity.") - - optional = parser.add_argument_group(title="Optional arguments", - description="Specify optional arguments with default values:") - optional.add_argument('--no_print_info', required=False, action="store_true", default=False, - help="Suppress printing the metrics result. " - "Metrics are saved in the pangenome and viewable using 'ppanggolin info'.") - optional.add_argument('--recompute_metrics', action="store_true", - help="Force re-computation of metrics if already computed.") - - -if __name__ == '__main__': + required = parser.add_argument_group( + title="Required arguments", description="Specify the required argument:" + ) + required.add_argument( + "-p", + "--pangenome", + required=False, + type=Path, + help="Path to the pangenome .h5 file", + ) + + onereq = parser.add_argument_group( + title="Input file", description="Choose one of the following arguments:" + ) + onereq.add_argument( + "--genome_fluidity", + required=False, + action="store_true", + default=False, + help="Compute the pangenome genomic fluidity.", + ) + + optional = parser.add_argument_group( + title="Optional arguments", + description="Specify optional arguments with default values:", + ) + optional.add_argument( + "--no_print_info", + required=False, + action="store_true", + default=False, + help="Suppress printing the metrics result. " + "Metrics are saved in the pangenome and viewable using 'ppanggolin info'.", + ) + optional.add_argument( + "--recompute_metrics", + action="store_true", + help="Force re-computation of metrics if already computed.", + ) + + +if __name__ == "__main__": """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser_metrics(main_parser) add_common_arguments(main_parser) diff --git a/ppanggolin/mod/module.py b/ppanggolin/mod/module.py index fe2df7a0..81dd900e 100644 --- a/ppanggolin/mod/module.py +++ b/ppanggolin/mod/module.py @@ -25,8 +25,10 @@ def check_pangenome_former_modules(pangenome: Pangenome, force: bool = False): :param force: Allow to force write on pangenome by erasing already present modules """ if pangenome.status["modules"] == "inFile" and not force: - raise Exception("You are trying to detect modules on a pangenome which already has predicted modules. " - "If you REALLY want to do that, use --force (it will erase modules previously predicted).") + raise Exception( + "You are trying to detect modules on a pangenome which already has predicted modules. " + "If you REALLY want to do that, use --force (it will erase modules previously predicted)." + ) elif pangenome.status["modules"] == "inFile" and force: erase_pangenome(pangenome, modules=True) @@ -34,32 +36,44 @@ def check_pangenome_former_modules(pangenome: Pangenome, force: bool = False): def compute_mod_graph(pangenome: Pangenome, t: int = 1, disable_bar: bool = False): """ Computes a graph using all provided genomes with a transitive closure of size t - + :param pangenome: pangenome with organisms to compute the graph :param t: the size of the transitive closure :param disable_bar: whether to show a progress bar or not """ g = nx.Graph() - for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): + for org in tqdm( + pangenome.organisms, + total=pangenome.number_of_organisms, + unit="genome", + disable=disable_bar, + ): for contig in org.contigs: if contig.number_of_genes > 0: start_gene = contig[0] g.add_node(start_gene.family) add_gene(g.nodes[start_gene.family], start_gene, fam_split=False) for i, gene in enumerate(contig.genes): - for j, a_gene in enumerate(contig.get_genes(i + 1, i + t + 2, outrange_ok=True), start=i + 1): + for j, a_gene in enumerate( + contig.get_genes(i + 1, i + t + 2, outrange_ok=True), + start=i + 1, + ): g.add_edge(gene.family, a_gene.family) edge = g[gene.family][a_gene.family] add_gene(edge, gene) add_gene(edge, a_gene) - if j == i + t + 1 or i == 0: # if it's the last gene of the series, or the first series + if ( + j == i + t + 1 or i == 0 + ): # if it's the last gene of the series, or the first series add_gene(g.nodes[a_gene.family], a_gene, fam_split=False) return g -def compute_modules(g: nx.Graph, multi: set, weight: float = 0.85, min_fam: int = 2, size: int = 3): +def compute_modules( + g: nx.Graph, multi: set, weight: float = 0.85, min_fam: int = 2, size: int = 3 +): """ Computes modules using a graph built by :func:`ppanggolin.mod.module.compute_mod_graph` and different parameters defining how restrictive the modules will be. @@ -77,8 +91,9 @@ def compute_modules(g: nx.Graph, multi: set, weight: float = 0.85, min_fam: int modules = set() c = 0 for comp in connected_components(g, removed, weight): - if len(comp) >= size and not any(fam.named_partition == "persistent" and - fam not in multi for fam in comp): + if len(comp) >= size and not any( + fam.named_partition == "persistent" and fam not in multi for fam in comp + ): # keep only the modules with at least 'size' non-multigenic genes and # remove 'persistent' and non-multigenic modules modules.add(Module(module_id=c, families=comp)) @@ -86,8 +101,16 @@ def compute_modules(g: nx.Graph, multi: set, weight: float = 0.85, min_fam: int return modules -def predict_modules(pangenome: Pangenome, dup_margin: float = 0.05, size: int = 3, min_presence: int = 2, - transitive: int = 4, jaccard: float = 0.85, force: bool = False, disable_bar: bool = False): +def predict_modules( + pangenome: Pangenome, + dup_margin: float = 0.05, + size: int = 3, + min_presence: int = 2, + transitive: int = 4, + jaccard: float = 0.85, + force: bool = False, + disable_bar: bool = False, +): """ Main function to predict module @@ -102,16 +125,24 @@ def predict_modules(pangenome: Pangenome, dup_margin: float = 0.05, size: int = """ # check statuses and load info check_pangenome_former_modules(pangenome, force) - check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_partitions=True, - disable_bar=disable_bar) + check_pangenome_info( + pangenome, + need_annotations=True, + need_families=True, + need_partitions=True, + disable_bar=disable_bar, + ) # compute the graph with transitive closure size provided as parameter start_time = time.time() logging.getLogger("PPanGGOLiN").info("Building the graph...") g = compute_mod_graph(pangenome, t=transitive, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info( - f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find modules in") - logging.getLogger("PPanGGOLiN").info(f"There are {nx.number_of_nodes(g)} nodes and {nx.number_of_edges(g)} edges") + f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find modules in" + ) + logging.getLogger("PPanGGOLiN").info( + f"There are {nx.number_of_nodes(g)} nodes and {nx.number_of_edges(g)} edges" + ) start_time = time.time() # get all multigenic gene families @@ -125,8 +156,12 @@ def predict_modules(pangenome: Pangenome, dup_margin: float = 0.05, size: int = fams |= set(mod.families) pangenome.add_module(mod) - logging.getLogger("PPanGGOLiN").info(f"There are {len(fams)} families among {len(modules)} modules") - logging.getLogger("PPanGGOLiN").info(f"Computing modules took {round(time.time() - start_time, 2)} seconds") + logging.getLogger("PPanGGOLiN").info( + f"There are {len(fams)} families among {len(modules)} modules" + ) + logging.getLogger("PPanGGOLiN").info( + f"Computing modules took {round(time.time() - start_time, 2)} seconds" + ) pangenome.status["modules"] = "Computed" pangenome.parameters["module"] = {} @@ -145,10 +180,19 @@ def launch(args: argparse.Namespace): """ pangenome = Pangenome() pangenome.add_file(args.pangenome) - predict_modules(pangenome=pangenome, dup_margin=args.dup_margin, size=args.size, - min_presence=args.min_presence, transitive=args.transitive, jaccard=args.jaccard, force=args.force, - disable_bar=args.disable_prog_bar) - write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar) + predict_modules( + pangenome=pangenome, + dup_margin=args.dup_margin, + size=args.size, + min_presence=args.min_presence, + transitive=args.transitive, + jaccard=args.jaccard, + force=args.force, + disable_bar=args.disable_prog_bar, + ) + write_pangenome( + pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar + ) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -159,7 +203,9 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : parser arguments for align command """ - parser = sub_parser.add_parser("module", formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "module", formatter_class=argparse.RawTextHelpFormatter + ) parser_module(parser) return parser @@ -170,36 +216,76 @@ def parser_module(parser: argparse.ArgumentParser): :param parser: parser for align argument """ - required = parser.add_argument_group(title="Required arguments", - description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") + required = parser.add_argument_group( + title="Required arguments", + description="One of the following arguments is required :", + ) + required.add_argument( + "-p", "--pangenome", required=False, type=Path, help="The pangenome .h5 file" + ) optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument("--size", required=False, type=int, default=3, - help="Minimal number of gene family in a module") - optional.add_argument("--dup_margin", required=False, type=restricted_float, default=0.05, - help="minimum ratio of genomes in which the family must have multiple genes" - " for it to be considered 'duplicated'") - optional.add_argument("-m", "--min_presence", required=False, type=int, default=2, - help="Minimum number of times the module needs to be present in the pangenome to be reported." - " Increasing it will improve precision but lower sensitivity.") - optional.add_argument("-t", "--transitive", required=False, type=int, default=4, - help="Size of the transitive closure used to build the graph. " - "This indicates the number of non related genes allowed in-between two related genes. " - "Increasing it will improve precision but lower sensitivity a little.") - optional.add_argument("-s", "--jaccard", required=False, type=restricted_float, default=0.85, - help="minimum jaccard similarity used to filter edges between gene families. " - "Increasing it will improve precision but lower sensitivity a lot.") - - optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") - - -if __name__ == '__main__': + optional.add_argument( + "--size", + required=False, + type=int, + default=3, + help="Minimal number of gene family in a module", + ) + optional.add_argument( + "--dup_margin", + required=False, + type=restricted_float, + default=0.05, + help="minimum ratio of genomes in which the family must have multiple genes" + " for it to be considered 'duplicated'", + ) + optional.add_argument( + "-m", + "--min_presence", + required=False, + type=int, + default=2, + help="Minimum number of times the module needs to be present in the pangenome to be reported." + " Increasing it will improve precision but lower sensitivity.", + ) + optional.add_argument( + "-t", + "--transitive", + required=False, + type=int, + default=4, + help="Size of the transitive closure used to build the graph. " + "This indicates the number of non related genes allowed in-between two related genes. " + "Increasing it will improve precision but lower sensitivity a little.", + ) + optional.add_argument( + "-s", + "--jaccard", + required=False, + type=restricted_float, + default=0.85, + help="minimum jaccard similarity used to filter edges between gene families. " + "Increasing it will improve precision but lower sensitivity a lot.", + ) + + optional.add_argument( + "-c", + "--cpu", + required=False, + default=1, + type=int, + help="Number of available cpus", + ) + + +if __name__ == "__main__": """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser_module(main_parser) add_common_arguments(main_parser) diff --git a/ppanggolin/nem/partition.py b/ppanggolin/nem/partition.py index 28c48074..f556fb49 100644 --- a/ppanggolin/nem/partition.py +++ b/ppanggolin/nem/partition.py @@ -32,10 +32,18 @@ samples = [] -def run_partitioning(nem_dir_path: Path, nb_org: int, beta: float = 2.5, free_dispersion: bool = False, kval: int = 3, - seed: int = 42, init: str = "param_file", keep_files: bool = False, itermax: int = 100, - just_log_likelihood: bool = False) \ - -> Union[Tuple[dict, None, None], Tuple[int, float, float], Tuple[dict, dict, float]]: +def run_partitioning( + nem_dir_path: Path, + nb_org: int, + beta: float = 2.5, + free_dispersion: bool = False, + kval: int = 3, + seed: int = 42, + init: str = "param_file", + keep_files: bool = False, + itermax: int = 100, + just_log_likelihood: bool = False, +) -> Union[Tuple[dict, None, None], Tuple[int, float, float], Tuple[dict, dict, float]]: """ Main function to make partitioning @@ -88,20 +96,55 @@ def run_partitioning(nem_dir_path: Path, nb_org: int, beta: float = 2.5, free_di init_random, init_param_file = range(1, 3) logging.getLogger("PPanGGOLiN").debug("Running NEM...") logging.getLogger("PPanGGOLiN").debug( - [nem_dir_path.as_posix().encode('ascii') + b"/nem_file", kval, algo, beta, convergence, - convergence_th, b"fuzzy", itermax, True, model, proportion, variance_model, - init_param_file if init in ["param_file", "init_from_old"] else init_random, - nem_dir_path.as_posix().encode('ascii') + b"/nem_file_init_" + str(kval).encode('ascii') + b".m", - nem_dir_path.as_posix().encode('ascii') + b"/nem_file_" + str(kval).encode('ascii'), - seed]) - nem_stats.nem(Fname=nem_dir_path.as_posix().encode('ascii') + b"/nem_file", nk=kval, algo=algo, beta=beta, - convergence=convergence, convergence_th=convergence_th, format=b"fuzzy", it_max=itermax, - dolog=True, model_family=model, proportion=proportion, dispersion=variance_model, - init_mode=init_param_file if init in ["param_file", "init_from_old"] else init_random, - init_file=nem_dir_path.as_posix().encode('ascii') + b"/nem_file_init_" + str(kval).encode( - 'ascii') + b".m", - out_file_prefix=nem_dir_path.as_posix().encode('ascii') + b"/nem_file_" + str(kval).encode('ascii'), - seed=seed) + [ + nem_dir_path.as_posix().encode("ascii") + b"/nem_file", + kval, + algo, + beta, + convergence, + convergence_th, + b"fuzzy", + itermax, + True, + model, + proportion, + variance_model, + init_param_file if init in ["param_file", "init_from_old"] else init_random, + nem_dir_path.as_posix().encode("ascii") + + b"/nem_file_init_" + + str(kval).encode("ascii") + + b".m", + nem_dir_path.as_posix().encode("ascii") + + b"/nem_file_" + + str(kval).encode("ascii"), + seed, + ] + ) + nem_stats.nem( + Fname=nem_dir_path.as_posix().encode("ascii") + b"/nem_file", + nk=kval, + algo=algo, + beta=beta, + convergence=convergence, + convergence_th=convergence_th, + format=b"fuzzy", + it_max=itermax, + dolog=True, + model_family=model, + proportion=proportion, + dispersion=variance_model, + init_mode=( + init_param_file if init in ["param_file", "init_from_old"] else init_random + ), + init_file=nem_dir_path.as_posix().encode("ascii") + + b"/nem_file_init_" + + str(kval).encode("ascii") + + b".m", + out_file_prefix=nem_dir_path.as_posix().encode("ascii") + + b"/nem_file_" + + str(kval).encode("ascii"), + seed=seed, + ) logging.getLogger("PPanGGOLiN").debug("After running NEM...") @@ -113,7 +156,9 @@ def run_partitioning(nem_dir_path: Path, nb_org: int, beta: float = 2.5, free_di # logging.getLogger("PPanGGOLiN").warning("No NEM output file found: "+ nem_dir_path+"/nem_file_"+str(K)+".uf") no_nem = True else: - logging.getLogger("PPanGGOLiN").debug(f"No NEM output file found: {nem_out_path.absolute().as_posix()}") + logging.getLogger("PPanGGOLiN").debug( + f"No NEM output file found: {nem_out_path.absolute().as_posix()}" + ) no_nem = True index_fam = [] @@ -126,8 +171,11 @@ def run_partitioning(nem_dir_path: Path, nb_org: int, beta: float = 2.5, free_di log_likelihood = None entropy = None try: - with open(nem_dir_path / f"nem_file_{str(kval)}.uf") as partitions_nem_file, \ - open(nem_dir_path / f"nem_file_{str(kval)}.mf") as parameters_nem_file: + with open( + nem_dir_path / f"nem_file_{str(kval)}.uf" + ) as partitions_nem_file, open( + nem_dir_path / f"nem_file_{str(kval)}.mf" + ) as parameters_nem_file: parameters = parameters_nem_file.readlines() log_likelihood = float(parameters[2].split()[3]) @@ -137,7 +185,7 @@ def run_partitioning(nem_dir_path: Path, nb_org: int, beta: float = 2.5, free_di for k, line in enumerate(parameters[-kval:]): vector = line.split() mu_k = [bool(float(mu_kj)) for mu_kj in vector[0:nb_org]] - epsilon_k = [float(epsilon_kj) for epsilon_kj in vector[nb_org + 1:]] + epsilon_k = [float(epsilon_kj) for epsilon_kj in vector[nb_org + 1 :]] proportion = float(vector[nb_org]) sum_mu_k.append(sum(mu_k)) sum_epsilon_k.append(sum(epsilon_k)) @@ -157,17 +205,28 @@ def run_partitioning(nem_dir_path: Path, nb_org: int, beta: float = 2.5, free_di for i, line in enumerate(partitions_nem_file): elements = [float(el) for el in line.split()] if just_log_likelihood: - entropy += sum([math.log(float(el)) * float(el) if float(el) > 0 else 0 for el in elements]) + entropy += sum( + [ + math.log(float(el)) * float(el) if float(el) > 0 else 0 + for el in elements + ] + ) else: max_prob = max([float(el) for el in elements]) - positions_max_prob = [pos for pos, prob in enumerate(elements) if prob == max_prob] + positions_max_prob = [ + pos for pos, prob in enumerate(elements) if prob == max_prob + ] if len(positions_max_prob) > 1 or max_prob < 0.5: - partitions_list[i] = "S_" # SHELL in case of doubt gene families is attributed to shell + partitions_list[i] = ( + "S_" # SHELL in case of doubt gene families is attributed to shell + ) else: partitions_list[i] = parti[positions_max_prob.pop()] except OSError: - logging.getLogger("PPanGGOLiN").warning("Partitioning did not work (the number of genomes used is probably too low), " - f"see logs here to obtain more details {nem_dir_path.as_posix()}") + logging.getLogger("PPanGGOLiN").warning( + "Partitioning did not work (the number of genomes used is probably too low), " + f"see logs here to obtain more details {nem_dir_path.as_posix()}" + ) return {}, None, None # return empty objects except ValueError: @@ -180,8 +239,9 @@ def run_partitioning(nem_dir_path: Path, nb_org: int, beta: float = 2.5, free_di return dict(zip(index_fam, partitions_list)), all_parameters, log_likelihood -def nem_single(args: List[Tuple[Path, int, float, bool, int, int, str, bool, int, bool]]) \ - -> Union[Tuple[dict, None, None], Tuple[int, float, float], Tuple[dict, dict, float]]: +def nem_single( + args: List[Tuple[Path, int, float, bool, int, int, str, bool, int, bool]] +) -> Union[Tuple[dict, None, None], Tuple[int, float, float], Tuple[dict, dict, float]]: """ Allow to run partitioning in multiprocessing to evaluate partition number @@ -192,10 +252,17 @@ def nem_single(args: List[Tuple[Path, int, float, bool, int, int, str, bool, int return run_partitioning(*args) -def partition_nem(index: int, kval: int, beta: float = 2.5, sm_degree: int = 10, - free_dispersion: bool = False, seed: int = 42, init: str = "param_file", - tmpdir: Path = None, keep_tmp_files: bool = False) \ - -> Union[Tuple[dict, None, None], Tuple[int, float, float], Tuple[dict, dict, float]]: +def partition_nem( + index: int, + kval: int, + beta: float = 2.5, + sm_degree: int = 10, + free_dispersion: bool = False, + seed: int = 42, + init: str = "param_file", + tmpdir: Path = None, + keep_tmp_files: bool = False, +) -> Union[Tuple[dict, None, None], Tuple[int, float, float], Tuple[dict, dict, float]]: """ :param index: Index of the sample group @@ -213,13 +280,25 @@ def partition_nem(index: int, kval: int, beta: float = 2.5, sm_degree: int = 10, currtmpdir = tmpdir / f"{str(index)}" # unique directory name samp = samples[index] # org_samples accessible because it is a global variable. - edges_weight, nb_fam = write_nem_input_files(tmpdir=currtmpdir, organisms=samp, sm_degree=sm_degree) - return run_partitioning(currtmpdir, len(samp), beta * (nb_fam / edges_weight), free_dispersion, kval=kval, - seed=seed, init=init, keep_files=keep_tmp_files) - - -def nem_samples(pack: tuple) -> Union[Tuple[dict, None, None], Tuple[int, float, float], Tuple[dict, dict, float]]: - """ run partitioning + edges_weight, nb_fam = write_nem_input_files( + tmpdir=currtmpdir, organisms=samp, sm_degree=sm_degree + ) + return run_partitioning( + currtmpdir, + len(samp), + beta * (nb_fam / edges_weight), + free_dispersion, + kval=kval, + seed=seed, + init=init, + keep_files=keep_tmp_files, + ) + + +def nem_samples( + pack: tuple, +) -> Union[Tuple[dict, None, None], Tuple[int, float, float], Tuple[dict, dict, float]]: + """run partitioning :param pack: {index: int, tmpdir: str, beta: float, sm_degree: int, free_dispersion: bool, kval: int, seed: int, init: str, keep_tmp_files: bool} :return: @@ -227,10 +306,12 @@ def nem_samples(pack: tuple) -> Union[Tuple[dict, None, None], Tuple[int, float, return partition_nem(*pack) -def write_nem_input_files(tmpdir: Path, organisms: set, sm_degree: int = 10) -> Tuple[float, int]: +def write_nem_input_files( + tmpdir: Path, organisms: set, sm_degree: int = 10 +) -> Tuple[float, int]: """ Create and format input files for partitioning with NEM - + :param tmpdir: temporary directory path :param organisms: Set of organism from pangenome :param sm_degree: Maximum degree of the nodes to be included in the smoothing process. @@ -243,11 +324,14 @@ def write_nem_input_files(tmpdir: Path, organisms: set, sm_degree: int = 10) -> with open(tmpdir / "column_org_file", "w") as org_file: org_file.write(" ".join([f'"{org.name}"' for org in organisms]) + "\n") - logging.getLogger("PPanGGOLiN").debug("Writing nem_file.str nem_file.index nem_file.nei and nem_file.dat files") - with open(tmpdir / "nem_file.str", "w") as str_file, \ - open(tmpdir / "nem_file.index", "w") as index_file, \ - open(tmpdir / "nem_file.nei", "w") as nei_file, \ - open(tmpdir / "nem_file.dat", "w") as dat_file: + logging.getLogger("PPanGGOLiN").debug( + "Writing nem_file.str nem_file.index nem_file.nei and nem_file.dat files" + ) + with open(tmpdir / "nem_file.str", "w") as str_file, open( + tmpdir / "nem_file.index", "w" + ) as index_file, open(tmpdir / "nem_file.nei", "w") as nei_file, open( + tmpdir / "nem_file.dat", "w" + ) as dat_file: nei_file.write("1\n") index_fam = {} @@ -255,7 +339,7 @@ def write_nem_input_files(tmpdir: Path, organisms: set, sm_degree: int = 10) -> index_org = {} default_dat = [] for index, org in enumerate(organisms): - default_dat.append('0') + default_dat.append("0") index_org[org] = index for fam in pan.gene_families: @@ -276,19 +360,39 @@ def write_nem_input_files(tmpdir: Path, organisms: set, sm_degree: int = 10) -> neighbor_number = 0 sum_dist_score = 0 for edge in fam.edges: # iter on the family's edges. - coverage = sum([len(gene_list) for org, gene_list in edge.get_organisms_dict().items() if org in organisms]) + coverage = sum( + [ + len(gene_list) + for org, gene_list in edge.get_organisms_dict().items() + if org in organisms + ] + ) if coverage == 0: continue # nothing interesting to write, this edge does not exist with this subset of organisms. distance_score = coverage / len(organisms) sum_dist_score += distance_score - row_fam.append(str(index_fam[edge.target if fam == edge.source else edge.source])) + row_fam.append( + str(index_fam[edge.target if fam == edge.source else edge.source]) + ) row_dist_score.append(str(round(distance_score, 4))) neighbor_number += 1 if neighbor_number > 0 and float(neighbor_number) < sm_degree: total_edges_weight += sum_dist_score - nei_file.write('\t'.join( - [str(item) for sublist in [[index_fam[fam]], [neighbor_number], row_fam, row_dist_score] for item in - sublist]) + "\n") + nei_file.write( + "\t".join( + [ + str(item) + for sublist in [ + [index_fam[fam]], + [neighbor_number], + row_fam, + row_dist_score, + ] + for item in sublist + ] + ) + + "\n" + ) else: nei_file.write(str(index_fam[fam]) + "\t0\n") @@ -296,9 +400,20 @@ def write_nem_input_files(tmpdir: Path, organisms: set, sm_degree: int = 10) -> return total_edges_weight / 2, len(index_fam) -def evaluate_nb_partitions(organisms: set, output: Path = None, sm_degree: int = 10, free_dispersion: bool = False, - chunk_size: int = 500, krange: list = None, icl_margin: float = 0.05, draw_icl: bool = False, - cpu: int = 1, seed: int = 42, tmpdir: Path = None, disable_bar: bool = False) -> int: +def evaluate_nb_partitions( + organisms: set, + output: Path = None, + sm_degree: int = 10, + free_dispersion: bool = False, + chunk_size: int = 500, + krange: list = None, + icl_margin: float = 0.05, + draw_icl: bool = False, + cpu: int = 1, + seed: int = 42, + tmpdir: Path = None, + disable_bar: bool = False, +) -> int: """ Evaluate the optimal number of partition for the pangenome @@ -329,13 +444,29 @@ def evaluate_nb_partitions(organisms: set, output: Path = None, sm_degree: int = max_icl_k = 0 args_partitionning = [] for k in range(krange[0] - 1, krange[1] + 1): - args_partitionning.append((newtmpdir, len(select_organisms), 0, free_dispersion, - k, seed, "param_file", True, 10, True)) # follow order run_partitionning args + args_partitionning.append( + ( + newtmpdir, + len(select_organisms), + 0, + free_dispersion, + k, + seed, + "param_file", + True, + 10, + True, + ) + ) # follow order run_partitionning args all_log_likelihood = [] if cpu > 1: - bar = tqdm(range(len(args_partitionning)), unit="Number of partitions", disable=disable_bar) - with get_context('fork').Pool(processes=cpu) as p: + bar = tqdm( + range(len(args_partitionning)), + unit="Number of partitions", + disable=disable_bar, + ) + with get_context("fork").Pool(processes=cpu) as p: for result in p.imap_unordered(nem_single, args_partitionning): all_log_likelihood.append(result) bar.update() @@ -351,8 +482,14 @@ def evaluate_nb_partitions(organisms: set, output: Path = None, sm_degree: int = all_lls = defaultdict(float) for k_candidate, log_likelihood, entropy in all_log_likelihood: if log_likelihood is not None: - nb_params = k_candidate * (len(select_organisms) + 1 + (len(select_organisms) if free_dispersion else 1)) - all_bics[k_candidate] = log_likelihood - 0.5 * (math.log(nb_params) * nb_fam) # Calculate BIC + nb_params = k_candidate * ( + len(select_organisms) + + 1 + + (len(select_organisms) if free_dispersion else 1) + ) + all_bics[k_candidate] = log_likelihood - 0.5 * ( + math.log(nb_params) * nb_fam + ) # Calculate BIC all_icls[k_candidate] = all_bics[k_candidate] - entropy all_lls[k_candidate] = log_likelihood @@ -362,45 +499,99 @@ def evaluate_nb_partitions(organisms: set, output: Path = None, sm_degree: int = if len(all_bics) > 3: max_icl_k = max(all_icls, key=all_icls.get) delta_icl = (all_icls[max_icl_k] - min(all_icls.values())) * icl_margin - best_k = min({k for k, icl in all_icls.items() if icl >= all_icls[max_icl_k] - delta_icl and k <= max_icl_k}) + best_k = min( + { + k + for k, icl in all_icls.items() + if icl >= all_icls[max_icl_k] - delta_icl and k <= max_icl_k + } + ) chosen_k = best_k if best_k >= 3 else chosen_k if len(all_bics) > 0 and draw_icl: - traces = [go.Scatter(x=sorted(all_bics.keys()), - y=[all_bics[key] for key in sorted(all_bics.keys())], - name="BIC", - mode="lines+markers"), go.Scatter(x=sorted(all_icls.keys()), - y=[all_icls[key] for key in sorted(all_icls.keys())], - name="ICL", - mode="lines+markers"), - go.Scatter(x=sorted(all_lls.keys()), - y=[all_lls[key] for key in sorted(all_lls.keys())], - name="log likelihood", - mode="lines+markers"), go.Scatter(x=sorted(all_bics.keys()), - y=[all_bics[key] for key in sorted(all_bics.keys())], - name="BIC", - mode="lines+markers"), - go.Scatter(x=sorted(all_icls.keys()), - y=[all_icls[key] for key in sorted(all_icls.keys())], - name="ICL", - mode="lines+markers"), go.Scatter(x=sorted(all_lls.keys()), - y=[all_lls[key] for key in sorted(all_lls.keys())], - name="log likelihood", - mode="lines+markers")] - layout = go.Layout(title='ICL curve (best K is ' + str(best_k) + ', ICL_th= is ' + str(icl_margin) + ")", - titlefont=dict(size=20), - xaxis=dict(title='number of partitions'), - yaxis=dict(title='ICL, BIC, log likelihood'), - plot_bgcolor='#ffffff', - shapes=[dict(type='line', x0=best_k, x1=best_k, y0=0, y1=all_icls[best_k], - line=dict(dict(width=1, dash='dashdot', color="black"))), - dict(type='line', x0=max_icl_k, x1=max_icl_k, y0=0, y1=all_icls[max_icl_k], - line=dict(dict(width=1, dash='dashdot', color="black"))), - dict(type='line', x0=best_k, x1=max_icl_k, y0=all_icls[max_icl_k], - y1=all_icls[max_icl_k], - line=dict(dict(width=1, dash='dashdot', color="black"))), - dict(type='line', x0=2, x1=krange[1], y0=all_icls[best_k], y1=all_icls[best_k], - line=dict(dict(width=1, dash='dashdot', color="black")))]) + traces = [ + go.Scatter( + x=sorted(all_bics.keys()), + y=[all_bics[key] for key in sorted(all_bics.keys())], + name="BIC", + mode="lines+markers", + ), + go.Scatter( + x=sorted(all_icls.keys()), + y=[all_icls[key] for key in sorted(all_icls.keys())], + name="ICL", + mode="lines+markers", + ), + go.Scatter( + x=sorted(all_lls.keys()), + y=[all_lls[key] for key in sorted(all_lls.keys())], + name="log likelihood", + mode="lines+markers", + ), + go.Scatter( + x=sorted(all_bics.keys()), + y=[all_bics[key] for key in sorted(all_bics.keys())], + name="BIC", + mode="lines+markers", + ), + go.Scatter( + x=sorted(all_icls.keys()), + y=[all_icls[key] for key in sorted(all_icls.keys())], + name="ICL", + mode="lines+markers", + ), + go.Scatter( + x=sorted(all_lls.keys()), + y=[all_lls[key] for key in sorted(all_lls.keys())], + name="log likelihood", + mode="lines+markers", + ), + ] + layout = go.Layout( + title="ICL curve (best K is " + + str(best_k) + + ", ICL_th= is " + + str(icl_margin) + + ")", + titlefont=dict(size=20), + xaxis=dict(title="number of partitions"), + yaxis=dict(title="ICL, BIC, log likelihood"), + plot_bgcolor="#ffffff", + shapes=[ + dict( + type="line", + x0=best_k, + x1=best_k, + y0=0, + y1=all_icls[best_k], + line=dict(dict(width=1, dash="dashdot", color="black")), + ), + dict( + type="line", + x0=max_icl_k, + x1=max_icl_k, + y0=0, + y1=all_icls[max_icl_k], + line=dict(dict(width=1, dash="dashdot", color="black")), + ), + dict( + type="line", + x0=best_k, + x1=max_icl_k, + y0=all_icls[max_icl_k], + y1=all_icls[max_icl_k], + line=dict(dict(width=1, dash="dashdot", color="black")), + ), + dict( + type="line", + x0=2, + x1=krange[1], + y0=all_icls[best_k], + y1=all_icls[best_k], + line=dict(dict(width=1, dash="dashdot", color="black")), + ), + ], + ) fig = go.Figure(data=traces, layout=layout) out_plot = output / f"ICL_curve_K{str(best_k)}.html" out_plotly.plot(fig, filename=out_plot.as_posix(), auto_open=False) @@ -414,17 +605,33 @@ def check_pangenome_former_partition(pangenome: Pangenome, force: bool = False): :param force: Allow to force write on Pangenome file """ if pangenome.status["partitioned"] == "inFile" and not force: - raise Exception("You are trying to partition a pangenome already partitioned." - " If you REALLY want to do that, " - "use --force (it will erase partitions and every feature computed from them.") + raise Exception( + "You are trying to partition a pangenome already partitioned." + " If you REALLY want to do that, " + "use --force (it will erase partitions and every feature computed from them." + ) elif pangenome.status["partitioned"] == "inFile" and force: erase_pangenome(pangenome, partition=True) -def partition(pangenome: Pangenome, output: Path = None, beta: float = 2.5, sm_degree: int = 10, - free_dispersion: bool = False, chunk_size: int = 500, kval: int = -1, krange: list = None, - icl_margin: float = 0.05, draw_icl: bool = False, cpu: int = 1, seed: int = 42, - tmpdir: Path = None, keep_tmp_files: bool = False, force: bool = False, disable_bar: bool = False): +def partition( + pangenome: Pangenome, + output: Path = None, + beta: float = 2.5, + sm_degree: int = 10, + free_dispersion: bool = False, + chunk_size: int = 500, + kval: int = -1, + krange: list = None, + icl_margin: float = 0.05, + draw_icl: bool = False, + cpu: int = 1, + seed: int = 42, + tmpdir: Path = None, + keep_tmp_files: bool = False, + force: bool = False, + disable_bar: bool = False, +): """ Partitioning the pangenome @@ -452,10 +659,18 @@ def partition(pangenome: Pangenome, output: Path = None, beta: float = 2.5, sm_d pan = pangenome if draw_icl and output is None: - raise Exception("Combination of option impossible: " - "You asked to draw the ICL curves but did not provide an output directory!") + raise Exception( + "Combination of option impossible: " + "You asked to draw the ICL curves but did not provide an output directory!" + ) check_pangenome_former_partition(pangenome, force) - check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=True, disable_bar=disable_bar) + check_pangenome_info( + pangenome, + need_annotations=True, + need_families=True, + need_graph=True, + disable_bar=disable_bar, + ) organisms = set(pangenome.organisms) if keep_tmp_files: @@ -468,8 +683,10 @@ def partition(pangenome: Pangenome, output: Path = None, beta: float = 2.5, sm_d tmp_path = Path(tmp_dir.name) if len(organisms) <= 10: - logging.getLogger("PPanGGOLiN").warning(f"The number of selected genomes is too low ({len(organisms)} " - f"genomes used) to robustly partition the graph") + logging.getLogger("PPanGGOLiN").warning( + f"The number of selected genomes is too low ({len(organisms)} " + f"genomes used) to robustly partition the graph" + ) pangenome.parameters["partition"] = {} pangenome.parameters["partition"]["beta"] = beta @@ -485,10 +702,26 @@ def partition(pangenome: Pangenome, output: Path = None, beta: float = 2.5, sm_d pangenome.parameters["partition"]["nb_of_partitions"] = kval if kval < 2: pangenome.parameters["partition"]["# computed nb of partitions"] = True - logging.getLogger("PPanGGOLiN").info("Estimating the optimal number of partitions...") - kval = evaluate_nb_partitions(organisms, output, sm_degree, free_dispersion, chunk_size, kmm, - icl_margin, draw_icl, cpu, seed, tmp_path, disable_bar) - logging.getLogger("PPanGGOLiN").info(f"The number of partitions has been evaluated at {kval}") + logging.getLogger("PPanGGOLiN").info( + "Estimating the optimal number of partitions..." + ) + kval = evaluate_nb_partitions( + organisms, + output, + sm_degree, + free_dispersion, + chunk_size, + kmm, + icl_margin, + draw_icl, + cpu, + seed, + tmp_path, + disable_bar, + ) + logging.getLogger("PPanGGOLiN").info( + f"The number of partitions has been evaluated at {kval}" + ) pangenome.parameters["partition"]["# final nb of partitions"] = kval pangenome.parameters["partition"]["krange"] = kmm @@ -521,8 +754,10 @@ def validate_family(res): for node, nem_class in res[0].items(): cpt_partition[node][nem_class[0]] += 1 sum_partionning = sum(cpt_partition[node].values()) - if (sum_partionning > len(organisms) / chunk_size and max( - cpt_partition[node].values()) >= sum_partionning * 0.5) or (sum_partionning > len(organisms)): + if ( + sum_partionning > len(organisms) / chunk_size + and max(cpt_partition[node].values()) >= sum_partionning * 0.5 + ) or (sum_partionning > len(organisms)): if node not in validated: if max(cpt_partition[node].values()) < sum_partionning * 0.5: cpt_partition[node]["U"] = len(organisms) @@ -548,21 +783,37 @@ def validate_family(res): args = [] # tmpdir, beta, sm_degree, free_dispersion, K, seed for i, _ in enumerate(samples[prev:], start=prev): - args.append((i, kval, beta, sm_degree, free_dispersion, seed, init, - tmp_path, keep_tmp_files)) + args.append( + ( + i, + kval, + beta, + sm_degree, + free_dispersion, + seed, + init, + tmp_path, + keep_tmp_files, + ) + ) logging.getLogger("PPanGGOLiN").info("Launching NEM") - with get_context('fork').Pool(processes=cpu) as p: + with get_context("fork").Pool(processes=cpu) as p: # launch partitioning - bar = tqdm(range(len(args)), unit=" samples partitioned", disable=disable_bar) + bar = tqdm( + range(len(args)), unit=" samples partitioned", disable=disable_bar + ) for result in p.imap_unordered(nem_samples, args): validate_family(result) bar.update() bar.close() - condition += 1 # if len(validated) < pan_size, we will want to resample more. + condition += ( + 1 # if len(validated) < pan_size, we will want to resample more. + ) logging.getLogger("PPanGGOLiN").debug( - f"There are {len(validated)} validated families out of {pansize} families.") + f"There are {len(validated)} validated families out of {pansize} families." + ) p.close() p.join() for fam, data in cpt_partition.items(): @@ -571,20 +822,34 @@ def validate_family(res): # need to compute the median vectors of each partition ??? partitioning_results = [partitioning_results, []] # introduces a 'non feature'. - logging.getLogger("PPanGGOLiN").info(f"Did {len(samples)} partitioning with chunks of size {chunk_size} among " - f"{len(organisms)} genomes in {round(time.time() - start_partitioning, 2)} seconds.") + logging.getLogger("PPanGGOLiN").info( + f"Did {len(samples)} partitioning with chunks of size {chunk_size} among " + f"{len(organisms)} genomes in {round(time.time() - start_partitioning, 2)} seconds." + ) else: - edges_weight, nb_fam = write_nem_input_files(tmp_path / f"{str(cpt)}", organisms, - sm_degree=sm_degree) - partitioning_results = run_partitioning(tmp_path / f"{str(cpt)}", len(organisms), - beta * (nb_fam / edges_weight), free_dispersion, kval=kval, seed=seed, - init=init, keep_files=keep_tmp_files) + edges_weight, nb_fam = write_nem_input_files( + tmp_path / f"{str(cpt)}", organisms, sm_degree=sm_degree + ) + partitioning_results = run_partitioning( + tmp_path / f"{str(cpt)}", + len(organisms), + beta * (nb_fam / edges_weight), + free_dispersion, + kval=kval, + seed=seed, + init=init, + keep_files=keep_tmp_files, + ) if partitioning_results == [{}, None, None]: - raise Exception("Statistical partitioning does not work on your data. " - "This usually happens because you used very few (<15) genomes.") + raise Exception( + "Statistical partitioning does not work on your data. " + "This usually happens because you used very few (<15) genomes." + ) cpt += 1 - logging.getLogger("PPanGGOLiN").info(f"Partitioned {len(organisms)} genomes in " - f"{round(time.time() - start_partitioning, 2)} seconds.") + logging.getLogger("PPanGGOLiN").info( + f"Partitioned {len(organisms)} genomes in " + f"{round(time.time() - start_partitioning, 2)} seconds." + ) # pangenome.savePartitionParameters(K, beta, free_dispersion, sm_degree, partitioning_results[1], chunk_size) @@ -608,9 +873,24 @@ def launch(args: argparse.Namespace): mk_outdir(args.output, args.force) global pan pan.add_file(args.pangenome) - partition(pan, args.output, args.beta, args.max_degree_smoothing, args.free_dispersion, args.chunk_size, - args.nb_of_partitions, args.krange, args.ICL_margin, args.draw_ICL, args.cpu, args.seed, args.tmpdir, - args.keep_tmp_files, args.force, disable_bar=args.disable_prog_bar) + partition( + pan, + args.output, + args.beta, + args.max_degree_smoothing, + args.free_dispersion, + args.chunk_size, + args.nb_of_partitions, + args.krange, + args.ICL_margin, + args.draw_ICL, + args.cpu, + args.seed, + args.tmpdir, + args.keep_tmp_files, + args.force, + disable_bar=args.disable_prog_bar, + ) logging.getLogger("PPanGGOLiN").debug("Write partition in pangenome") write_pangenome(pan, pan.file, args.force, disable_bar=args.disable_prog_bar) logging.getLogger("PPanGGOLiN").debug("Partitioning is finished") @@ -624,7 +904,9 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : parser arguments for align command """ - parser = sub_parser.add_parser("partition", formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "partition", formatter_class=argparse.RawTextHelpFormatter + ) parser_partition(parser) return parser @@ -635,57 +917,139 @@ def parser_partition(parser: argparse.ArgumentParser): :param parser: parser for align argument """ - required = parser.add_argument_group(title="Required arguments", - description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome.h5 file") + required = parser.add_argument_group( + title="Required arguments", + description="One of the following arguments is required :", + ) + required.add_argument( + "-p", "--pangenome", required=False, type=Path, help="The pangenome.h5 file" + ) optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument("-b", "--beta", required=False, default=2.5, type=float, - help="beta is the strength of the smoothing using the graph topology during partitioning. " - "0 will deactivate spatial smoothing.") - optional.add_argument("-ms", "--max_degree_smoothing", required=False, default=10, type=float, - help="max. degree of the nodes to be included in the smoothing process.") - optional.add_argument('-o', '--output', required=False, type=Path, - default=Path( - f"ppanggolin_output{time.strftime('DATE%Y-%m-%d_HOUR%H.%M.%S', time.localtime())}" - f"_PID{str(os.getpid())}"), - help="Output directory") - optional.add_argument("-fd", "--free_dispersion", required=False, default=False, action="store_true", - help="use if the dispersion around the centroid vector of each partition during must be free." - " It will be the same for all genomes by default.") - optional.add_argument("-ck", "--chunk_size", required=False, default=500, type=int, - help="Size of the chunks when performing partitioning using chunks of genomes. " - "Chunk partitioning will be used automatically " - "if the number of genomes is above this number.") - optional.add_argument("-K", "--nb_of_partitions", required=False, default=-1, type=int, - help="Number of partitions to use. Must be at least 2. " - "If under 2, it will be detected automatically.") - optional.add_argument("-Kmm", "--krange", nargs=2, required=False, type=int, default=[3, 20], - help="Range of K values to test when detecting K automatically.") - optional.add_argument("-im", "--ICL_margin", required=False, type=float, default=0.05, - help="K is detected automatically by maximizing ICL. However at some point the ICL " - "reaches a plateau. Therefore we are looking for the minimal value of K without " - "significant gain from the larger values of K measured by ICL. For that we take the " - "lowest K that is found within a given 'margin' of the maximal ICL value. Basically, " - "change this option only if you truly understand it, otherwise just leave it be.") - optional.add_argument("--draw_ICL", required=False, default=False, action="store_true", - help="Use if you want to draw the ICL curve for all the tested K values. " - "Will not be done if K is given.") - optional.add_argument("--keep_tmp_files", required=False, default=False, action="store_true", - help="Use if you want to keep the temporary NEM files") - optional.add_argument("-se", "--seed", type=int, default=42, help="seed used to generate random numbers") - optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") - optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), - help="directory for storing temporary files") - - -if __name__ == '__main__': + optional.add_argument( + "-b", + "--beta", + required=False, + default=2.5, + type=float, + help="beta is the strength of the smoothing using the graph topology during partitioning. " + "0 will deactivate spatial smoothing.", + ) + optional.add_argument( + "-ms", + "--max_degree_smoothing", + required=False, + default=10, + type=float, + help="max. degree of the nodes to be included in the smoothing process.", + ) + optional.add_argument( + "-o", + "--output", + required=False, + type=Path, + default=Path( + f"ppanggolin_output{time.strftime('DATE%Y-%m-%d_HOUR%H.%M.%S', time.localtime())}" + f"_PID{str(os.getpid())}" + ), + help="Output directory", + ) + optional.add_argument( + "-fd", + "--free_dispersion", + required=False, + default=False, + action="store_true", + help="use if the dispersion around the centroid vector of each partition during must be free." + " It will be the same for all genomes by default.", + ) + optional.add_argument( + "-ck", + "--chunk_size", + required=False, + default=500, + type=int, + help="Size of the chunks when performing partitioning using chunks of genomes. " + "Chunk partitioning will be used automatically " + "if the number of genomes is above this number.", + ) + optional.add_argument( + "-K", + "--nb_of_partitions", + required=False, + default=-1, + type=int, + help="Number of partitions to use. Must be at least 2. " + "If under 2, it will be detected automatically.", + ) + optional.add_argument( + "-Kmm", + "--krange", + nargs=2, + required=False, + type=int, + default=[3, 20], + help="Range of K values to test when detecting K automatically.", + ) + optional.add_argument( + "-im", + "--ICL_margin", + required=False, + type=float, + default=0.05, + help="K is detected automatically by maximizing ICL. However at some point the ICL " + "reaches a plateau. Therefore we are looking for the minimal value of K without " + "significant gain from the larger values of K measured by ICL. For that we take the " + "lowest K that is found within a given 'margin' of the maximal ICL value. Basically, " + "change this option only if you truly understand it, otherwise just leave it be.", + ) + optional.add_argument( + "--draw_ICL", + required=False, + default=False, + action="store_true", + help="Use if you want to draw the ICL curve for all the tested K values. " + "Will not be done if K is given.", + ) + optional.add_argument( + "--keep_tmp_files", + required=False, + default=False, + action="store_true", + help="Use if you want to keep the temporary NEM files", + ) + optional.add_argument( + "-se", + "--seed", + type=int, + default=42, + help="seed used to generate random numbers", + ) + optional.add_argument( + "-c", + "--cpu", + required=False, + default=1, + type=int, + help="Number of available cpus", + ) + optional.add_argument( + "--tmpdir", + required=False, + type=str, + default=Path(tempfile.gettempdir()), + help="directory for storing temporary files", + ) + + +if __name__ == "__main__": """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser_partition(main_parser) add_common_arguments(main_parser) diff --git a/ppanggolin/nem/rarefaction.py b/ppanggolin/nem/rarefaction.py index 04e2f638..d22c02df 100644 --- a/ppanggolin/nem/rarefaction.py +++ b/ppanggolin/nem/rarefaction.py @@ -33,9 +33,17 @@ samples = [] -def raref_nem(index: int, tmpdir: Path, beta: float = 2.5, sm_degree: int = 10, - free_dispersion: bool = False, chunk_size: int = 500, kval: int = -1, - krange: list = None, seed: int = 42) -> Tuple[Dict[str, int], int]: +def raref_nem( + index: int, + tmpdir: Path, + beta: float = 2.5, + sm_degree: int = 10, + free_dispersion: bool = False, + chunk_size: int = 500, + kval: int = -1, + krange: list = None, + seed: int = 42, +) -> Tuple[Dict[str, int], int]: """ :param index: Index of the sample group organisms @@ -56,22 +64,42 @@ def raref_nem(index: int, tmpdir: Path, beta: float = 2.5, sm_degree: int = 10, kmm = [3, 20] if krange is None else krange if kval < 3: - kval = ppp.evaluate_nb_partitions(organisms=samp, sm_degree=sm_degree, free_dispersion=free_dispersion, - chunk_size=chunk_size, krange=kmm, seed=seed, - tmpdir=tmpdir / f"{str(index)}_eval") + kval = ppp.evaluate_nb_partitions( + organisms=samp, + sm_degree=sm_degree, + free_dispersion=free_dispersion, + chunk_size=chunk_size, + krange=kmm, + seed=seed, + tmpdir=tmpdir / f"{str(index)}_eval", + ) if len(samp) <= chunk_size: # all good, just write stuff. - edges_weight, nb_fam = ppp.write_nem_input_files(tmpdir=currtmpdir, organisms=set(samp), - sm_degree=sm_degree) - cpt_partition = ppp.run_partitioning(currtmpdir, len(samp), beta * (nb_fam / edges_weight), free_dispersion, - kval=kval, seed=seed, init="param_file")[0] + edges_weight, nb_fam = ppp.write_nem_input_files( + tmpdir=currtmpdir, organisms=set(samp), sm_degree=sm_degree + ) + cpt_partition = ppp.run_partitioning( + currtmpdir, + len(samp), + beta * (nb_fam / edges_weight), + free_dispersion, + kval=kval, + seed=seed, + init="param_file", + )[0] else: # going to need multiple partitioning for this sample... families = set() cpt_partition = {} validated = set() cpt = 0 - def validate_family(result: Union[Tuple[dict, None, None], Tuple[int, float, float], Tuple[dict, dict, float]]): + def validate_family( + result: Union[ + Tuple[dict, None, None], + Tuple[int, float, float], + Tuple[dict, dict, float], + ] + ): """ Validate partition assignation to families @@ -80,15 +108,19 @@ def validate_family(result: Union[Tuple[dict, None, None], Tuple[int, float, flo for node, nem_class in result[0].items(): cpt_partition[node][nem_class[0]] += 1 sum_partitioning = sum(cpt_partition[node].values()) - if (sum_partitioning > len(samp) / chunk_size and max( - cpt_partition[node].values()) >= sum_partitioning * 0.5) or (sum_partitioning > len(samp)): + if ( + sum_partitioning > len(samp) / chunk_size + and max(cpt_partition[node].values()) >= sum_partitioning * 0.5 + ) or (sum_partitioning > len(samp)): if node not in validated: if max(cpt_partition[node].values()) < sum_partitioning * 0.5: cpt_partition[node]["U"] = len(samp) validated.add(node) for fam in ppp.pan.gene_families: - if not samp.isdisjoint(set(fam.organisms)): # otherwise, useless to keep track of + if not samp.isdisjoint( + set(fam.organisms) + ): # otherwise, useless to keep track of families.add(fam) cpt_partition[fam.name] = {"P": 0, "S": 0, "C": 0, "U": 0} @@ -113,14 +145,29 @@ def validate_family(result: Union[Tuple[dict, None, None], Tuple[int, float, flo for samp in org_samples: if not currtmpdir.exists(): mk_outdir(currtmpdir) - edges_weight, nb_fam = ppp.write_nem_input_files(currtmpdir / f"{str(cpt)}", samp, - sm_degree=sm_degree) - validate_family(ppp.run_partitioning(currtmpdir / f"{str(cpt)}", len(samp), - beta * (nb_fam / edges_weight), free_dispersion, kval=kval, - seed=seed, init="param_file")) + edges_weight, nb_fam = ppp.write_nem_input_files( + currtmpdir / f"{str(cpt)}", samp, sm_degree=sm_degree + ) + validate_family( + ppp.run_partitioning( + currtmpdir / f"{str(cpt)}", + len(samp), + beta * (nb_fam / edges_weight), + free_dispersion, + kval=kval, + seed=seed, + init="param_file", + ) + ) cpt += 1 if len(cpt_partition) == 0: - counts = {"persistent": "NA", "shell": "NA", "cloud": "NA", "undefined": "NA", "K": kval} + counts = { + "persistent": "NA", + "shell": "NA", + "cloud": "NA", + "undefined": "NA", + "K": kval, + } else: counts = {"persistent": 0, "shell": 0, "cloud": 0, "undefined": 0, "K": kval} @@ -140,7 +187,9 @@ def validate_family(result: Union[Tuple[dict, None, None], Tuple[int, float, flo return counts, index -def launch_raref_nem(args: Tuple[int, Path, float, int, bool, int, int, list, int]) -> Tuple[Tuple[Dict[str, int], int]]: +def launch_raref_nem( + args: Tuple[int, Path, float, int, bool, int, int, list, int] +) -> Tuple[Tuple[Dict[str, int], int]]: """ Launch raref_nem in multiprocessing @@ -160,132 +209,275 @@ def draw_curve(output: Path, data: list, max_sampling: int = 10): :param data: """ logging.getLogger("PPanGGOLiN").info("Drawing the rarefaction curve ...") - raref_name = output/"rarefaction.csv" + raref_name = output / "rarefaction.csv" raref = open(raref_name, "w") - raref.write(",".join(["genomes_count", "persistent", "shell", "cloud", "undefined", "exact_core", "exact_accessory", - "soft_core", "soft_accessory", "pangenome", "K"]) + "\n") + raref.write( + ",".join( + [ + "genomes_count", + "persistent", + "shell", + "cloud", + "undefined", + "exact_core", + "exact_accessory", + "soft_core", + "soft_accessory", + "pangenome", + "K", + ] + ) + + "\n" + ) for part in data: - raref.write(",".join(map(str, [part["nborgs"], part["persistent"], part["shell"], part["cloud"], - part["undefined"], part["exact_core"], part["exact_accessory"], - part["soft_core"], part["soft_accessory"], part["exact_core"] + - part["exact_accessory"], part["K"]])) + "\n") + raref.write( + ",".join( + map( + str, + [ + part["nborgs"], + part["persistent"], + part["shell"], + part["cloud"], + part["undefined"], + part["exact_core"], + part["exact_accessory"], + part["soft_core"], + part["soft_accessory"], + part["exact_core"] + part["exact_accessory"], + part["K"], + ], + ) + ) + + "\n" + ) raref.close() def heap_law(n, p_kappa, p_gamma) -> float: - return p_kappa * n ** p_gamma + return p_kappa * n**p_gamma def poly_area(p_x: list, p_y: list) -> float: - return 0.5 * numpy.abs(numpy.dot(p_x, numpy.roll(p_y, 1)) - numpy.dot(p_y, numpy.roll(p_x, 1))) + return 0.5 * numpy.abs( + numpy.dot(p_x, numpy.roll(p_y, 1)) - numpy.dot(p_y, numpy.roll(p_x, 1)) + ) annotations = [] traces = [] data_raref = read_csv(raref_name, index_col=False) - params_file = open(output/"rarefaction_parameters.csv", "w") - params_file.write("partition,kappa,gamma,kappa_std_error,gamma_std_error,IQR_area\n") - for partition in ["persistent", "shell", "cloud", "undefined", "exact_core", "exact_accessory", "soft_core", - "soft_accessory", "pangenome"]: - percentiles_75 = Series({i: numpy.nanpercentile(data_raref[data_raref["genomes_count"] == i][partition], 75) for i in - range(1, max_sampling + 1)}).dropna() - percentiles_25 = Series({i: numpy.nanpercentile(data_raref[data_raref["genomes_count"] == i][partition], 25) for i in - range(1, max_sampling + 1)}).dropna() - mins = Series({i: numpy.min(data_raref[data_raref["genomes_count"] == i][partition]) for i in - range(1, max_sampling + 1)}).dropna() - maxs = Series({i: numpy.max(data_raref[data_raref["genomes_count"] == i][partition]) for i in - range(1, max_sampling + 1)}).dropna() - medians = Series({i: numpy.median(data_raref[data_raref["genomes_count"] == i][partition]) for i in - range(1, max_sampling + 1)}).dropna() - means = Series({i: numpy.mean(data_raref[data_raref["genomes_count"] == i][partition]) for i in - range(1, max_sampling + 1)}).dropna() + params_file = open(output / "rarefaction_parameters.csv", "w") + params_file.write( + "partition,kappa,gamma,kappa_std_error,gamma_std_error,IQR_area\n" + ) + for partition in [ + "persistent", + "shell", + "cloud", + "undefined", + "exact_core", + "exact_accessory", + "soft_core", + "soft_accessory", + "pangenome", + ]: + percentiles_75 = Series( + { + i: numpy.nanpercentile( + data_raref[data_raref["genomes_count"] == i][partition], 75 + ) + for i in range(1, max_sampling + 1) + } + ).dropna() + percentiles_25 = Series( + { + i: numpy.nanpercentile( + data_raref[data_raref["genomes_count"] == i][partition], 25 + ) + for i in range(1, max_sampling + 1) + } + ).dropna() + mins = Series( + { + i: numpy.min(data_raref[data_raref["genomes_count"] == i][partition]) + for i in range(1, max_sampling + 1) + } + ).dropna() + maxs = Series( + { + i: numpy.max(data_raref[data_raref["genomes_count"] == i][partition]) + for i in range(1, max_sampling + 1) + } + ).dropna() + medians = Series( + { + i: numpy.median(data_raref[data_raref["genomes_count"] == i][partition]) + for i in range(1, max_sampling + 1) + } + ).dropna() + means = Series( + { + i: numpy.mean(data_raref[data_raref["genomes_count"] == i][partition]) + for i in range(1, max_sampling + 1) + } + ).dropna() initial_kappa_gamma = numpy.array([0.0, 0.0]) x = percentiles_25.index.tolist() x += list(reversed(percentiles_25.index.tolist())) area_iqr = poly_area(x, percentiles_25.tolist() + percentiles_75.tolist()) nb_org_min_fitting = 15 - colors = {"pangenome": "black", "exact_accessory": "#EB37ED", "exact_core": "#FF2828", "soft_core": "#c7c938", - "soft_accessory": "#996633", "shell": "#00D860", "persistent": "#F7A507", "cloud": "#79DEFF", - "undefined": "#828282"} + colors = { + "pangenome": "black", + "exact_accessory": "#EB37ED", + "exact_core": "#FF2828", + "soft_core": "#c7c938", + "soft_accessory": "#996633", + "shell": "#00D860", + "persistent": "#F7A507", + "cloud": "#79DEFF", + "undefined": "#828282", + } try: - all_values = data_raref[data_raref["genomes_count"] > nb_org_min_fitting][partition].dropna() - res = optimization.curve_fit(heap_law, data_raref.loc[all_values.index]["genomes_count"], all_values, - initial_kappa_gamma) + all_values = data_raref[data_raref["genomes_count"] > nb_org_min_fitting][ + partition + ].dropna() + res = optimization.curve_fit( + heap_law, + data_raref.loc[all_values.index]["genomes_count"], + all_values, + initial_kappa_gamma, + ) kappa, gamma = res[0] - error_k, error_g = numpy.sqrt(numpy.diag(res[1])) # to calculate the fitting error. + error_k, error_g = numpy.sqrt( + numpy.diag(res[1]) + ) # to calculate the fitting error. # The variance of parameters are the diagonal elements of the variance-co variance matrix, # and the standard error is the square root of it. source : # https://stackoverflow.com/questions/25234996/getting-standard-error-associated-with-parameter-estimates-from-scipy-optimize-c if numpy.isinf(error_k) and numpy.isinf(error_g): - params_file.write(",".join([partition, "NA", "NA", "NA", "NA", str(area_iqr)]) + "\n") + params_file.write( + ",".join([partition, "NA", "NA", "NA", "NA", str(area_iqr)]) + "\n" + ) else: params_file.write( - ",".join([partition, str(kappa), str(gamma), str(error_k), str(error_g), str(area_iqr)]) + "\n") - regression = numpy.apply_along_axis(heap_law, 0, range(nb_org_min_fitting + 1, max_sampling + 1), kappa, - gamma) - regression_sd_top = numpy.apply_along_axis(heap_law, 0, range(nb_org_min_fitting + 1, max_sampling + 1), - kappa - error_k, gamma + error_g) - regression_sd_bottom = numpy.apply_along_axis(heap_law, 0, - range(nb_org_min_fitting + 1, max_sampling + 1), - kappa + error_k, gamma - error_g) - traces.append(go.Scatter(x=list(range(nb_org_min_fitting + 1, max_sampling + 1)), - y=regression, - name=partition + ": Heaps' law", - line=dict(color=colors[partition], - width=4, - dash='dash'), - visible="legendonly" if partition == "undefined" else True)) - traces.append(go.Scatter(x=list(range(nb_org_min_fitting + 1, max_sampling + 1)), - y=regression_sd_top, - name=partition + ": Heaps' law error +", - line=dict(color=colors[partition], - width=1, - dash='dash'), - visible="legendonly" if partition == "undefined" else True)) - traces.append(go.Scatter(x=list(range(nb_org_min_fitting + 1, max_sampling + 1)), - y=regression_sd_bottom, - name=partition + ": Heaps' law error -", - line=dict(color=colors[partition], - width=1, - dash='dash'), - visible="legendonly" if partition == "undefined" else True)) - annotations.append(dict(x=max_sampling, - y=heap_law(max_sampling, kappa, gamma), - ay=0, - ax=50, - text="F=" + str(round(kappa, 0)) + "N" + "" + str( - round(gamma, 5)) + "
IQRarea=" + str(round(area_iqr, 2)), - showarrow=True, - arrowhead=7, - font=dict(size=10, color='white'), - align='center', - arrowcolor=colors[partition], - bordercolor='#c7c7c7', - borderwidth=2, - borderpad=4, - bgcolor=colors[partition], - opacity=0.8)) + ",".join( + [ + partition, + str(kappa), + str(gamma), + str(error_k), + str(error_g), + str(area_iqr), + ] + ) + + "\n" + ) + regression = numpy.apply_along_axis( + heap_law, + 0, + range(nb_org_min_fitting + 1, max_sampling + 1), + kappa, + gamma, + ) + regression_sd_top = numpy.apply_along_axis( + heap_law, + 0, + range(nb_org_min_fitting + 1, max_sampling + 1), + kappa - error_k, + gamma + error_g, + ) + regression_sd_bottom = numpy.apply_along_axis( + heap_law, + 0, + range(nb_org_min_fitting + 1, max_sampling + 1), + kappa + error_k, + gamma - error_g, + ) + traces.append( + go.Scatter( + x=list(range(nb_org_min_fitting + 1, max_sampling + 1)), + y=regression, + name=partition + ": Heaps' law", + line=dict(color=colors[partition], width=4, dash="dash"), + visible="legendonly" if partition == "undefined" else True, + ) + ) + traces.append( + go.Scatter( + x=list(range(nb_org_min_fitting + 1, max_sampling + 1)), + y=regression_sd_top, + name=partition + ": Heaps' law error +", + line=dict(color=colors[partition], width=1, dash="dash"), + visible="legendonly" if partition == "undefined" else True, + ) + ) + traces.append( + go.Scatter( + x=list(range(nb_org_min_fitting + 1, max_sampling + 1)), + y=regression_sd_bottom, + name=partition + ": Heaps' law error -", + line=dict(color=colors[partition], width=1, dash="dash"), + visible="legendonly" if partition == "undefined" else True, + ) + ) + annotations.append( + dict( + x=max_sampling, + y=heap_law(max_sampling, kappa, gamma), + ay=0, + ax=50, + text="F=" + + str(round(kappa, 0)) + + "N" + + "" + + str(round(gamma, 5)) + + "
IQRarea=" + + str(round(area_iqr, 2)), + showarrow=True, + arrowhead=7, + font=dict(size=10, color="white"), + align="center", + arrowcolor=colors[partition], + bordercolor="#c7c7c7", + borderwidth=2, + borderpad=4, + bgcolor=colors[partition], + opacity=0.8, + ) + ) except (TypeError, RuntimeError, ValueError): # if fitting doesn't work - params_file.write(",".join([partition, "NA", "NA", "NA", "NA", str(area_iqr)]) + "\n") - - traces.append(go.Scatter(x=medians.index, - y=medians, - name=partition + " : medians", - mode="lines+markers", - error_y=dict(type='data', - symmetric=False, - array=maxs.subtract(medians), - arrayminus=medians.subtract(mins), - visible=True, - color=colors[partition], - thickness=0.5), - line=dict(color=colors[partition], - width=1), - marker=dict(color=colors[partition], symbol=3, size=8, opacity=0.5), - visible="legendonly" if partition == "undefined" else True)) - traces.append(go.Scatter(x=means.index, - y=means, - name=partition + " : means", - mode="markers", - marker=dict(color=colors[partition], symbol=4, size=8, opacity=0.5), - visible="legendonly" if partition == "undefined" else True)) + params_file.write( + ",".join([partition, "NA", "NA", "NA", "NA", str(area_iqr)]) + "\n" + ) + + traces.append( + go.Scatter( + x=medians.index, + y=medians, + name=partition + " : medians", + mode="lines+markers", + error_y=dict( + type="data", + symmetric=False, + array=maxs.subtract(medians), + arrayminus=medians.subtract(mins), + visible=True, + color=colors[partition], + thickness=0.5, + ), + line=dict(color=colors[partition], width=1), + marker=dict(color=colors[partition], symbol=3, size=8, opacity=0.5), + visible="legendonly" if partition == "undefined" else True, + ) + ) + traces.append( + go.Scatter( + x=means.index, + y=means, + name=partition + " : means", + mode="markers", + marker=dict(color=colors[partition], symbol=4, size=8, opacity=0.5), + visible="legendonly" if partition == "undefined" else True, + ) + ) # up = percentiles_75 # down = percentiles_25 # IQR_area = up.append(down[::-1]) @@ -299,41 +491,67 @@ def poly_area(p_x: list, p_y: list) -> float: # line=dict(color=COLORS[partition]), # marker=dict(color = COLORS[partition]), # visible = "legendonly" if partition == "undefined" else True)) - traces.append(go.Scatter(x=percentiles_75.index, - y=percentiles_75, - name=partition + " : 3rd quartile", - mode="lines", - hoveron="points", - # hovertext=[str(round(e)) for e in half_stds.multiply(2)], - line=dict(color=colors[partition]), - marker=dict(color=colors[partition]), - visible="legendonly" if partition == "undefined" else True)) - traces.append(go.Scatter(x=percentiles_25.index, - y=percentiles_25, - name=partition + " : 1st quartile", - fill='tonexty', - mode="lines", - hoveron="points", - # hovertext=[str(round(e)) for e in half_stds.multiply(2)], - line=dict(color=colors[partition]), - marker=dict(color=colors[partition]), - visible="legendonly" if partition == "undefined" else True)) - layout = go.Layout(title="Rarefaction curve ", - titlefont=dict(size=20), - xaxis=dict(title='size of genome subsets (N)'), - yaxis=dict(title='# of gene families (F)'), - annotations=annotations, - plot_bgcolor='#ffffff') + traces.append( + go.Scatter( + x=percentiles_75.index, + y=percentiles_75, + name=partition + " : 3rd quartile", + mode="lines", + hoveron="points", + # hovertext=[str(round(e)) for e in half_stds.multiply(2)], + line=dict(color=colors[partition]), + marker=dict(color=colors[partition]), + visible="legendonly" if partition == "undefined" else True, + ) + ) + traces.append( + go.Scatter( + x=percentiles_25.index, + y=percentiles_25, + name=partition + " : 1st quartile", + fill="tonexty", + mode="lines", + hoveron="points", + # hovertext=[str(round(e)) for e in half_stds.multiply(2)], + line=dict(color=colors[partition]), + marker=dict(color=colors[partition]), + visible="legendonly" if partition == "undefined" else True, + ) + ) + layout = go.Layout( + title="Rarefaction curve ", + titlefont=dict(size=20), + xaxis=dict(title="size of genome subsets (N)"), + yaxis=dict(title="# of gene families (F)"), + annotations=annotations, + plot_bgcolor="#ffffff", + ) fig = go.Figure(data=traces, layout=layout) - out_plotly.plot(fig, filename=output.as_posix() + "/rarefaction_curve.html", auto_open=False) + out_plotly.plot( + fig, filename=output.as_posix() + "/rarefaction_curve.html", auto_open=False + ) params_file.close() -def make_rarefaction_curve(pangenome: Pangenome, output: Path, tmpdir: Path = None, beta: float = 2.5, depth: int = 30, - min_sampling: int = 1, max_sampling: int = 100, sm_degree: int = 10, - free_dispersion: bool = False, chunk_size: int = 500, kval: int = -1, krange: list = None, - cpu: int = 1, seed: int = 42, kestimate: bool = False, soft_core: float = 0.95, - disable_bar: bool = False): +def make_rarefaction_curve( + pangenome: Pangenome, + output: Path, + tmpdir: Path = None, + beta: float = 2.5, + depth: int = 30, + min_sampling: int = 1, + max_sampling: int = 100, + sm_degree: int = 10, + free_dispersion: bool = False, + chunk_size: int = 500, + kval: int = -1, + krange: list = None, + cpu: int = 1, + seed: int = 42, + kestimate: bool = False, + soft_core: float = 0.95, + disable_bar: bool = False, +): """ Main function to make the rarefaction curve @@ -361,11 +579,25 @@ def make_rarefaction_curve(pangenome: Pangenome, output: Path, tmpdir: Path = No ppp.pan = pangenome # use the global from partition to store the pangenome, so that it is usable try: - krange[0] = ppp.pan.parameters["partition"]["# final nb of partitions"] if krange[0] < 0 else krange[0] - krange[1] = ppp.pan.parameters["partition"]["# final nb of partitions"] if krange[1] < 0 else krange[1] + krange[0] = ( + ppp.pan.parameters["partition"]["# final nb of partitions"] + if krange[0] < 0 + else krange[0] + ) + krange[1] = ( + ppp.pan.parameters["partition"]["# final nb of partitions"] + if krange[1] < 0 + else krange[1] + ) except KeyError: krange = [3, 20] - check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=True, disable_bar=disable_bar) + check_pangenome_info( + pangenome, + need_annotations=True, + need_families=True, + need_graph=True, + disable_bar=disable_bar, + ) tmpdir_obj = tempfile.TemporaryDirectory(dir=tmpdir) tmp_path = Path(tmpdir_obj.name) @@ -378,27 +610,48 @@ def make_rarefaction_curve(pangenome: Pangenome, output: Path, tmpdir: Path = No if kval < 3 and kestimate is False: # estimate K once and for all. try: kval = ppp.pan.parameters["partition"]["# final nb of partitions"] - logging.getLogger("PPanGGOLiN").info(f"Reuse the number of partitions {kval}") + logging.getLogger("PPanGGOLiN").info( + f"Reuse the number of partitions {kval}" + ) except KeyError: - logging.getLogger("PPanGGOLiN").info("Estimating the number of partitions...") - kval = ppp.evaluate_nb_partitions(organisms=set(pangenome.organisms), sm_degree=sm_degree, - free_dispersion=free_dispersion, chunk_size=chunk_size, krange=krange, - cpu=cpu, seed=seed, tmpdir=tmp_path) - logging.getLogger("PPanGGOLiN").info(f"The number of partitions has been evaluated at {kval}") + logging.getLogger("PPanGGOLiN").info( + "Estimating the number of partitions..." + ) + kval = ppp.evaluate_nb_partitions( + organisms=set(pangenome.organisms), + sm_degree=sm_degree, + free_dispersion=free_dispersion, + chunk_size=chunk_size, + krange=krange, + cpu=cpu, + seed=seed, + tmpdir=tmp_path, + ) + logging.getLogger("PPanGGOLiN").info( + f"The number of partitions has been evaluated at {kval}" + ) logging.getLogger("PPanGGOLiN").info("Extracting samples ...") all_samples = [] for i in range(min_sampling, max_sampling): # each point for _ in range(depth): # number of samples per points all_samples.append(set(random.sample(list(pangenome.organisms), i + 1))) - logging.getLogger("PPanGGOLiN").info(f"Done sampling genomes in the pan, there are {len(all_samples)} samples") + logging.getLogger("PPanGGOLiN").info( + f"Done sampling genomes in the pan, there are {len(all_samples)} samples" + ) samp_nb_per_part = [] logging.getLogger("PPanGGOLiN").info("Computing bitarrays for each family...") index_org = pangenome.compute_family_bitarrays() - logging.getLogger("PPanGGOLiN").info("Done computing bitarrays. Comparing them to get exact and soft core stats " - f"for {len(all_samples)} samples...") - bar = tqdm(range(len(all_samples) * pangenome.number_of_gene_families), unit="gene family", disable=disable_bar) + logging.getLogger("PPanGGOLiN").info( + "Done computing bitarrays. Comparing them to get exact and soft core stats " + f"for {len(all_samples)} samples..." + ) + bar = tqdm( + range(len(all_samples) * pangenome.number_of_gene_families), + unit="gene family", + disable=disable_bar, + ) for samp in all_samples: # make the sample's organism bitarray. samp_bitarray = gmpy2.xmpz() # pylint: disable=no-member @@ -411,7 +664,9 @@ def make_rarefaction_curve(pangenome: Pangenome, output: Path, tmpdir: Path = No part["exact_accessory"] = 0 part["soft_accessory"] = 0 for fam in pangenome.gene_families: - nb_common_org = gmpy2.popcount(fam.bitarray & samp_bitarray) # pylint: disable=no-member + nb_common_org = gmpy2.popcount( + fam.bitarray & samp_bitarray + ) # pylint: disable=no-member part["nborgs"] = len(samp) if nb_common_org != 0: # in that case the node 'does not exist' if nb_common_org == len(samp): @@ -433,13 +688,27 @@ def make_rarefaction_curve(pangenome: Pangenome, output: Path, tmpdir: Path = No args = [] for index, samp in enumerate(samples): - args.append((index, tmp_path, beta, sm_degree, free_dispersion, chunk_size, kval, krange, seed)) - - with get_context('fork').Pool(processes=cpu) as p: + args.append( + ( + index, + tmp_path, + beta, + sm_degree, + free_dispersion, + chunk_size, + kval, + krange, + seed, + ) + ) + + with get_context("fork").Pool(processes=cpu) as p: # launch partitioning logging.getLogger("PPanGGOLiN").info(" Partitioning all samples...") bar = tqdm(range(len(args)), unit="samples partitioned", disable=disable_bar) - random.shuffle(args) # shuffling the processing so that the progress bar is closer to reality. + random.shuffle( + args + ) # shuffling the processing so that the progress bar is closer to reality. for result in p.imap_unordered(launch_raref_nem, args): samp_nb_per_part[result[1]] = {**result[0], **samp_nb_per_part[result[1]]} bar.update() @@ -462,12 +731,25 @@ def launch(args: argparse.Namespace): mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) - make_rarefaction_curve(pangenome=pangenome, output=args.output, tmpdir=args.tmpdir, beta=args.beta, - depth=args.depth, min_sampling=args.min, max_sampling=args.max, - sm_degree=args.max_degree_smoothing, free_dispersion=args.free_dispersion, - chunk_size=args.chunk_size, kval=args.nb_of_partitions, krange=args.krange, cpu=args.cpu, - seed=args.seed, kestimate=args.reestimate_K, soft_core=args.soft_core, - disable_bar=args.disable_prog_bar) + make_rarefaction_curve( + pangenome=pangenome, + output=args.output, + tmpdir=args.tmpdir, + beta=args.beta, + depth=args.depth, + min_sampling=args.min, + max_sampling=args.max, + sm_degree=args.max_degree_smoothing, + free_dispersion=args.free_dispersion, + chunk_size=args.chunk_size, + kval=args.nb_of_partitions, + krange=args.krange, + cpu=args.cpu, + seed=args.seed, + kestimate=args.reestimate_K, + soft_core=args.soft_core, + disable_bar=args.disable_prog_bar, + ) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -478,8 +760,11 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : parser arguments for align command """ - parser = sub_parser.add_parser("rarefaction", description='Compute the rarefaction curve of the pangenome', - formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "rarefaction", + description="Compute the rarefaction curve of the pangenome", + formatter_class=argparse.RawTextHelpFormatter, + ) parser_rarefaction(parser) return parser @@ -490,59 +775,151 @@ def parser_rarefaction(parser: argparse.ArgumentParser): :param parser: parser for align argument """ - required = parser.add_argument_group(title="Required arguments", - description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") + required = parser.add_argument_group( + title="Required arguments", + description="One of the following arguments is required :", + ) + required.add_argument( + "-p", "--pangenome", required=False, type=Path, help="The pangenome .h5 file" + ) optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument("-b", "--beta", required=False, default=2.5, type=float, - help="beta is the strength of the smoothing using the graph topology during partitioning. " - "0 will deactivate spatial smoothing.") - optional.add_argument("--depth", required=False, default=30, type=int, - help="Number of samplings at each sampling point") - optional.add_argument("--min", required=False, default=1, type=int, help="Minimum number of organisms in a sample") - optional.add_argument("--max", required=False, type=float, default=100, - help="Maximum number of organisms in a sample (if above the number of provided organisms, " - "the provided organisms will be the maximum)") - - optional.add_argument("-ms", "--max_degree_smoothing", required=False, default=10, type=float, - help="max. degree of the nodes to be included in the smoothing process.") - optional.add_argument('-o', '--output', required=False, type=Path, - default=Path( - f"ppanggolin_output{time.strftime('DATE%Y-%m-%d_HOUR%H.%M.%S', time.localtime())}" - f"_PID{str(os.getpid())}"), - help="Output directory") - optional.add_argument("-fd", "--free_dispersion", required=False, default=False, action="store_true", - help="use if the dispersion around the centroid vector of each partition during must be free." - " It will be the same for all organisms by default.") - optional.add_argument("-ck", "--chunk_size", required=False, default=500, type=int, - help="Size of the chunks when performing partitioning using chunks of organisms. " - "Chunk partitioning will be used automatically " - "if the number of genomes is above this number.") - optional.add_argument("-K", "--nb_of_partitions", required=False, default=-1, type=int, - help="Number of partitions to use. Must be at least 2. " - "By default reuse K if it exists else compute it.") - optional.add_argument("--reestimate_K", required=False, action="store_true", - help=" Will recompute the number of partitions for each sample " - "(between the values provided by --krange) (VERY intensive. Can take a long time.)") - optional.add_argument("-Kmm", "--krange", nargs=2, required=False, type=int, default=[3, -1], - help="Range of K values to test when detecting K automatically. " - "Default between 3 and the K previously computed " - "if there is one, or 20 if there are none.") - optional.add_argument("--soft_core", required=False, type=float, default=0.95, help="Soft core threshold") - optional.add_argument("-se", "--seed", type=int, default=42, help="seed used to generate random numbers") - optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") - optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), - help="directory for storing temporary files") - - -if __name__ == '__main__': + optional.add_argument( + "-b", + "--beta", + required=False, + default=2.5, + type=float, + help="beta is the strength of the smoothing using the graph topology during partitioning. " + "0 will deactivate spatial smoothing.", + ) + optional.add_argument( + "--depth", + required=False, + default=30, + type=int, + help="Number of samplings at each sampling point", + ) + optional.add_argument( + "--min", + required=False, + default=1, + type=int, + help="Minimum number of organisms in a sample", + ) + optional.add_argument( + "--max", + required=False, + type=float, + default=100, + help="Maximum number of organisms in a sample (if above the number of provided organisms, " + "the provided organisms will be the maximum)", + ) + + optional.add_argument( + "-ms", + "--max_degree_smoothing", + required=False, + default=10, + type=float, + help="max. degree of the nodes to be included in the smoothing process.", + ) + optional.add_argument( + "-o", + "--output", + required=False, + type=Path, + default=Path( + f"ppanggolin_output{time.strftime('DATE%Y-%m-%d_HOUR%H.%M.%S', time.localtime())}" + f"_PID{str(os.getpid())}" + ), + help="Output directory", + ) + optional.add_argument( + "-fd", + "--free_dispersion", + required=False, + default=False, + action="store_true", + help="use if the dispersion around the centroid vector of each partition during must be free." + " It will be the same for all organisms by default.", + ) + optional.add_argument( + "-ck", + "--chunk_size", + required=False, + default=500, + type=int, + help="Size of the chunks when performing partitioning using chunks of organisms. " + "Chunk partitioning will be used automatically " + "if the number of genomes is above this number.", + ) + optional.add_argument( + "-K", + "--nb_of_partitions", + required=False, + default=-1, + type=int, + help="Number of partitions to use. Must be at least 2. " + "By default reuse K if it exists else compute it.", + ) + optional.add_argument( + "--reestimate_K", + required=False, + action="store_true", + help=" Will recompute the number of partitions for each sample " + "(between the values provided by --krange) (VERY intensive. Can take a long time.)", + ) + optional.add_argument( + "-Kmm", + "--krange", + nargs=2, + required=False, + type=int, + default=[3, -1], + help="Range of K values to test when detecting K automatically. " + "Default between 3 and the K previously computed " + "if there is one, or 20 if there are none.", + ) + optional.add_argument( + "--soft_core", + required=False, + type=float, + default=0.95, + help="Soft core threshold", + ) + optional.add_argument( + "-se", + "--seed", + type=int, + default=42, + help="seed used to generate random numbers", + ) + optional.add_argument( + "-c", + "--cpu", + required=False, + default=1, + type=int, + help="Number of available cpus", + ) + optional.add_argument( + "--tmpdir", + required=False, + type=str, + default=Path(tempfile.gettempdir()), + help="directory for storing temporary files", + ) + + +if __name__ == "__main__": """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser_rarefaction(main_parser) add_common_arguments(main_parser) diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index a9b5d1de..c33ac94a 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -25,8 +25,7 @@ class Pangenome: """ def __init__(self): - """Constructor method. - """ + """Constructor method.""" self.file = None # basic parameters @@ -40,34 +39,38 @@ def __init__(self): self._spot_getter = {} self._module_getter = {} self.status = { - 'genomesAnnotated': "No", - 'geneSequences': "No", - 'genesClustered': "No", - 'defragmented': "No", - 'geneFamilySequences': "No", - 'neighborsGraph': "No", - 'partitioned': "No", - 'predictedRGP': "No", - 'spots': "No", - 'modules': 'No', - "metadata": {"families": 'No', - "genes": 'No', - "contigs": 'No', - "genomes": 'No', - "RGPs": 'No', - "spots": 'No', - "modules": 'No'}, - "metasources": {"families": [], - "genes": [], - "contigs": [], - "genomes": [], - "RGPs": [], - "spots": [], - "modules": []} + "genomesAnnotated": "No", + "geneSequences": "No", + "genesClustered": "No", + "defragmented": "No", + "geneFamilySequences": "No", + "neighborsGraph": "No", + "partitioned": "No", + "predictedRGP": "No", + "spots": "No", + "modules": "No", + "metadata": { + "families": "No", + "genes": "No", + "contigs": "No", + "genomes": "No", + "RGPs": "No", + "spots": "No", + "modules": "No", + }, + "metasources": { + "families": [], + "genes": [], + "contigs": [], + "genomes": [], + "RGPs": [], + "spots": [], + "modules": [], + }, } self.parameters = {} - def add_file(self, pangenome_file: Path, check_version:bool=True): + def add_file(self, pangenome_file: Path, check_version: bool = True): """ Links an HDF5 file to the pangenome. If needed elements will be loaded from this file, and anything that is computed will be saved to this file when @@ -77,9 +80,12 @@ def add_file(self, pangenome_file: Path, check_version:bool=True): :param check_version: Check ppanggolin version of the pangenome file to be compatible with the current version of ppaggolin being used. :raises AssertionError: If the `pangenome_file` is not an instance of the Path class """ - assert isinstance(pangenome_file, Path), "pangenome file should be a Path object type" + assert isinstance( + pangenome_file, Path + ), "pangenome file should be a Path object type" from ppanggolin.formats.readBinaries import get_status from ppanggolin.utils import check_version_compatibility + # importing on call instead of importing on top to avoid cross-reference problems. if not tables.is_hdf5_file(pangenome_file): raise TypeError("Pangenome file should be an HDF5 file type") @@ -90,13 +96,16 @@ def add_file(self, pangenome_file: Path, check_version:bool=True): self.file = pangenome_file.absolute().as_posix() """ Gene Methods""" + @property def genes(self) -> Generator[Gene, None, None]: """Generator of genes in the pangenome. - + :return: gene generator """ - if self.number_of_organisms > 0: # if we have organisms, they're supposed to have genes + if ( + self.number_of_organisms > 0 + ): # if we have organisms, they're supposed to have genes for org in self.organisms: for contig in org.contigs: for gene in contig.genes: @@ -107,7 +116,9 @@ def genes(self) -> Generator[Gene, None, None]: for gene in gene_fam.genes: yield gene else: - logging.getLogger("PPanGGOLiN").warning("There is no gene in your pangenome") + logging.getLogger("PPanGGOLiN").warning( + "There is no gene in your pangenome" + ) def _mk_gene_getter(self): """ @@ -132,14 +143,18 @@ def get_gene(self, gene_id: str) -> Gene: :raises AssertionError: If the `gene_id` is not a string :raises KeyError: If the `gene_id` is not in the pangenome """ - assert isinstance(gene_id, str), f"The provided gene id ({gene_id}) should be a string and not a {type(gene_id)}" + assert isinstance( + gene_id, str + ), f"The provided gene id ({gene_id}) should be a string and not a {type(gene_id)}" try: gene = self._gene_getter[gene_id] except AttributeError: # in that case, either the gene getter has not been computed, or the geneID is not in the pangenome. self._mk_gene_getter() # make it - return self.get_gene(gene_id) # Return what was expected. If geneID does not exist it will raise an error. + return self.get_gene( + gene_id + ) # Return what was expected. If geneID does not exist it will raise an error. except KeyError: raise KeyError(f"{gene_id} does not exist in the pangenome.") else: @@ -159,8 +174,8 @@ def number_of_genes(self) -> int: else: return nb_genes - """RNAs methods""" + @property def RNAs(self) -> Generator[Gene, None, None]: """Generator of genes in the pangenome. @@ -180,10 +195,10 @@ def number_of_rnas(self) -> int: return sum(ctg.number_of_rnas for ctg in self.contigs) """Gene families methods""" + @property def max_fam_id(self): - """Get the last family identifier - """ + """Get the last family identifier""" return self._max_fam_id @max_fam_id.setter @@ -197,7 +212,7 @@ def max_fam_id(self, value): @property def gene_families(self) -> Generator[GeneFamily, None, None]: """Returns all the gene families in the pangenome - + :return: Generator of gene families """ yield from self._fam_getter.values() @@ -233,7 +248,7 @@ def get_gene_family(self, name: str) -> GeneFamily: def add_gene_family(self, family: GeneFamily): """ Adds the given gene family to the pangenome. If a family with the same name already exists, raises a KeyError. - + :param family: The gene family to add to the pangenome :raises KeyError: If a family with the same name already exists :raises Exception: For any unexpected exceptions @@ -245,16 +260,20 @@ def add_gene_family(self, family: GeneFamily): self._fam_getter[family.name] = family self.max_fam_id += 1 except Exception as error: - raise Exception(f"An unexpected error occurred when adding family {family} to pangenome: {str(error)}") from error + raise Exception( + f"An unexpected error occurred when adding family {family} to pangenome: {str(error)}" + ) from error else: - raise KeyError(f"Cannot add family {family.name}: A family with the same name already exists.") - + raise KeyError( + f"Cannot add family {family.name}: A family with the same name already exists." + ) """Graph methods""" + @property def edges(self) -> Generator[Edge, None, None]: """Returns all the edges in the pangenome graph - + :return: Generator of edge """ yield from self._edge_getter.values() @@ -271,13 +290,17 @@ def add_edge(self, gene1: Gene, gene2: Gene) -> Edge: :raises AssertionError: Genes object are expected :raises AttributeError: Genes are not associated to any families """ - assert isinstance(gene1, Gene) and isinstance(gene2, Gene), "Gene object are expected" + assert isinstance(gene1, Gene) and isinstance( + gene2, Gene + ), "Gene object are expected" try: family_1, family_2 = gene1.family, gene2.family except AttributeError: - raise AttributeError("Genes are not linked to families. Check that you compute the gene families and post an" - " issue on our GitHub") - key = frozenset([family_1, family_2 ]) + raise AttributeError( + "Genes are not linked to families. Check that you compute the gene families and post an" + " issue on our GitHub" + ) + key = frozenset([family_1, family_2]) edge = self._edge_getter.get(key) if edge is None: edge = Edge(gene1, gene2) @@ -295,10 +318,11 @@ def number_of_edges(self) -> int: return len(self._edge_getter) """Organism methods""" + @property def organisms(self) -> Generator[Organism, None, None]: """Returns all the organisms in the pangenome - + :return: Generator :class:`ppanggolin.genome.Organism` """ yield from self._org_getter.values() @@ -306,7 +330,7 @@ def organisms(self) -> Generator[Organism, None, None]: @property def number_of_organisms(self) -> int: """Returns the number of organisms present in the pangenome - + :return: The number of organism """ return len(self._org_getter) @@ -334,16 +358,20 @@ def _mk_contig_getter(self, check_name: bool = False, name: str = ""): The assumption behind this is that the pangenome has been filled and no more contig will be added. """ if (check_name and name == "") or (not check_name and name != ""): - raise AssertionError('if you search the identifier corresponding to the name, ' - 'check_name must be True and name different than empty string.') + raise AssertionError( + "if you search the identifier corresponding to the name, " + "check_name must be True and name different than empty string." + ) names = set() identifier = None self._contig_getter = {} for contig in self.contigs: if check_name: if contig.name in names: - raise KeyError("Two contigs with the same name. " - "You should use the contig ID or give the genome name") + raise KeyError( + "Two contigs with the same name. " + "You should use the contig ID or give the genome name" + ) names.add(contig.name) if contig.name == name: identifier = contig.ID @@ -352,7 +380,9 @@ def _mk_contig_getter(self, check_name: bool = False, name: str = ""): def _get_contig_by_identifier(self, identifier: int = None) -> Contig: if identifier is None: - raise Exception("Unexpected error happened. Please report an issue to our GitHub.") + raise Exception( + "Unexpected error happened. Please report an issue to our GitHub." + ) else: if not isinstance(identifier, int): raise AssertionError("Contig ID should be an integer") @@ -361,13 +391,19 @@ def _get_contig_by_identifier(self, identifier: int = None) -> Contig: except AttributeError: # in that case, either the gene getter has not been computed, or the geneID is not in the pangenome. self._mk_contig_getter() # make it - return self.get_contig(identifier=identifier) # Return what was expected. If geneID does not exist it will raise an error. + return self.get_contig( + identifier=identifier + ) # Return what was expected. If geneID does not exist it will raise an error. except KeyError: - raise KeyError(f"Contig: {identifier}, does not exist in the pangenome.") + raise KeyError( + f"Contig: {identifier}, does not exist in the pangenome." + ) else: return contig - def get_contig(self, identifier: int = None, name: str = None, organism_name: str = None) -> Contig: + def get_contig( + self, identifier: int = None, name: str = None, organism_name: str = None + ) -> Contig: """Returns the contig by his identifier or by his name. If name is given the organism name is needed :param identifier: ID of the contig to look for @@ -379,8 +415,9 @@ def get_contig(self, identifier: int = None, name: str = None, organism_name: st :raises AssertionError: If the `contig_id` is not an integer :raises KeyError: If the `contig` is not in the pangenome """ - assert not all(x is None for x in [identifier, name, organism_name]), ("You must provide either contig_id or " - "name or genome_name") + assert not all(x is None for x in [identifier, name, organism_name]), ( + "You must provide either contig_id or " "name or genome_name" + ) if name: if not isinstance(name, str): raise AssertionError("Contig name should be a string") @@ -427,16 +464,22 @@ def add_organism(self, organism: Organism): :raise AssertionError: If the organism name is not a string :raises KeyError: if the provided organism is already in pangenome """ - assert isinstance(organism, Organism), "An organism object is expected to be add to pangenome" + assert isinstance( + organism, Organism + ), "An organism object is expected to be add to pangenome" try: self.get_organism(organism.name) except KeyError: self._org_getter[organism.name] = organism else: - raise KeyError(f"Redondant genome name was found ({organism.name})." - f"All of your genomes must have unique names.") - - def get_org_index(self) -> Dict[Organism, int]: # will not make a new index if it exists already + raise KeyError( + f"Redondant genome name was found ({organism.name})." + f"All of your genomes must have unique names." + ) + + def get_org_index( + self, + ) -> Dict[Organism, int]: # will not make a new index if it exists already """Creates an index for Organisms (each organism is assigned an Integer). :return: The index of organisms in pangenome @@ -447,7 +490,7 @@ def get_org_index(self) -> Dict[Organism, int]: # will not make a new index if self._org_index[org] = index return self._org_index - def compute_family_bitarrays(self, part: str = 'all') -> Dict[Organism, int]: + def compute_family_bitarrays(self, part: str = "all") -> Dict[Organism, int]: """ Based on the index generated by get_org_index, generate a bitarray for each gene family. If the family j is present in the organism with the index i, the bit at position i will be 1. If it is not, @@ -466,7 +509,9 @@ def compute_family_bitarrays(self, part: str = 'all') -> Dict[Organism, int]: # case where there is an index but the bitarrays have not been computed??? return self._org_index - def get_fam_index(self) -> Dict[GeneFamily, int]: # will not make a new index if it exists already + def get_fam_index( + self, + ) -> Dict[GeneFamily, int]: # will not make a new index if it exists already """Creates an index for gene families (each family is assigned an Integer). :return: The index of families in pangenome @@ -477,7 +522,7 @@ def get_fam_index(self) -> Dict[GeneFamily, int]: # will not make a new index i self._fam_index[fam] = index return self._fam_index - def compute_org_bitarrays(self, part='all') -> Dict[GeneFamily, int]: + def compute_org_bitarrays(self, part="all") -> Dict[GeneFamily, int]: """ Based on the index generated by get_fam_index, generate a bitarray for each gene family. If the family j is present in the organism with the index i, the bit at position i will be 1. If it is not, @@ -497,6 +542,7 @@ def compute_org_bitarrays(self, part='all') -> Dict[GeneFamily, int]: return self._fam_index """RGP methods""" + @property def regions(self) -> Generator[Region, None, None]: """returns all the regions (RGP) in the pangenome @@ -524,7 +570,9 @@ def get_region(self, name: str) -> Region: else: return rgp - def get_multigenics(self, dup_margin: float, persistent: bool = True) -> Set[GeneFamily]: + def get_multigenics( + self, dup_margin: float, persistent: bool = True + ) -> Set[GeneFamily]: """ Returns the multigenic persistent families of the pangenome graph. A family will be considered multigenic if it is duplicated in more than `dup_margin` of the genomes where it is present. @@ -540,14 +588,22 @@ def get_multigenics(self, dup_margin: float, persistent: bool = True) -> Set[Gen multigenics = set() for fam in self.gene_families: if fam.named_partition == "persistent" or not persistent: - dup = len([genes for org, genes in fam.get_org_dict().items() if - len([gene for gene in genes if not gene.is_fragment]) > 1]) - if (dup / fam.number_of_organisms) >= dup_margin: # tot / nborgs >= 1.05 + dup = len( + [ + genes + for org, genes in fam.get_org_dict().items() + if len([gene for gene in genes if not gene.is_fragment]) > 1 + ] + ) + if ( + dup / fam.number_of_organisms + ) >= dup_margin: # tot / nborgs >= 1.05 multigenics.add(fam) return multigenics - - def get_single_copy_persistent_families(self, dup_margin: float, exclude_fragments: bool) -> Set[GeneFamily]: + def get_single_copy_persistent_families( + self, dup_margin: float, exclude_fragments: bool + ) -> Set[GeneFamily]: """ Retrieves gene families that are both persistent and single copy based on the provided criteria. @@ -561,12 +617,13 @@ def get_single_copy_persistent_families(self, dup_margin: float, exclude_fragmen # Iterate through gene families and check for persistence and single copy status for fam in self.gene_families: - if fam.named_partition == "persistent" and fam.is_single_copy(dup_margin, exclude_fragments): + if fam.named_partition == "persistent" and fam.is_single_copy( + dup_margin, exclude_fragments + ): single_copy_fams.add(fam) return single_copy_fams - def add_region(self, region: Region): """Add a region to the pangenome @@ -582,7 +639,9 @@ def add_region(self, region: Region): except KeyError: self._region_getter[region.name] = region else: - raise KeyError(f"A RGP with this name ({region.name} already exist in pangenome") + raise KeyError( + f"A RGP with this name ({region.name} already exist in pangenome" + ) @property def number_of_rgp(self) -> int: @@ -593,6 +652,7 @@ def number_of_rgp(self) -> int: return len(self._region_getter) """Spot methods""" + @property def spots(self) -> Generator[Spot, None, None]: """Generate spots in the pangenome @@ -619,8 +679,10 @@ def get_spot(self, spot_id: Union[int, str]) -> Spot: if result: spot_id = int(result.group(1)) else: - raise ValueError(f"The provided spot ID '{spot_id}' does not have the expected format." - "It should be an integer or in the format 'spot_'.") + raise ValueError( + f"The provided spot ID '{spot_id}' does not have the expected format." + "It should be an integer or in the format 'spot_'." + ) try: spot = self._spot_getter[spot_id] except KeyError: @@ -655,10 +717,10 @@ def number_of_spots(self) -> int: return len(self._spot_getter) """Modules methods""" + @property def modules(self) -> Generator[Module, None, None]: - """Generate modules in the pangenome - """ + """Generate modules in the pangenome""" yield from self._module_getter.values() def get_module(self, module_id: Union[int, str]) -> Module: @@ -681,8 +743,10 @@ def get_module(self, module_id: Union[int, str]) -> Module: if result: module_id = int(result.group(1)) else: - raise ValueError(f"The provided module ID '{module_id}' does not have the expected format." - "It should be an integer or in the format 'module_'.") + raise ValueError( + f"The provided module ID '{module_id}' does not have the expected format." + "It should be an integer or in the format 'module_'." + ) try: module = self._module_getter[module_id] @@ -709,7 +773,7 @@ def add_module(self, module: Module): else: raise KeyError("Module already exist") - def compute_mod_bitarrays(self, part: str = 'all') -> Dict[GeneFamily, int]: + def compute_mod_bitarrays(self, part: str = "all") -> Dict[GeneFamily, int]: """Based on the index generated by get_fam_index, generated a bitarray for each gene family present in modules. If the family j is present in the module with the index i, the bit at position i will be 1. If it is not, @@ -740,7 +804,7 @@ def soft_core_families(self, soft_core_threshold: float) -> Set[GeneFamily]: """ Retrieves gene families considered part of the soft core based on the provided threshold. - :param soft_core_threshold: The threshold to determine the minimum fraction of organisms + :param soft_core_threshold: The threshold to determine the minimum fraction of organisms required for a gene family to be considered part of the soft core. :return: A set containing gene families identified as part of the soft core. """ @@ -768,12 +832,12 @@ def exact_core_families(self) -> Set[GeneFamily]: return exact_core_families """Metadata""" + def has_metadata(self) -> bool: """ Whether or not the pangenome has metadata associated with any of its elements. """ - return any( status != "No" for status in self.status['metadata'].values()) - + return any(status != "No" for status in self.status["metadata"].values()) def select_elem(self, metatype: str): """Get all the element for the given metatype @@ -829,8 +893,9 @@ def metadata(self, metatype: str) -> Generator[Metadata, None, None]: for elem in self.select_elem(metatype): yield elem.metadata - def get_elem_by_metadata(self, metatype: str, **kwargs - ) -> Generator[Union[GeneFamily, Gene, Organism, Region, Spot, Module], None, None]: + def get_elem_by_metadata( + self, metatype: str, **kwargs + ) -> Generator[Union[GeneFamily, Gene, Organism, Region, Spot, Module], None, None]: """Get element in pangenome with metadata attribute expected :param metatype: Select to which pangenome element metadata @@ -842,16 +907,27 @@ def get_elem_by_metadata(self, metatype: str, **kwargs if len(list(elem.get_metadata_by_attribute(**kwargs))) > 0: yield elem - def get_elem_by_source(self, source: str, metatype: str - ) -> Generator[Union[GeneFamily, Gene, Contig, Organism, Region, Spot, Module], None, None]: - """ Get gene families with a specific source in pangenome + def get_elem_by_source( + self, source: str, metatype: str + ) -> Generator[ + Union[GeneFamily, Gene, Contig, Organism, Region, Spot, Module], None, None + ]: + """Get gene families with a specific source in pangenome :param source: Name of the source :param metatype: select to which pangenome element metadata should be written :return: Gene families with the source """ - assert metatype in ["families", "genomes", "contigs", "genes", "RGPs", "spots", "modules"] + assert metatype in [ + "families", + "genomes", + "contigs", + "genes", + "RGPs", + "spots", + "modules", + ] for elem in self.select_elem(metatype): if elem.has_source(source): yield elem diff --git a/ppanggolin/projection/__init__.py b/ppanggolin/projection/__init__.py index 56bb37d6..f477784e 100644 --- a/ppanggolin/projection/__init__.py +++ b/ppanggolin/projection/__init__.py @@ -1 +1 @@ -from .projection import subparser, launch \ No newline at end of file +from .projection import subparser, launch diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 91e4db75..b0a36c5f 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -20,38 +20,69 @@ import yaml # # local libraries -from ppanggolin.annotate.synta import read_fasta, get_dna_sequence -from ppanggolin.annotate.annotate import init_contig_counter, read_anno_file, annotate_organism, \ - local_identifiers_are_unique +from ppanggolin.annotate.synta import get_contigs_from_fasta_file, get_dna_sequence +from ppanggolin.annotate.annotate import ( + init_contig_counter, + read_anno_file, + annotate_organism, + local_identifiers_are_unique, +) from ppanggolin.annotate import subparser as annotate_subparser from ppanggolin.pangenome import Pangenome -from ppanggolin.utils import detect_filetype, create_tmpdir, read_compressed_or_not, write_compressed_or_not, \ - restricted_float, mk_outdir, get_config_args, parse_config_file, get_default_args, \ - check_input_files, parse_input_paths_file -from ppanggolin.align.alignOnPang import write_gene_to_gene_family, get_input_seq_to_family_with_rep, \ - get_input_seq_to_family_with_all, project_and_write_partition +from ppanggolin.utils import ( + detect_filetype, + create_tmpdir, + read_compressed_or_not, + write_compressed_or_not, + restricted_float, + mk_outdir, + get_config_args, + parse_config_file, + get_default_args, + check_input_files, + parse_input_paths_file, +) +from ppanggolin.align.alignOnPang import ( + write_gene_to_gene_family, + get_input_seq_to_family_with_rep, + get_input_seq_to_family_with_all, + project_and_write_partition, +) from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations from ppanggolin.formats.readBinaries import check_pangenome_info from ppanggolin.RGP.genomicIsland import naming_scheme, compute_org_rgp -from ppanggolin.RGP.spot import make_spot_graph, check_sim, add_new_node_in_spot_graph, write_spot_graph +from ppanggolin.RGP.spot import ( + make_spot_graph, + check_sim, + add_new_node_in_spot_graph, + write_spot_graph, +) from ppanggolin.genome import Organism from ppanggolin.geneFamily import GeneFamily from ppanggolin.region import Region, Spot, Module -from ppanggolin.formats.writeFlatGenomes import write_proksee_organism, manage_module_colors, write_gff_file, \ - write_tsv_genome_file -from ppanggolin.formats.writeFlatPangenome import summarize_spots, summarize_genome, write_summaries_in_tsv, \ - write_rgp_table +from ppanggolin.formats.writeFlatGenomes import ( + write_proksee_organism, + manage_module_colors, + write_gff_file, + write_tsv_genome_file, +) +from ppanggolin.formats.writeFlatPangenome import ( + summarize_spots, + summarize_genome, + write_summaries_in_tsv, + write_rgp_table, +) from ppanggolin.formats.writeSequences import read_genome_file class NewSpot(Spot): """ - This class represent a hotspot specifically + This class represent a hotspot specifically created for the projected genome. """ def __str__(self): - return f'new_spot_{str(self.ID)}' + return f"new_spot_{str(self.ID)}" def check_pangenome_for_projection(pangenome: Pangenome, fast_aln: bool): @@ -78,42 +109,69 @@ def check_pangenome_for_projection(pangenome: Pangenome, fast_aln: bool): project_spots = True if pangenome.status["partitioned"] not in ["Computed", "Loaded", "inFile"]: - raise NameError("The provided pangenome has not been partitioned. " - "Annotation of an external genome is therefore not possible. " - "See the 'partition' subcommands.") + raise NameError( + "The provided pangenome has not been partitioned. " + "Annotation of an external genome is therefore not possible. " + "See the 'partition' subcommands." + ) if pangenome.status["predictedRGP"] not in ["Computed", "Loaded", "inFile"]: - logging.getLogger('PPanGGOLiN').info("RGPs have not been predicted in the provided pangenome. " - "Projection of RGPs and spots into the provided " - "genome will not be performed.") + logging.getLogger("PPanGGOLiN").info( + "RGPs have not been predicted in the provided pangenome. " + "Projection of RGPs and spots into the provided " + "genome will not be performed." + ) predict_rgp = False project_spots = False elif pangenome.status["spots"] not in ["Computed", "Loaded", "inFile"]: - logging.getLogger('PPanGGOLiN').info("Spots have not been predicted in the provided pangenome. " - "Projection of spots into the provided genome will not be performed.") + logging.getLogger("PPanGGOLiN").info( + "Spots have not been predicted in the provided pangenome. " + "Projection of spots into the provided genome will not be performed." + ) project_spots = False if pangenome.status["modules"] not in ["Computed", "Loaded", "inFile"]: - logging.getLogger('PPanGGOLiN').info("Modules have not been predicted in the provided pangenome. " - "Projection of modules into the provided genome will not be performed.") + logging.getLogger("PPanGGOLiN").info( + "Modules have not been predicted in the provided pangenome. " + "Projection of modules into the provided genome will not be performed." + ) project_modules = False - if pangenome.status["geneSequences"] not in ["Loaded", "Computed", "inFile"] and not fast_aln: - raise Exception("The provided pangenome has no gene sequences. " - "Projection is still possible with the --fast option to use representative " - "sequences rather than all genes to annotate input genes.") + if ( + pangenome.status["geneSequences"] not in ["Loaded", "Computed", "inFile"] + and not fast_aln + ): + raise Exception( + "The provided pangenome has no gene sequences. " + "Projection is still possible with the --fast option to use representative " + "sequences rather than all genes to annotate input genes." + ) if pangenome.status["geneFamilySequences"] not in ["Loaded", "Computed", "inFile"]: - raise Exception("The provided pangenome has no gene families sequences. " - "This is not possible to annotate an input genome to this pangenome.") + raise Exception( + "The provided pangenome has no gene families sequences. " + "This is not possible to annotate an input genome to this pangenome." + ) return predict_rgp, project_spots, project_modules + def manage_input_genomes_annotation( - pangenome, input_mode: str, anno: str, fasta: str, organism_name: str, circular_contigs: list, - pangenome_params, cpu: int, use_pseudo: bool, disable_bar: bool, tmpdir: str, config: dict): + pangenome, + input_mode: str, + anno: str, + fasta: str, + organism_name: str, + circular_contigs: list, + pangenome_params, + cpu: int, + use_pseudo: bool, + disable_bar: bool, + tmpdir: str, + config: dict, +): """ Manage the input genomes annotation based on the provided mode and parameters. @@ -148,30 +206,28 @@ def manage_input_genomes_annotation( if anno: input_type = "annotation" genome_name_to_path = { - organism_name: { - "path": anno, - "circular_contigs": circular_contigs - } + organism_name: {"path": anno, "circular_contigs": circular_contigs} } elif fasta: input_type = "fasta" genome_name_to_path = { - organism_name: { - "path": fasta, - "circular_contigs": circular_contigs - } + organism_name: {"path": fasta, "circular_contigs": circular_contigs} } else: - raise ValueError(f"Input mode '{input_mode}' is not valid. Expected 'multiple' or 'single'.") + raise ValueError( + f"Input mode '{input_mode}' is not valid. Expected 'multiple' or 'single'." + ) # Process annotation input type if input_type == "annotation": check_input_names(pangenome, genome_name_to_path) organisms, org_2_has_fasta = read_annotation_files( - genome_name_to_path, cpu=cpu, pseudo=use_pseudo, + genome_name_to_path, + cpu=cpu, + pseudo=use_pseudo, translation_table=int(pangenome_params.cluster.translation_table), - disable_bar=disable_bar + disable_bar=disable_bar, ) # Check for genomes without associated sequence data @@ -186,10 +242,12 @@ def manage_input_genomes_annotation( genome_name_to_fasta_path = { organism_name: { "path": fasta, - "circular_contigs": circular_contigs + "circular_contigs": circular_contigs, } } - get_gene_sequences_from_fasta_files(organisms_with_no_fasta, genome_name_to_fasta_path) + get_gene_sequences_from_fasta_files( + organisms_with_no_fasta, genome_name_to_fasta_path + ) else: raise ValueError( f"GFF files provided for {len(organisms_with_no_fasta)} (out of {len(organisms)}) genomes without " @@ -200,34 +258,58 @@ def manage_input_genomes_annotation( # Process fasta input type elif input_type == "fasta": - annotate_param_names = ["norna", "kingdom", "allow_overlap", "prodigal_procedure"] - annotate_params = manage_annotate_param(annotate_param_names, pangenome_params.annotate, config) + annotate_param_names = [ + "norna", + "kingdom", + "allow_overlap", + "prodigal_procedure", + ] + annotate_params = manage_annotate_param( + annotate_param_names, pangenome_params.annotate, config + ) check_input_names(pangenome, genome_name_to_path) organisms = annotate_fasta_files( - genome_name_to_fasta_path=genome_name_to_path, tmpdir=tmpdir, cpu=cpu, + genome_name_to_fasta_path=genome_name_to_path, + tmpdir=tmpdir, + cpu=cpu, translation_table=int(pangenome_params.cluster.translation_table), - norna=annotate_params.norna, kingdom=annotate_params.kingdom, - allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure, - disable_bar=disable_bar + norna=annotate_params.norna, + kingdom=annotate_params.kingdom, + allow_overlap=annotate_params.allow_overlap, + procedure=annotate_params.prodigal_procedure, + disable_bar=disable_bar, ) else: - raise ValueError(f"Input type '{input_type}' is not valid. Expected 'fasta' or 'annotation'.") + raise ValueError( + f"Input type '{input_type}' is not valid. Expected 'fasta' or 'annotation'." + ) return organisms, genome_name_to_path, input_type -def write_projection_results(pangenome: Pangenome, organisms: Set[Organism], - input_org_2_rgps: Dict[Organism, Set[Region]], - input_org_to_spots: Dict[Organism, Set[Spot]], - input_orgs_to_modules: Dict[Organism, Set[Module]], - input_org_to_lonely_genes_count: Dict[Organism, int], - write_proksee: bool, write_gff: bool, write_table: bool, - add_sequences: bool, - genome_name_to_path: Dict[str, dict], input_type: str, - output_dir: Path, dup_margin: float, soft_core: float, - metadata_sep: str, compress: bool, - need_regions: bool, need_spots: bool, need_modules: bool): +def write_projection_results( + pangenome: Pangenome, + organisms: Set[Organism], + input_org_2_rgps: Dict[Organism, Set[Region]], + input_org_to_spots: Dict[Organism, Set[Spot]], + input_orgs_to_modules: Dict[Organism, Set[Module]], + input_org_to_lonely_genes_count: Dict[Organism, int], + write_proksee: bool, + write_gff: bool, + write_table: bool, + add_sequences: bool, + genome_name_to_path: Dict[str, dict], + input_type: str, + output_dir: Path, + dup_margin: float, + soft_core: float, + metadata_sep: str, + compress: bool, + need_regions: bool, + need_spots: bool, + need_modules: bool, +): """ Write the results of the projection of pangneome onto input genomes. @@ -246,7 +328,7 @@ def write_projection_results(pangenome: Pangenome, organisms: Set[Organism], :param output_dir: The directory where the output files will be written. :param dup_margin: The duplication margin used to compute completeness. :param soft_core: Soft core threshold - + Note: - If `write_proksee` is True and input organisms have modules, module colors for ProkSee are obtained. @@ -264,9 +346,14 @@ def write_projection_results(pangenome: Pangenome, organisms: Set[Organism], # dup margin value here is specified in argument and is used to compute completeness. # That means it can be different than dup margin used in spot and RGPS. - pangenome_persistent_single_copy_families = pangenome.get_single_copy_persistent_families(dup_margin=dup_margin, - exclude_fragments=True) - pangenome_persistent_count = len([fam for fam in pangenome.gene_families if fam.named_partition == "persistent"]) + pangenome_persistent_single_copy_families = ( + pangenome.get_single_copy_persistent_families( + dup_margin=dup_margin, exclude_fragments=True + ) + ) + pangenome_persistent_count = len( + [fam for fam in pangenome.gene_families if fam.named_partition == "persistent"] + ) soft_core_families = pangenome.soft_core_families(soft_core) exact_core_families = pangenome.exact_core_families() @@ -278,70 +365,102 @@ def write_projection_results(pangenome: Pangenome, organisms: Set[Organism], # summarize projection for all input organisms singleton_gene_count = input_org_to_lonely_genes_count[organism] - org_summary = summarize_projected_genome(organism, - pangenome_persistent_count, - pangenome_persistent_single_copy_families, - soft_core_families=soft_core_families, - exact_core_families=exact_core_families, - input_org_rgps=input_org_2_rgps.get(organism, None), - input_org_spots=input_org_to_spots.get(organism, None), - input_org_modules=input_orgs_to_modules.get(organism, None), - pangenome_file=pangenome.file, - singleton_gene_count=singleton_gene_count) + org_summary = summarize_projected_genome( + organism, + pangenome_persistent_count, + pangenome_persistent_single_copy_families, + soft_core_families=soft_core_families, + exact_core_families=exact_core_families, + input_org_rgps=input_org_2_rgps.get(organism, None), + input_org_spots=input_org_to_spots.get(organism, None), + input_org_modules=input_orgs_to_modules.get(organism, None), + pangenome_file=pangenome.file, + singleton_gene_count=singleton_gene_count, + ) summaries.append(org_summary) yaml_outputfile = output_dir / organism.name / "projection_summary.yaml" write_summary_in_yaml(org_summary, yaml_outputfile) if (write_proksee or write_gff) and add_sequences: - genome_sequences = read_genome_file(genome_name_to_path[organism.name]['path'], organism) - genome_name_to_path[organism.name]['path'] + genome_sequences = read_genome_file( + genome_name_to_path[organism.name]["path"], organism + ) + genome_name_to_path[organism.name]["path"] else: genome_sequences = None if write_proksee: - org_module_to_color = {org_mod: module_to_colors[org_mod] for org_mod in - input_orgs_to_modules.get(organism, [])} + org_module_to_color = { + org_mod: module_to_colors[org_mod] + for org_mod in input_orgs_to_modules.get(organism, []) + } output_file = output_dir / organism.name / f"{organism.name}_proksee.json" - write_proksee_organism(organism, output_file, features='all', module_to_colors=org_module_to_color, - genome_sequences=genome_sequences, compress=compress) + write_proksee_organism( + organism, + output_file, + features="all", + module_to_colors=org_module_to_color, + genome_sequences=genome_sequences, + compress=compress, + ) if write_gff: - if input_type == "annotation": # if the genome has not been annotated by PPanGGOLiN - annotation_sources = {"rRNA": "external", - "tRNA": "external", - "CDS": "external"} + if ( + input_type == "annotation" + ): # if the genome has not been annotated by PPanGGOLiN + annotation_sources = { + "rRNA": "external", + "tRNA": "external", + "CDS": "external", + } else: annotation_sources = {} - write_gff_file(organism, output_dir / organism.name, - annotation_sources=annotation_sources, - genome_sequences=genome_sequences, metadata_sep=metadata_sep, - compress=compress) + write_gff_file( + organism, + output_dir / organism.name, + annotation_sources=annotation_sources, + genome_sequences=genome_sequences, + metadata_sep=metadata_sep, + compress=compress, + ) if write_table: - write_tsv_genome_file(organism, output_dir / organism.name, compress=compress, metadata_sep=metadata_sep, - need_regions=need_regions, need_spots=need_spots, need_modules=need_modules) + write_tsv_genome_file( + organism, + output_dir / organism.name, + compress=compress, + metadata_sep=metadata_sep, + need_regions=need_regions, + need_spots=need_spots, + need_modules=need_modules, + ) output_file = output_dir / "summary_projection.tsv" - write_summaries_in_tsv(summaries, - output_file=output_file, - dup_margin=dup_margin, - soft_core=soft_core, compress=compress) - - -def summarize_projected_genome(organism: Organism, - pangenome_persistent_count: int, - pangenome_persistent_single_copy_families: Set[GeneFamily], - soft_core_families: Set[GeneFamily], - exact_core_families: Set[GeneFamily], - input_org_rgps: List[Region], - input_org_spots: List[Spot], - input_org_modules: List[Module], - pangenome_file: str, - singleton_gene_count: int) -> Dict[str, any]: + write_summaries_in_tsv( + summaries, + output_file=output_file, + dup_margin=dup_margin, + soft_core=soft_core, + compress=compress, + ) + + +def summarize_projected_genome( + organism: Organism, + pangenome_persistent_count: int, + pangenome_persistent_single_copy_families: Set[GeneFamily], + soft_core_families: Set[GeneFamily], + exact_core_families: Set[GeneFamily], + input_org_rgps: List[Region], + input_org_spots: List[Spot], + input_org_modules: List[Module], + pangenome_file: str, + singleton_gene_count: int, +) -> Dict[str, any]: """ Summarizes the projected genome and generates an organism summary. @@ -371,35 +490,46 @@ def summarize_projected_genome(organism: Organism, exact_core_families=exact_core_families, rgp_count=rgp_count, spot_count=spot_count, - module_count=module_count + module_count=module_count, ) # Add specific values for the projected genome organism_summary["Pangenome_file"] = pangenome_file - cloud_without_specific_fams = organism_summary["Cloud"]["families"] - singleton_gene_count + cloud_without_specific_fams = ( + organism_summary["Cloud"]["families"] - singleton_gene_count + ) organism_summary["Cloud"]["families"] = cloud_without_specific_fams organism_summary["Cloud"]["specific families"] = singleton_gene_count input_org_spots = input_org_spots - new_spot_count = "Not computed" if input_org_spots is None else sum( - 1 for spot in input_org_spots if isinstance(spot, NewSpot)) + new_spot_count = ( + "Not computed" + if input_org_spots is None + else sum(1 for spot in input_org_spots if isinstance(spot, NewSpot)) + ) organism_summary["New_spots"] = new_spot_count return organism_summary -def annotate_fasta_files(genome_name_to_fasta_path: Dict[str, dict], tmpdir: str, cpu: int = 1, - translation_table: int = 11, - kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False, - procedure: str = None, - disable_bar: bool = False): +def annotate_fasta_files( + genome_name_to_fasta_path: Dict[str, dict], + tmpdir: str, + cpu: int = 1, + translation_table: int = 11, + kingdom: str = "bacteria", + norna: bool = False, + allow_overlap: bool = False, + procedure: str = None, + disable_bar: bool = False, +): """ Main function to annotate a pangenome :param genome_name_to_fasta_path: :param fasta_list: List of fasta file containing sequences that will be base of pangenome :param tmpdir: Path to temporary directory - :param cpu: number of CPU cores to use + :param cpu: number of CPU cores to use :param translation_table: Translation table (genetic code) to use. :param kingdom: Kingdom to which the prokaryota belongs to, to know which models to use for rRNA annotation. :param norna: Use to avoid annotating RNA features. @@ -411,13 +541,30 @@ def annotate_fasta_files(genome_name_to_fasta_path: Dict[str, dict], tmpdir: str organisms = [] arguments = [] # Argument given to annotate organism in same order than prototype for org_name, org_info in genome_name_to_fasta_path.items(): - arguments.append((org_name, org_info['path'], org_info['circular_contigs'], tmpdir, translation_table, - norna, kingdom, allow_overlap, procedure)) + arguments.append( + ( + org_name, + org_info["path"], + org_info["circular_contigs"], + tmpdir, + translation_table, + norna, + kingdom, + allow_overlap, + procedure, + ) + ) - logging.getLogger("PPanGGOLiN").info(f"Annotating {len(arguments)} genomes using {cpu} cpus...") - contig_counter = Value('i', 0) - with ProcessPoolExecutor(mp_context=get_context('fork'), max_workers=cpu, - initializer=init_contig_counter, initargs=(contig_counter,)) as executor: + logging.getLogger("PPanGGOLiN").info( + f"Annotating {len(arguments)} genomes using {cpu} cpus..." + ) + contig_counter = Value("i", 0) + with ProcessPoolExecutor( + mp_context=get_context("fork"), + max_workers=cpu, + initializer=init_contig_counter, + initargs=(contig_counter,), + ) as executor: with tqdm(total=len(arguments), unit="file", disable=disable_bar) as progress: futures = [] @@ -431,8 +578,13 @@ def annotate_fasta_files(genome_name_to_fasta_path: Dict[str, dict], tmpdir: str return organisms -def read_annotation_files(genome_name_to_annot_path: Dict[str, dict], cpu: int = 1, pseudo: bool = False, translation_table: int = 11, - disable_bar: bool = False) -> Tuple[List[Organism], Dict[Organism, bool]]: +def read_annotation_files( + genome_name_to_annot_path: Dict[str, dict], + cpu: int = 1, + pseudo: bool = False, + translation_table: int = 11, + disable_bar: bool = False, +) -> Tuple[List[Organism], Dict[Organism, bool]]: """ Read the annotation from GBFF file @@ -451,12 +603,24 @@ def read_annotation_files(genome_name_to_annot_path: Dict[str, dict], cpu: int = # unless a gff file without fasta is met (which is the only case where sequences can be absent) org_to_has_fasta_flag = {} - args = [(org_name, org_info['path'], org_info['circular_contigs'], pseudo, translation_table) - for org_name, org_info in genome_name_to_annot_path.items()] - - contig_counter = Value('i', 0) - with ProcessPoolExecutor(mp_context=get_context('fork'), max_workers=cpu, - initializer=init_contig_counter, initargs=(contig_counter,)) as executor: + args = [ + ( + org_name, + org_info["path"], + org_info["circular_contigs"], + pseudo, + translation_table, + ) + for org_name, org_info in genome_name_to_annot_path.items() + ] + + contig_counter = Value("i", 0) + with ProcessPoolExecutor( + mp_context=get_context("fork"), + max_workers=cpu, + initializer=init_contig_counter, + initargs=(contig_counter,), + ) as executor: with tqdm(total=len(args), unit="file", disable=disable_bar) as progress: futures = [] @@ -474,14 +638,22 @@ def read_annotation_files(genome_name_to_annot_path: Dict[str, dict], cpu: int = if local_identifiers_are_unique(genes): for gene in genes: - gene.ID = gene.local_identifier # Erase ppanggolin generated gene ids and replace with local identifiers - gene.local_identifier = "" # this is now useless, setting it to default value - - logging.getLogger("PPanGGOLiN").info("Gene identifiers used in the provided annotation files were unique, " - "PPanGGOLiN will use them.") + gene.ID = ( + gene.local_identifier + ) # Erase ppanggolin generated gene ids and replace with local identifiers + gene.local_identifier = ( + "" # this is now useless, setting it to default value + ) + + logging.getLogger("PPanGGOLiN").info( + "Gene identifiers used in the provided annotation files were unique, " + "PPanGGOLiN will use them." + ) else: - logging.getLogger("PPanGGOLiN").info("Gene identifiers used in the provided annotation files were not unique, " - "PPanGGOLiN will use self-generated identifiers.") + logging.getLogger("PPanGGOLiN").info( + "Gene identifiers used in the provided annotation files were not unique, " + "PPanGGOLiN will use self-generated identifiers." + ) return organisms, org_to_has_fasta_flag @@ -497,24 +669,30 @@ def get_gene_sequences_from_fasta_files(organisms, genome_name_to_annot_path): if org_names & set(genome_name_to_annot_path) != org_names: missing = len(org_names - set(genome_name_to_annot_path)) - raise ValueError(f"You did not provided fasta for all the genomes found in annotation file. " - f"{missing} are missing (out of {len(organisms)}). Missing genomes: {','.join(missing)}") + raise ValueError( + f"You did not provided fasta for all the genomes found in annotation file. " + f"{missing} are missing (out of {len(organisms)}). Missing genomes: {','.join(missing)}" + ) for org in organisms: - org_fasta_file = genome_name_to_annot_path[org.name]['path'] + org_fasta_file = genome_name_to_annot_path[org.name]["path"] with read_compressed_or_not(org_fasta_file) as currFastaFile: - org_contig_to_seq = read_fasta(org, currFastaFile) + org_contig_to_seq = get_contigs_from_fasta_file(org, currFastaFile) for contig in org.contigs: try: contig_seq = org_contig_to_seq[contig.name] except KeyError: - msg = f"Fasta file for genome {org.name} did not have the contig {contig.name} " \ - f"that was read from the annotation file. " - msg += f"The provided contigs in the fasta were : " \ - f"{', '.join(org_contig_to_seq)}." + msg = ( + f"Fasta file for genome {org.name} did not have the contig {contig.name} " + f"that was read from the annotation file. " + ) + msg += ( + f"The provided contigs in the fasta were : " + f"{', '.join(org_contig_to_seq)}." + ) raise KeyError(msg) for gene in contig.genes: @@ -535,7 +713,8 @@ def check_input_names(pangenome, input_names): duplicated_names = set(input_names) & {org.name for org in pangenome.organisms} if len(duplicated_names) != 0: raise NameError( - f"{len(duplicated_names)} provided genome name(s) already exist in the given pangenome: {' '.join(duplicated_names)}") + f"{len(duplicated_names)} provided genome name(s) already exist in the given pangenome: {' '.join(duplicated_names)}" + ) def write_summary_in_yaml(summary_info: Dict[str, Any], output_file: Path): @@ -549,19 +728,29 @@ def write_summary_in_yaml(summary_info: Dict[str, Any], output_file: Path): :param output_file: The file where the summary will be written. """ - yaml_string = yaml.dump(summary_info, default_flow_style=False, sort_keys=False, indent=4) + yaml_string = yaml.dump( + summary_info, default_flow_style=False, sort_keys=False, indent=4 + ) - with open(output_file, 'w') as flout: - flout.write('Projection_summary:') + with open(output_file, "w") as flout: + flout.write("Projection_summary:") flout.write(yaml_string) -def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_organisms: Iterable[Organism], - output: Path, - cpu: int, use_representatives: bool, no_defrag: bool, - identity: float, coverage: float, tmpdir: Path, - translation_table: int, keep_tmp: bool = False, - disable_bar: bool = False): +def annotate_input_genes_with_pangenome_families( + pangenome: Pangenome, + input_organisms: Iterable[Organism], + output: Path, + cpu: int, + use_representatives: bool, + no_defrag: bool, + identity: float, + coverage: float, + tmpdir: Path, + translation_table: int, + keep_tmp: bool = False, + disable_bar: bool = False, +): """ Annotate input genes with pangenome gene families by associating them to a cluster. @@ -580,38 +769,59 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org :return: Number of genes that do not cluster with any of the gene families of the pangenome. """ - logging.getLogger('PPanGGOLiN').info('Writing gene sequences of input genomes.') + logging.getLogger("PPanGGOLiN").info("Writing gene sequences of input genomes.") input_genes = [gene for org in input_organisms for gene in org.genes] - seq_fasta_file = output / 'input_genes.fasta' + seq_fasta_file = output / "input_genes.fasta" - write_gene_sequences_from_annotations(input_genes, seq_fasta_file, disable_bar=True, add='ppanggolin_') + write_gene_sequences_from_annotations( + input_genes, seq_fasta_file, disable_bar=True, add="ppanggolin_" + ) - with create_tmpdir(main_dir=tmpdir, basename="projection_tmp", keep_tmp=keep_tmp) as new_tmpdir: + with create_tmpdir( + main_dir=tmpdir, basename="projection_tmp", keep_tmp=keep_tmp + ) as new_tmpdir: if use_representatives: - _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome=pangenome, - sequence_files=seq_fasta_file, output=new_tmpdir, - tmpdir=new_tmpdir, input_type="nucleotide", - is_input_slf=True, cpu=cpu, no_defrag=no_defrag, - identity=identity, coverage=coverage, - translation_table=translation_table) + _, seqid_to_gene_family = get_input_seq_to_family_with_rep( + pangenome=pangenome, + sequence_files=seq_fasta_file, + output=new_tmpdir, + tmpdir=new_tmpdir, + input_type="nucleotide", + is_input_slf=True, + cpu=cpu, + no_defrag=no_defrag, + identity=identity, + coverage=coverage, + translation_table=translation_table, + disable_bar=disable_bar, + ) else: - _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome, - sequence_files=seq_fasta_file, - output=new_tmpdir, tmpdir=new_tmpdir, - input_type="nucleotide", is_input_slf=True, - cpu=cpu, no_defrag=no_defrag, identity=identity, - coverage=coverage, - translation_table=translation_table, - disable_bar=disable_bar) + _, seqid_to_gene_family = get_input_seq_to_family_with_all( + pangenome=pangenome, + sequence_files=seq_fasta_file, + output=new_tmpdir, + tmpdir=new_tmpdir, + input_type="nucleotide", + is_input_slf=True, + cpu=cpu, + no_defrag=no_defrag, + identity=identity, + coverage=coverage, + translation_table=translation_table, + disable_bar=disable_bar, + ) input_org_to_lonely_genes_count = {} for input_organism in input_organisms: org_outdir = output / input_organism.name mk_outdir(org_outdir, force=True) - seq_set = {gene.ID if gene.local_identifier == "" else gene.local_identifier for gene in input_organism.genes} + seq_set = { + gene.ID if gene.local_identifier == "" else gene.local_identifier + for gene in input_organism.genes + } project_and_write_partition(seqid_to_gene_family, seq_set, org_outdir) @@ -635,10 +845,11 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org else: # gene id already exists. new_name = f"{input_organism.name}_{gene_id}" - logging.getLogger('PPanGGOLiN').warning( - 'The input genome as a specific gene that does not align to any ' - f'pangenome families with the same id ({gene_id}) than an existing gene family in the pangenome. ' - f'The genome name is added to the family name: {new_name}') + logging.getLogger("PPanGGOLiN").warning( + "The input genome as a specific gene that does not align to any " + f"pangenome families with the same id ({gene_id}) than an existing gene family in the pangenome. " + f"The genome name is added to the family name: {new_name}" + ) gene_family = GeneFamily(pangenome.max_fam_id, new_name) pangenome.add_gene_family(gene_family) @@ -648,10 +859,11 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org if gene_family.contains_gene_id(gene_id): new_name = f"{input_organism.name}_{gene_id}" - logging.getLogger('PPanGGOLiN').warning( + logging.getLogger("PPanGGOLiN").warning( "The input genome contains a gene that aligns to a pangenome family " f"which already contains a gene with the same ID ({gene_id}). " - f"The genome name has been appended to the family name: {new_name}") + f"The genome name has been appended to the family name: {new_name}" + ) gene.ID = new_name @@ -660,22 +872,37 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org pangenome._mk_gene_getter() # re-build the gene getter - logging.getLogger('PPanGGOLiN').info( + logging.getLogger("PPanGGOLiN").info( f"{input_organism.name} has {len(lonely_genes)}/{input_organism.number_of_genes()} " - "specific genes that do not align to any gene of the pangenome.") + "specific genes that do not align to any gene of the pangenome." + ) # Write specific gene ids in a file with open(org_outdir / "specific_genes.tsv", "w") as fl: - fl.write('\n'.join( - gene.ID if gene.local_identifier == "" else gene.local_identifier for gene in lonely_genes) + '\n') + fl.write( + "\n".join( + gene.ID if gene.local_identifier == "" else gene.local_identifier + for gene in lonely_genes + ) + + "\n" + ) input_org_to_lonely_genes_count[input_organism] = len(lonely_genes) return input_org_to_lonely_genes_count -def predict_RGP(pangenome: Pangenome, input_organisms: List[Organism], persistent_penalty: int, variable_gain: int, - min_length: int, min_score: int, multigenics: Set[GeneFamily], - output_dir: Path, disable_bar: bool, compress: bool) -> Dict[Organism, Set[Region]]: +def predict_RGP( + pangenome: Pangenome, + input_organisms: List[Organism], + persistent_penalty: int, + variable_gain: int, + min_length: int, + min_score: int, + multigenics: Set[GeneFamily], + output_dir: Path, + disable_bar: bool, + compress: bool, +) -> Dict[Organism, Set[Region]]: """ Compute Regions of Genomic Plasticity (RGP) for the given input organisms. @@ -693,20 +920,30 @@ def predict_RGP(pangenome: Pangenome, input_organisms: List[Organism], persisten :return: Dictionary mapping organism with the set of predicted regions """ - logging.getLogger('PPanGGOLiN').info("Computing Regions of Genomic Plasticity...") + logging.getLogger("PPanGGOLiN").info("Computing Regions of Genomic Plasticity...") name_scheme = naming_scheme(chain(pangenome.organisms, input_organisms)) organism_to_rgps = {} for input_organism in input_organisms: - rgps = compute_org_rgp(input_organism, multigenics, persistent_penalty, variable_gain, min_length, - min_score, naming=name_scheme, disable_bar=disable_bar) + rgps = compute_org_rgp( + input_organism, + multigenics, + persistent_penalty, + variable_gain, + min_length, + min_score, + naming=name_scheme, + disable_bar=disable_bar, + ) # turn on projected attribute in rgp objects # useful when associating spot to prevent failure when multiple spot are associated to a projected RGP for rgp in rgps: rgp.projected = True - logging.getLogger('PPanGGOLiN').info(f"{len(rgps)} RGPs have been predicted in the input genomes.") + logging.getLogger("PPanGGOLiN").info( + f"{len(rgps)} RGPs have been predicted in the input genomes." + ) org_outdir = output_dir / input_organism.name @@ -716,7 +953,12 @@ def predict_RGP(pangenome: Pangenome, input_organisms: List[Organism], persisten return organism_to_rgps -def write_rgp_to_spot_table(rgp_to_spots: Dict[Region, Set[str]], output: Path, filename: str, compress: bool = False): +def write_rgp_to_spot_table( + rgp_to_spots: Dict[Region, Set[str]], + output: Path, + filename: str, + compress: bool = False, +): """ Write a table mapping RGPs to corresponding spot IDs. @@ -726,21 +968,21 @@ def write_rgp_to_spot_table(rgp_to_spots: Dict[Region, Set[str]], output: Path, :param compress: Whether to compress the file. """ fname = output / filename - logging.getLogger('PPanGGOLiN').debug( - f'Writing RGPs to spot table in {fname}') + logging.getLogger("PPanGGOLiN").debug(f"Writing RGPs to spot table in {fname}") with write_compressed_or_not(fname, compress) as tab: fieldnames = ["region", "spot_id"] - writer = csv.DictWriter(tab, fieldnames=fieldnames, delimiter='\t') + writer = csv.DictWriter(tab, fieldnames=fieldnames, delimiter="\t") writer.writeheader() - regions = sorted(rgp_to_spots.keys(), key=lambda x: ( - x.organism.name, x.contig.name, x.ID)) + regions = sorted( + rgp_to_spots.keys(), key=lambda x: (x.organism.name, x.contig.name, x.ID) + ) for region in regions: row = { "region": region.name, - "spot_id": ';'.join(map(str, rgp_to_spots[region])) + "spot_id": ";".join(map(str, rgp_to_spots[region])), } writer.writerow(row) @@ -755,26 +997,32 @@ def retrieve_gene_sequences_from_fasta_file(input_organism, fasta_file): """ with read_compressed_or_not(fasta_file) as currFastaFile: - contig_id2seq = read_fasta(input_organism, currFastaFile) + contig_id2seq = get_contigs_from_fasta_file(input_organism, currFastaFile) for contig in input_organism.contigs: try: for gene in contig.genes: - gene.add_dna(get_dna_sequence( - contig_id2seq[contig.name], gene)) + gene.add_dna(get_dna_sequence(contig_id2seq[contig.name], gene)) for rna in contig.RNAs: rna.add_dna(get_dna_sequence(contig_id2seq[contig.name], rna)) except KeyError: - msg = f"Fasta file for input genome {input_organism.name} did not have the contig {contig.name} " \ - f"that was read from the annotation file. " - msg += f"The provided contigs in the fasta were : " \ - f"{', '.join(contig_id2seq.keys())}." + msg = ( + f"Fasta file for input genome {input_organism.name} did not have the contig {contig.name} " + f"that was read from the annotation file. " + ) + msg += ( + f"The provided contigs in the fasta were : " + f"{', '.join(contig_id2seq.keys())}." + ) raise KeyError(msg) -def manage_annotate_param(annotate_param_names: List[str], pangenome_args: argparse.Namespace, - config_file: Optional[str]) -> argparse.Namespace: +def manage_annotate_param( + annotate_param_names: List[str], + pangenome_args: argparse.Namespace, + config_file: Optional[str], +) -> argparse.Namespace: """ Manage annotate parameters by collecting them from different sources and merging them. @@ -785,14 +1033,20 @@ def manage_annotate_param(annotate_param_names: List[str], pangenome_args: argpa :return: An argparse.Namespace containing the merged annotate parameters with their values. """ - default_annotate_args = get_default_args('annotate', annotate_subparser) + default_annotate_args = get_default_args("annotate", annotate_subparser) if config_file is None: config_annotate_args = argparse.Namespace() else: config = defaultdict(dict, parse_config_file(config_file)) config_annotate_args = get_config_args( - 'annotate', annotate_subparser, config, "annotate", annotate_param_names, strict_config_check=False) + "annotate", + annotate_subparser, + config, + "annotate", + annotate_param_names, + strict_config_check=False, + ) annotate_param_from_pangenome = {} annotate_param_from_config = {} @@ -822,25 +1076,31 @@ def manage_annotate_param(annotate_param_names: List[str], pangenome_args: argpa # Log the sources of the annotate parameters if len(annotate_param_from_pangenome) > 0: - param_val_string = ' '.join( - [f'--{k} {v}' for k, v in annotate_param_from_pangenome.items()]) + param_val_string = " ".join( + [f"--{k} {v}" for k, v in annotate_param_from_pangenome.items()] + ) logging.getLogger("PPanGGOLiN").debug( f"{len(annotate_param_from_pangenome)}/{len(annotate_param_names)} annotate parameters extracted from pangenome parameters " - f"(the parameters used to build the input pangenome): {param_val_string}") + f"(the parameters used to build the input pangenome): {param_val_string}" + ) if len(annotate_param_from_config) > 0: - param_val_string = ';'.join( - [f' {k} : {v}' for k, v in annotate_param_from_config.items()]) + param_val_string = ";".join( + [f" {k} : {v}" for k, v in annotate_param_from_config.items()] + ) logging.getLogger("PPanGGOLiN").debug( f"{len(annotate_param_from_config)}/{len(annotate_param_names)} annotate parameters were not found in pangenome internal parameters." - f" They have been parsed from the annotate section in the config file: {param_val_string}") + f" They have been parsed from the annotate section in the config file: {param_val_string}" + ) if len(annotate_param_from_default) > 0: - param_val_string = ';'.join( - [f' {k} : {v}' for k, v in annotate_param_from_default.items()]) + param_val_string = ";".join( + [f" {k} : {v}" for k, v in annotate_param_from_default.items()] + ) logging.getLogger("PPanGGOLiN").debug( f"{len(annotate_param_from_default)}/{len(annotate_param_names)} annotate parameters were not found in the pangenome parameters " - f"nor in the config file. Default values have been used: {param_val_string}") + f"nor in the config file. Default values have been used: {param_val_string}" + ) return annotate_params @@ -864,8 +1124,9 @@ def check_spots_congruency(graph_spot: nx.Graph, spots: List[Spot]) -> None: # check that region in cc are the regions of a spot spot_in_cc = {rgp_to_spot[rgp] for rgp in regions_in_cc} - assert len( - spot_in_cc) == 1, "More than one spot in a connected_components. Something went wrong when recomputing spots." + assert ( + len(spot_in_cc) == 1 + ), "More than one spot in a connected_components. Something went wrong when recomputing spots." current_spot = spot_in_cc.pop() # Add spot id to the graph for node in cc: @@ -874,17 +1135,18 @@ def check_spots_congruency(graph_spot: nx.Graph, spots: List[Spot]) -> None: def predict_spots_in_input_organisms( - initial_spots: List[Spot], - initial_regions: List[Region], - input_org_2_rgps: Dict[Organism, Set[Region]], - multigenics: Set[GeneFamily], - output: Path, - write_graph_flag: bool = False, - graph_formats: List[str] = ['gexf'], - overlapping_match: int = 2, - set_size: int = 3, - exact_match: int = 1, - compress: bool = False) -> Dict[Organism, Set[Spot]]: + initial_spots: List[Spot], + initial_regions: List[Region], + input_org_2_rgps: Dict[Organism, Set[Region]], + multigenics: Set[GeneFamily], + output: Path, + write_graph_flag: bool = False, + graph_formats: List[str] = ["gexf"], + overlapping_match: int = 2, + set_size: int = 3, + exact_match: int = 1, + compress: bool = False, +) -> Dict[Organism, Set[Spot]]: """ Create a spot graph from pangenome RGP and predict spots for input organism RGPs. @@ -904,22 +1166,31 @@ def predict_spots_in_input_organisms( """ logging.getLogger("PPanGGOLiN").debug("Rebuilding original spot graph.") - graph_spot = make_spot_graph(rgps=initial_regions, multigenics=multigenics, - overlapping_match=overlapping_match, set_size=set_size, exact_match=exact_match) + graph_spot = make_spot_graph( + rgps=initial_regions, + multigenics=multigenics, + overlapping_match=overlapping_match, + set_size=set_size, + exact_match=exact_match, + ) original_nodes = set(graph_spot.nodes) # Check congruency with already computed spot and add spot id in node attributes check_spots_congruency(graph_spot, initial_spots) - new_spot_id_counter = max(s.ID for s in initial_spots) + 1 if len(initial_spots) != 0 else 1 + new_spot_id_counter = ( + max(s.ID for s in initial_spots) + 1 if len(initial_spots) != 0 else 1 + ) input_org_to_spots = {} for input_organism, rgps in input_org_2_rgps.items(): if len(rgps) == 0: - logging.getLogger('PPanGGOLiN').debug(f"{input_organism.name}: No RGPs have been found. " - "As a result, spot prediction and RGP output will be skipped.") + logging.getLogger("PPanGGOLiN").debug( + f"{input_organism.name}: No RGPs have been found. " + "As a result, spot prediction and RGP output will be skipped." + ) input_org_to_spots[input_organism] = set() continue @@ -928,14 +1199,21 @@ def predict_spots_in_input_organisms( # Copy the graph spot, as each input organism are processed independently graph_spot_cp = graph_spot.copy() - input_org_spots = predict_spot_in_one_organism(graph_spot_cp, input_org_rgps=rgps, - original_nodes=original_nodes, - new_spot_id_counter=new_spot_id_counter, multigenics=multigenics, - organism_name=input_organism.name, - output=outdir_org, write_graph_flag=write_graph_flag, - graph_formats=graph_formats, - overlapping_match=overlapping_match, set_size=set_size, - exact_match=exact_match, compress=compress) + input_org_spots = predict_spot_in_one_organism( + graph_spot_cp, + input_org_rgps=rgps, + original_nodes=original_nodes, + new_spot_id_counter=new_spot_id_counter, + multigenics=multigenics, + organism_name=input_organism.name, + output=outdir_org, + write_graph_flag=write_graph_flag, + graph_formats=graph_formats, + overlapping_match=overlapping_match, + set_size=set_size, + exact_match=exact_match, + compress=compress, + ) if len(input_org_spots) > 0: new_spot_id_counter = max(s.ID for s in input_org_spots) + 1 @@ -946,19 +1224,20 @@ def predict_spots_in_input_organisms( def predict_spot_in_one_organism( - graph_spot: nx.Graph, - input_org_rgps: List[Region], - original_nodes: Set[int], - new_spot_id_counter: int, - multigenics: Set[GeneFamily], - organism_name: str, - output: Path, - write_graph_flag: bool = False, - graph_formats: List[str] = ['gexf'], - overlapping_match: int = 2, - set_size: int = 3, - exact_match: int = 1, - compress: bool = False) -> Set[Spot]: + graph_spot: nx.Graph, + input_org_rgps: List[Region], + original_nodes: Set[int], + new_spot_id_counter: int, + multigenics: Set[GeneFamily], + organism_name: str, + output: Path, + write_graph_flag: bool = False, + graph_formats: List[str] = ["gexf"], + overlapping_match: int = 2, + set_size: int = 3, + exact_match: int = 1, + compress: bool = False, +) -> Set[Spot]: """ Predict spots for input organism RGPs. @@ -999,7 +1278,8 @@ def predict_spot_in_one_organism( f"{organism_name}: no RGPs of the input genome will be associated with any spot of insertion " "as they are on a contig border (or have " f"less than {set_size} persistent gene families until the contig border). " - "Projection of spots stops here") + "Projection of spots stops here" + ) return set() # remove node that were already in the graph @@ -1007,10 +1287,12 @@ def predict_spot_in_one_organism( logging.getLogger("PPanGGOLiN").debug( f"{organism_name}: {lost} RGPs were not used as they are on a contig border (or have" - f"less than {set_size} persistent gene families until the contig border)") + f"less than {set_size} persistent gene families until the contig border)" + ) logging.getLogger("PPanGGOLiN").debug( - f"{organism_name}: {used} RGPs of the input genome will be associated to a spot of insertion") + f"{organism_name}: {used} RGPs of the input genome will be associated to a spot of insertion" + ) # add potential edges from new nodes to the rest of the nodes all_nodes = list(graph_spot.nodes) @@ -1020,9 +1302,13 @@ def predict_spot_in_one_organism( continue node_obj_i = graph_spot.nodes[nodei] node_obj_j = graph_spot.nodes[nodej] - if check_sim([node_obj_i["border0"], node_obj_i["border1"]], - [node_obj_j["border0"], node_obj_j["border1"]], - overlapping_match, set_size, exact_match): + if check_sim( + [node_obj_i["border0"], node_obj_i["border1"]], + [node_obj_j["border0"], node_obj_j["border1"]], + overlapping_match, + set_size, + exact_match, + ): graph_spot.add_edge(nodei, nodej) input_rgp_to_spots = {} @@ -1049,8 +1335,10 @@ def predict_spot_in_one_organism( elif len(spots_of_the_cc) > 1: # more than one spot in the cc - logging.getLogger("PPanGGOLiN").debug(f'{organism_name}: Some RGPs of the input genome ' - f"are connected to {len(spots_of_the_cc)} original spots of the pangenome.") + logging.getLogger("PPanGGOLiN").debug( + f"{organism_name}: Some RGPs of the input genome " + f"are connected to {len(spots_of_the_cc)} original spots of the pangenome." + ) input_rgps_of_the_cc = set() for node in comp: @@ -1060,8 +1348,9 @@ def predict_spot_in_one_organism( if write_graph_flag: graph_spot.nodes[node]["spots"] = spots_of_the_cc - graph_spot.nodes[node]["spot_id"] = ';'.join( - str(spot) for spot in spots_of_the_cc) + graph_spot.nodes[node]["spot_id"] = ";".join( + str(spot) for spot in spots_of_the_cc + ) graph_spot.nodes[node]["includes_RGPs_from_the_input_genome"] = True for spot in spots_of_the_cc: @@ -1069,35 +1358,46 @@ def predict_spot_in_one_organism( spot.add(region) input_rgp_to_spots.update( - {rgp: spots_of_the_cc for rgp in input_rgps_of_the_cc}) + {rgp: spots_of_the_cc for rgp in input_rgps_of_the_cc} + ) if write_graph_flag: # remove node that would not be writable in graph file for node in graph_spot.nodes: del graph_spot.nodes[node]["spots"] - write_spot_graph(graph_spot, output, graph_formats, - file_basename='projected_spotGraph') + write_spot_graph( + graph_spot, output, graph_formats, file_basename="projected_spotGraph" + ) - write_rgp_to_spot_table(input_rgp_to_spots, output=output, - filename='input_genome_rgp_to_spot.tsv', compress=compress) + write_rgp_to_spot_table( + input_rgp_to_spots, + output=output, + filename="input_genome_rgp_to_spot.tsv", + compress=compress, + ) - input_org_spots = {spot for spots in input_rgp_to_spots.values() - for spot in spots} + input_org_spots = {spot for spots in input_rgp_to_spots.values() for spot in spots} new_spots = {spot for spot in input_org_spots if isinstance(spot, NewSpot)} - logging.getLogger('PPanGGOLiN').debug( - f'{organism_name}: {len(new_spots)} new spots have been created for the input genome.') + logging.getLogger("PPanGGOLiN").debug( + f"{organism_name}: {len(new_spots)} new spots have been created for the input genome." + ) if new_spots: - summarize_spots(new_spots, output, compress=compress, - file_name="new_spots_summary.tsv") + summarize_spots( + new_spots, output, compress=compress, file_name="new_spots_summary.tsv" + ) return input_org_spots -def project_and_write_modules(pangenome: Pangenome, input_organisms: Iterable[Organism], - output: Path, compress: bool = False): +def project_and_write_modules( + pangenome: Pangenome, + input_organisms: Iterable[Organism], + output: Path, + compress: bool = False, +): """ Write a tsv file providing association between modules and the input organism @@ -1118,29 +1418,38 @@ def project_and_write_modules(pangenome: Pangenome, input_organisms: Iterable[Or for mod in pangenome.modules: module_in_input_organism = any( - fam in input_organism_families for fam in mod.families) + fam in input_organism_families for fam in mod.families + ) if module_in_input_organism: counter += 1 modules_in_input_org.append(mod) completion = round( - len(set(input_organism.families) & set(mod.families)) / len(set(mod.families)), 2) + len(set(input_organism.families) & set(mod.families)) + / len(set(mod.families)), + 2, + ) fout.write( - f"module_{mod.ID}\t{input_organism.name}\t{completion}\n") + f"module_{mod.ID}\t{input_organism.name}\t{completion}\n" + ) - logging.getLogger('PPanGGOLiN').debug( - f"{input_organism.name}: {counter} modules have been projected to the input genomes.") + logging.getLogger("PPanGGOLiN").debug( + f"{input_organism.name}: {counter} modules have been projected to the input genomes." + ) - logging.getLogger('PPanGGOLiN').debug( - f"{input_organism.name}: Projected modules have been written in: '{output_file}'") + logging.getLogger("PPanGGOLiN").debug( + f"{input_organism.name}: Projected modules have been written in: '{output_file}'" + ) input_orgs_to_modules[input_organism] = modules_in_input_org return input_orgs_to_modules -def infer_input_mode(input_file: Path, expected_types: List[str], parser: argparse.ArgumentParser) -> str: +def infer_input_mode( + input_file: Path, expected_types: List[str], parser: argparse.ArgumentParser +) -> str: """ Determine the input mode based on the provided input file and expected file types. @@ -1156,25 +1465,33 @@ def infer_input_mode(input_file: Path, expected_types: List[str], parser: argpar filetype = detect_filetype(input_file) except Exception: parser.error( - "Based on its content, the provided file is not recognized as a valid input file. Please ensure it is in one of the supported formats (FASTA, GFF/GBFF, or TSV).") + "Based on its content, the provided file is not recognized as a valid input file. Please ensure it is in one of the supported formats (FASTA, GFF/GBFF, or TSV)." + ) if filetype == "tsv": - logging.getLogger('PPanGGOLiN').debug(f"The provided file ({input_file}) is detected as a TSV file.") + logging.getLogger("PPanGGOLiN").debug( + f"The provided file ({input_file}) is detected as a TSV file." + ) mode = "multiple" elif filetype in expected_types: - logging.getLogger('PPanGGOLiN').debug( - f"The provided file ({input_file}) is detected as a single {'/'.join(expected_types)} file.") + logging.getLogger("PPanGGOLiN").debug( + f"The provided file ({input_file}) is detected as a single {'/'.join(expected_types)} file." + ) mode = "single" else: - logging.getLogger('PPanGGOLiN').error( - f"The provided file {input_file} is not recognized as a valid {'/'.join(expected_types)} file or a TSV file listing names and {'/'.join(expected_types)} files of genomes to annotate.") + logging.getLogger("PPanGGOLiN").error( + f"The provided file {input_file} is not recognized as a valid {'/'.join(expected_types)} file or a TSV file listing names and {'/'.join(expected_types)} files of genomes to annotate." + ) parser.error( - f"The provided file {input_file} is not recognized as a valid {'/'.join(expected_types)} file or a TSV file listing names and files of genomes to annotate.") + f"The provided file {input_file} is not recognized as a valid {'/'.join(expected_types)} file or a TSV file listing names and files of genomes to annotate." + ) return mode -def check_projection_arguments(args: argparse.Namespace, parser: argparse.ArgumentParser) -> str: +def check_projection_arguments( + args: argparse.Namespace, parser: argparse.ArgumentParser +) -> str: """ Check the arguments provided for genome projection and raise errors if they are incompatible or missing. @@ -1188,32 +1505,38 @@ def check_projection_arguments(args: argparse.Namespace, parser: argparse.Argume parser.error( "Please provide either a FASTA file or a tab-separated file listing sequence files using the '--fasta' option, " "or an annotation file or a tab-separated file listing annotation files using the '--anno' option. " - "You can specify these either through the command line or the configuration file.") + "You can specify these either through the command line or the configuration file." + ) mode_from_fasta, mode_from_anno = None, None if args.fasta: - mode_from_fasta = infer_input_mode(args.fasta, ['fasta'], parser) + mode_from_fasta = infer_input_mode(args.fasta, ["fasta"], parser) input_mode = mode_from_fasta if args.anno: - mode_from_anno = infer_input_mode(args.anno, ['gff', "gbff"], parser) + mode_from_anno = infer_input_mode(args.anno, ["gff", "gbff"], parser) input_mode = mode_from_anno - logging.getLogger('PPanGGOLiN').debug("") + logging.getLogger("PPanGGOLiN").debug("") if mode_from_fasta and mode_from_anno and mode_from_fasta != mode_from_anno: - single_input, multiple_input = ("fasta", "anno") if mode_from_fasta == "single" else ("anno", "fasta") + single_input, multiple_input = ( + ("fasta", "anno") if mode_from_fasta == "single" else ("anno", "fasta") + ) parser.error( f"You've provided both a single annotation/fasta file using the '--{single_input}' option and a list of files using " - f"the '--{multiple_input}' option. Please choose either a single file or a tab-separated file listing genome files, but not both.") + f"the '--{multiple_input}' option. Please choose either a single file or a tab-separated file listing genome files, but not both." + ) if input_mode == "multiple": # We are in paths file mode if args.circular_contigs: - parser.error("You provided a TSV file listing the files of genomes you wish to annotate. " - "Therefore, the argument '--circular_contigs' is incompatible with this multiple genomes file.") + parser.error( + "You provided a TSV file listing the files of genomes you wish to annotate. " + "Therefore, the argument '--circular_contigs' is incompatible with this multiple genomes file." + ) if args.fasta: check_input_files(args.fasta, True) @@ -1239,86 +1562,134 @@ def launch(args: argparse.Namespace): pangenome = Pangenome() pangenome.add_file(args.pangenome) - predict_rgp, project_spots, project_modules = check_pangenome_for_projection(pangenome, args.fast) + predict_rgp, project_spots, project_modules = check_pangenome_for_projection( + pangenome, args.fast + ) need_graph = True if args.table else False - check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar, - need_rgp=predict_rgp, need_modules=project_modules, need_gene_sequences=False, - need_spots=project_spots, need_graph=need_graph, need_metadata=True) + check_pangenome_info( + pangenome, + need_annotations=True, + need_families=True, + disable_bar=args.disable_prog_bar, + need_rgp=predict_rgp, + need_modules=project_modules, + need_gene_sequences=False, + need_spots=project_spots, + need_graph=need_graph, + need_metadata=True, + ) - logging.getLogger('PPanGGOLiN').info('Retrieving parameters from the provided pangenome file.') + logging.getLogger("PPanGGOLiN").info( + "Retrieving parameters from the provided pangenome file." + ) pangenome_params = argparse.Namespace( - **{step: argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) + **{ + step: argparse.Namespace(**k_v) + for step, k_v in pangenome.parameters.items() + } + ) if predict_rgp: # computing multigenics for rgp prediction first to have original family.number_of_genomes # and the same multigenics list as when rgp and spot were predicted multigenics = pangenome.get_multigenics(pangenome_params.rgp.dup_margin) - organisms, genome_name_to_path, input_type = manage_input_genomes_annotation(pangenome=pangenome, - input_mode=args.input_mode, - anno=args.anno, fasta=args.fasta, - organism_name=args.genome_name, - circular_contigs=args.circular_contigs, - pangenome_params=pangenome_params, - cpu=args.cpu, - use_pseudo=args.use_pseudo, - disable_bar=args.disable_prog_bar, - tmpdir=args.tmpdir, config=args.config) - - input_org_to_lonely_genes_count = annotate_input_genes_with_pangenome_families(pangenome, input_organisms=organisms, - output=output_dir, cpu=args.cpu, - use_representatives=args.fast, - no_defrag=args.no_defrag, - identity=args.identity, - coverage=args.coverage, - tmpdir=args.tmpdir, - translation_table=int( - pangenome_params.cluster.translation_table), - keep_tmp=args.keep_tmp, - disable_bar=args.disable_prog_bar) + organisms, genome_name_to_path, input_type = manage_input_genomes_annotation( + pangenome=pangenome, + input_mode=args.input_mode, + anno=args.anno, + fasta=args.fasta, + organism_name=args.genome_name, + circular_contigs=args.circular_contigs, + pangenome_params=pangenome_params, + cpu=args.cpu, + use_pseudo=args.use_pseudo, + disable_bar=args.disable_prog_bar, + tmpdir=args.tmpdir, + config=args.config, + ) + + input_org_to_lonely_genes_count = annotate_input_genes_with_pangenome_families( + pangenome, + input_organisms=organisms, + output=output_dir, + cpu=args.cpu, + use_representatives=args.fast, + no_defrag=args.no_defrag, + identity=args.identity, + coverage=args.coverage, + tmpdir=args.tmpdir, + translation_table=int(pangenome_params.cluster.translation_table), + keep_tmp=args.keep_tmp, + disable_bar=args.disable_prog_bar, + ) input_org_2_rgps, input_org_to_spots, input_orgs_to_modules = {}, {}, {} if predict_rgp: - logging.getLogger('PPanGGOLiN').info('Detecting RGPs in input genomes.') - - input_org_2_rgps = predict_RGP(pangenome, organisms, persistent_penalty=pangenome_params.rgp.persistent_penalty, - variable_gain=pangenome_params.rgp.variable_gain, - min_length=pangenome_params.rgp.min_length, - min_score=pangenome_params.rgp.min_score, multigenics=multigenics, - output_dir=output_dir, - disable_bar=args.disable_prog_bar, compress=args.compress) + logging.getLogger("PPanGGOLiN").info("Detecting RGPs in input genomes.") + + input_org_2_rgps = predict_RGP( + pangenome, + organisms, + persistent_penalty=pangenome_params.rgp.persistent_penalty, + variable_gain=pangenome_params.rgp.variable_gain, + min_length=pangenome_params.rgp.min_length, + min_score=pangenome_params.rgp.min_score, + multigenics=multigenics, + output_dir=output_dir, + disable_bar=args.disable_prog_bar, + compress=args.compress, + ) if project_spots: - logging.getLogger('PPanGGOLiN').info('Predicting spot of insertion in input genomes.') - input_org_to_spots = predict_spots_in_input_organisms(initial_spots=list(pangenome.spots), - initial_regions=pangenome.regions, - input_org_2_rgps=input_org_2_rgps, - multigenics=multigenics, - output=output_dir, - write_graph_flag=args.spot_graph, - graph_formats=args.graph_formats, - overlapping_match=pangenome_params.spot.overlapping_match, - set_size=pangenome_params.spot.set_size, - exact_match=pangenome_params.spot.exact_match_size, - compress=args.compress) + logging.getLogger("PPanGGOLiN").info( + "Predicting spot of insertion in input genomes." + ) + input_org_to_spots = predict_spots_in_input_organisms( + initial_spots=list(pangenome.spots), + initial_regions=pangenome.regions, + input_org_2_rgps=input_org_2_rgps, + multigenics=multigenics, + output=output_dir, + write_graph_flag=args.spot_graph, + graph_formats=args.graph_formats, + overlapping_match=pangenome_params.spot.overlapping_match, + set_size=pangenome_params.spot.set_size, + exact_match=pangenome_params.spot.exact_match_size, + compress=args.compress, + ) if project_modules: - input_orgs_to_modules = project_and_write_modules(pangenome, organisms, output_dir, compress=args.compress) + input_orgs_to_modules = project_and_write_modules( + pangenome, organisms, output_dir, compress=args.compress + ) - write_projection_results(pangenome, organisms, input_org_2_rgps, - input_org_to_spots, - input_orgs_to_modules, - input_org_to_lonely_genes_count, - write_proksee=args.proksee, write_gff=args.gff, write_table=args.table, - add_sequences=args.add_sequences, - genome_name_to_path=genome_name_to_path, input_type=input_type, - output_dir=output_dir, dup_margin=args.dup_margin, soft_core=args.soft_core, - metadata_sep=args.metadata_sep, compress=args.compress, - need_modules=project_modules, need_spots=project_spots, need_regions=predict_rgp) + write_projection_results( + pangenome, + organisms, + input_org_2_rgps, + input_org_to_spots, + input_orgs_to_modules, + input_org_to_lonely_genes_count, + write_proksee=args.proksee, + write_gff=args.gff, + write_table=args.table, + add_sequences=args.add_sequences, + genome_name_to_path=genome_name_to_path, + input_type=input_type, + output_dir=output_dir, + dup_margin=args.dup_margin, + soft_core=args.soft_core, + metadata_sep=args.metadata_sep, + compress=args.compress, + need_modules=project_modules, + need_spots=project_spots, + need_regions=predict_rgp, + ) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -1330,7 +1701,8 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : parser arguments for projection command """ parser = sub_parser.add_parser( - "projection", formatter_class=argparse.RawTextHelpFormatter) + "projection", formatter_class=argparse.RawTextHelpFormatter + ) parser_projection(parser) return parser @@ -1343,108 +1715,222 @@ def parser_projection(parser: argparse.ArgumentParser): """ required = parser.add_argument_group(title="Required arguments") - required.add_argument('-p', '--pangenome', required=False, - type=Path, help="The pangenome.h5 file") + required.add_argument( + "-p", "--pangenome", required=False, type=Path, help="The pangenome.h5 file" + ) - required.add_argument('--fasta', required=False, type=Path, - help="Specify a FASTA file containing the genomic sequences of the genome(s) you wish to annotate, " - "or provide a tab-separated file listing genome names alongside their respective FASTA filepaths, with one line per genome.") + required.add_argument( + "--fasta", + required=False, + type=Path, + help="Specify a FASTA file containing the genomic sequences of the genome(s) you wish to annotate, " + "or provide a tab-separated file listing genome names alongside their respective FASTA filepaths, with one line per genome.", + ) - required.add_argument('--anno', required=False, type=Path, - help="Specify an annotation file in GFF/GBFF format for the genome you wish to annotate. " - "Alternatively, you can provide a tab-separated file listing genome names alongside their respective annotation filepaths, " - "with one line per genome. If both an annotation file and a FASTA file are provided, the annotation file will take precedence.") + required.add_argument( + "--anno", + required=False, + type=Path, + help="Specify an annotation file in GFF/GBFF format for the genome you wish to annotate. " + "Alternatively, you can provide a tab-separated file listing genome names alongside their respective annotation filepaths, " + "with one line per genome. If both an annotation file and a FASTA file are provided, the annotation file will take precedence.", + ) - required_single = parser.add_argument_group(title="Single Genome Arguments", - description="Use these options when providing a single FASTA or annotation file:") + required_single = parser.add_argument_group( + title="Single Genome Arguments", + description="Use these options when providing a single FASTA or annotation file:", + ) - required_single.add_argument("-n", '--genome_name', required=False, type=str, default="input_genome", - help="Specify the name of the genome whose genome you want to annotate when providing a single FASTA or annotation file.") + required_single.add_argument( + "-n", + "--genome_name", + required=False, + type=str, + default="input_genome", + help="Specify the name of the genome whose genome you want to annotate when providing a single FASTA or annotation file.", + ) - required_single.add_argument('--circular_contigs', nargs="+", required=False, type=tuple, - help="Specify the contigs of the input genome that should be treated as circular when providing a single FASTA or annotation file.") + required_single.add_argument( + "--circular_contigs", + nargs="+", + required=False, + type=tuple, + help="Specify the contigs of the input genome that should be treated as circular when providing a single FASTA or annotation file.", + ) optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument('-o', '--output', required=False, type=Path, - default="ppanggolin_projection" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", - time.localtime()) + "_PID" + str(os.getpid()), - help="Output directory") + optional.add_argument( + "-o", + "--output", + required=False, + type=Path, + default="ppanggolin_projection" + + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", time.localtime()) + + "_PID" + + str(os.getpid()), + help="Output directory", + ) - optional.add_argument('--no_defrag', required=False, action="store_true", - help="DO NOT Realign gene families to link fragments with " - "their non-fragmented gene family. (default: False)") + optional.add_argument( + "--no_defrag", + required=False, + action="store_true", + help="DO NOT Realign gene families to link fragments with " + "their non-fragmented gene family. (default: False)", + ) - optional.add_argument("--fast", required=False, action="store_true", - help="Use representative sequences of gene families for input gene alignment. " - "This option is faster but may be less sensitive. By default, all pangenome genes are used.") + optional.add_argument( + "--fast", + required=False, + action="store_true", + help="Use representative sequences of gene families for input gene alignment. " + "This option is faster but may be less sensitive. By default, all pangenome genes are used.", + ) - optional.add_argument('--identity', required=False, type=restricted_float, default=0.8, - help="min identity percentage threshold") + optional.add_argument( + "--identity", + required=False, + type=restricted_float, + default=0.8, + help="min identity percentage threshold", + ) - optional.add_argument('--coverage', required=False, type=restricted_float, default=0.8, - help="min coverage percentage threshold") + optional.add_argument( + "--coverage", + required=False, + type=restricted_float, + default=0.8, + help="min coverage percentage threshold", + ) - optional.add_argument("--use_pseudo", required=False, action="store_true", - help="In the context of provided annotation, use this option to read pseudogenes. " - "(Default behavior is to ignore them)") + optional.add_argument( + "--use_pseudo", + required=False, + action="store_true", + help="In the context of provided annotation, use this option to read pseudogenes. " + "(Default behavior is to ignore them)", + ) - optional.add_argument("--dup_margin", required=False, type=restricted_float, default=0.05, - help="minimum ratio of genomes in which the family must have multiple genes " - "for it to be considered 'duplicated'. " - "This metric is used to compute completeness and duplication of the input genomes") + optional.add_argument( + "--dup_margin", + required=False, + type=restricted_float, + default=0.05, + help="minimum ratio of genomes in which the family must have multiple genes " + "for it to be considered 'duplicated'. " + "This metric is used to compute completeness and duplication of the input genomes", + ) - optional.add_argument("--soft_core", required=False, type=restricted_float, default=0.95, - help="Soft core threshold used when generating general statistics on the projected genome. " - "This threshold does not influence PPanGGOLiN's partitioning. " - "The value determines the minimum fraction of genomes that must possess a gene family " - "for it to be considered part of the soft core.") + optional.add_argument( + "--soft_core", + required=False, + type=restricted_float, + default=0.95, + help="Soft core threshold used when generating general statistics on the projected genome. " + "This threshold does not influence PPanGGOLiN's partitioning. " + "The value determines the minimum fraction of genomes that must possess a gene family " + "for it to be considered part of the soft core.", + ) - optional.add_argument("--spot_graph", required=False, action="store_true", - help="Write the spot graph to a file, with pairs of blocks of single copy markers flanking RGPs " - "as nodes. This graph can be used to visualize nodes that have RGPs from the input genome.") + optional.add_argument( + "--spot_graph", + required=False, + action="store_true", + help="Write the spot graph to a file, with pairs of blocks of single copy markers flanking RGPs " + "as nodes. This graph can be used to visualize nodes that have RGPs from the input genome.", + ) - optional.add_argument('--graph_formats', required=False, type=str, choices=['gexf', "graphml"], nargs="+", - default=['gexf'], help="Format of the output graph.") + optional.add_argument( + "--graph_formats", + required=False, + type=str, + choices=["gexf", "graphml"], + nargs="+", + default=["gexf"], + help="Format of the output graph.", + ) - optional.add_argument("--gff", required=False, action="store_true", - help="Generate GFF files with projected pangenome annotations for each input genome.") + optional.add_argument( + "--gff", + required=False, + action="store_true", + help="Generate GFF files with projected pangenome annotations for each input genome.", + ) - optional.add_argument("--proksee", required=False, action="store_true", - help="Generate JSON map files for PROKSEE with projected pangenome annotations for each input genome.") + optional.add_argument( + "--proksee", + required=False, + action="store_true", + help="Generate JSON map files for PROKSEE with projected pangenome annotations for each input genome.", + ) - optional.add_argument("--table", required=False, action="store_true", - help="Generate a tsv file for each input genome with pangenome annotations.") + optional.add_argument( + "--table", + required=False, + action="store_true", + help="Generate a tsv file for each input genome with pangenome annotations.", + ) - optional.add_argument("--compress", required=False, action="store_true", - help="Compress the files in .gz") + optional.add_argument( + "--compress", + required=False, + action="store_true", + help="Compress the files in .gz", + ) - optional.add_argument("--add_sequences", required=False, action="store_true", - help="Include input genome DNA sequences in GFF and Proksee output.") + optional.add_argument( + "--add_sequences", + required=False, + action="store_true", + help="Include input genome DNA sequences in GFF and Proksee output.", + ) - optional.add_argument("-c", "--cpu", required=False, - default=1, type=int, help="Number of available cpus") + optional.add_argument( + "-c", + "--cpu", + required=False, + default=1, + type=int, + help="Number of available cpus", + ) - optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), - help="directory for storing temporary files") + optional.add_argument( + "--tmpdir", + required=False, + type=Path, + default=Path(tempfile.gettempdir()), + help="directory for storing temporary files", + ) - optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", - help="Keeping temporary files (useful for debugging).") + optional.add_argument( + "--keep_tmp", + required=False, + default=False, + action="store_true", + help="Keeping temporary files (useful for debugging).", + ) - optional.add_argument("--add_metadata", - required=False, - action="store_true", - help="Include metadata information in the output files " - "if any have been added to pangenome elements (see ppanggolin metadata command).") + optional.add_argument( + "--add_metadata", + required=False, + action="store_true", + help="Include metadata information in the output files " + "if any have been added to pangenome elements (see ppanggolin metadata command).", + ) - optional.add_argument("--metadata_sources", - default=None, - nargs="+", - help="Which source of metadata should be written. " - "By default all metadata sources are included.") + optional.add_argument( + "--metadata_sources", + default=None, + nargs="+", + help="Which source of metadata should be written. " + "By default all metadata sources are included.", + ) - optional.add_argument("--metadata_sep", - required=False, - default='|', - help="The separator used to join multiple metadata values for elements with multiple metadata" - " values from the same source. This character should not appear in metadata values.") + optional.add_argument( + "--metadata_sep", + required=False, + default="|", + help="The separator used to join multiple metadata values for elements with multiple metadata" + " values from the same source. This character should not appear in metadata values.", + ) diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 504cb842..e2589e13 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -13,7 +13,11 @@ from ppanggolin.genome import Gene, Organism, Contig from ppanggolin.geneFamily import GeneFamily from ppanggolin.metadata import MetaFeatures -from ppanggolin.utils import find_region_border_position, get_consecutive_region_positions +from ppanggolin.utils import ( + find_region_border_position, + get_consecutive_region_positions, +) + class Region(MetaFeatures): """ @@ -36,6 +40,7 @@ class Region(MetaFeatures): - 'Starter': the first gene in the region. - 'stopper': the last gene in the region. """ + id_counter = 0 def __init__(self, name: str): @@ -55,7 +60,7 @@ def __init__(self, name: str): self._organism = None self.ID = Region.id_counter self._spot = None - self.projected = False # If the rgp is from a projected genome. If true can have multiple spots + self.projected = False # If the rgp is from a projected genome. If true can have multiple spots Region.id_counter += 1 def __str__(self): @@ -68,8 +73,7 @@ def __repr__(self) -> str: return f"RGP name:{self.name}" def __hash__(self) -> int: - """Create a hash value for the region - """ + """Create a hash value for the region""" return id(self) def __lt__(self, obj): @@ -89,16 +93,21 @@ def __eq__(self, other: Region) -> bool: :raises TypeError: Try to compare a region with another type object """ if not isinstance(other, Region): - raise TypeError(f"'Region' type object was expected, but '{type(other)}' type object was provided.") - if [gene.family for gene in self.genes] == [gene.family for gene in other.genes]: + raise TypeError( + f"'Region' type object was expected, but '{type(other)}' type object was provided." + ) + if [gene.family for gene in self.genes] == [ + gene.family for gene in other.genes + ]: return True - if [gene.family for gene in self.genes] == [gene.family for gene in list(other.genes)[::-1]]: + if [gene.family for gene in self.genes] == [ + gene.family for gene in list(other.genes)[::-1] + ]: return True return False def __len__(self) -> int: - """Get the number of genes in the region - """ + """Get the number of genes in the region""" return len(self._genes_getter) def __setitem__(self, position: int, gene: Gene): @@ -114,7 +123,9 @@ def __setitem__(self, position: int, gene: Gene): """ if position != gene.position: - raise ValueError(f"The given gene position ({position}) to set the gene in the region and the position of the gene ({gene.position}) are different. ") + raise ValueError( + f"The given gene position ({position}) to set the gene in the region and the position of the gene ({gene.position}) are different. " + ) if len(self) == 0: # first gene to be added to the region @@ -123,11 +134,15 @@ def __setitem__(self, position: int, gene: Gene): if len(self) > 0: if gene.organism != self.organism: - raise ValueError(f"Gene {gene.name} is from a different genome than the first defined in RGP. " - "That's not possible") + raise ValueError( + f"Gene {gene.name} is from a different genome than the first defined in RGP. " + "That's not possible" + ) if gene.contig != self.contig: - raise ValueError(f"Gene {gene.name} is from a different contig than the first defined in RGP. " - "That's not possible") + raise ValueError( + f"Gene {gene.name} is from a different contig than the first defined in RGP. " + "That's not possible" + ) if position in self._genes_getter and self[position] != gene: raise KeyError("Another gene already exist at this position") self._genes_getter[position] = gene @@ -142,18 +157,21 @@ def __setitem__(self, position: int, gene: Gene): def identify_rgp_last_and_first_genes(self): """ - Identify first and last genes of the rgp by taking into account the circularity of contigs. + Identify first and last genes of the rgp by taking into account the circularity of contigs. Set the attributes _starter: first gene of the region and _stopper: last gene of the region and _coordinates """ - rgp_genes_positions = list(self._genes_getter.keys() ) + rgp_genes_positions = list(self._genes_getter.keys()) if len(rgp_genes_positions) == 0: - raise ValueError(f'RGP ({self.name}) has no gene associated.') + raise ValueError(f"RGP ({self.name}) has no gene associated.") - gene = self._genes_getter[rgp_genes_positions[0]] # get a gene of the region - first_gene_position, last_gene_position = find_region_border_position(region_positions=rgp_genes_positions, contig_gene_count=gene.contig.number_of_genes) + gene = self._genes_getter[rgp_genes_positions[0]] # get a gene of the region + first_gene_position, last_gene_position = find_region_border_position( + region_positions=rgp_genes_positions, + contig_gene_count=gene.contig.number_of_genes, + ) self._starter = self._genes_getter[first_gene_position] self._stopper = self._genes_getter[last_gene_position] @@ -161,11 +179,16 @@ def identify_rgp_last_and_first_genes(self): if self._starter.start > self._stopper.stop: # this means region is overlapping the contig edge if not gene.contig.is_circular: - raise ValueError(f'Region seems to be overlapping the contig (first gene {self._starter.position}:{self._starter.coordinates} ' - f'and last gene {self._stopper.position}:{self._stopper.coordinates} ) ' - f'but the contig is not circular. This is unexpected. {rgp_genes_positions}') - - self._coordinates = [(self._starter.start, self._starter.contig.length), (1, self._stopper.stop)] + raise ValueError( + f"Region seems to be overlapping the contig (first gene {self._starter.position}:{self._starter.coordinates} " + f"and last gene {self._stopper.position}:{self._stopper.coordinates} ) " + f"but the contig is not circular. This is unexpected. {rgp_genes_positions}" + ) + + self._coordinates = [ + (self._starter.start, self._starter.contig.length), + (1, self._stopper.stop), + ] self._overlaps_contig_edge = True else: self._coordinates = [(self._starter.start, self._stopper.stop)] @@ -178,17 +201,23 @@ def get_ordered_genes(self) -> List[Gene]: :return: A list of genes ordered by their positions in the region. """ - rgp_genes_positions = list(self._genes_getter.keys() ) + rgp_genes_positions = list(self._genes_getter.keys()) - gene = self._genes_getter[rgp_genes_positions[0]] # get a gene of the region + gene = self._genes_getter[rgp_genes_positions[0]] # get a gene of the region - consecutive_region_positions = get_consecutive_region_positions(region_positions=rgp_genes_positions, contig_gene_count=gene.contig.number_of_genes) + consecutive_region_positions = get_consecutive_region_positions( + region_positions=rgp_genes_positions, + contig_gene_count=gene.contig.number_of_genes, + ) - ordered_genes = [self._genes_getter[position] for ordered_positions in consecutive_region_positions for position in ordered_positions] + ordered_genes = [ + self._genes_getter[position] + for ordered_positions in consecutive_region_positions + for position in ordered_positions + ] return ordered_genes - def __getitem__(self, position: int) -> Gene: """Get the gene at the given position @@ -201,7 +230,9 @@ def __getitem__(self, position: int) -> Gene: try: gene = self._genes_getter[position] except KeyError: - raise KeyError(f"There is no gene at position {position} in RGP {self.name}") + raise KeyError( + f"There is no gene at position {position} in RGP {self.name}" + ) else: return gene @@ -240,7 +271,7 @@ def string_coordinates(self) -> str: """ Return a string representation of the coordinates """ - return ','.join([f'{start}..{stop}' for start, stop in self.coordinates]) + return ",".join([f"{start}..{stop}" for start, stop in self.coordinates]) @property def overlaps_contig_edge(self) -> bool: @@ -255,7 +286,7 @@ def spot(self) -> Union[Spot, None]: @spot.setter def spot(self, spot: Spot): """Sets the spot of the RGP - + :param spot: spot to which the RGP is added :raise TypeError: if the given spot is not a Spot. @@ -263,7 +294,9 @@ def spot(self, spot: Spot): if isinstance(spot, Spot): self._spot = spot # only 1 spot possible else: - raise TypeError(f"Unexpected class / type for {type(spot)} when adding spot to a RGP") + raise TypeError( + f"Unexpected class / type for {type(spot)} when adding spot to a RGP" + ) def __delitem__(self, position): """Remove the gene at the given position @@ -274,7 +307,9 @@ def __delitem__(self, position): try: del self._genes_getter[position] except KeyError: - raise KeyError(f"There is no gene at position {position} in RGP {self.name}") + raise KeyError( + f"There is no gene at position {position} in RGP {self.name}" + ) def add(self, gene: Gene): """Add a gene to the region @@ -282,10 +317,12 @@ def add(self, gene: Gene): :param gene: Gene to add """ if not isinstance(gene, Gene): - raise TypeError(f"Unexpected class / type for {type(gene)} " - f"when adding it to a region of genomic plasticity") + raise TypeError( + f"Unexpected class / type for {type(gene)} " + f"when adding it to a region of genomic plasticity" + ) if gene.position is None: - raise AttributeError(f'Gene {gene.name} is not fill with position') + raise AttributeError(f"Gene {gene.name} is not fill with position") self[gene.position] = gene def get(self, position: int) -> Gene: @@ -298,7 +335,9 @@ def get(self, position: int) -> Gene: :raises TypeError: Position is not an integer """ if not isinstance(position, int): - raise TypeError(f"Position to get gene must be an integer. The provided type was {type(position)}") + raise TypeError( + f"Position to get gene must be an integer. The provided type was {type(position)}" + ) return self[position] def remove(self, position): @@ -309,7 +348,9 @@ def remove(self, position): :raises TypeError: Position is not an integer """ if not isinstance(position, int): - raise TypeError(f"Position to get gene must be an integer. The provided type was {type(position)}") + raise TypeError( + f"Position to get gene must be an integer. The provided type was {type(position)}" + ) del self[position] @property @@ -335,7 +376,9 @@ def modules(self) -> Set[Module]: :return: Modules found in families of the RGP """ - modules = {family.module for family in self.families if family.module is not None} + modules = { + family.module for family in self.families if family.module is not None + } return modules @property @@ -352,11 +395,11 @@ def length(self): :return: Size of the region """ - return sum([(stop - start +1) for start, stop in self.coordinates]) + return sum([(stop - start + 1) for start, stop in self.coordinates]) @property def organism(self) -> Organism: - """ Get the Organism link to RGP + """Get the Organism link to RGP :return: Organism corresponding to the region """ @@ -364,7 +407,7 @@ def organism(self) -> Organism: @property def contig(self) -> Contig: - """ Get the starter contig link to RGP + """Get the starter contig link to RGP :return: Contig corresponding to the region """ @@ -372,7 +415,7 @@ def contig(self) -> Contig: @property def start(self) -> int: - """ + """ Get the starter start link to RGP :return: start position in the contig of the first gene of the RGP @@ -381,7 +424,7 @@ def start(self) -> int: @property def stop(self) -> int: - """ + """ Get the stopper stop link to RGP :return: start position in the contig of the last gene of the RGP @@ -394,7 +437,10 @@ def is_whole_contig(self) -> bool: :return: True if whole contig else False """ - if self.starter.position == 0 and self.stopper.position == self.contig.number_of_genes - 1: + if ( + self.starter.position == 0 + and self.stopper.position == self.contig.number_of_genes - 1 + ): return True return False @@ -416,16 +462,18 @@ def is_contig_border(self) -> bool: return True return False - def get_bordering_genes(self, n: int, multigenics: Set[GeneFamily], return_only_persistents:bool = True) -> List[List[Gene], List[Gene]]: - """ + def get_bordering_genes( + self, n: int, multigenics: Set[GeneFamily], return_only_persistents: bool = True + ) -> List[List[Gene], List[Gene]]: + """ Get the bordered genes in the region. Find the n persistent and single copy gene bordering the region. If return_only_persistents is False, the method return all genes included between the n single copy and persistent genes. :param n: Number of genes to get :param multigenics: pangenome graph multigenic persistent families - :param return_only_persistents: return only non multgenic persistent genes identify as the region. - If False return all genes included between - the borders made of n persistent and single copy genes around the region. + :param return_only_persistents: return only non multgenic persistent genes identify as the region. + If False return all genes included between + the borders made of n persistent and single copy genes around the region. :return: A list of bordering genes in start and stop position """ @@ -435,7 +483,9 @@ def get_bordering_genes(self, n: int, multigenics: Set[GeneFamily], return_only_ pos = self.starter.position init = pos single_copy_persistent_count = 0 - while single_copy_persistent_count < n and (pos != 0 or self.contig.is_circular): + while single_copy_persistent_count < n and ( + pos != 0 or self.contig.is_circular + ): curr_gene = None if pos == 0: if self.contig.is_circular: @@ -443,11 +493,19 @@ def get_bordering_genes(self, n: int, multigenics: Set[GeneFamily], return_only_ else: curr_gene = self.contig[pos - 1] - if curr_gene is not None and curr_gene.family not in multigenics and \ - curr_gene.family.named_partition == "persistent" and curr_gene not in genes_in_region: + if ( + curr_gene is not None + and curr_gene.family not in multigenics + and curr_gene.family.named_partition == "persistent" + and curr_gene not in genes_in_region + ): left_border.append(curr_gene) - single_copy_persistent_count +=1 - elif curr_gene is not None and curr_gene not in genes_in_region and not return_only_persistents: + single_copy_persistent_count += 1 + elif ( + curr_gene is not None + and curr_gene not in genes_in_region + and not return_only_persistents + ): left_border.append(curr_gene) pos -= 1 @@ -456,23 +514,33 @@ def get_bordering_genes(self, n: int, multigenics: Set[GeneFamily], return_only_ if pos == init: break # looped around the contig - # Identifying right border + # Identifying right border right_border = [] pos = self.stopper.position init = pos single_copy_persistent_count = 0 - while single_copy_persistent_count < n and (pos != self.contig.number_of_genes - 1 or self.contig.is_circular): + while single_copy_persistent_count < n and ( + pos != self.contig.number_of_genes - 1 or self.contig.is_circular + ): curr_gene = None if pos == self.contig.number_of_genes - 1: if self.contig.is_circular: curr_gene = self.contig[0] else: curr_gene = self.contig[pos + 1] - if curr_gene is not None and curr_gene.family not in multigenics and \ - curr_gene.family.named_partition == "persistent" and curr_gene not in genes_in_region: + if ( + curr_gene is not None + and curr_gene.family not in multigenics + and curr_gene.family.named_partition == "persistent" + and curr_gene not in genes_in_region + ): right_border.append(curr_gene) - single_copy_persistent_count +=1 - elif curr_gene is not None and curr_gene not in genes_in_region and not return_only_persistents: + single_copy_persistent_count += 1 + elif ( + curr_gene is not None + and curr_gene not in genes_in_region + and not return_only_persistents + ): right_border.append(curr_gene) pos += 1 if pos == self.contig.number_of_genes and self.contig.is_circular: @@ -487,7 +555,7 @@ def get_bordering_genes(self, n: int, multigenics: Set[GeneFamily], return_only_ class Spot(MetaFeatures): """ The 'Spot' class represents a region of genomic plasticity. - + Methods: - 'regions': the property that generates the regions in the spot. - 'families': the property that generates the gene families in the spot. @@ -509,7 +577,9 @@ def __init__(self, spot_id: int): :param spot_id: Identifier of the spot """ if not isinstance(spot_id, int): - raise TypeError(f"Spot identifier must be an integer. Given type is {type(spot_id)}") + raise TypeError( + f"Spot identifier must be an integer. Given type is {type(spot_id)}" + ) super().__init__() self.ID = spot_id self._region_getter = {} @@ -517,13 +587,11 @@ def __init__(self, spot_id: int): self._uniqContent = {} def __repr__(self) -> str: - """Spot representation - """ + """Spot representation""" return f"Spot {self.ID} - #RGP: {len(self)}" def __str__(self): - """String representation of the spot - """ + """String representation of the spot""" return f"spot_{self.ID}" def __setitem__(self, name: str, region: Region): @@ -542,8 +610,10 @@ def __setitem__(self, name: str, region: Region): # where a projected RGP might link two spots in the spot graph. # To handle this scenario without triggering failure, we check the 'projected' attribute of the given region. - raise ValueError(f"The region '{region.name}' is already associated with spot '{region.spot.ID}' while being associated with spot '{self.ID}'. " - "A region should only belong to one spot.") + raise ValueError( + f"The region '{region.name}' is already associated with spot '{region.spot.ID}' while being associated with spot '{self.ID}'. " + "A region should only belong to one spot." + ) self._region_getter[name] = region region.spot = self @@ -559,7 +629,9 @@ def __getitem__(self, name) -> Region: :raises TypeError: Name is not a string """ if not isinstance(name, str): - raise TypeError(f"Name of the region must be a string. The provided type was {type(name)}") + raise TypeError( + f"Name of the region must be a string. The provided type was {type(name)}" + ) try: region = self._region_getter[name] except KeyError: @@ -576,15 +648,16 @@ def __delitem__(self, name): :raises TypeError: Name is not a string """ if not isinstance(name, str): - raise TypeError(f"Name of the region must be a string. The provided type was {type(name)}") + raise TypeError( + f"Name of the region must be a string. The provided type was {type(name)}" + ) try: del self._region_getter[name] except KeyError: raise KeyError(f"Region with {name} does not exist in spot") def __len__(self) -> int: - """Get the number of regions in the spot - """ + """Get the number of regions in the spot""" return len(self._region_getter) def add(self, region: Region): @@ -596,7 +669,9 @@ def add(self, region: Region): :raises TypeError: Region is not an instance Region """ if not isinstance(region, Region): - raise TypeError(f"A Region object is expected to be added to the spot. find type is {type(region)}") + raise TypeError( + f"A Region object is expected to be added to the spot. find type is {type(region)}" + ) self[region.name] = region def get(self, name: str) -> Region: @@ -647,26 +722,31 @@ def number_of_families(self) -> int: return len({family for region in self.regions for family in region.families}) def spot_2_families(self): - """Add to Gene Families a link to spot - """ + """Add to Gene Families a link to spot""" for family in self.families: family.add_spot(self) - def borders(self, set_size: int, multigenics) -> List[List[int, List[GeneFamily], List[GeneFamily]]]: - """ Extracts all the borders of all RGPs belonging to the spot + def borders( + self, set_size: int, multigenics + ) -> List[List[int, List[GeneFamily], List[GeneFamily]]]: + """Extracts all the borders of all RGPs belonging to the spot :param set_size: Number of genes to get :param multigenics: pangenome graph multigenic persistent families :return: Families that bordering spot """ - all_borders = [rgp.get_bordering_genes(set_size, multigenics) - for rgp in self.regions] + all_borders = [ + rgp.get_bordering_genes(set_size, multigenics) for rgp in self.regions + ] family_borders = [] for borders in all_borders: new = True - curr_set = [[gene.family for gene in borders[0]], [gene.family for gene in borders[1]]] + curr_set = [ + [gene.family for gene in borders[0]], + [gene.family for gene in borders[1]], + ] for i, (c, former_borders) in enumerate(family_borders): if former_borders == curr_set or former_borders == curr_set[::-1]: family_borders[i][0] += 1 @@ -678,8 +758,7 @@ def borders(self, set_size: int, multigenics) -> List[List[int, List[GeneFamily] return family_borders def _mk_uniq_ordered_set_obj(self): - """cluster RGP into groups that have an identical synteny - """ + """cluster RGP into groups that have an identical synteny""" for rgp in self.regions: z = True for seen_rgp in self._uniqOrderedSet: @@ -690,7 +769,7 @@ def _mk_uniq_ordered_set_obj(self): self._uniqOrderedSet[rgp] = {rgp} def _get_ordered_set(self) -> Dict[Region, Set[Region]]: - """ Creates the _uniqSyn object if it was never computed. Return it in any case + """Creates the _uniqSyn object if it was never computed. Return it in any case :return: RGP groups that have an identical synteny """ @@ -699,7 +778,7 @@ def _get_ordered_set(self) -> Dict[Region, Set[Region]]: return self._uniqOrderedSet def get_uniq_to_rgp(self) -> Dict[Region, Set[Region]]: - """ Get dictionary with a representing RGP as the key, and all identical RGPs as value + """Get dictionary with a representing RGP as the key, and all identical RGPs as value :return: Dictionary with a representing RGP as the key, and set of identical RGPs as value """ @@ -713,8 +792,7 @@ def get_uniq_ordered_set(self) -> Set[Region]: return set(self._get_ordered_set().keys()) def _mk_uniq_content(self): - """cluster RGP into groups that have identical gene content - """ + """cluster RGP into groups that have identical gene content""" for rgp in self.regions: z = True for seen_rgp in self._uniqContent: @@ -734,7 +812,7 @@ def _get_content(self) -> Dict[Region, Set[Region]]: return self._uniqContent def get_uniq_content(self) -> Set[Region]: - """ Get an Iterable of all the unique rgp (in terms of gene family content) in the spot + """Get an Iterable of all the unique rgp (in terms of gene family content) in the spot :return: Iterable of all the unique rgp (in terms of gene family content) in the spot """ @@ -776,7 +854,9 @@ def __init__(self, module_id: int, families: set = None): :param families: Set of families which define the module """ if not isinstance(module_id, int): - raise TypeError(f"Module identifier must be an integer. Given type is {type(module_id)}") + raise TypeError( + f"Module identifier must be an integer. Given type is {type(module_id)}" + ) super().__init__() self.ID = module_id self._families_getter = {} @@ -786,23 +866,19 @@ def __init__(self, module_id: int, families: set = None): self.add(family) def __repr__(self) -> str: - """Module representation - """ + """Module representation""" return f"Module {self.ID} - #Families: {len(self)}" def __str__(self) -> str: - """String representation of the module - """ + """String representation of the module""" return f"module_{self.ID}" def __hash__(self) -> int: - """Create a hash value for the module - """ + """Create a hash value for the module""" return id(self) def __len__(self) -> int: - """Get the number of families in the module - """ + """Get the number of families in the module""" return len(self._families_getter) def __eq__(self, other: Module) -> bool: @@ -816,7 +892,9 @@ def __eq__(self, other: Module) -> bool: :raises TypeError: Try to compare a module with another type object """ if not isinstance(other, Module): - raise TypeError(f"Another module is expected to be compared to the first one. You give a {type(other)}") + raise TypeError( + f"Another module is expected to be compared to the first one. You give a {type(other)}" + ) return set(self.families) == set(other.families) def __setitem__(self, name: str, family: GeneFamily): @@ -829,7 +907,9 @@ def __setitem__(self, name: str, family: GeneFamily): :raises KeyError: Another family with the same name already exists in the module """ if name in self._families_getter and self[name] != family: - raise KeyError("A different gene family with the same name already exist in the module") + raise KeyError( + "A different gene family with the same name already exist in the module" + ) self._families_getter[name] = family family.set_module(self) @@ -845,7 +925,9 @@ def __getitem__(self, name) -> GeneFamily: try: family = self._families_getter[name] except KeyError: - raise KeyError(f"There isn't gene family with the name {name} in the module") + raise KeyError( + f"There isn't gene family with the name {name} in the module" + ) else: return family @@ -859,7 +941,9 @@ def __delitem__(self, name): try: fam = self._families_getter[name] except KeyError: - raise KeyError(f"There isn't gene family with the name {name} in the module") + raise KeyError( + f"There isn't gene family with the name {name} in the module" + ) else: del self._families_getter[name] fam._module = None # TODO define method to remove a module from family @@ -873,7 +957,9 @@ def add(self, family: GeneFamily): :raises TypeError: Region is not an instance Region """ if not isinstance(family, GeneFamily): - raise TypeError(f"A gene family is expected to be added to module. Given type was {type(family)}") + raise TypeError( + f"A gene family is expected to be added to module. Given type was {type(family)}" + ) self[family.name] = family def get(self, name: str) -> GeneFamily: @@ -913,7 +999,7 @@ def organisms(self) -> Generator[Organism, None, None]: organisms |= set(fam.organisms) yield from organisms - def mk_bitarray(self, index: Dict[GeneFamily, int], partition: str = 'all'): + def mk_bitarray(self, index: Dict[GeneFamily, int], partition: str = "all"): """Produces a bitarray representing the presence / absence of families in the organism using the provided index The bitarray is stored in the :attr:`bitarray` attribute and is a :class:`gmpy2.xmpz` type. @@ -921,27 +1007,29 @@ def mk_bitarray(self, index: Dict[GeneFamily, int], partition: str = 'all'): :param index: The index computed by :func:`ppanggolin.pangenome.Pangenome.getIndex` """ self.bitarray = gmpy2.xmpz() # pylint: disable=no-member - if partition == 'all': + if partition == "all": logging.getLogger("PPanGGOLiN").debug("all") for fam in self.families: self.bitarray[index[fam]] = 1 - elif partition == 'persistent': + elif partition == "persistent": logging.getLogger("PPanGGOLiN").debug("persistent") for fam in self.families: - if fam.named_partition in ['persistent']: + if fam.named_partition in ["persistent"]: self.bitarray[index[fam]] = 1 - elif partition in ['shell', 'cloud']: + elif partition in ["shell", "cloud"]: logging.getLogger("PPanGGOLiN").debug("shell, cloud") for fam in self.families: if fam.named_partition == partition: self.bitarray[index[fam]] = 1 - elif partition == 'accessory': + elif partition == "accessory": logging.getLogger("PPanGGOLiN").debug("accessory") for fam in self.families: - if fam.named_partition in ['shell', 'cloud']: + if fam.named_partition in ["shell", "cloud"]: self.bitarray[index[fam]] = 1 else: - raise Exception("There is not any partition corresponding please report a github issue") + raise Exception( + "There is not any partition corresponding please report a github issue" + ) class GeneContext: @@ -958,7 +1046,12 @@ class GeneContext: - graph: context graph corresponding to the gene context """ - def __init__(self, gc_id: int, families: Set[GeneFamily] = None, families_of_interest: Set[GeneFamily] = None): + def __init__( + self, + gc_id: int, + families: Set[GeneFamily] = None, + families_of_interest: Set[GeneFamily] = None, + ): """Constructor method :param gc_id: Identifier of the gene context. @@ -967,7 +1060,9 @@ def __init__(self, gc_id: int, families: Set[GeneFamily] = None, families_of_int """ if not isinstance(gc_id, int): - raise TypeError(f"Gene context identifier must be an integer. Given type is {type(gc_id)}") + raise TypeError( + f"Gene context identifier must be an integer. Given type is {type(gc_id)}" + ) self.ID = gc_id self._families_getter = {} @@ -975,28 +1070,26 @@ def __init__(self, gc_id: int, families: Set[GeneFamily] = None, families_of_int self._graph = None if families is not None: if not all(isinstance(fam, GeneFamily) for fam in families): - raise Exception("You provided elements that were not GeneFamily objects. " - "GeneContexts are only made of GeneFamily objects.") + raise Exception( + "You provided elements that were not GeneFamily objects. " + "GeneContexts are only made of GeneFamily objects." + ) self._families_getter = {family.name: family for family in families} def __repr__(self) -> str: - """Context representation - """ + """Context representation""" return f"Context {self.ID} - #Families: {len(self)}" def __str__(self) -> str: - """String representation of the gene context - """ - return f'GC_{str(self.ID)}' + """String representation of the gene context""" + return f"GC_{str(self.ID)}" def __hash__(self) -> int: - """Create a hash value for the region - """ + """Create a hash value for the region""" return id(self) def __len__(self) -> int: - """Get the number of families in the context - """ + """Get the number of families in the context""" return len(self._families_getter) def __eq__(self, other: GeneContext) -> bool: @@ -1010,7 +1103,9 @@ def __eq__(self, other: GeneContext) -> bool: :raises TypeError: Try to compare a gene context with another type object """ if not isinstance(other, GeneContext): - raise TypeError(f"Another context is expected to be compared to the first one. You give a {type(other)}") + raise TypeError( + f"Another context is expected to be compared to the first one. You give a {type(other)}" + ) return set(self.families) == set(other.families) def __setitem__(self, name, family): @@ -1023,9 +1118,13 @@ def __setitem__(self, name, family): :raises KeyError: Another family with the same name already exists in the context """ if not isinstance(family, GeneFamily): - raise TypeError(f"A gene family is expected to be added to gene context. Given type was {type(family)}") + raise TypeError( + f"A gene family is expected to be added to gene context. Given type was {type(family)}" + ) if name in self._families_getter and self[name] != family: - raise KeyError("A different gene family with the same name already exist in the gene context") + raise KeyError( + "A different gene family with the same name already exist in the gene context" + ) self._families_getter[name] = family def __getitem__(self, name) -> GeneFamily: @@ -1040,7 +1139,9 @@ def __getitem__(self, name) -> GeneFamily: try: family = self._families_getter[name] except KeyError: - raise KeyError(f"There isn't gene family with the name {name} in the gene context") + raise KeyError( + f"There isn't gene family with the name {name} in the gene context" + ) else: return family @@ -1054,7 +1155,9 @@ def __delitem__(self, name): try: del self._families_getter[name] except KeyError: - raise KeyError(f"There isn't gene family with the name {name} in the gene context") + raise KeyError( + f"There isn't gene family with the name {name} in the gene context" + ) @property def graph(self): @@ -1089,6 +1192,8 @@ def add_family(self, family: GeneFamily): :param family: The gene family to add. """ if not isinstance(family, GeneFamily): - raise Exception("You did not provide a GeneFamily object. " - "GeneContexts are only made of GeneFamily objects.") + raise Exception( + "You did not provide a GeneFamily object. " + "GeneContexts are only made of GeneFamily objects." + ) self[family.name] = family diff --git a/ppanggolin/utility/__init__.py b/ppanggolin/utility/__init__.py index 162e4744..b35b161f 100644 --- a/ppanggolin/utility/__init__.py +++ b/ppanggolin/utility/__init__.py @@ -1 +1 @@ -from .utils import subparser, launch \ No newline at end of file +from .utils import subparser, launch diff --git a/ppanggolin/utility/utils.py b/ppanggolin/utility/utils.py index cd85be7a..612eafce 100644 --- a/ppanggolin/utility/utils.py +++ b/ppanggolin/utility/utils.py @@ -6,9 +6,19 @@ import os from pathlib import Path from typing import List + # local libraries -from ppanggolin.utils import get_subcommand_parser, check_log, ALL_INPUT_PARAMS, ALL_GENERAL_PARAMS, \ - WORKFLOW_SUBCOMMANDS, ALL_WORKFLOW_DEPENDENCIES, WRITE_PAN_FLAG_DEFAULT_IN_WF, WRITE_GENOME_FLAG_DEFAULT_IN_WF, DRAW_FLAG_DEFAULT_IN_WF +from ppanggolin.utils import ( + get_subcommand_parser, + check_log, + ALL_INPUT_PARAMS, + ALL_GENERAL_PARAMS, + WORKFLOW_SUBCOMMANDS, + ALL_WORKFLOW_DEPENDENCIES, + WRITE_PAN_FLAG_DEFAULT_IN_WF, + WRITE_GENOME_FLAG_DEFAULT_IN_WF, + DRAW_FLAG_DEFAULT_IN_WF, +) from ppanggolin import SUBCOMMAND_TO_SUBPARSER """ Utility scripts to help formatting input files of PPanggolin.""" @@ -16,27 +26,35 @@ def split(list_object: list, chunk_count: int) -> List[List[int]]: """ - Split list into n chunk. + Split list into n chunk. :params list_object: list to split :params chunk_count: number of final chunk - :return : list of chunk of the initial list. + :return : list of chunk of the initial list. """ quotient, remainder = divmod(len(list_object), chunk_count) - return [list_object[index * quotient + min(index, remainder):(index + 1) * quotient + min(index + 1, remainder)] for - index in range(chunk_count)] + return [ + list_object[ + index * quotient + + min(index, remainder) : (index + 1) * quotient + + min(index + 1, remainder) + ] + for index in range(chunk_count) + ] -def split_comment_string(comment_string: str, max_word_count: int = 20, prefix: str = "\n # ") -> str: +def split_comment_string( + comment_string: str, max_word_count: int = 20, prefix: str = "\n # " +) -> str: """ Split a line of comment into multiple line. :params comment_string: comment string to split :params max_word_count: maximum number of word per line :params prefix: prefix used to start a new comment line - + :return : the split comment line. """ @@ -44,21 +62,23 @@ def split_comment_string(comment_string: str, max_word_count: int = 20, prefix: word_count = len(splitted_comment) line_count = round(word_count / max_word_count) + 1 - comment_lines = [' '.join(words) for words in split(splitted_comment, line_count)] + comment_lines = [" ".join(words) for words in split(splitted_comment, line_count)] return prefix.join(comment_lines) -def get_input_argument_lines(argument_actions: List[argparse._SubParsersAction]) -> List[str]: +def get_input_argument_lines( + argument_actions: List[argparse._SubParsersAction], +) -> List[str]: """ Manage input argument from a specific list of parser actions and format them for the yaml output. Input arguments are commented in the config file: as no default is valid. - Help and possible values of the argument is added as comment line. + Help and possible values of the argument is added as comment line. - :param argument_actions: list of parser action for input arguments. + :param argument_actions: list of parser action for input arguments. - :return: default arguments for the given command + :return: default arguments for the given command """ arg_default_lines = [] @@ -71,15 +91,17 @@ def get_input_argument_lines(argument_actions: List[argparse._SubParsersAction]) return arg_default_lines -def get_default_argument_lines(argument_actions: List[argparse._SubParsersAction]) -> List[str]: +def get_default_argument_lines( + argument_actions: List[argparse._SubParsersAction], +) -> List[str]: """ Get default arguments for a specific list of parser actions and format them for the yaml output. - Help and possible values of the argument is added as comment line. + Help and possible values of the argument is added as comment line. - :param argument_actions: list of parser action arguments. + :param argument_actions: list of parser action arguments. - :return: default arguments for the given command + :return: default arguments for the given command """ arg_default_lines = [] @@ -92,7 +114,9 @@ def get_default_argument_lines(argument_actions: List[argparse._SubParsersAction arg_default_lines.append(f" # {action.help}") if action.choices: - arg_default_lines.append(f" # Choices: {', '.join([str(choice) for choice in action.choices])}") + arg_default_lines.append( + f" # Choices: {', '.join([str(choice) for choice in action.choices])}" + ) # When default is None, it is replaced by False to omit the arg and get the None value as expected. default = action.default if action.default is not None else False @@ -101,7 +125,9 @@ def get_default_argument_lines(argument_actions: List[argparse._SubParsersAction return arg_default_lines -def deduplicate_actions(actions: List[argparse._SubParsersAction]) -> List[argparse._SubParsersAction]: +def deduplicate_actions( + actions: List[argparse._SubParsersAction], +) -> List[argparse._SubParsersAction]: """ Deduplicate duplicate actions based on their dest. @@ -132,25 +158,33 @@ def launch_default_config(args: argparse.Namespace): initial_command = args.default_config if args.output.exists() and not args.force: - raise FileExistsError(f"{args.output} already exists. Use -f if you want to overwrite it.") + raise FileExistsError( + f"{args.output} already exists. Use -f if you want to overwrite it." + ) - ignored_params = ['config', 'help'] + ignored_params = ["config", "help"] - workflow_dependencies = {sub_cmd for sub_cmd in ALL_WORKFLOW_DEPENDENCIES if - sub_cmd not in ["rgp", "spot", "module"]} + workflow_dependencies = { + sub_cmd + for sub_cmd in ALL_WORKFLOW_DEPENDENCIES + if sub_cmd not in ["rgp", "spot", "module"] + } - if initial_command in ['panrgp', 'all']: + if initial_command in ["panrgp", "all"]: workflow_dependencies |= {"rgp", "spot"} - if initial_command in ['panmodule', 'all']: - workflow_dependencies.add('module') + if initial_command in ["panmodule", "all"]: + workflow_dependencies.add("module") if initial_command in WORKFLOW_SUBCOMMANDS: # it is clearer if the order of the subcommand is conserved in wf config file - commands = [initial_command] + [sub_cmd for sub_cmd in ALL_WORKFLOW_DEPENDENCIES if - sub_cmd in workflow_dependencies] + commands = [initial_command] + [ + sub_cmd + for sub_cmd in ALL_WORKFLOW_DEPENDENCIES + if sub_cmd in workflow_dependencies + ] elif initial_command == "projection": - commands = [initial_command] + ['annotate'] + commands = [initial_command] + ["annotate"] else: commands = [initial_command] @@ -168,12 +202,18 @@ def launch_default_config(args: argparse.Namespace): specific_actions = [] # overwrite some default value for write cmd in a workflow context - if initial_command in WORKFLOW_SUBCOMMANDS and sub_command in ['write_pangenome', "write_genomes"]: + if initial_command in WORKFLOW_SUBCOMMANDS and sub_command in [ + "write_pangenome", + "write_genomes", + ]: for sub_action in sub._actions: - if sub_action.dest in WRITE_PAN_FLAG_DEFAULT_IN_WF + WRITE_GENOME_FLAG_DEFAULT_IN_WF : + if ( + sub_action.dest + in WRITE_PAN_FLAG_DEFAULT_IN_WF + WRITE_GENOME_FLAG_DEFAULT_IN_WF + ): sub_action.default = True # overwrite some default value for draw cmd in a workflow context - if initial_command in WORKFLOW_SUBCOMMANDS and sub_command == 'draw': + if initial_command in WORKFLOW_SUBCOMMANDS and sub_command == "draw": for sub_action in sub._actions: if sub_action.dest in DRAW_FLAG_DEFAULT_IN_WF: sub_action.default = True @@ -199,10 +239,10 @@ def launch_default_config(args: argparse.Namespace): inputs_actions = deduplicate_actions(inputs_actions) general_actions = deduplicate_actions(general_actions) - arg_lines = ['input_parameters:'] + arg_lines = ["input_parameters:"] arg_lines += get_input_argument_lines(inputs_actions) - arg_lines.append('\ngeneral_parameters:') + arg_lines.append("\ngeneral_parameters:") arg_lines += get_default_argument_lines(general_actions) for sub_command, specific_actions in sub_cmd_to_actions.items(): @@ -213,9 +253,9 @@ def launch_default_config(args: argparse.Namespace): arg_lines.append(f"\n{sub_command}:") arg_lines += get_default_argument_lines(specific_actions) - logging.getLogger("PPanGGOLiN").info(f'Writting default config in {args.output}') - with open(args.output, 'w') as fl: - fl.write('\n'.join(arg_lines) + '\n') + logging.getLogger("PPanGGOLiN").info(f"Writting default config in {args.output}") + with open(args.output, "w") as fl: + fl.write("\n".join(arg_lines) + "\n") def launch(args: argparse.Namespace): @@ -240,46 +280,77 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : parser arguments for info command """ - parser = sub_parser.add_parser("utils", formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "utils", formatter_class=argparse.RawTextHelpFormatter + ) parser_default_config(parser) return parser def parser_default_config(parser: argparse.ArgumentParser): """ - Parser for specific argument of utils command + Parser for specific argument of utils command :param parser: parser for utils argument """ subcommands = list(SUBCOMMAND_TO_SUBPARSER.keys()) - required = parser.add_argument_group(title="Required arguments", - description="All of the following arguments are required :") + required = parser.add_argument_group( + title="Required arguments", + description="All of the following arguments are required :", + ) - required.add_argument('--default_config', required=False, type=str, default=None, # nargs="*",, - help="Generate a config file with default values for the given subcommand.", - choices=subcommands) + required.add_argument( + "--default_config", + required=False, + type=str, + default=None, # nargs="*",, + help="Generate a config file with default values for the given subcommand.", + choices=subcommands, + ) optional = parser.add_argument_group(title="Config arguments") - optional.add_argument('-o', '--output', type=Path, default='default_config.yaml', - help='name and path of the config file with default parameters written in yaml.') - - optional.add_argument("--verbose", required=False, type=int, default=1, choices=[0, 1, 2], - help="Indicate verbose level (0 for warning and errors only, 1 for info, 2 for debug)") - - optional.add_argument("--log", required=False, type=check_log, default="stdout", help="log output file") - - optional.add_argument('-f', '--force', action="store_true", - help="Overwrite the given output file if it exists.") - - -if __name__ == '__main__': + optional.add_argument( + "-o", + "--output", + type=Path, + default="default_config.yaml", + help="name and path of the config file with default parameters written in yaml.", + ) + + optional.add_argument( + "--verbose", + required=False, + type=int, + default=1, + choices=[0, 1, 2], + help="Indicate verbose level (0 for warning and errors only, 1 for info, 2 for debug)", + ) + + optional.add_argument( + "--log", + required=False, + type=check_log, + default="stdout", + help="log output file", + ) + + optional.add_argument( + "-f", + "--force", + action="store_true", + help="Overwrite the given output file if it exists.", + ) + + +if __name__ == "__main__": """To test local change and allow using debugger""" main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser_default_config(main_parser) diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index d0ecc29d..b9661a68 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -29,24 +29,64 @@ from collections import defaultdict # all input params that exists in ppanggolin -ALL_INPUT_PARAMS = ['fasta', 'anno', 'clusters', 'pangenome', - "fasta_file", "annot_file", "genome_name"] # the last three params is for projection cmd +ALL_INPUT_PARAMS = [ + "fasta", + "anno", + "clusters", + "pangenome", + "fasta_file", + "annot_file", + "genome_name", +] # the last three params is for projection cmd # all params that should be in the general_parameters section of the config file -ALL_GENERAL_PARAMS = ['output', 'basename', 'rarefaction', 'no_flat_files', 'tmpdir', 'verbose', 'log', - 'disable_prog_bar', 'force', "config"] - -WORKFLOW_SUBCOMMANDS = {'all', 'workflow', 'panrgp', 'panmodule'} +ALL_GENERAL_PARAMS = [ + "output", + "basename", + "rarefaction", + "no_flat_files", + "tmpdir", + "verbose", + "log", + "disable_prog_bar", + "force", + "config", +] + +WORKFLOW_SUBCOMMANDS = {"all", "workflow", "panrgp", "panmodule"} # command that can be launched inside a workflow subcommand -ALL_WORKFLOW_DEPENDENCIES = ["annotate", "cluster", "graph", "partition", "rarefaction", "rgp", "spot", "module", - "draw", "write_pangenome", "write_genomes"] +ALL_WORKFLOW_DEPENDENCIES = [ + "annotate", + "cluster", + "graph", + "partition", + "rarefaction", + "rgp", + "spot", + "module", + "draw", + "write_pangenome", + "write_genomes", +] # Inside a workflow command, write output default is overwrite to output some flat files -WRITE_PAN_FLAG_DEFAULT_IN_WF = ["csv", "Rtab", "gexf", "light_gexf", - 'stats', 'json', 'partitions', 'regions', - 'borders', 'modules', 'spot_modules', "spots", "families_tsv"] -WRITE_GENOME_FLAG_DEFAULT_IN_WF = ['table', 'proksee', "gff"] +WRITE_PAN_FLAG_DEFAULT_IN_WF = [ + "csv", + "Rtab", + "gexf", + "light_gexf", + "stats", + "json", + "partitions", + "regions", + "borders", + "modules", + "spot_modules", + "spots", + "families_tsv", +] +WRITE_GENOME_FLAG_DEFAULT_IN_WF = ["table", "proksee", "gff"] DRAW_FLAG_DEFAULT_IN_WF = ["tile_plot", "ucurve", "draw_spots"] @@ -71,23 +111,29 @@ def check_log(log_file: str) -> TextIO: if os.access(log_file, os.W_OK): return log_file else: - raise OSError(f"The given log file {log_file} is not writable. Please check if it is accessible.") + raise OSError( + f"The given log file {log_file} is not writable. Please check if it is accessible." + ) else: - raise OSError(f"The given log file: {log_file} is a directory. Please provide a valid log file.") + raise OSError( + f"The given log file: {log_file} is a directory. Please provide a valid log file." + ) # target does not exist, check perms on parent dir parent_dir = os.path.dirname(log_file) if not parent_dir: - parent_dir = '.' + parent_dir = "." # target is creatable if parent dir is writable if os.access(parent_dir, os.W_OK): return log_file else: - raise OSError(f"The given log file {log_file} is not writable. Please check if it is accessible.") + raise OSError( + f"The given log file {log_file} is not writable. Please check if it is accessible." + ) def check_tsv_sanity(tsv: Path): - """ Check if the given tsv is readable for the next PPanGGOLiN step + """Check if the given tsv is readable for the next PPanGGOLiN step :param tsv: Path to the tsv containing organims information """ @@ -96,8 +142,10 @@ def check_tsv_sanity(tsv: Path): except OSError as ios_error: raise OSError(ios_error) except Exception as exception_error: - raise Exception(f"The following unexpected error happened when opening the list of genomes path: " - f"{exception_error}") + raise Exception( + f"The following unexpected error happened when opening the list of genomes path: " + f"{exception_error}" + ) else: name_set = set() duplicated_names = set() @@ -107,9 +155,11 @@ def check_tsv_sanity(tsv: Path): if len(elements) <= 1: raise Exception(f"No tabulation separator found in given file: {tsv}") if " " in elements[0]: - raise Exception(f"Your genome names contain spaces (The first encountered genome name that had " - f"this string: '{elements[0]}'). To ensure compatibility with all of the dependencies " - f"of PPanGGOLiN this is not allowed. Please remove spaces from your genome names.") + raise Exception( + f"Your genome names contain spaces (The first encountered genome name that had " + f"this string: '{elements[0]}'). To ensure compatibility with all of the dependencies " + f"of PPanGGOLiN this is not allowed. Please remove spaces from your genome names." + ) old_len = len(name_set) name_set.add(elements[0]) if len(name_set) == old_len: @@ -118,16 +168,20 @@ def check_tsv_sanity(tsv: Path): if not org_path.exists() and not tsv.parent.joinpath(org_path).exists(): non_existing_files.add(elements[1]) if len(non_existing_files) != 0: - raise Exception(f"Some of the given files do not exist. The non-existing files are the following : " - f"'{' '.join(non_existing_files)}'") + raise Exception( + f"Some of the given files do not exist. The non-existing files are the following : " + f"'{' '.join(non_existing_files)}'" + ) if len(duplicated_names) != 0: - raise Exception(f"Some of your genomes have identical names. The duplicated names are the following : " - f"'{' '.join(duplicated_names)}'") + raise Exception( + f"Some of your genomes have identical names. The duplicated names are the following : " + f"'{' '.join(duplicated_names)}'" + ) input_file.close() def check_input_files(file: Path, check_tsv: bool = False): - """ Checks if the provided input files exist and are of the proper format + """Checks if the provided input files exist and are of the proper format :param file: Path to the file :param check_tsv: Allow checking tsv file for annotation or fasta list @@ -136,7 +190,9 @@ def check_input_files(file: Path, check_tsv: bool = False): if check_tsv: check_tsv_sanity(file) else: - raise FileNotFoundError(f"No such file or directory: '{file.absolute().as_posix()}'") + raise FileNotFoundError( + f"No such file or directory: '{file.absolute().as_posix()}'" + ) def set_verbosity_level(args): @@ -151,26 +207,32 @@ def set_verbosity_level(args): elif args.verbose == 0: level = logging.WARNING # only warnings and errors - if args.log != sys.stdout and not args.disable_prog_bar: # if output is not to stdout we remove progress bars. + if ( + args.log != sys.stdout and not args.disable_prog_bar + ): # if output is not to stdout we remove progress bars. args.disable_prog_bar = True str_format = "%(asctime)s %(filename)s:l%(lineno)d %(levelname)s\t%(message)s" - datefmt = '%Y-%m-%d %H:%M:%S' + datefmt = "%Y-%m-%d %H:%M:%S" if args.log in [sys.stdout, sys.stderr]: # use stream - logging.basicConfig(stream=args.log, level=level, - format=str_format, - datefmt=datefmt) + logging.basicConfig( + stream=args.log, level=level, format=str_format, datefmt=datefmt + ) else: # log is written in a files. basic condif uses filename - logging.basicConfig(filename=args.log, level=level, - format=str_format, - datefmt=datefmt) - logging.getLogger("PPanGGOLiN").info("Command: " + " ".join(arg for arg in sys.argv)) - logging.getLogger("PPanGGOLiN").info(f"PPanGGOLiN version: {distribution('ppanggolin').version}") + logging.basicConfig( + filename=args.log, level=level, format=str_format, datefmt=datefmt + ) + logging.getLogger("PPanGGOLiN").info( + "Command: " + " ".join(arg for arg in sys.argv) + ) + logging.getLogger("PPanGGOLiN").info( + f"PPanGGOLiN version: {distribution('ppanggolin').version}" + ) def jaccard_similarities(mat: csc_matrix, jaccard_similarity_th) -> csc_matrix: - """ Compute the jaccard similarities + """Compute the jaccard similarities :param mat: :param jaccard_similarity_th: threshold @@ -184,13 +246,15 @@ def jaccard_similarities(mat: csc_matrix, jaccard_similarity_th) -> csc_matrix: # for columns bb = cols_sum[ab.indices] similarities = ab.copy() - similarities.data /= (aa + bb - ab.data) + similarities.data /= aa + bb - ab.data similarities.data[similarities.data < jaccard_similarity_th] = 0 similarities.eliminate_zeros() return similarities -def is_compressed(file_or_file_path: Union[Path, BinaryIO, TextIOWrapper, TextIO]) -> Tuple[bool, Union[str, None]]: +def is_compressed( + file_or_file_path: Union[Path, BinaryIO, TextIOWrapper, TextIO] +) -> Tuple[bool, Union[str, None]]: """ Detects if a file is compressed based on its file signature. @@ -201,10 +265,10 @@ def is_compressed(file_or_file_path: Union[Path, BinaryIO, TextIOWrapper, TextIO :raises TypeError: If the file type is not supported. """ file_signatures = { - b'\x1f\x8b': 'gzip', - b'BZh': 'bz2', - b'\x50\x4b\x03\x04': 'zip', - b'\xfd\x37\x7a\x58\x5a\x00': 'xz' + b"\x1f\x8b": "gzip", + b"BZh": "bz2", + b"\x50\x4b\x03\x04": "zip", + b"\xfd\x37\x7a\x58\x5a\x00": "xz", } def check_file_signature(byte_stream) -> Tuple[bool, Union[str, None]]: @@ -222,7 +286,7 @@ def check_file_signature(byte_stream) -> Tuple[bool, Union[str, None]]: # Determine the type of file and read its first few bytes if isinstance(file_or_file_path, Path): - with file_or_file_path.open('rb') as file: + with file_or_file_path.open("rb") as file: first_bytes = file.read(4) else: if isinstance(file_or_file_path, BinaryIO): @@ -238,8 +302,9 @@ def check_file_signature(byte_stream) -> Tuple[bool, Union[str, None]]: return check_file_signature(first_bytes) -def read_compressed_or_not(file_or_file_path: Union[Path, BinaryIO, TextIOWrapper, TextIO]) \ - -> Union[TextIOWrapper, BinaryIO, TextIO]: +def read_compressed_or_not( + file_or_file_path: Union[Path, BinaryIO, TextIOWrapper, TextIO] +) -> Union[TextIOWrapper, BinaryIO, TextIO]: """ Opens and reads a file, decompressing it if necessary. @@ -256,15 +321,19 @@ def read_compressed_or_not(file_or_file_path: Union[Path, BinaryIO, TextIOWrappe is_comp, comp_type = is_compressed(file_or_file_path) if is_comp: if comp_type == "gzip": - return gzip.open(file_or_file_path, 'rt') + return gzip.open(file_or_file_path, "rt") elif comp_type == "bz2": - return bz2.open(file_or_file_path, 'rt') + return bz2.open(file_or_file_path, "rt") elif comp_type == "xz": - raise NotImplementedError("Unfortunately PPanGGOLiN does not support xz compressed files. " - "Please report an issue on our GitHub to let us know we should work on it.") + raise NotImplementedError( + "Unfortunately PPanGGOLiN does not support xz compressed files. " + "Please report an issue on our GitHub to let us know we should work on it." + ) elif comp_type == "zip": with zipfile.ZipFile(file_or_file_path, "r") as z: - logging.getLogger("PPanGGOLiN").warning("Assuming we want to read the first file in the ZIP archive") + logging.getLogger("PPanGGOLiN").warning( + "Assuming we want to read the first file in the ZIP archive" + ) file_list = z.namelist() if file_list: return TextIOWrapper(z.open(file_list[0], "r")) @@ -275,7 +344,9 @@ def read_compressed_or_not(file_or_file_path: Union[Path, BinaryIO, TextIOWrappe return file_or_file_path -def write_compressed_or_not(file_path: Path, compress: bool = False) -> Union[gzip.GzipFile, TextIOWrapper]: +def write_compressed_or_not( + file_path: Path, compress: bool = False +) -> Union[gzip.GzipFile, TextIOWrapper]: """ Create a file-like object, compressed or not. @@ -285,13 +356,13 @@ def write_compressed_or_not(file_path: Path, compress: bool = False) -> Union[gz :return: file-like object, compressed or not """ if compress: - return gzip.open(file_path.parent / (file_path.name + '.gz'), mode="wt") + return gzip.open(file_path.parent / (file_path.name + ".gz"), mode="wt") else: return open(file_path, "w") def mk_outdir(output: Path, force: bool = False, exist_ok: bool = False): - """ Create a directory at the given output if it doesn't exist already + """Create a directory at the given output if it doesn't exist already :param output: Path where to create directory :param force: Force to write in the directory @@ -300,12 +371,15 @@ def mk_outdir(output: Path, force: bool = False, exist_ok: bool = False): :raise FileExistError: The current path already exist and force is false """ if not output.is_dir(): - logging.getLogger("PPanGGOLiN").debug(f"Create output directory {output.absolute().as_posix()}") + logging.getLogger("PPanGGOLiN").debug( + f"Create output directory {output.absolute().as_posix()}" + ) Path.mkdir(output, exist_ok=exist_ok) else: if not force: raise FileExistsError( - f"{output} already exists. Use -f if you want to overwrite the files in the directory") + f"{output} already exists. Use -f if you want to overwrite the files in the directory" + ) @contextmanager @@ -315,7 +389,8 @@ def create_tmpdir(main_dir, basename="tmpdir", keep_tmp=False): new_tmpdir = main_dir / dir_name logging.getLogger("PPanGGOLiN").debug( - f'Creating a temporary directory: {new_tmpdir.as_posix()}. This directory will be retained.') + f"Creating a temporary directory: {new_tmpdir.as_posix()}. This directory will be retained." + ) mk_outdir(new_tmpdir, force=True) yield new_tmpdir @@ -323,7 +398,8 @@ def create_tmpdir(main_dir, basename="tmpdir", keep_tmp=False): else: with tempfile.TemporaryDirectory(dir=main_dir, prefix=basename) as new_tmpdir: logging.getLogger("PPanGGOLiN").debug( - f"Creating a temporary directory: {new_tmpdir}. This directory won't be retained.") + f"Creating a temporary directory: {new_tmpdir}. This directory won't be retained." + ) yield Path(new_tmpdir) @@ -343,7 +419,9 @@ def mk_file_name(basename: str, output: Path, force: bool = False) -> Path: mk_outdir(output, force) if filename.exists() and not force: - raise FileExistsError(f"{filename.name} already exists. Use -f if you want to overwrite the file") + raise FileExistsError( + f"{filename.name} already exists. Use -f if you want to overwrite the file" + ) return filename @@ -360,17 +438,20 @@ def detect_filetype(filename: Path) -> str: first_line = f.readline() if first_line.startswith("LOCUS "): # then this is probably a gbff/gbk file return "gbff" - elif re.match(r"##gff-version\s{1,3}3", - first_line): # prodigal gff header has two spaces between gff-version and 3... some gff user can have a tab - return 'gff' + elif re.match( + r"##gff-version\s{1,3}3", first_line + ): # prodigal gff header has two spaces between gff-version and 3... some gff user can have a tab + return "gff" elif first_line.startswith(">"): - return 'fasta' + return "fasta" elif "\t" in first_line: return "tsv" else: - raise Exception(f"Filetype {filename} was not gff3 (file starts with '##gff-version 3') " - "nor gbff/gbk (file starts with 'LOCUS ') " - "nor fasta (file starts with '>') nor tsv (file has '\t' in the first line). ") + raise Exception( + f"Filetype {filename} was not gff3 (file starts with '##gff-version 3') " + "nor gbff/gbk (file starts with 'LOCUS ') " + "nor fasta (file starts with '>') nor tsv (file has '\t' in the first line). " + ) def restricted_float(x: Union[int, float]) -> float: @@ -442,8 +523,10 @@ def _plain_bfs(g: nx.Graph, source: Any, removed: set, weight: float): edge_genes_v = g[v][n]["genes"][v] edge_genes_n = g[v][n]["genes"][n] # if the edge is indeed existent for most genes of both families, we use it - if len(edge_genes_n) / len(g.nodes[n]["genes"]) >= weight and len(edge_genes_v) / len( - g.nodes[v]["genes"]) >= weight: + if ( + len(edge_genes_n) / len(g.nodes[n]["genes"]) >= weight + and len(edge_genes_v) / len(g.nodes[v]["genes"]) >= weight + ): nextlevel.add(n) @@ -476,13 +559,17 @@ def check_option_workflow(args): :param args: list of arguments """ if args.clusters is not None and not any([args.fasta, args.anno]): - raise Exception("If you give --clusters option, you must give at least --fasta or --anno") + raise Exception( + "If you give --clusters option, you must give at least --fasta or --anno" + ) if not any([args.fasta, args.anno]): raise Exception("At least one of --fasta or --anno must be given") if args.infer_singletons and args.clusters is None: - logging.getLogger("PPanGGOLiN").warning("--infer_singleton works only with --clusters.") + logging.getLogger("PPanGGOLiN").warning( + "--infer_singleton works only with --clusters." + ) def parse_config_file(yaml_config_file: str) -> dict: @@ -491,7 +578,7 @@ def parse_config_file(yaml_config_file: str) -> dict: :param yaml_config_file: config file in yaml - :return: dict of config with key the command and as value another dict with param as key and value as value. + :return: dict of config with key the command and as value another dict with param as key and value as value. """ with yaml_config_file as yaml_fh: @@ -502,11 +589,15 @@ def parse_config_file(yaml_config_file: str) -> dict: # if config has a Parameters key. Update config with its content if config and "Parameters" in config: - config.update(config['Parameters']) - del config['Parameters'] + config.update(config["Parameters"]) + del config["Parameters"] # remove empty section that have no parameter specified in it. In this case they have a None value - config = {section: param_val_dict for section, param_val_dict in config.items() if param_val_dict is not None} + config = { + section: param_val_dict + for section, param_val_dict in config.items() + if param_val_dict is not None + } return config @@ -517,17 +608,44 @@ def add_common_arguments(subparser: argparse.ArgumentParser): :param subparser: A subparser object from any subcommand. """ - common = subparser._action_groups.pop(1) # get the 'optional arguments' action group. + common = subparser._action_groups.pop( + 1 + ) # get the 'optional arguments' action group. common.title = "Common arguments" - common.add_argument("--verbose", required=False, type=int, default=1, choices=[0, 1, 2], - help="Indicate verbose level (0 for warning and errors only, 1 for info, 2 for debug)") - common.add_argument("--log", required=False, type=check_log, default="stdout", help="log output file") - common.add_argument("-d", "--disable_prog_bar", required=False, action="store_true", - help="disables the progress bars") - common.add_argument('-f', '--force', action="store_true", - help="Force writing in output directory and in pangenome output file.") - common.add_argument("--config", required=False, type=argparse.FileType(), - help="Specify command arguments through a YAML configuration file.") + common.add_argument( + "--verbose", + required=False, + type=int, + default=1, + choices=[0, 1, 2], + help="Indicate verbose level (0 for warning and errors only, 1 for info, 2 for debug)", + ) + common.add_argument( + "--log", + required=False, + type=check_log, + default="stdout", + help="log output file", + ) + common.add_argument( + "-d", + "--disable_prog_bar", + required=False, + action="store_true", + help="disables the progress bars", + ) + common.add_argument( + "-f", + "--force", + action="store_true", + help="Force writing in output directory and in pangenome output file.", + ) + common.add_argument( + "--config", + required=False, + type=argparse.FileType(), + help="Specify command arguments through a YAML configuration file.", + ) subparser._action_groups.append(common) @@ -545,7 +663,11 @@ def get_arg_name(arg_val: Union[str, TextIOWrapper]) -> Union[str, TextIOWrapper return arg_val -def overwrite_args(default_args: argparse.Namespace, config_args: argparse.Namespace, cli_args: argparse.Namespace): +def overwrite_args( + default_args: argparse.Namespace, + config_args: argparse.Namespace, + cli_args: argparse.Namespace, +): """ Overwrite args objects. @@ -559,12 +681,12 @@ def overwrite_args(default_args: argparse.Namespace, config_args: argparse.Names :return: final arguments """ args = argparse.Namespace() - all_params = [arg for arg in dir(default_args) if not arg.startswith('_')] + all_params = [arg for arg in dir(default_args) if not arg.startswith("_")] for param in all_params: default_val = getattr(default_args, param) - cli_val = getattr(cli_args, param, 'unspecified') - config_val = getattr(config_args, param, 'unspecified') + cli_val = getattr(cli_args, param, "unspecified") + config_val = getattr(config_args, param, "unspecified") if param in cli_args and param not in config_args: # Use the value from the command line argument @@ -573,7 +695,8 @@ def overwrite_args(default_args: argparse.Namespace, config_args: argparse.Names if default_val != cli_val and param != "config": logging.getLogger("PPanGGOLiN").debug( f'The parameter "--{param}: {get_arg_name(cli_val)}" has been specified in the command line with a non-default value.' - f' Its value overwrites the default value ({get_arg_name(default_val)}).') + f" Its value overwrites the default value ({get_arg_name(default_val)})." + ) elif param not in cli_args and param in config_args: # Use the value from the config file @@ -582,7 +705,8 @@ def overwrite_args(default_args: argparse.Namespace, config_args: argparse.Names if default_val != config_val: logging.getLogger("PPanGGOLiN").debug( f'The parameter "--{param}: {get_arg_name(config_val)}" has been specified in the config file with a non-default value.' - f' Its value overwrites the default value ({get_arg_name(default_val)}).') + f" Its value overwrites the default value ({get_arg_name(default_val)})." + ) elif param in cli_args and param in config_args: # Use the value from the command line argument (cli) if it's different from the config file (config) @@ -591,15 +715,17 @@ def overwrite_args(default_args: argparse.Namespace, config_args: argparse.Names if cli_val == config_val and cli_val != default_val: logging.getLogger("PPanGGOLiN").debug( f'The parameter "--{param} {get_arg_name(cli_val)}" has been specified in both the command line ' - f'and the config file with the same values, but with non-default value. ' - f'Its value overwrites the default value ({get_arg_name(default_val)}).') + f"and the config file with the same values, but with non-default value. " + f"Its value overwrites the default value ({get_arg_name(default_val)})." + ) elif cli_val != config_val and param != "config": # Values in cli and config differ. Use the value from the command line argument (cli) logging.getLogger("PPanGGOLiN").debug( f'The parameter "--{param}" has been specified in both the command line ("{get_arg_name(cli_val)}") ' f'and the config file ("{get_arg_name(config_val)}") with different values. ' - f'The value from the command line argument is used.') + f"The value from the command line argument is used." + ) else: # Parameter is not defined in cli and in config. Use the default value. setattr(args, param, default_val) @@ -617,7 +743,7 @@ def combine_args(args: argparse.Namespace, another_args: argparse.Namespace): :return: object with combined arguments """ - other_arg_names = [arg for arg in dir(another_args) if not arg.startswith('_')] + other_arg_names = [arg for arg in dir(another_args) if not arg.startswith("_")] for arg_name in other_arg_names: arg_val = getattr(another_args, arg_name) @@ -626,8 +752,11 @@ def combine_args(args: argparse.Namespace, another_args: argparse.Namespace): return args -def get_args_differing_from_default(default_args: argparse.Namespace, final_args: argparse.Namespace, - param_to_ignore: Union[List[str], Set[str]] = None) -> dict: +def get_args_differing_from_default( + default_args: argparse.Namespace, + final_args: argparse.Namespace, + param_to_ignore: Union[List[str], Set[str]] = None, +) -> dict: """ Get the parameters that have different value than default values. @@ -638,25 +767,35 @@ def get_args_differing_from_default(default_args: argparse.Namespace, final_args :return: A dict with param that differ from default as key and the final value of the param as value """ param_to_ignore = [] if param_to_ignore is None else param_to_ignore - all_params = [arg for arg in dir(final_args) if not arg.startswith('_') if arg not in param_to_ignore] - - params_that_differ = {param: getattr(final_args, param) for param in all_params if - getattr(default_args, param) != getattr(final_args, param)} + all_params = [ + arg + for arg in dir(final_args) + if not arg.startswith("_") + if arg not in param_to_ignore + ] + + params_that_differ = { + param: getattr(final_args, param) + for param in all_params + if getattr(default_args, param) != getattr(final_args, param) + } return params_that_differ -def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_subparser: dict) -> argparse.Namespace: +def manage_cli_and_config_args( + subcommand: str, config_file: str, subcommand_to_subparser: dict +) -> argparse.Namespace: """ Manage command line and config arguments for the given subcommand. This function parse arguments from the cmd line and config file and set up the following priority: cli > config > default - When the subcommand is a workflow, the subcommand used in workflows are also parsed in the config. + When the subcommand is a workflow, the subcommand used in workflows are also parsed in the config. :params subcommand: Name of the subcommand. :params config_file: Path to the config file given in argument. If None, only default and cli arguments value are used. - :params subcommand_to_subparser: Dict with subcommand name as key and the corresponding subparser function as value. + :params subcommand_to_subparser: Dict with subcommand name as key and the corresponding subparser function as value. """ if config_file: config = parse_config_file(config_file) @@ -672,32 +811,54 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ cli_args = get_cli_args(cmd_subparser) - all_cmd_param_names = {arg_name for arg_name in dir(default_args) if not arg_name.startswith('_')} + all_cmd_param_names = { + arg_name for arg_name in dir(default_args) if not arg_name.startswith("_") + } input_params = {param for param in all_cmd_param_names if param in ALL_INPUT_PARAMS} - general_params = {param for param in all_cmd_param_names if param in ALL_GENERAL_PARAMS} + general_params = { + param for param in all_cmd_param_names if param in ALL_GENERAL_PARAMS + } specific_params = all_cmd_param_names - (input_params | general_params) all_unspecific_params = ALL_INPUT_PARAMS + ALL_GENERAL_PARAMS # manage logging first to correctly set it up and to be able to log any issue when using config file later on - config_general_args = get_config_args(subcommand, cmd_subparser, config, "general_parameters", general_params, - strict_config_check=False) + config_general_args = get_config_args( + subcommand, + cmd_subparser, + config, + "general_parameters", + general_params, + strict_config_check=False, + ) general_args = overwrite_args(default_args, config_general_args, cli_args) set_verbosity_level(general_args) - config_input_args = get_config_args(subcommand, cmd_subparser, config, "input_parameters", input_params, - strict_config_check=True) + config_input_args = get_config_args( + subcommand, + cmd_subparser, + config, + "input_parameters", + input_params, + strict_config_check=True, + ) if subcommand in WORKFLOW_SUBCOMMANDS: # for workflow commands there is no section dedicated in the config: so no specific_args # only general_parameters and sections of commands launched in the worklow commands are used config_args = combine_args(config_general_args, config_input_args) else: - config_specific_args = get_config_args(subcommand, cmd_subparser, config, subcommand, specific_params, - strict_config_check=True) + config_specific_args = get_config_args( + subcommand, + cmd_subparser, + config, + subcommand, + specific_params, + strict_config_check=True, + ) config_args = combine_args(config_general_args, config_specific_args) config_args = combine_args(config_args, config_input_args) @@ -705,40 +866,64 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ # cli > config > default args = overwrite_args(default_args, config_args, cli_args) - params_that_differ = get_args_differing_from_default(default_args, args, input_params) + params_that_differ = get_args_differing_from_default( + default_args, args, input_params + ) if params_that_differ: - params_that_differ_str = ', '.join(f'{p}={v}' for p, v in params_that_differ.items()) + params_that_differ_str = ", ".join( + f"{p}={v}" for p, v in params_that_differ.items() + ) logging.getLogger("PPanGGOLiN").debug( - f"{len(params_that_differ)} {subcommand} parameters have non-default value: {params_that_differ_str}") + f"{len(params_that_differ)} {subcommand} parameters have non-default value: {params_that_differ_str}" + ) # manage workflow command workflow_steps = [] if subcommand in WORKFLOW_SUBCOMMANDS: for workflow_step in ALL_WORKFLOW_DEPENDENCIES: - if (workflow_step in ["rgp", "spot"] and subcommand in ["workflow", "panmodule"]) or \ - (workflow_step == "module" and subcommand in ["workflow", "panrgp"]): + if ( + workflow_step in ["rgp", "spot"] + and subcommand in ["workflow", "panmodule"] + ) or (workflow_step == "module" and subcommand in ["workflow", "panrgp"]): continue - logging.getLogger("PPanGGOLiN").debug(f'Parsing {workflow_step} arguments in config file.') + logging.getLogger("PPanGGOLiN").debug( + f"Parsing {workflow_step} arguments in config file." + ) step_subparser = subcommand_to_subparser[workflow_step] - default_step_args = get_default_args(workflow_step, step_subparser, unwanted_args=all_unspecific_params) + default_step_args = get_default_args( + workflow_step, step_subparser, unwanted_args=all_unspecific_params + ) # remove general args - all_param_names = {arg_name for arg_name in dir(default_step_args) if not arg_name.startswith('_')} - specific_step_params = {param_name for param_name in all_param_names if - param_name not in all_unspecific_params} - config_step_args = get_config_args(workflow_step, step_subparser, config, workflow_step, - specific_step_params, strict_config_check=True) + all_param_names = { + arg_name + for arg_name in dir(default_step_args) + if not arg_name.startswith("_") + } + specific_step_params = { + param_name + for param_name in all_param_names + if param_name not in all_unspecific_params + } + config_step_args = get_config_args( + workflow_step, + step_subparser, + config, + workflow_step, + specific_step_params, + strict_config_check=True, + ) # overwrite write and draw default when not specified in config - if workflow_step == 'write_pangenome': + if workflow_step == "write_pangenome": for out_flag in WRITE_PAN_FLAG_DEFAULT_IN_WF: if out_flag not in config[workflow_step]: setattr(default_step_args, out_flag, True) - if workflow_step == 'write_genomes': + if workflow_step == "write_genomes": for out_flag in WRITE_GENOME_FLAG_DEFAULT_IN_WF: if out_flag not in config[workflow_step]: setattr(default_step_args, out_flag, True) @@ -750,16 +935,24 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ step_args = overwrite_args(default_step_args, config_step_args, cli_args) - step_params_that_differ = get_args_differing_from_default(default_step_args, step_args) + step_params_that_differ = get_args_differing_from_default( + default_step_args, step_args + ) if step_params_that_differ: - step_params_that_differ_str = ', '.join(f'{p}={v}' for p, v in step_params_that_differ.items()) - logging.getLogger("PPanGGOLiN").debug(f"{len(step_params_that_differ)} {workflow_step} parameters have " - f"a non-default value: {step_params_that_differ_str}") + step_params_that_differ_str = ", ".join( + f"{p}={v}" for p, v in step_params_that_differ.items() + ) + logging.getLogger("PPanGGOLiN").debug( + f"{len(step_params_that_differ)} {workflow_step} parameters have " + f"a non-default value: {step_params_that_differ_str}" + ) # add step name to differentiate the params - step_params_that_differ = {f'{workflow_step}:{param}': value for param, value in - step_params_that_differ.items()} + step_params_that_differ = { + f"{workflow_step}:{param}": value + for param, value in step_params_that_differ.items() + } params_that_differ.update(step_params_that_differ) @@ -767,7 +960,9 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ setattr(args, workflow_step, step_args) if params_that_differ: - logging.getLogger("PPanGGOLiN").info(f'{len(params_that_differ)} parameters have a non-default value.') + logging.getLogger("PPanGGOLiN").info( + f"{len(params_that_differ)} parameters have a non-default value." + ) check_config_consistency(config, workflow_steps) @@ -776,9 +971,9 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ def check_config_consistency(config: dict, workflow_steps: list): """ - Check that the same parameter used in different subcommand inside a workflow has the same value. + Check that the same parameter used in different subcommand inside a workflow has the same value. - If not, the function throw a logging.getLogger("PPanGGOLiN").warning. + If not, the function throw a logging.getLogger("PPanGGOLiN").warning. :params config_dict: config dict with as key the section of the config file and as value another dict pairing name and value of parameters. :params workflow_steps: list of subcommand names used in the workflow execution. @@ -798,17 +993,25 @@ def count_different_values(values: Iterable[Union[int, str, Tuple, List]]) -> in return len(hashable_values) # params used in multiple subcommands - all_params = [param for subcmd, param_to_value_dict in config.items() for param in param_to_value_dict if - subcmd in workflow_steps] + all_params = [ + param + for subcmd, param_to_value_dict in config.items() + for param in param_to_value_dict + if subcmd in workflow_steps + ] duplicate_params = [param for param in all_params if all_params.count(param) > 1] for duplicate_param in set(duplicate_params): - step_to_value = {step: param_to_value[duplicate_param] for step, param_to_value in config.items() if - duplicate_param in param_to_value} + step_to_value = { + step: param_to_value[duplicate_param] + for step, param_to_value in config.items() + if duplicate_param in param_to_value + } if count_different_values(step_to_value.values()) > 1: logging.getLogger("PPanGGOLiN").warning( - f'The parameter {duplicate_param} used in multiple subcommands of the workflow is specified with different values in config file: {step_to_value}.') + f"The parameter {duplicate_param} used in multiple subcommands of the workflow is specified with different values in config file: {step_to_value}." + ) def set_up_config_param_to_parser(config_param_val: dict) -> list: @@ -841,8 +1044,9 @@ def set_up_config_param_to_parser(config_param_val: dict) -> list: return arguments_to_parse -def get_subcommand_parser(subparser_fct: Callable, name: str = '') \ - -> Tuple[argparse._SubParsersAction, argparse.ArgumentParser]: +def get_subcommand_parser( + subparser_fct: Callable, name: str = "" +) -> Tuple[argparse._SubParsersAction, argparse.ArgumentParser]: """ Get subcommand parser object using the given subparser function. @@ -860,10 +1064,11 @@ def get_subcommand_parser(subparser_fct: Callable, name: str = '') \ prog = f"Parsing section {name} in config file" usage = "Yaml config file" - parser = argparse.ArgumentParser(prog=prog, - allow_abbrev=False, add_help=False) + parser = argparse.ArgumentParser(prog=prog, allow_abbrev=False, add_help=False) - subparsers = parser.add_subparsers(metavar="", dest="subcommand", title="subcommands", description="") + subparsers = parser.add_subparsers( + metavar="", dest="subcommand", title="subcommands", description="" + ) sub = subparser_fct(subparsers) sub.usage = usage @@ -876,7 +1081,9 @@ def get_subcommand_parser(subparser_fct: Callable, name: str = '') \ return parser, sub -def get_default_args(subcommand: str, subparser_fct: Callable, unwanted_args: list = None) -> argparse.Namespace: +def get_default_args( + subcommand: str, subparser_fct: Callable, unwanted_args: list = None +) -> argparse.Namespace: """ Get default value for the arguments for the given subparser function. @@ -884,21 +1091,29 @@ def get_default_args(subcommand: str, subparser_fct: Callable, unwanted_args: li :params subparser_fct: Subparser function to use. This subparser give the expected argument for the subcommand. :params unwanted_args: List of arguments to filter out. - :return args: arguments with default values. + :return args: arguments with default values. """ unwanted_args = [] if unwanted_args is None else unwanted_args parser, sub = get_subcommand_parser(subparser_fct, subcommand) # remove unwanted argumnents - sub._actions = [p_action for p_action in sub._actions if p_action.dest not in unwanted_args] + sub._actions = [ + p_action for p_action in sub._actions if p_action.dest not in unwanted_args + ] args = parser.parse_args([subcommand]) return args -def get_config_args(subcommand: str, subparser_fct: Callable, config_dict: dict, config_section: str, - expected_params: Union[List[str], Set[str]], strict_config_check: bool) -> argparse.Namespace: +def get_config_args( + subcommand: str, + subparser_fct: Callable, + config_dict: dict, + config_section: str, + expected_params: Union[List[str], Set[str]], + strict_config_check: bool, +) -> argparse.Namespace: """ Parsing parameters of a specific section of the config file. @@ -921,18 +1136,27 @@ def get_config_args(subcommand: str, subparser_fct: Callable, config_dict: dict, erase_default_value(sub) # Manage args - sub._actions = [p_action for p_action in sub._actions if p_action.dest in expected_params] + sub._actions = [ + p_action for p_action in sub._actions if p_action.dest in expected_params + ] if not strict_config_check: # remove param found in config that are not expected by parser. useful for general_parameters. expected_args_names = [p_action.dest for p_action in sub._actions] - unexpected_config = [f'{name}:{value}' for name, value in config.items() if name not in expected_args_names] - config = {name: value for name, value in config.items() if name in expected_args_names} + unexpected_config = [ + f"{name}:{value}" + for name, value in config.items() + if name not in expected_args_names + ] + config = { + name: value for name, value in config.items() if name in expected_args_names + } if unexpected_config: logging.getLogger("PPanGGOLiN").info( - f'While parsing {config_section} section in config file, {len(unexpected_config)} unexpected parameters ' - f'were ignored : {" ".join(unexpected_config)}') + f"While parsing {config_section} section in config file, {len(unexpected_config)} unexpected parameters " + f'were ignored : {" ".join(unexpected_config)}' + ) else: for param_name in config: if param_name not in expected_params: @@ -951,7 +1175,7 @@ def get_config_args(subcommand: str, subparser_fct: Callable, config_dict: dict, def get_cli_args(subparser_fct: Callable) -> argparse.Namespace: """ - Parse command line arguments using the specified parsing function. + Parse command line arguments using the specified parsing function. :params subparser_fct: Subparser function to use. This subparser give the expected argument for the subcommand. """ @@ -965,14 +1189,14 @@ def get_cli_args(subparser_fct: Callable) -> argparse.Namespace: # remove argument that have not been specified delete_unspecified_args(cli_args) - delattr(cli_args, 'subcommand') + delattr(cli_args, "subcommand") return cli_args def erase_default_value(parser: argparse.ArgumentParser): """ - Remove default action in the given list of argument parser actions. + Remove default action in the given list of argument parser actions. This is dnoe to distinguish specified arguments. @@ -996,8 +1220,12 @@ def delete_unspecified_args(args: argparse.Namespace): delattr(args, arg_name) -def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int], window_size: int, - is_circular: bool = False): +def extract_contig_window( + contig_size: int, + positions_of_interest: Iterable[int], + window_size: int, + is_circular: bool = False, +): """ Extracts contiguous windows around positions of interest within a contig. @@ -1014,8 +1242,10 @@ def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int] # Check if any position of interest is out of range if sorted_positions[0] < 0 or sorted_positions[-1] >= contig_size: - raise IndexError(f'Positions of interest are out of range. ' - f"Contig has {contig_size} genes while given min={sorted_positions[0]} & max={sorted_positions[-1]} positions") + raise IndexError( + f"Positions of interest are out of range. " + f"Contig has {contig_size} genes while given min={sorted_positions[0]} & max={sorted_positions[-1]} positions" + ) if is_circular: first_position = sorted_positions[0] @@ -1053,7 +1283,9 @@ def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int] return windows_coordinates -def parse_input_paths_file(path_list_file: Path) -> Dict[str, Dict[str, Union[Path, List[str]]]]: +def parse_input_paths_file( + path_list_file: Path, +) -> Dict[str, Dict[str, Union[Path, List[str]]]]: """ Parse an input paths file to extract genome information. @@ -1066,7 +1298,9 @@ def parse_input_paths_file(path_list_file: Path) -> Dict[str, Dict[str, Union[Pa :raises FileNotFoundError: If a specified genome file path does not exist. :raises Exception: If there are no genomes in the provided file. """ - logging.getLogger("PPanGGOLiN").info(f"Reading {path_list_file} to process genome files") + logging.getLogger("PPanGGOLiN").info( + f"Reading {path_list_file} to process genome files" + ) genome_name_to_genome_path = {} for line in read_compressed_or_not(path_list_file): @@ -1081,13 +1315,14 @@ def parse_input_paths_file(path_list_file: Path) -> Dict[str, Dict[str, Union[Pa if not genome_file_path_alt.exists(): raise FileNotFoundError( - f"The file path '{genome_file_path}' for genome '{genome_name}' specified in '{path_list_file}' does not exist.") + f"The file path '{genome_file_path}' for genome '{genome_name}' specified in '{path_list_file}' does not exist." + ) else: genome_file_path = genome_file_path_alt genome_name_to_genome_path[genome_name] = { "path": genome_file_path, - "circular_contigs": putative_circular_contigs + "circular_contigs": putative_circular_contigs, } if len(genome_name_to_genome_path) == 0: @@ -1096,7 +1331,9 @@ def parse_input_paths_file(path_list_file: Path) -> Dict[str, Dict[str, Union[Pa return genome_name_to_genome_path -def flatten_nested_dict(nested_dict: Dict[str, Union[Dict, int, str, float]]) -> Dict[str, Union[int, str, float]]: +def flatten_nested_dict( + nested_dict: Dict[str, Union[Dict, int, str, float]] +) -> Dict[str, Union[int, str, float]]: """ Flattens a nested dictionary into a flat dictionary by concatenating keys at different levels. @@ -1105,7 +1342,7 @@ def flatten_nested_dict(nested_dict: Dict[str, Union[Dict, int, str, float]]) -> """ flat_dict = {} - def flatten(dictionary, parent_key=''): + def flatten(dictionary, parent_key=""): for key, val in dictionary.items(): new_key = f"{parent_key}_{key}" if parent_key else key if isinstance(val, dict): @@ -1126,7 +1363,7 @@ def get_major_version(version: str) -> int: :raises ValueError: If the input version does not have the expected format. """ try: - major_version = int(version.split('.')[0]) + major_version = int(version.split(".")[0]) except ValueError: raise ValueError(f"Version {version} does not have the expected format.") @@ -1140,27 +1377,31 @@ def check_version_compatibility(file_version: str) -> None: :param file_version: A string representing the version of the pangenome file. """ # Get the current PPanGGOLiN version - current_version = distribution('ppanggolin').version + current_version = distribution("ppanggolin").version current_version_major = get_major_version(current_version) file_major_version = get_major_version(file_version) # Check for compatibility issues if file_major_version != current_version_major: - logging.getLogger("PPanGGOLiN").error('Your pangenome file has been created with a different major version ' - 'of PPanGGOLiN than the one installed in the system. This mismatch may lead to compatibility issues.') + logging.getLogger("PPanGGOLiN").error( + "Your pangenome file has been created with a different major version " + "of PPanGGOLiN than the one installed in the system. This mismatch may lead to compatibility issues." + ) if file_major_version < 2 and current_version_major >= 2: - raise ValueError(f'The provided pangenome file was created by PPanGGOLiN version {file_version}, which is ' - f'incompatible with the current PPanGGOLiN version {current_version}.') + raise ValueError( + f"The provided pangenome file was created by PPanGGOLiN version {file_version}, which is " + f"incompatible with the current PPanGGOLiN version {current_version}." + ) def find_consecutive_sequences(sequence: List[int]) -> List[List[int]]: """ Find consecutive sequences in a list of integers. - + :param sequence: The input list of integers. - + :return: A list of lists containing consecutive sequences of integers. """ s_sequence = sorted(sequence) @@ -1177,39 +1418,47 @@ def find_consecutive_sequences(sequence: List[int]) -> List[List[int]]: return consecutive_sequences -def find_region_border_position(region_positions: List[int], contig_gene_count: int) -> Tuple[int, int]: +def find_region_border_position( + region_positions: List[int], contig_gene_count: int +) -> Tuple[int, int]: """ Find the start and stop integers of the region considering circularity of the contig. - + :param region_positions: List of positions that belong to the region. :param contig_gene_count: Number of gene in the contig. The contig is considered circular. - + :return: A tuple containing the start and stop integers of the region. """ - consecutive_region_positions = get_consecutive_region_positions(region_positions, contig_gene_count) + consecutive_region_positions = get_consecutive_region_positions( + region_positions, contig_gene_count + ) return consecutive_region_positions[0][0], consecutive_region_positions[-1][-1] -def get_consecutive_region_positions(region_positions: List[int], contig_gene_count: int) -> List[List[int]]: +def get_consecutive_region_positions( + region_positions: List[int], contig_gene_count: int +) -> List[List[int]]: """ Order integers position of the region considering circularity of the contig. - + :param region_positions: List of positions that belong to the region. :param contig_gene_count: Number of gene in the contig. The contig is considered circular. - + :return: An ordered list of integers of the region. - + :raises ValueError: If unexpected conditions are encountered. """ if len(region_positions) == 0: - raise ValueError('Region has no position. This is unexpected.') + raise ValueError("Region has no position. This is unexpected.") consecutive_sequences = sorted(find_consecutive_sequences(region_positions)) if len(consecutive_sequences) == 0: - raise ValueError('No consecutive sequences found in the region. This is unexpected.') + raise ValueError( + "No consecutive sequences found in the region. This is unexpected." + ) elif len(consecutive_sequences) == 1: return consecutive_sequences @@ -1217,22 +1466,32 @@ def get_consecutive_region_positions(region_positions: List[int], contig_gene_co elif len(consecutive_sequences) == 2: # Check for overlaps at the edge of the contig if consecutive_sequences[0][0] != 0: - raise ValueError(f'Two sequences of consecutive positions ({consecutive_sequences}) ' - f'indicate an overlap on the edge of the contig, but neither starts at the beginning of the contig (0).') + raise ValueError( + f"Two sequences of consecutive positions ({consecutive_sequences}) " + f"indicate an overlap on the edge of the contig, but neither starts at the beginning of the contig (0)." + ) elif consecutive_sequences[-1][-1] != contig_gene_count - 1: - raise ValueError(f'Two sequences of consecutive positions ({consecutive_sequences}) ' - f'indicate an overlap on the edge of the contig, but neither ends at the end of the contig ({contig_gene_count - 1}).') + raise ValueError( + f"Two sequences of consecutive positions ({consecutive_sequences}) " + f"indicate an overlap on the edge of the contig, but neither ends at the end of the contig ({contig_gene_count - 1})." + ) return [consecutive_sequences[-1], consecutive_sequences[0]] elif len(consecutive_sequences) > 2: - raise ValueError(f'More than two consecutive sequences found ({len(consecutive_sequences)}). ' - f'This is unexpected. Consecutive sequences: {consecutive_sequences}. ' - 'The region should consist of consecutive positions along the contig.') - - -def run_subprocess(cmd: List[str], output: Path = None, msg: str = "Subprocess failed with the following error:\n"): + raise ValueError( + f"More than two consecutive sequences found ({len(consecutive_sequences)}). " + f"This is unexpected. Consecutive sequences: {consecutive_sequences}. " + "The region should consist of consecutive positions along the contig." + ) + + +def run_subprocess( + cmd: List[str], + output: Path = None, + msg: str = "Subprocess failed with the following error:\n", +): """Run a subprocess command and write the output to the given path. :param cmd: list of program arguments @@ -1252,11 +1511,10 @@ def run_subprocess(cmd: List[str], output: Path = None, msg: str = "Subprocess f raise Exception(msg + subprocess_err.stderr) else: if output is not None: - with open(output, 'w') as fout: + with open(output, "w") as fout: fout.write(result.stdout) - def has_non_ascii(string_to_test: str) -> bool: """ Check if a string contains any non-ASCII characters. @@ -1265,11 +1523,12 @@ def has_non_ascii(string_to_test: str) -> bool: :return: True if the string contains non-ASCII characters, False otherwise. """ try: - string_to_test.encode('ascii') + string_to_test.encode("ascii") except UnicodeEncodeError: return True return False + def replace_non_ascii(string_with_ascii: str, replacement_string: str = "_") -> str: """ Replace all non-ASCII characters in a string with a specified replacement string. @@ -1278,4 +1537,4 @@ def replace_non_ascii(string_with_ascii: str, replacement_string: str = "_") -> :param replacement_string: The string to replace non-ASCII characters with (default is '_'). :return: A new string where all non-ASCII characters have been replaced. """ - return re.sub(r'[^\x00-\x7F]+', replacement_string, string_with_ascii) + return re.sub(r"[^\x00-\x7F]+", replacement_string, string_with_ascii) diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index f3b8d922..3ce60b7c 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -10,9 +10,18 @@ # local libraries from ppanggolin.pangenome import Pangenome -from ppanggolin.utils import mk_file_name, mk_outdir, check_option_workflow, restricted_float -from ppanggolin.annotate.annotate import annotate_pangenome, read_annotations, get_gene_sequences_from_fastas, \ - check_annotate_args +from ppanggolin.utils import ( + mk_file_name, + mk_outdir, + check_option_workflow, + restricted_float, +) +from ppanggolin.annotate.annotate import ( + annotate_pangenome, + read_annotations, + get_gene_sequences_from_fastas, + check_annotate_args, +) from ppanggolin.cluster.cluster import clustering, read_clustering from ppanggolin.graph.makeGraph import compute_neighbors_graph from ppanggolin.nem.rarefaction import make_rarefaction_curve @@ -31,8 +40,9 @@ """a global workflow that does everything in one go.""" -def launch_workflow(args: argparse.Namespace, panrgp: bool = True, - panmodule: bool = True): +def launch_workflow( + args: argparse.Namespace, panrgp: bool = True, panmodule: bool = True +): """ Unified function to launch ppanggolin workflow. @@ -47,39 +57,70 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, filename = mk_file_name(args.basename, args.output, args.force) - writing_time, anno_time, clust_time, mod_time, desc_time = (None, None, None, None, None) + writing_time, anno_time, clust_time, mod_time, desc_time = ( + None, + None, + None, + None, + None, + ) if args.anno: # if the annotations are provided, we read from it start_anno = time.time() - read_annotations(pangenome, args.anno, pseudo=args.annotate.use_pseudo, - cpu=args.annotate.cpu, translation_table=args.annotate.translation_table, - disable_bar=args.disable_prog_bar) + read_annotations( + pangenome, + args.anno, + pseudo=args.annotate.use_pseudo, + cpu=args.annotate.cpu, + translation_table=args.annotate.translation_table, + disable_bar=args.disable_prog_bar, + ) anno_time = time.time() - start_anno start_writing = time.time() - write_pangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) + write_pangenome( + pangenome, filename, args.force, disable_bar=args.disable_prog_bar + ) writing_time = time.time() - start_writing if args.clusters is not None: start_clust = time.time() - read_clustering(pangenome, args.clusters, infer_singleton=args.cluster.infer_singletons, - code=args.cluster.translation_table, cpu=args.cluster.cpu, tmpdir=args.tmpdir, - keep_tmp=args.cluster.keep_tmp, force=args.force, disable_bar=args.disable_prog_bar) + read_clustering( + pangenome, + args.clusters, + infer_singleton=args.cluster.infer_singletons, + code=args.cluster.translation_table, + cpu=args.cluster.cpu, + tmpdir=args.tmpdir, + keep_tmp=args.cluster.keep_tmp, + force=args.force, + disable_bar=args.disable_prog_bar, + ) else: # args.cluster is None if pangenome.status["geneSequences"] == "No": if args.fasta is None: - raise Exception("The gff/gbff provided did not have any sequence information, " - "you did not provide clusters and you did not provide fasta file. " - "Thus, we do not have the information we need to continue the analysis.") + raise Exception( + "The gff/gbff provided did not have any sequence information, " + "you did not provide clusters and you did not provide fasta file. " + "Thus, we do not have the information we need to continue the analysis." + ) else: get_gene_sequences_from_fastas(pangenome, args.fasta) start_clust = time.time() - clustering(pangenome, tmpdir=args.tmpdir, cpu=args.cluster.cpu, force=args.force, - disable_bar=args.disable_prog_bar, - defrag=not args.cluster.no_defrag, code=args.cluster.translation_table, - coverage=args.cluster.coverage, identity=args.cluster.identity, mode=args.cluster.mode, - keep_tmp_files=args.cluster.keep_tmp) + clustering( + pangenome, + tmpdir=args.tmpdir, + cpu=args.cluster.cpu, + force=args.force, + disable_bar=args.disable_prog_bar, + defrag=not args.cluster.no_defrag, + code=args.cluster.translation_table, + coverage=args.cluster.coverage, + identity=args.cluster.identity, + mode=args.cluster.mode, + keep_tmp_files=args.cluster.keep_tmp, + ) clust_time = time.time() - start_clust elif args.fasta is not None: @@ -92,51 +133,73 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, raise argparse.ArgumentError(argument=None, message=message) start_anno = time.time() - annotate_pangenome(pangenome, args.fasta, tmpdir=args.tmpdir, cpu=args.annotate.cpu, - disable_bar=args.disable_prog_bar, - procedure=args.annotate.prodigal_procedure, - translation_table=args.annotate.translation_table, kingdom=args.annotate.kingdom, - norna=args.annotate.norna, - allow_overlap=args.annotate.allow_overlap) + annotate_pangenome( + pangenome, + args.fasta, + tmpdir=args.tmpdir, + cpu=args.annotate.cpu, + disable_bar=args.disable_prog_bar, + procedure=args.annotate.prodigal_procedure, + translation_table=args.annotate.translation_table, + kingdom=args.annotate.kingdom, + norna=args.annotate.norna, + allow_overlap=args.annotate.allow_overlap, + ) anno_time = time.time() - start_anno start_writing = time.time() - write_pangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) + write_pangenome( + pangenome, filename, args.force, disable_bar=args.disable_prog_bar + ) writing_time = time.time() - start_writing start_clust = time.time() - clustering(pangenome, tmpdir=args.tmpdir, cpu=args.cluster.cpu, force=args.force, - disable_bar=args.disable_prog_bar, defrag=not args.cluster.no_defrag, - code=args.cluster.translation_table, coverage=args.cluster.coverage, - identity=args.cluster.identity, mode=args.cluster.mode, - keep_tmp_files=args.cluster.keep_tmp) + clustering( + pangenome, + tmpdir=args.tmpdir, + cpu=args.cluster.cpu, + force=args.force, + disable_bar=args.disable_prog_bar, + defrag=not args.cluster.no_defrag, + code=args.cluster.translation_table, + coverage=args.cluster.coverage, + identity=args.cluster.identity, + mode=args.cluster.mode, + keep_tmp_files=args.cluster.keep_tmp, + ) clust_time = time.time() - start_clust write_pangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) start_graph = time.time() - compute_neighbors_graph(pangenome, args.graph.remove_high_copy_number, args.force, - disable_bar=args.disable_prog_bar) + compute_neighbors_graph( + pangenome, + args.graph.remove_high_copy_number, + args.force, + disable_bar=args.disable_prog_bar, + ) graph_time = time.time() - start_graph start_part = time.time() - partition(pangenome, - tmpdir=args.tmpdir, - output=args.output, - beta=args.partition.beta, - sm_degree=args.partition.max_degree_smoothing, - free_dispersion=args.partition.free_dispersion, - chunk_size=args.partition.chunk_size, - kval=args.partition.nb_of_partitions, - krange=args.partition.krange, - icl_margin=args.partition.ICL_margin, - draw_icl=args.partition.draw_ICL, - seed=args.partition.seed, - keep_tmp_files=args.partition.keep_tmp_files, - cpu=args.partition.cpu, - force=args.force, - disable_bar=args.disable_prog_bar) + partition( + pangenome, + tmpdir=args.tmpdir, + output=args.output, + beta=args.partition.beta, + sm_degree=args.partition.max_degree_smoothing, + free_dispersion=args.partition.free_dispersion, + chunk_size=args.partition.chunk_size, + kval=args.partition.nb_of_partitions, + krange=args.partition.krange, + icl_margin=args.partition.ICL_margin, + draw_icl=args.partition.draw_ICL, + seed=args.partition.seed, + keep_tmp_files=args.partition.keep_tmp_files, + cpu=args.partition.cpu, + force=args.force, + disable_bar=args.disable_prog_bar, + ) part_time = time.time() - start_part start_writing = time.time() @@ -146,23 +209,44 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, regions_time, spot_time = (0, 0) if panrgp: start_regions = time.time() - predict_rgp(pangenome, persistent_penalty=args.rgp.persistent_penalty, variable_gain=args.rgp.variable_gain, - min_length=args.rgp.min_length, min_score=args.rgp.min_score, dup_margin=args.rgp.dup_margin, - force=args.force, disable_bar=args.disable_prog_bar) + predict_rgp( + pangenome, + persistent_penalty=args.rgp.persistent_penalty, + variable_gain=args.rgp.variable_gain, + min_length=args.rgp.min_length, + min_score=args.rgp.min_score, + dup_margin=args.rgp.dup_margin, + force=args.force, + disable_bar=args.disable_prog_bar, + ) regions_time = time.time() - start_regions start_spots = time.time() - predict_hotspots(pangenome, args.output, force=args.force, spot_graph=args.spot.spot_graph, - overlapping_match=args.spot.overlapping_match, set_size=args.spot.set_size, - exact_match=args.spot.exact_match_size, disable_bar=args.disable_prog_bar) + predict_hotspots( + pangenome, + args.output, + force=args.force, + spot_graph=args.spot.spot_graph, + overlapping_match=args.spot.overlapping_match, + set_size=args.spot.set_size, + exact_match=args.spot.exact_match_size, + disable_bar=args.disable_prog_bar, + ) spot_time = time.time() - start_spots if panmodule: start_mods = time.time() - predict_modules(pangenome=pangenome, dup_margin=args.module.dup_margin, size=args.module.size, - min_presence=args.module.min_presence, transitive=args.module.transitive, - jaccard=args.module.jaccard, force=args.force, disable_bar=args.disable_prog_bar) + predict_modules( + pangenome=pangenome, + dup_margin=args.module.dup_margin, + size=args.module.size, + min_presence=args.module.min_presence, + transitive=args.module.transitive, + jaccard=args.module.jaccard, + force=args.force, + disable_bar=args.disable_prog_bar, + ) mod_time = time.time() - start_mods @@ -171,116 +255,201 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, writing_time = writing_time + time.time() - start_writing if args.rarefaction_flag: - make_rarefaction_curve(pangenome=pangenome, output=args.output, tmpdir=args.tmpdir, - beta=args.rarefaction.beta, - depth=args.rarefaction.depth, - min_sampling=args.rarefaction.min, - max_sampling=args.rarefaction.max, - sm_degree=args.rarefaction.max_degree_smoothing, - free_dispersion=args.rarefaction.free_dispersion, - chunk_size=args.rarefaction.chunk_size, - kval=args.rarefaction.nb_of_partitions, - krange=args.rarefaction.krange, - seed=args.rarefaction.seed, - kestimate=args.rarefaction.reestimate_K, - soft_core=args.rarefaction.soft_core, - cpu=args.rarefaction.cpu, disable_bar=args.disable_prog_bar) + make_rarefaction_curve( + pangenome=pangenome, + output=args.output, + tmpdir=args.tmpdir, + beta=args.rarefaction.beta, + depth=args.rarefaction.depth, + min_sampling=args.rarefaction.min, + max_sampling=args.rarefaction.max, + sm_degree=args.rarefaction.max_degree_smoothing, + free_dispersion=args.rarefaction.free_dispersion, + chunk_size=args.rarefaction.chunk_size, + kval=args.rarefaction.nb_of_partitions, + krange=args.rarefaction.krange, + seed=args.rarefaction.seed, + kestimate=args.rarefaction.reestimate_K, + soft_core=args.rarefaction.soft_core, + cpu=args.rarefaction.cpu, + disable_bar=args.disable_prog_bar, + ) if not args.no_flat_files: if panrgp and args.draw.spots: start_spot_drawing = time.time() - mk_outdir(args.output / 'spot_figures', force=True) - draw_spots(pangenome=pangenome, output=args.output / 'spot_figures', spot_list='all', - disable_bar=args.disable_prog_bar) + mk_outdir(args.output / "spot_figures", force=True) + draw_spots( + pangenome=pangenome, + output=args.output / "spot_figures", + spot_list="all", + disable_bar=args.disable_prog_bar, + ) spot_time += time.time() - start_spot_drawing if args.draw.tile_plot: - if pangenome.number_of_organisms < 65000 or pangenome.number_of_gene_families < 65000: - nocloud = args.draw.nocloud if pangenome.number_of_organisms < 32767 or pangenome.number_of_gene_families < 32767 else True - draw_tile_plot(pangenome, args.output, nocloud=nocloud, disable_bar=args.disable_prog_bar, - draw_dendrogram=args.draw.add_dendrogram, add_metadata=True) + if ( + pangenome.number_of_organisms < 65000 + or pangenome.number_of_gene_families < 65000 + ): + nocloud = ( + args.draw.nocloud + if pangenome.number_of_organisms < 32767 + or pangenome.number_of_gene_families < 32767 + else True + ) + draw_tile_plot( + pangenome, + args.output, + nocloud=nocloud, + disable_bar=args.disable_prog_bar, + draw_dendrogram=args.draw.add_dendrogram, + add_metadata=True, + ) else: logging.getLogger("PPanGGOLiN").warning( - 'Tile plot output have been requested but there are too many genomes or families to produce a viewable tile plot.') + "Tile plot output have been requested but there are too many genomes or families to produce a viewable tile plot." + ) if args.draw.ucurve: - draw_ucurve(pangenome, args.output, disable_bar=args.disable_prog_bar, soft_core=args.draw.soft_core) + draw_ucurve( + pangenome, + args.output, + disable_bar=args.disable_prog_bar, + soft_core=args.draw.soft_core, + ) start_desc = time.time() - write_pangenome_arguments = ["gexf", "light_gexf", 'json', "csv", "Rtab", "stats", "partitions", "families_tsv"] + write_pangenome_arguments = [ + "gexf", + "light_gexf", + "json", + "csv", + "Rtab", + "stats", + "partitions", + "families_tsv", + ] # Check that we don't ask write to output something not computed. - borders, spots, spot_modules, modules, regions = (False, False, False, False, False) + borders, spots, spot_modules, modules, regions = ( + False, + False, + False, + False, + False, + ) if panmodule: modules = args.write_pangenome.modules - write_pangenome_arguments.append('modules') + write_pangenome_arguments.append("modules") if panrgp: borders, spots, regions = ( - args.write_pangenome.borders, args.write_pangenome.spots, args.write_pangenome.regions) + args.write_pangenome.borders, + args.write_pangenome.spots, + args.write_pangenome.regions, + ) write_pangenome_arguments += ["borders", "spots", "regions"] if panmodule and panrgp: spot_modules = args.write_pangenome.spot_modules - write_pangenome_arguments.append('spot_modules') + write_pangenome_arguments.append("spot_modules") # check that at least one output file is requested. if not write is not call. - if any(getattr(args.write_pangenome, arg) is True for arg in write_pangenome_arguments): + if any( + getattr(args.write_pangenome, arg) is True + for arg in write_pangenome_arguments + ): # some parameters are set to false because they have not been computed in this workflow - write_pangenome_flat_files(pangenome, args.output, cpu=args.write_pangenome.cpu, - disable_bar=args.disable_prog_bar, - soft_core=args.write_pangenome.soft_core, - dup_margin=args.write_pangenome.dup_margin, - csv=args.write_pangenome.csv, gene_pa=args.write_pangenome.Rtab, - gexf=args.write_pangenome.gexf, - light_gexf=args.write_pangenome.light_gexf, - stats=args.write_pangenome.stats, json=args.write_pangenome.json, - partitions=args.write_pangenome.partitions, - families_tsv=args.write_pangenome.families_tsv, regions=regions, - compress=args.write_pangenome.compress, - spot_modules=spot_modules, modules=modules, spots=spots, borders=borders) + write_pangenome_flat_files( + pangenome, + args.output, + cpu=args.write_pangenome.cpu, + disable_bar=args.disable_prog_bar, + soft_core=args.write_pangenome.soft_core, + dup_margin=args.write_pangenome.dup_margin, + csv=args.write_pangenome.csv, + gene_pa=args.write_pangenome.Rtab, + gexf=args.write_pangenome.gexf, + light_gexf=args.write_pangenome.light_gexf, + stats=args.write_pangenome.stats, + json=args.write_pangenome.json, + partitions=args.write_pangenome.partitions, + families_tsv=args.write_pangenome.families_tsv, + regions=regions, + compress=args.write_pangenome.compress, + spot_modules=spot_modules, + modules=modules, + spots=spots, + borders=borders, + ) else: logging.getLogger("PPanGGOLiN").info( - 'No flat file describing the pangenome has been requested in config file. ' - 'Writing output pangenome flat file is skipped.') - - write_genomes_arguments = ['proksee', "table", "gff"] - if any(getattr(args.write_genomes, arg) is True for arg in write_genomes_arguments): - write_flat_genome_files(pangenome, args.output, - proksee=args.write_genomes.proksee, - table=args.write_genomes.table, - gff=args.write_genomes.gff, - add_metadata=True, - compress=args.write_genomes.compress, - disable_bar=args.disable_prog_bar, cpu=args.write_genomes.cpu) + "No flat file describing the pangenome has been requested in config file. " + "Writing output pangenome flat file is skipped." + ) + + write_genomes_arguments = ["proksee", "table", "gff"] + if any( + getattr(args.write_genomes, arg) is True for arg in write_genomes_arguments + ): + write_flat_genome_files( + pangenome, + args.output, + proksee=args.write_genomes.proksee, + table=args.write_genomes.table, + gff=args.write_genomes.gff, + add_metadata=True, + compress=args.write_genomes.compress, + disable_bar=args.disable_prog_bar, + cpu=args.write_genomes.cpu, + ) else: logging.getLogger("PPanGGOLiN").info( - 'No flat file of genomes with pangenome annotation has been requested in config file. ' - 'Writing output genomes flat file is skipped.') + "No flat file of genomes with pangenome annotation has been requested in config file. " + "Writing output genomes flat file is skipped." + ) desc_time = time.time() - start_desc - logging.getLogger("PPanGGOLiN").info(f"Annotation took : {round(anno_time, 2)} seconds") - logging.getLogger("PPanGGOLiN").info(f"Clustering took : {round(clust_time, 2)} seconds") - logging.getLogger("PPanGGOLiN").info(f"Building the graph took : {round(graph_time, 2)} seconds") - logging.getLogger("PPanGGOLiN").info(f"Partitioning the pangenome took : {round(part_time, 2)} seconds") + logging.getLogger("PPanGGOLiN").info( + f"Annotation took : {round(anno_time, 2)} seconds" + ) + logging.getLogger("PPanGGOLiN").info( + f"Clustering took : {round(clust_time, 2)} seconds" + ) + logging.getLogger("PPanGGOLiN").info( + f"Building the graph took : {round(graph_time, 2)} seconds" + ) + logging.getLogger("PPanGGOLiN").info( + f"Partitioning the pangenome took : {round(part_time, 2)} seconds" + ) if panrgp: - logging.getLogger("PPanGGOLiN").info(f"Predicting RGP took : {round(regions_time, 2)} seconds") - logging.getLogger("PPanGGOLiN").info(f"Gathering RGP into spots took : {round(spot_time, 2)} seconds") + logging.getLogger("PPanGGOLiN").info( + f"Predicting RGP took : {round(regions_time, 2)} seconds" + ) + logging.getLogger("PPanGGOLiN").info( + f"Gathering RGP into spots took : {round(spot_time, 2)} seconds" + ) if panmodule: - logging.getLogger("PPanGGOLiN").info(f"Predicting modules took : {round(mod_time, 2)} seconds") + logging.getLogger("PPanGGOLiN").info( + f"Predicting modules took : {round(mod_time, 2)} seconds" + ) - logging.getLogger("PPanGGOLiN").info(f"Writing the pangenome data in HDF5 took : {round(writing_time, 2)} seconds") + logging.getLogger("PPanGGOLiN").info( + f"Writing the pangenome data in HDF5 took : {round(writing_time, 2)} seconds" + ) if not args.no_flat_files: logging.getLogger("PPanGGOLiN").info( - f"Writing descriptive files for the pangenome took : {round(desc_time, 2)} seconds") + f"Writing descriptive files for the pangenome took : {round(desc_time, 2)} seconds" + ) print_info(filename, content=True) @@ -310,74 +479,157 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser def add_workflow_args(parser: argparse.ArgumentParser): """ - Parser for important arguments that can be changed in CLI. + Parser for important arguments that can be changed in CLI. Other (less important) arguments that are step specific can be changed in the config file. :param parser: parser for workflow argument """ date = time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", time.localtime()) - required = parser.add_argument_group(title="Input arguments", description="The possible input arguments :") - - required.add_argument('--fasta', required=False, type=Path, - help="A tab-separated file listing the genome names, " - "and the fasta filepath of its genomic sequence(s) (the fastas can be compressed). " - "One line per genome. This option can be used alone.") - - required.add_argument('--anno', required=False, type=Path, - help="A tab-separated file listing the genome names, and the gff filepath of " - "its annotations (the gffs can be compressed). One line per genome. " - "This option can be used alone IF the fasta sequences are in the gff files, " - "otherwise --fasta needs to be used.") - - required.add_argument("--clusters", required=False, type=Path, - help="a tab-separated file listing the cluster names, the gene IDs, " - "and optionally whether they are a fragment or not.") + required = parser.add_argument_group( + title="Input arguments", description="The possible input arguments :" + ) + + required.add_argument( + "--fasta", + required=False, + type=Path, + help="A tab-separated file listing the genome names, " + "and the fasta filepath of its genomic sequence(s) (the fastas can be compressed). " + "One line per genome. This option can be used alone.", + ) + + required.add_argument( + "--anno", + required=False, + type=Path, + help="A tab-separated file listing the genome names, and the gff filepath of " + "its annotations (the gffs can be compressed). One line per genome. " + "This option can be used alone IF the fasta sequences are in the gff files, " + "otherwise --fasta needs to be used.", + ) + + required.add_argument( + "--clusters", + required=False, + type=Path, + help="a tab-separated file listing the cluster names, the gene IDs, " + "and optionally whether they are a fragment or not.", + ) optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument('-o', '--output', required=False, type=Path, - default=Path(f'ppanggolin_output{date}_PID{str(os.getpid())}'), - help="Output directory") - - optional.add_argument("--basename", required=False, default="pangenome", - help="basename for the output file") - - optional.add_argument("--rarefaction", required=False, action="store_true", dest="rarefaction_flag", - help="Use to compute the rarefaction curves (WARNING: can be time consuming)") - - optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") - - optional.add_argument("--translation_table", required=False, type=int, default=11, - help="Translation table (genetic code) to use.") - - optional.add_argument("--kingdom", required=False, type=str.lower, default="bacteria", - choices=["bacteria", "archaea"], - help="Kingdom to which the prokaryota belongs to, " - "to know which models to use for rRNA annotation.") - - optional.add_argument("--mode", required=False, default="1", choices=["0", "1", "2", "3"], - help="the cluster mode of MMseqs2. 0: Setcover, 1: single linkage (or connected component)," - " 2: CD-HIT-like, 3: CD-HIT-like (lowmem)") - - optional.add_argument("--coverage", required=False, type=restricted_float, default=0.8, - help="Minimal coverage of the alignment for two proteins to be in the same cluster") - - optional.add_argument("--identity", required=False, type=restricted_float, default=0.8, - help="Minimal identity percent for two proteins to be in the same cluster") - - optional.add_argument("--infer_singletons", required=False, action="store_true", - help="Use this option together with --clusters. " - "If a gene is not present in the provided clustering result file, " - "it will be assigned to its own unique cluster as a singleton.") - - optional.add_argument("-K", "--nb_of_partitions", required=False, default=-1, type=int, - help="Number of partitions to use. Must be at least 2. If under 2, " - "it will be detected automatically.") + optional.add_argument( + "-o", + "--output", + required=False, + type=Path, + default=Path(f"ppanggolin_output{date}_PID{str(os.getpid())}"), + help="Output directory", + ) + + optional.add_argument( + "--basename", + required=False, + default="pangenome", + help="basename for the output file", + ) + + optional.add_argument( + "--rarefaction", + required=False, + action="store_true", + dest="rarefaction_flag", + help="Use to compute the rarefaction curves (WARNING: can be time consuming)", + ) + + optional.add_argument( + "-c", + "--cpu", + required=False, + default=1, + type=int, + help="Number of available cpus", + ) + + optional.add_argument( + "--translation_table", + required=False, + type=int, + default=11, + help="Translation table (genetic code) to use.", + ) + + optional.add_argument( + "--kingdom", + required=False, + type=str.lower, + default="bacteria", + choices=["bacteria", "archaea"], + help="Kingdom to which the prokaryota belongs to, " + "to know which models to use for rRNA annotation.", + ) + + optional.add_argument( + "--mode", + required=False, + default="1", + choices=["0", "1", "2", "3"], + help="the cluster mode of MMseqs2. 0: Setcover, 1: single linkage (or connected component)," + " 2: CD-HIT-like, 3: CD-HIT-like (lowmem)", + ) + + optional.add_argument( + "--coverage", + required=False, + type=restricted_float, + default=0.8, + help="Minimal coverage of the alignment for two proteins to be in the same cluster", + ) + + optional.add_argument( + "--identity", + required=False, + type=restricted_float, + default=0.8, + help="Minimal identity percent for two proteins to be in the same cluster", + ) + + optional.add_argument( + "--infer_singletons", + required=False, + action="store_true", + help="Use this option together with --clusters. " + "If a gene is not present in the provided clustering result file, " + "it will be assigned to its own unique cluster as a singleton.", + ) + + optional.add_argument( + "-K", + "--nb_of_partitions", + required=False, + default=-1, + type=int, + help="Number of partitions to use. Must be at least 2. If under 2, " + "it will be detected automatically.", + ) # This ensures compatibility with workflows built with the old option "defrag" when it was not the default - optional.add_argument("--no_defrag", required=False, action="store_true", - help="DO NOT Realign gene families to link fragments with their non-fragmented gene family.") - - optional.add_argument("--no_flat_files", required=False, action="store_true", - help="Generate only the HDF5 pangenome file.") - optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), - help="directory for storing temporary files") + optional.add_argument( + "--no_defrag", + required=False, + action="store_true", + help="DO NOT Realign gene families to link fragments with their non-fragmented gene family.", + ) + + optional.add_argument( + "--no_flat_files", + required=False, + action="store_true", + help="Generate only the HDF5 pangenome file.", + ) + optional.add_argument( + "--tmpdir", + required=False, + type=str, + default=Path(tempfile.gettempdir()), + help="directory for storing temporary files", + ) diff --git a/ppanggolin/workflow/panModule.py b/ppanggolin/workflow/panModule.py index 9105da05..bac562a0 100644 --- a/ppanggolin/workflow/panModule.py +++ b/ppanggolin/workflow/panModule.py @@ -25,7 +25,9 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :param sub_parser : sub_parser for all command :return : parser arguments for all command """ - parser = sub_parser.add_parser("panmodule", formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "panmodule", formatter_class=argparse.RawTextHelpFormatter + ) add_workflow_args(parser) diff --git a/ppanggolin/workflow/panRGP.py b/ppanggolin/workflow/panRGP.py index 7f024ec3..e2fd112a 100644 --- a/ppanggolin/workflow/panRGP.py +++ b/ppanggolin/workflow/panRGP.py @@ -26,7 +26,9 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :param sub_parser : sub_parser for all command :return : parser arguments for all command """ - parser = sub_parser.add_parser("panrgp", formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "panrgp", formatter_class=argparse.RawTextHelpFormatter + ) add_workflow_args(parser) diff --git a/ppanggolin/workflow/workflow.py b/ppanggolin/workflow/workflow.py index bf2e44e4..269eab2f 100644 --- a/ppanggolin/workflow/workflow.py +++ b/ppanggolin/workflow/workflow.py @@ -8,6 +8,7 @@ """ a global workflow that does everything in one go. """ + def launch(args: argparse.Namespace): """ Command launcher @@ -24,7 +25,9 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :param sub_parser : sub_parser for all command :return : parser arguments for all command """ - parser = sub_parser.add_parser("workflow", formatter_class=argparse.RawTextHelpFormatter) + parser = sub_parser.add_parser( + "workflow", formatter_class=argparse.RawTextHelpFormatter + ) add_workflow_args(parser) diff --git a/pyproject.toml b/pyproject.toml index 3707014e..8c8e101e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,8 @@ doc = [ "sphinxcontrib.mermaid==0.9.2", ] test = [ - "pytest==7" + "pytest==7", + "black==24.*" ] python_deps = [ "tqdm==4.*", diff --git a/setup.py b/setup.py index 5b922969..b762db14 100755 --- a/setup.py +++ b/setup.py @@ -7,16 +7,20 @@ setup( ext_modules=[ Extension( - extra_compile_args=['-fcommon', '-Wno-int-conversion'], + extra_compile_args=["-fcommon", "-Wno-int-conversion"], name="nem_stats", - sources=[NEM_DIR_PATH + 'nem_stats.pyx', - NEM_DIR_PATH + 'nem_exe.c', - NEM_DIR_PATH + 'nem_alg.c', - NEM_DIR_PATH + 'nem_nei.c', - NEM_DIR_PATH + 'nem_mod.c', - NEM_DIR_PATH + 'nem_rnd.c', - NEM_DIR_PATH + 'lib_io.c', - NEM_DIR_PATH + 'nem_hlp.c', - NEM_DIR_PATH + 'genmemo.c'], - include_dirs=[NEM_DIR_PATH])] + sources=[ + NEM_DIR_PATH + "nem_stats.pyx", + NEM_DIR_PATH + "nem_exe.c", + NEM_DIR_PATH + "nem_alg.c", + NEM_DIR_PATH + "nem_nei.c", + NEM_DIR_PATH + "nem_mod.c", + NEM_DIR_PATH + "nem_rnd.c", + NEM_DIR_PATH + "lib_io.c", + NEM_DIR_PATH + "nem_hlp.c", + NEM_DIR_PATH + "genmemo.c", + ], + include_dirs=[NEM_DIR_PATH], + ) + ] ) diff --git a/testingDataset/compare_results.py b/testingDataset/compare_results.py index f33a05c0..d6d6f217 100644 --- a/testingDataset/compare_results.py +++ b/testingDataset/compare_results.py @@ -19,11 +19,12 @@ def ordered(obj): return sorted(ordered(x) for x in obj) else: return obj - + + def read_json_file(json_file): proper_open_1 = gzip.open if json_file.suffix == ".gz" else open - with proper_open_1(json_file.as_posix(), 'rt') as f1: + with proper_open_1(json_file.as_posix(), "rt") as f1: return json.load(f1) @@ -32,7 +33,7 @@ def are_json_files_identical(file1, file2): data1 = read_json_file(file1) data2 = read_json_file(file2) # Load data from the second file - + return ordered(data1) == ordered(data2), [] @@ -48,57 +49,96 @@ def are_text_files_identical(expected_file, file2, outdir=Path("./")): proper_open_e = gzip.open if expected_file.suffix == ".gz" else open proper_open_t = gzip.open if file2.suffix == ".gz" else open - with proper_open_e(expected_file, 'rt') as f1, proper_open_t(file2, 'rt') as f2: + with proper_open_e(expected_file, "rt") as f1, proper_open_t(file2, "rt") as f2: f1_content, f2_content = sorted(f1.readlines()), sorted(f2.readlines()) f1_line_count = len(f1_content) f2_line_count = len(f2_content) - diff = difflib.unified_diff(f1_content, f2_content, fromfile=expected_file.as_posix(), tofile=file2.as_posix()) + diff = difflib.unified_diff( + f1_content, + f2_content, + fromfile=expected_file.as_posix(), + tofile=file2.as_posix(), + ) diff = [line.rstrip() for line in diff] if len(diff) == 0: return True, diff_details - - diff_event_counter = {"+":0, "-":0} - + + diff_event_counter = {"+": 0, "-": 0} + diff_file = outdir / f"{common_file}.diff" - with open(diff_file, 'w') as out: - out.write('\n'.join(diff) + '\n') + with open(diff_file, "w") as out: + out.write("\n".join(diff) + "\n") # gather stat on diff - diff_event_counter = Counter((line[0] for line in diff[2:] if line[0] in ['+', '-'])) - + diff_event_counter = Counter( + (line[0] for line in diff[2:] if line[0] in ["+", "-"]) + ) + prct_inserted = 100 * diff_event_counter["+"] / f1_line_count prct_deleted = 100 * diff_event_counter["-"] / f2_line_count diff_prct = max(prct_inserted, prct_deleted) - diff_details.append(f'{diff_event_counter["+"]} insertions(+), {diff_event_counter["-"]} deletions(-) - {diff_prct:.1f}% difference') - diff_details.append(f'Check out diff in {diff_file}') + diff_details.append( + f'{diff_event_counter["+"]} insertions(+), {diff_event_counter["-"]} deletions(-) - {diff_prct:.1f}% difference' + ) + diff_details.append(f"Check out diff in {diff_file}") # vs code command to diff the files - # diff_details.append(f'code --diff {expected_file} {file2}') + # diff_details.append(f'code --diff {expected_file} {file2}') return False, diff_details + def add_subdir_to_files(subdir, files): - return [os.path.join(subdir, file) for file in files] + return [os.path.join(subdir, file) for file in files] + def compare_dir_recursively(expected_dir, tested_dir, ignored_files): dcmp_result = filecmp.dircmp(expected_dir, tested_dir, ignore=ignored_files) - for subdir in dcmp_result.common_dirs: - sub_dcmp_result = compare_dir_recursively(expected_dir/subdir, tested_dir/subdir, ignored_files) - for files_attr in ['right_only', 'left_only', 'common_files', "same_files", "diff_files"]: + for subdir in dcmp_result.common_dirs: + sub_dcmp_result = compare_dir_recursively( + expected_dir / subdir, tested_dir / subdir, ignored_files + ) + for files_attr in [ + "right_only", + "left_only", + "common_files", + "same_files", + "diff_files", + ]: files_list = getattr(dcmp_result, files_attr) - files_list += add_subdir_to_files(subdir, getattr(sub_dcmp_result, files_attr)) - + files_list += add_subdir_to_files( + subdir, getattr(sub_dcmp_result, files_attr) + ) + return dcmp_result + def get_suffix_except_gz(path: Path): for ext in path.suffixes[::-1]: if ext != ".gz": return ext - -def compare_directories(expected_dir, tested_dir, ignored_files, diff_outdir, report_identical_files, extension_to_compare=[".tsv", ".aln", ".json", '.gff', '.txt', '.csv', '.faa', '.fasta', ".yaml"]): + +def compare_directories( + expected_dir, + tested_dir, + ignored_files, + diff_outdir, + report_identical_files, + extension_to_compare=[ + ".tsv", + ".aln", + ".json", + ".gff", + ".txt", + ".csv", + ".faa", + ".fasta", + ".yaml", + ], +): # Define directory information with color expected_dir_info = f"- Expected directory: {expected_dir}" tested_dir_info = f"- Tested directory: {tested_dir}" @@ -106,11 +146,17 @@ def compare_directories(expected_dir, tested_dir, ignored_files, diff_outdir, re # Create the panel print("\n===Comparison of Directories===") print("\n".join([expected_dir_info, tested_dir_info])) - print('===============================') + print("===============================") # Compare directories - dcmp = compare_dir_recursively(expected_dir, tested_dir, ignored_files=ignored_files) - ignored_files_ext = [common_file for common_file in dcmp.common_files if get_suffix_except_gz(Path(common_file)) not in extension_to_compare] + dcmp = compare_dir_recursively( + expected_dir, tested_dir, ignored_files=ignored_files + ) + ignored_files_ext = [ + common_file + for common_file in dcmp.common_files + if get_suffix_except_gz(Path(common_file)) not in extension_to_compare + ] if ignored_files: print("\nFiles ignored for comparison:") for ignored_file in ignored_files: @@ -135,8 +181,7 @@ def compare_directories(expected_dir, tested_dir, ignored_files, diff_outdir, re different_files = [] identical_files = [] - - files_to_compare = list( set(dcmp.common_files) - set(ignored_files_ext)) + files_to_compare = list(set(dcmp.common_files) - set(ignored_files_ext)) for common_file in files_to_compare: file1 = expected_dir / common_file @@ -145,7 +190,9 @@ def compare_directories(expected_dir, tested_dir, ignored_files, diff_outdir, re if get_suffix_except_gz(Path(common_file)) == ".json": identical_tables, details = are_json_files_identical(file1, file2) else: - identical_tables, details = are_text_files_identical(file1, file2, outdir=diff_outdir) + identical_tables, details = are_text_files_identical( + file1, file2, outdir=diff_outdir + ) if identical_tables: identical_files.append(common_file) @@ -159,12 +206,11 @@ def compare_directories(expected_dir, tested_dir, ignored_files, diff_outdir, re if different_files: print("\nDifferent files:") - for file, details in different_files: + for file, details in different_files: print(f" - {file}") for detail in details: print(f"{detail}") print() - # Generate summary report print("\nSummary:") @@ -182,34 +228,56 @@ def compare_directories(expected_dir, tested_dir, ignored_files, diff_outdir, re # Display different files count print(f"{len(different_files)} file(s) differ.") - - if different_files or dcmp.left_only or dcmp.right_only: - print('\nSome difference exist between the tested and the expected result directories') + print( + "\nSome difference exist between the tested and the expected result directories" + ) exit(1) else: - print('\nNo difference have been found between the tested and the expected result directories') + print( + "\nNo difference have been found between the tested and the expected result directories" + ) + def parse_arguments(): """Parse script arguments.""" - parser = ArgumentParser(description="...", - formatter_class=ArgumentDefaultsHelpFormatter) - - - parser.add_argument('-e', '--expected_dir', help="Expected result directory", required=True, type=Path) - - parser.add_argument('-t', '--tested_dir', help="Tested result directory", required=True, type=Path) - - parser.add_argument('-o', '--outdir', help="Directories where to write diff files", default='out_diff', type=Path) + parser = ArgumentParser( + description="...", formatter_class=ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "-e", + "--expected_dir", + help="Expected result directory", + required=True, + type=Path, + ) + + parser.add_argument( + "-t", "--tested_dir", help="Tested result directory", required=True, type=Path + ) + + parser.add_argument( + "-o", + "--outdir", + help="Directories where to write diff files", + default="out_diff", + type=Path, + ) + + parser.add_argument( + "-i", + "--ignored_files", + nargs="+", + help="File to ignore for the comparison", + default=["pangenomeGraph.json", "pangenomeGraph.json.gz"], + ) - parser.add_argument('-i', '--ignored_files', nargs="+", help="File to ignore for the comparison", default=['pangenomeGraph.json', "pangenomeGraph.json.gz"]) - - - args = parser.parse_args() return args + def main(): args = parse_arguments() @@ -217,12 +285,17 @@ def main(): report_identical_files = True logging.basicConfig(level=logging.INFO, format="%(message)s") - + Path.mkdir(args.outdir, exist_ok=True) + compare_directories( + expected_dir=args.expected_dir, + tested_dir=args.tested_dir, + diff_outdir=args.outdir, + ignored_files=args.ignored_files, + report_identical_files=report_identical_files, + ) - compare_directories(expected_dir=args.expected_dir, tested_dir=args.tested_dir, diff_outdir=args.outdir, ignored_files=args.ignored_files, report_identical_files=report_identical_files) -if __name__ == '__main__': +if __name__ == "__main__": main() - diff --git a/testingDataset/expected_info_files/myannopang_info.yaml b/testingDataset/expected_info_files/myannopang_info.yaml index 045c6205..3b7c1b5b 100644 --- a/testingDataset/expected_info_files/myannopang_info.yaml +++ b/testingDataset/expected_info_files/myannopang_info.yaml @@ -8,7 +8,7 @@ Status: RGP_Predicted: false Spots_Predicted: false Modules_Predicted: false - PPanGGOLiN_Version: 2.1.2 + PPanGGOLiN_Version: 2.2.0 Content: Genes: 47986 diff --git a/testingDataset/expected_info_files/mybasicpangenome_info.yaml b/testingDataset/expected_info_files/mybasicpangenome_info.yaml index 466ddb63..b0bedf6f 100644 --- a/testingDataset/expected_info_files/mybasicpangenome_info.yaml +++ b/testingDataset/expected_info_files/mybasicpangenome_info.yaml @@ -8,7 +8,7 @@ Status: RGP_Predicted: true Spots_Predicted: true Modules_Predicted: true - PPanGGOLiN_Version: 2.1.2 + PPanGGOLiN_Version: 2.2.0 Content: Genes: 45429 diff --git a/testingDataset/expected_info_files/stepbystep_info.yaml b/testingDataset/expected_info_files/stepbystep_info.yaml index ecebca05..ef4525fc 100644 --- a/testingDataset/expected_info_files/stepbystep_info.yaml +++ b/testingDataset/expected_info_files/stepbystep_info.yaml @@ -8,7 +8,7 @@ Status: RGP_Predicted: true Spots_Predicted: true Modules_Predicted: true - PPanGGOLiN_Version: 2.1.2 + PPanGGOLiN_Version: 2.2.0 Content: Genes: 45429 diff --git a/testingDataset/launch_test_locally.py b/testingDataset/launch_test_locally.py index 4bac2eee..eaf4ba28 100644 --- a/testingDataset/launch_test_locally.py +++ b/testingDataset/launch_test_locally.py @@ -11,15 +11,17 @@ from pathlib import Path import yaml + def parse_yaml_file(yaml_file): # Load the YAML file - with open(yaml_file, 'r') as stream: + with open(yaml_file, "r") as stream: workflow = yaml.safe_load(stream) return workflow + def create_symbolic_links(source_dir, target_dir): # Convert paths to Path objects source_dir = Path(source_dir) @@ -38,31 +40,59 @@ def create_symbolic_links(source_dir, target_dir): except FileExistsError: pass + def parse_arguments(default_ci_yaml, testing_datadir): """Parse script arguments.""" - parser = ArgumentParser(description="...", - formatter_class=ArgumentDefaultsHelpFormatter) - - - parser.add_argument('--ci_yaml', help="increase output verbosity", default=default_ci_yaml, type=Path,) - - parser.add_argument('--data_dir', help="Directory where dataset files are located", default=testing_datadir, type=Path,) - - parser.add_argument('-o', '--outdir', help="increase output verbosity", default='local_CI', type=Path) - - parser.add_argument('-c', '--cpu', type=int, default=4, - help="Use this amount of cpu when number of cpu is specified in the command.") - - parser.add_argument("-v", "--verbose", help="increase output verbosity", - action="store_true") - - parser.add_argument('--skip_msa', action="store_true", - help="Skip msa command as it takes quite some time to complete.") - - - parser.add_argument('-f', '--force', action="store_true", - help="Force writing in output directory if exists.") - + parser = ArgumentParser( + description="...", formatter_class=ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "--ci_yaml", + help="increase output verbosity", + default=default_ci_yaml, + type=Path, + ) + + parser.add_argument( + "--data_dir", + help="Directory where dataset files are located", + default=testing_datadir, + type=Path, + ) + + parser.add_argument( + "-o", + "--outdir", + help="increase output verbosity", + default="local_CI", + type=Path, + ) + + parser.add_argument( + "-c", + "--cpu", + type=int, + default=4, + help="Use this amount of cpu when number of cpu is specified in the command.", + ) + + parser.add_argument( + "-v", "--verbose", help="increase output verbosity", action="store_true" + ) + + parser.add_argument( + "--skip_msa", + action="store_true", + help="Skip msa command as it takes quite some time to complete.", + ) + + parser.add_argument( + "-f", + "--force", + action="store_true", + help="Force writing in output directory if exists.", + ) args = parser.parse_args() return args @@ -70,82 +100,82 @@ def parse_arguments(default_ci_yaml, testing_datadir): def main(): - script_path = Path(__file__).resolve() ppanggolin_main_dir = script_path.parent.parent default_ci_yaml = ppanggolin_main_dir / ".github/workflows/main.yml" testing_datadir = ppanggolin_main_dir / "testingDataset" - args = parse_arguments(default_ci_yaml, testing_datadir) if args.verbose: logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.DEBUG) - logging.info('Mode verbose ON') + logging.info("Mode verbose ON") else: logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO) - if not args.outdir.is_dir(): logging.debug(f"Create output directory {args.outdir.absolute().as_posix()}") Path.mkdir(args.outdir) - + elif not args.force: raise FileExistsError( - f"{args.outdir} already exists. Use -f if you want to overwrite the files in the directory") + f"{args.outdir} already exists. Use -f if you want to overwrite the files in the directory" + ) # setup test dir # execution_dir = args.outdir # / args.data_dir.name create_symbolic_links(args.data_dir, args.outdir) - workflow = parse_yaml_file(args.ci_yaml) - excluded_steps = ['Install ppanggolin', "Get core number on linux", "Get core number on macos"] + excluded_steps = [ + "Install ppanggolin", + "Get core number on linux", + "Get core number on macos", + ] - test_script = args.outdir / 'launch_test_command.sh' + test_script = args.outdir / "launch_test_command.sh" with open(test_script, "w") as fl: - - fl.write('#!/bin/bash\nset -e\n') - # Iterate over jobs and steps - for job in workflow['jobs'].values(): + fl.write("#!/bin/bash\nset -e\n") + + # Iterate over jobs and steps + for job in workflow["jobs"].values(): - if 'steps' not in job: + if "steps" not in job: continue - - for step in job['steps']: - if 'run' not in step: + + for step in job["steps"]: + if "run" not in step: continue - if step['name'] in excluded_steps: - logging.info(f'Ignoring: {step}') + if step["name"] in excluded_steps: + logging.info(f"Ignoring: {step}") continue # Execute the command line - command = step['run'].strip() + command = step["run"].strip() # process the command - command = command.replace('$NUM_CPUS', f"{args.cpu}") - command = command.replace('cd ', "# cd ") + command = command.replace("$NUM_CPUS", f"{args.cpu}") + command = command.replace("cd ", "# cd ") if args.skip_msa: - command = command.replace('ppanggolin msa', "# ppanggolin msa") + command = command.replace("ppanggolin msa", "# ppanggolin msa") if command == "pytest": command = f"pytest {ppanggolin_main_dir}" - - + # log the step name logging.info(f'Executing: {step["name"]}') logging.debug(f" {command}\n") - + # write the command in the script fl.write(f'\n# {step["name"]}\n\n') fl.write(command) - print(f"(cd {args.outdir}; bash {test_script.name})") -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/tests/align/test_align.py b/tests/align/test_align.py index 42d1cd5f..99840aff 100644 --- a/tests/align/test_align.py +++ b/tests/align/test_align.py @@ -7,76 +7,82 @@ @pytest.fixture def single_line_fasta_nt() -> List: - - return ['>Gene_1 seq_description\n', - 'ATGCGTTGTCGTTG\n', - ">Gene_2\n", - "TGTGACCTGCT\n" - ] + + return [ + ">Gene_1 seq_description\n", + "ATGCGTTGTCGTTG\n", + ">Gene_2\n", + "TGTGACCTGCT\n", + ] + @pytest.fixture def single_line_fasta_aa() -> List: - - return ['>Gene_1 seq_description\n', - 'YWTPRPFFYAAEYNN\n', - ">Gene_2\n", - "YWTPRPSYWTPAAEYNN\n" - ] + + return [ + ">Gene_1 seq_description\n", + "YWTPRPFFYAAEYNN\n", + ">Gene_2\n", + "YWTPRPSYWTPAAEYNN\n", + ] + @pytest.fixture def multi_line_fasta_nt() -> List: - - return ['>Gene_1 seq_description\n', - 'ATGCGT\n', - 'TGTCGTTG\n', - ">Gene_2\n", - "TGTGACCTGCT\n" - ] + + return [ + ">Gene_1 seq_description\n", + "ATGCGT\n", + "TGTCGTTG\n", + ">Gene_2\n", + "TGTGACCTGCT\n", + ] + @pytest.fixture def multi_line_fasta_aa() -> List: - - return ['>Gene_1 seq_description\n', - 'AAEYNN\n', - 'YWTPRPFFY\n', - ">Gene_2\n", - "YWTPRPS\n", - "YWTPAAEYNN\n" - ] + + return [ + ">Gene_1 seq_description\n", + "AAEYNN\n", + "YWTPRPFFY\n", + ">Gene_2\n", + "YWTPRPS\n", + "YWTPAAEYNN\n", + ] def test_get_seq_ids_single_line_nt(single_line_fasta_nt): - - + seq_set, is_nucleotide, single_line_fasta = get_seq_ids(single_line_fasta_nt) assert len(seq_set) == 2 - assert seq_set == {'Gene_1', 'Gene_2'} + assert seq_set == {"Gene_1", "Gene_2"} assert is_nucleotide assert single_line_fasta + def test_get_seq_ids_single_line_aa(single_line_fasta_aa): seq_set, is_nucleotide, single_line_fasta = get_seq_ids(single_line_fasta_aa) assert len(seq_set) == 2 - assert seq_set == {'Gene_1', 'Gene_2'} + assert seq_set == {"Gene_1", "Gene_2"} assert not is_nucleotide assert single_line_fasta + def test_get_seq_ids_multi_line_nt(multi_line_fasta_nt): seq_set, is_nucleotide, single_line_fasta = get_seq_ids(multi_line_fasta_nt) assert len(seq_set) == 2 - assert seq_set == {'Gene_1', 'Gene_2'} + assert seq_set == {"Gene_1", "Gene_2"} assert is_nucleotide assert not single_line_fasta + def test_get_seq_ids_multi_line_aa(multi_line_fasta_aa): seq_set, is_nucleotide, single_line_fasta = get_seq_ids(multi_line_fasta_aa) assert len(seq_set) == 2 - assert seq_set == {'Gene_1', 'Gene_2'} + assert seq_set == {"Gene_1", "Gene_2"} assert not is_nucleotide assert not single_line_fasta - - - diff --git a/tests/annotate/test_annotate.py b/tests/annotate/test_annotate.py index e52868c2..377dab8f 100644 --- a/tests/annotate/test_annotate.py +++ b/tests/annotate/test_annotate.py @@ -2,34 +2,87 @@ from pathlib import Path from ppanggolin.genome import Contig -from ppanggolin.annotate.annotate import extract_positions, read_anno_file, parse_contig_header_lines, \ - parse_gbff_by_contig, parse_feature_lines, parse_dna_seq_lines, read_org_gbff, combine_contigs_metadata, \ - fix_partial_gene_coordinates, shift_start_coordinates, shift_end_coordinates +from ppanggolin.annotate.annotate import ( + extract_positions, + read_anno_file, + parse_contig_header_lines, + parse_gbff_by_contig, + parse_feature_lines, + parse_dna_seq_lines, + read_org_gbff, + combine_contigs_metadata, + fix_partial_gene_coordinates, + shift_start_coordinates, + shift_end_coordinates, +) + +from ppanggolin.annotate.synta import check_sequence_tuple, parse_fasta @pytest.mark.parametrize( - "input_string, expected_positions, expected_complement, expected_partialgene_start, expected_partialgene_end", [ - ("join(190..7695,7695..12071)", [(190, 7695), (7695, 12071)], False, False, False), - ("order(190..7695,7995..12071)", [(190, 7695), (7995, 12071)], False, False, False), - ("complement(join(4359800..4360707,4360707..4360962,1..100))", - [(4359800, 4360707), (4360707, 4360962), (1, 100)], - True, False, False), - ("complement(order(4359800..4360707,4360707..4360962,1..100))", - [(4359800, 4360707), (4360707, 4360962), (1, 100)], - True, False, False), - ("join(6835405..6835731,1..1218)", [(6835405, 6835731), (1, 1218)], False, False, False), - ("join(1375484..1375555,1375557..1376579)", [(1375484, 1375555), (1375557, 1376579)], False, False, False), + "input_string, expected_positions, expected_complement, expected_partialgene_start, expected_partialgene_end", + [ + ( + "join(190..7695,7695..12071)", + [(190, 7695), (7695, 12071)], + False, + False, + False, + ), + ( + "order(190..7695,7995..12071)", + [(190, 7695), (7995, 12071)], + False, + False, + False, + ), + ( + "complement(join(4359800..4360707,4360707..4360962,1..100))", + [(4359800, 4360707), (4360707, 4360962), (1, 100)], + True, + False, + False, + ), + ( + "complement(order(4359800..4360707,4360707..4360962,1..100))", + [(4359800, 4360707), (4360707, 4360962), (1, 100)], + True, + False, + False, + ), + ( + "join(6835405..6835731,1..1218)", + [(6835405, 6835731), (1, 1218)], + False, + False, + False, + ), + ( + "join(1375484..1375555,1375557..1376579)", + [(1375484, 1375555), (1375557, 1376579)], + False, + False, + False, + ), ("complement(6815492..6816265)", [(6815492, 6816265)], True, False, False), ("6811501..6812109", [(6811501, 6812109)], False, False, False), ("complement(6792573..>6795461)", [(6792573, 6795461)], True, False, True), ("complement(<6792573..6795461)", [(6792573, 6795461)], True, True, False), ("complement(<6792573..>6795461)", [(6792573, 6795461)], True, True, True), ("join(1038313,1..1016)", [(1038313, 1038313), (1, 1016)], False, False, False), - ("1038313", [(1038313, 1038313)], False, False, False) - ]) -def test_extract_positions(input_string, expected_positions, expected_complement, expected_partialgene_start, - expected_partialgene_end): - positions, is_complement, has_partial_start, has_partial_end = extract_positions(input_string) + ("1038313", [(1038313, 1038313)], False, False, False), + ], +) +def test_extract_positions( + input_string, + expected_positions, + expected_complement, + expected_partialgene_start, + expected_partialgene_end, +): + positions, is_complement, has_partial_start, has_partial_end = extract_positions( + input_string + ) assert positions == expected_positions assert is_complement == expected_complement assert has_partial_start == expected_partialgene_start @@ -43,23 +96,33 @@ def test_extract_positions_with_wrong_positions_format(): def test_extract_positions_with_strange_chevrons(): with pytest.raises(ValueError): - extract_positions("complement(join(4359800..>4360707,1..100))") # chevron in inner position + extract_positions( + "complement(join(4359800..>4360707,1..100))" + ) # chevron in inner position with pytest.raises(ValueError): - extract_positions("complement(join(4359800..4360707,<1..100))") # chevron in inner position + extract_positions( + "complement(join(4359800..4360707,<1..100))" + ) # chevron in inner position with pytest.raises(ValueError): - extract_positions("complement(join(4359800..4360707,1..<100))") # start chevron in ending position + extract_positions( + "complement(join(4359800..4360707,1..<100))" + ) # start chevron in ending position def test_extract_positions_with_wrong_positions_format2(): with pytest.raises(ValueError): extract_positions("start..stop") # start and stop are not integer with pytest.raises(ValueError): - extract_positions("complement(join(start..6816265, 1..stop))") # start and stop are not integer + extract_positions( + "complement(join(start..6816265, 1..stop))" + ) # start and stop are not integer with pytest.raises(ValueError): extract_positions("start..stop") # start and stop are not integer with pytest.raises(ValueError): - extract_positions("complement(join(start..6816265, 1..stop))") # start and stop are not integer + extract_positions( + "complement(join(start..6816265, 1..stop))" + ) # start and stop are not integer @pytest.fixture @@ -70,7 +133,10 @@ def genome_data(): script_path = Path(__file__).resolve() ppanggolin_main_dir = script_path.parent.parent.parent - genome_path = ppanggolin_main_dir / "testingDataset/GBFF/GCF_000026905.1_ASM2690v1_genomic.gbff.gz" + genome_path = ( + ppanggolin_main_dir + / "testingDataset/GBFF/GCF_000026905.1_ASM2690v1_genomic.gbff.gz" + ) circular_contigs = [] genome_name = "GCF_000026905" return genome_name, genome_path, circular_contigs @@ -84,7 +150,10 @@ def genome_data_with_joined_genes(): script_path = Path(__file__).resolve() ppanggolin_main_dir = script_path.parent.parent.parent - genome_path = ppanggolin_main_dir / "testingDataset/GBFF/GCF_002776845.1_ASM277684v1_genomic.gbff.gz" + genome_path = ( + ppanggolin_main_dir + / "testingDataset/GBFF/GCF_002776845.1_ASM277684v1_genomic.gbff.gz" + ) circular_contigs = [] genome_name = "GCF_002776845" return genome_name, genome_path, circular_contigs @@ -97,7 +166,9 @@ def test_read_anno_file(genome_data): genome_name, genome_path, circular_contigs = genome_data use_pseudogene = False - genome, has_sequence = read_anno_file(genome_name, genome_path, circular_contigs, use_pseudogene) + genome, has_sequence = read_anno_file( + genome_name, genome_path, circular_contigs, use_pseudogene + ) assert has_sequence is True assert genome.name == genome_name @@ -112,7 +183,9 @@ def test_read_anno_file_with_pseudo_enable(genome_data): genome_name, genome_path, circular_contigs = genome_data use_pseudogene = True - genome, has_sequence = read_anno_file(genome_name, genome_path, circular_contigs, use_pseudogene) + genome, has_sequence = read_anno_file( + genome_name, genome_path, circular_contigs, use_pseudogene + ) assert has_sequence is True assert genome.name == genome_name @@ -123,17 +196,21 @@ def test_read_anno_file_with_pseudo_enable(genome_data): def test_with_joined_genes(genome_data_with_joined_genes): genome_name, genome_path, circular_contigs = genome_data_with_joined_genes use_pseudogene = True - genome, _ = read_anno_file(genome_name, genome_path, circular_contigs, use_pseudogene) + genome, _ = read_anno_file( + genome_name, genome_path, circular_contigs, use_pseudogene + ) - # this genome has 2 genes that are joined. + # this genome has 2 genes that are joined. assert genome.number_of_genes() == 917 def test_read_org_gbff(genome_data_with_joined_genes): genome_name, genome_path, circular_contigs = genome_data_with_joined_genes - genome, _ = read_org_gbff(genome_name, genome_path, circular_contigs, use_pseudogenes=True) + genome, _ = read_org_gbff( + genome_name, genome_path, circular_contigs, use_pseudogenes=True + ) - # this genome has 2 genes that are joined. + # this genome has 2 genes that are joined. assert genome.number_of_genes() == 917 @@ -155,7 +232,7 @@ def test_gbff_header_parser(): "DEFINITION": "Chlamydia trachomatis strain D/14-96 genome.", "VERSION": "NC_022109.1", "SOURCE": "Chlamydia trachomatis", - "ORGANISM": "Chlamydia trachomatis\nBacteria; Chlamydiae; Chlamydiales; Chlamydiaceae;\nChlamydia/Chlamydophila group; Chlamydia." + "ORGANISM": "Chlamydia trachomatis\nBacteria; Chlamydiae; Chlamydiales; Chlamydiaceae;\nChlamydia/Chlamydophila group; Chlamydia.", } @@ -215,196 +292,280 @@ def test_parse_gbff_by_contig(sample_gbff_path): # Check second contig header_2, feature_2, sequence_2 = contigs[1] assert len(header_2) == 5 - assert list(feature_2) == [{"feature_type": "source", "location": "1..2041595", - "organism": 'Chlamydia trachomatis', "mol_type": "genomic DNA"}] + assert list(feature_2) == [ + { + "feature_type": "source", + "location": "1..2041595", + "organism": "Chlamydia trachomatis", + "mol_type": "genomic DNA", + } + ] assert sequence_2 == "AAACCGGGTTCCAAATTTGGGGCCCCTTTT" # Define test cases -@pytest.mark.parametrize("input_lines, expected_output", [ - ([" gene 123..456", - " /locus_tag=ABC123", - " /note=Some note", - " CDS 789..1011", - " /protein_id=DEF456", - " /translation=ATGCTAGCATCG"], - [{"feature_type": "gene", "location": "123..456", "locus_tag": "ABC123", "note": "Some note"}, - {"feature_type": "CDS", "location": "789..1011", "protein_id": "DEF456", "translation": "ATGCTAGCATCG"}]), - ([" gene 123..456", - " /locus_tag=ABC123", - " /note=Some note", - " CDS 789..1011", - ' /protein_id="DEF456"', - ' /translation="ATGCTAGCATCG"', - " gene 789..1011", - " /locus_tag=DEF789", - " /note=Another note"], - [{"feature_type": "gene", "location": "123..456", "locus_tag": "ABC123", "note": "Some note"}, - {"feature_type": "CDS", "location": "789..1011", "protein_id": "DEF456", "translation": "ATGCTAGCATCG"}, - {"feature_type": "gene", "location": "789..1011", "locus_tag": "DEF789", "note": "Another note"}]), - # Add more test cases as needed -]) +@pytest.mark.parametrize( + "input_lines, expected_output", + [ + ( + [ + " gene 123..456", + " /locus_tag=ABC123", + " /note=Some note", + " CDS 789..1011", + " /protein_id=DEF456", + " /translation=ATGCTAGCATCG", + ], + [ + { + "feature_type": "gene", + "location": "123..456", + "locus_tag": "ABC123", + "note": "Some note", + }, + { + "feature_type": "CDS", + "location": "789..1011", + "protein_id": "DEF456", + "translation": "ATGCTAGCATCG", + }, + ], + ), + ( + [ + " gene 123..456", + " /locus_tag=ABC123", + " /note=Some note", + " CDS 789..1011", + ' /protein_id="DEF456"', + ' /translation="ATGCTAGCATCG"', + " gene 789..1011", + " /locus_tag=DEF789", + " /note=Another note", + ], + [ + { + "feature_type": "gene", + "location": "123..456", + "locus_tag": "ABC123", + "note": "Some note", + }, + { + "feature_type": "CDS", + "location": "789..1011", + "protein_id": "DEF456", + "translation": "ATGCTAGCATCG", + }, + { + "feature_type": "gene", + "location": "789..1011", + "locus_tag": "DEF789", + "note": "Another note", + }, + ], + ), + # Add more test cases as needed + ], +) def test_parse_feature_lines(input_lines, expected_output): assert list(parse_feature_lines(input_lines)) == expected_output def test_parse_dna_seq_lines(): - lines = [" 1 aaacc gggtt", - " 11 ccaaa tttgg", - " 21 ggccc ctttt"] + lines = [" 1 aaacc gggtt", " 11 ccaaa tttgg", " 21 ggccc ctttt"] assert parse_dna_seq_lines(lines) == "AAACCGGGTTCCAAATTTGGGGCCCCTTTT" def test_combine_contigs_metadata(): - contig1, contig2, contig3 = Contig(1, "contig1"), Contig(2, "contig2"), Contig(3, "contig3") + contig1, contig2, contig3 = ( + Contig(1, "contig1"), + Contig(2, "contig2"), + Contig(3, "contig3"), + ) contig_to_metadata = { contig1: {"sp": "spA", "strain": "123", "contig_feat": "ABC"}, contig2: {"sp": "spA", "strain": "123", "contig_feat": "XYZ"}, - contig3: {"sp": "spA", "strain": "123"} + contig3: {"sp": "spA", "strain": "123"}, } genome_metadata, contig_metadata = combine_contigs_metadata(contig_to_metadata) assert genome_metadata == {"sp": "spA", "strain": "123"} - assert contig_metadata == {contig1: {"contig_feat": "ABC"}, contig2: {"contig_feat": "XYZ"}, } - - -@pytest.mark.parametrize("coordinates, is_complement, start_shift, expected", [ - # Case 1: No partial start or end, expect no change in non-complement strand - # Coordinates are already correct, no need to modify anything. - ([(11, 40)], False, 0, [(11, 40)]), - - # Case 2: Partial start, no partial end (Non-complement) - # A shift of 1 is added to the start coordinate. - ([(10, 40)], False, 1, [(11, 40)]), # start_shift of 1 added to start - - # Case 2: Partial start, no partial end (Non-complement) - # A shift of 2 is added to the start coordinate. - ([(2, 33)], False, 2, [(4, 33)]), # start_shift of 2 added to start - - # Case 3: No partial start, partial end (Non-complement) - # Adjust last coordinate to make gene length a multiple of 3. - ([(11, 41)], False, 0, [(11, 40)]), # last end adjusted to be a multiple of 3 - - # Case 3: No partial start, partial end (Non-complement) - # Gene length is already a multiple of 3, so no changes needed. - ([(11, 40)], False, 0, [(11, 40)]), # gene length already a multiple of 3 so no change is made - - # Case 4: Partial start and end (Non-complement) - # Both start and end need adjustment: add shift to start and adjust end to make gene length a multiple of 3. - ([(10, 41)], False, 1, [(11, 40)]), # start_shift added to start, and length adjusted - - # Case 5: Partial start and no partial end on complement strand - # Adjust start since we are on the complement strand. - ([(9, 40)], True, 0, [(11, 40)]), # length adjusted - - # Case 5: No partial start but partial end on complement strand - # Shift removed from the last end on the complement strand. - ([(9, 40)], True, 2, [(9, 38)]), # start_shift removed - - # Case 5: Partial start and end on complement strand - # Adjust both start and end since we are on the complement strand, ensuring gene length is a multiple of 3. - ([(8, 40)], True, 2, [(9, 38)]), # start_shift removed and length adjusted - - # Case 5: Joined coordinates without partial start or end - # Nothing to adjust as the gene is properly framed. - ([(1, 9), (7, 12)], False, 0, [(1, 9), (7, 12)]), # nothing to do - - # Case 5: Joined coordinates with partial start - # Adjust the first start coordinate by the shift. - ([(3, 9), (7, 12)], False, 1, [(4, 9), (7, 12)]), # adjust start - - # Case 5: Joined coordinates with partial end - # Adjust the last end to ensure the gene length is a multiple of 3. - ([(3, 9), (7, 12)], False, 0, [(3, 9), (7, 11)]), # adjust end - - # Case 5: Joined coordinates with partial start and partial end - # Adjust both start and end for correct gene length and frame shift. - ([(3, 9), (7, 12)], False, 2, [(5, 9), (7, 10)]), # adjust start and end - - # Case 5: Joined coordinates with partial start and end on complement strand - # Adjust both start and end on the complement strand. - ([(4, 9), (7, 12)], True, 2, [(5, 9), (7, 10)]), # adjust start and end in complement mode - - # Real tricky case from GCF_000623275.1 - ([(4681814, 4682911), (1, 1)], False, 0, [(4681814, 4682911)]), - # ajust the end by removing one nt. In this case that remove the second part of the coordinates - - # Tricky case inspired by last case - ([(30, 60), (1, 1)], False, 0, [(30, 59)]), - # ajust the end by removing two nt. In this case that remove the second part of the coordinates and one nt in the first part - - # Tricky case inspired by last case - ([(60, 60), (1, 9)], False, 1, [(1, 9)]), - # ajust the end by removing one nt. In this case that remove the second part of the coordinates - - # Tricky case inspired by last case - ([(60, 60), (1, 10)], False, 2, [(2, 10)]), - # ajust the end by removing one nt. In this case that remove the second part of the coordinates - - # Very tricky case inspired by last case - ([(59, 60), (60, 60), (1, 9)], False, 3, [(1, 9)]), - # ajust the end by removing one nt. In this case that remove the second part of the coordinates + assert contig_metadata == { + contig1: {"contig_feat": "ABC"}, + contig2: {"contig_feat": "XYZ"}, + } - # Very tricky case inspired by last case - ([(60, 61), (1, 8)], True, 3, [(61, 61), (1, 5)]), # -]) -def test_fix_partial_gene_coordinates(coordinates, is_complement, start_shift, expected): +@pytest.mark.parametrize( + "coordinates, is_complement, start_shift, expected", + [ + # Case 1: No partial start or end, expect no change in non-complement strand + # Coordinates are already correct, no need to modify anything. + ([(11, 40)], False, 0, [(11, 40)]), + # Case 2: Partial start, no partial end (Non-complement) + # A shift of 1 is added to the start coordinate. + ([(10, 40)], False, 1, [(11, 40)]), # start_shift of 1 added to start + # Case 2: Partial start, no partial end (Non-complement) + # A shift of 2 is added to the start coordinate. + ([(2, 33)], False, 2, [(4, 33)]), # start_shift of 2 added to start + # Case 3: No partial start, partial end (Non-complement) + # Adjust last coordinate to make gene length a multiple of 3. + ([(11, 41)], False, 0, [(11, 40)]), # last end adjusted to be a multiple of 3 + # Case 3: No partial start, partial end (Non-complement) + # Gene length is already a multiple of 3, so no changes needed. + ( + [(11, 40)], + False, + 0, + [(11, 40)], + ), # gene length already a multiple of 3 so no change is made + # Case 4: Partial start and end (Non-complement) + # Both start and end need adjustment: add shift to start and adjust end to make gene length a multiple of 3. + ( + [(10, 41)], + False, + 1, + [(11, 40)], + ), # start_shift added to start, and length adjusted + # Case 5: Partial start and no partial end on complement strand + # Adjust start since we are on the complement strand. + ([(9, 40)], True, 0, [(11, 40)]), # length adjusted + # Case 5: No partial start but partial end on complement strand + # Shift removed from the last end on the complement strand. + ([(9, 40)], True, 2, [(9, 38)]), # start_shift removed + # Case 5: Partial start and end on complement strand + # Adjust both start and end since we are on the complement strand, ensuring gene length is a multiple of 3. + ([(8, 40)], True, 2, [(9, 38)]), # start_shift removed and length adjusted + # Case 5: Joined coordinates without partial start or end + # Nothing to adjust as the gene is properly framed. + ([(1, 9), (7, 12)], False, 0, [(1, 9), (7, 12)]), # nothing to do + # Case 5: Joined coordinates with partial start + # Adjust the first start coordinate by the shift. + ([(3, 9), (7, 12)], False, 1, [(4, 9), (7, 12)]), # adjust start + # Case 5: Joined coordinates with partial end + # Adjust the last end to ensure the gene length is a multiple of 3. + ([(3, 9), (7, 12)], False, 0, [(3, 9), (7, 11)]), # adjust end + # Case 5: Joined coordinates with partial start and partial end + # Adjust both start and end for correct gene length and frame shift. + ([(3, 9), (7, 12)], False, 2, [(5, 9), (7, 10)]), # adjust start and end + # Case 5: Joined coordinates with partial start and end on complement strand + # Adjust both start and end on the complement strand. + ( + [(4, 9), (7, 12)], + True, + 2, + [(5, 9), (7, 10)], + ), # adjust start and end in complement mode + # Real tricky case from GCF_000623275.1 + ([(4681814, 4682911), (1, 1)], False, 0, [(4681814, 4682911)]), + # ajust the end by removing one nt. In this case that remove the second part of the coordinates + # Tricky case inspired by last case + ([(30, 60), (1, 1)], False, 0, [(30, 59)]), + # ajust the end by removing two nt. In this case that remove the second part of the coordinates and one nt in the first part + # Tricky case inspired by last case + ([(60, 60), (1, 9)], False, 1, [(1, 9)]), + # ajust the end by removing one nt. In this case that remove the second part of the coordinates + # Tricky case inspired by last case + ([(60, 60), (1, 10)], False, 2, [(2, 10)]), + # ajust the end by removing one nt. In this case that remove the second part of the coordinates + # Very tricky case inspired by last case + ([(59, 60), (60, 60), (1, 9)], False, 3, [(1, 9)]), + # ajust the end by removing one nt. In this case that remove the second part of the coordinates + # Very tricky case inspired by last case + ([(60, 61), (1, 8)], True, 3, [(61, 61), (1, 5)]), # + ], +) +def test_fix_partial_gene_coordinates( + coordinates, is_complement, start_shift, expected +): result = fix_partial_gene_coordinates(coordinates, is_complement, start_shift) assert result == expected def test_fix_partial_gene_coordinates_with_wrong_coordinates(): with pytest.raises(ValueError): - fix_partial_gene_coordinates(coordinates=[(1, 1)], is_complement=False, - start_shift=0) # gene is too small, the length adjustement at the end lead to no gene + fix_partial_gene_coordinates( + coordinates=[(1, 1)], is_complement=False, start_shift=0 + ) # gene is too small, the length adjustement at the end lead to no gene with pytest.raises(ValueError): - fix_partial_gene_coordinates([(1, 1)], False, - 1) # gene is too small, the length adjustement at the start lead to no gene + fix_partial_gene_coordinates( + [(1, 1)], False, 1 + ) # gene is too small, the length adjustement at the start lead to no gene with pytest.raises(ValueError): - fix_partial_gene_coordinates([(60, 60), (1, 1)], False, - 1) # gene is too small, the length adjustement at the start and at the end lead to no gene + fix_partial_gene_coordinates( + [(60, 60), (1, 1)], False, 1 + ) # gene is too small, the length adjustement at the start and at the end lead to no gene with pytest.raises(ValueError): fix_partial_gene_coordinates([], False, 1) # chevron in inner position -@pytest.mark.parametrize("coordinates, shift, expected", [ +@pytest.mark.parametrize( + "coordinates, shift, expected", + [ + ([(11, 40)], 0, [(11, 40)]), + ([(1, 2)], 1, [(2, 2)]), + ([(1, 1), (1, 4)], 1, [(1, 4)]), + ([(1, 1), (1, 1), (1, 4)], 2, [(1, 4)]), + ([(1, 1), (1, 2), (1, 4)], 2, [(2, 2), (1, 4)]), + ], +) +def test_shift_start_coordinates(coordinates, shift, expected): + result = shift_start_coordinates(coordinates, shift) + assert result == expected - ([(11, 40)], 0, [(11, 40)]), - ([(1, 2)], 1, [(2, 2)]), +@pytest.mark.parametrize( + "coordinates, shift, expected", + [ + ([(11, 40)], 0, [(11, 40)]), + ([(1, 2)], 1, [(1, 1)]), + ([(1, 1), (1, 4)], 1, [(1, 1), (1, 3)]), + ([(1, 4), (4, 4), (4, 4)], 2, [(1, 4)]), + ([(18, 18), (1, 4)], 4, [(18, 18)]), + ], +) +def test_shift_end_coordinates(coordinates, shift, expected): + result = shift_end_coordinates(coordinates, shift) + assert result == expected - ([(1, 1), (1, 4)], 1, [(1, 4)]), - ([(1, 1), (1, 1), (1, 4)], 2, [(1, 4)]), +def test_check_sequence_tuple_valid(): + name, sequence = check_sequence_tuple("seq1", "ATGC") + assert name == "seq1" + assert sequence == "ATGC" - ([(1, 1), (1, 2), (1, 4)], 2, [(2, 2), (1, 4)]), -]) -def test_shift_start_coordinates(coordinates, shift, expected): - result = shift_start_coordinates(coordinates, shift) - assert result == expected +def test_check_sequence_tuple_empty_name(): + with pytest.raises(ValueError): + check_sequence_tuple("", "ATGC") -@pytest.mark.parametrize("coordinates, shift, expected", [ +def test_check_sequence_tuple_empty_sequence(): + with pytest.raises(ValueError): + check_sequence_tuple("seq1", "") - ([(11, 40)], 0, [(11, 40)]), - ([(1, 2)], 1, [(1, 1)]), +def test_parse_fasta_valid(): + fasta_data = ">seq1\nATGC\n>seq2\nGCTA" - ([(1, 1), (1, 4)], 1, [(1, 1), (1, 3)]), + result = list(parse_fasta(fasta_data.split("\n"))) - ([(1, 4), (4, 4), (4, 4)], 2, [(1, 4)]), + assert result == [("seq1", "ATGC"), ("seq2", "GCTA")] - ([(18, 18), (1, 4)], 4, [(18, 18)]), -]) -def test_shift_end_coordinates(coordinates, shift, expected): - result = shift_end_coordinates(coordinates, shift) - assert result == expected +def test_parse_fasta_empty_sequence(): + fasta_data = ">seq1\n>seq2\nGCTA" + with pytest.raises(ValueError): + list(parse_fasta(fasta_data.split("\n"))) + + +def test_parse_fasta_no_header(): + fasta_data = "seq1\nATGC\nseq2\nGCTA".split("\n") + with pytest.raises(ValueError): + list(parse_fasta(fasta_data)) diff --git a/tests/context/test_context.py b/tests/context/test_context.py index 453a1f5d..cd6440e8 100644 --- a/tests/context/test_context.py +++ b/tests/context/test_context.py @@ -1,8 +1,12 @@ #! /usr/bin/env python3 import pytest -from ppanggolin.context.searchGeneContext import (extract_contig_window, get_n_next_genes_index, - add_edges_to_context_graph, compute_gene_context_graph) +from ppanggolin.context.searchGeneContext import ( + extract_contig_window, + get_n_next_genes_index, + add_edges_to_context_graph, + compute_gene_context_graph, +) from ppanggolin.geneFamily import GeneFamily from ppanggolin.genome import Gene, Contig, Organism @@ -11,38 +15,71 @@ def test_extract_contig_window(): - #TODO try to use @pytest.mark.parametrize to test different combinations - assert extract_contig_window(contig_size=15, positions_of_interest={8}, window_size=1) == [(7, 9)] + # TODO try to use @pytest.mark.parametrize to test different combinations + assert extract_contig_window( + contig_size=15, positions_of_interest={8}, window_size=1 + ) == [(7, 9)] # check that extracted window is inside contig limit - assert extract_contig_window(contig_size=16, positions_of_interest={15}, window_size=4) == [(11, 15)] + assert extract_contig_window( + contig_size=16, positions_of_interest={15}, window_size=4 + ) == [(11, 15)] - assert extract_contig_window(contig_size=10, positions_of_interest={2, 8}, window_size=2) == [(0, 4), (6, 9)] + assert extract_contig_window( + contig_size=10, positions_of_interest={2, 8}, window_size=2 + ) == [(0, 4), (6, 9)] # 12 window is (9,15) # 19 window is (16,22) # so when 12 and 19 are of interest window merge (9,22) - assert extract_contig_window(contig_size=200, positions_of_interest={12}, window_size=3) == [(9, 15)] - assert extract_contig_window(contig_size=200, positions_of_interest={19}, window_size=3) == [(16, 22)] - assert extract_contig_window(contig_size=200, positions_of_interest={12, 19}, window_size=3) == [(9, 22)] + assert extract_contig_window( + contig_size=200, positions_of_interest={12}, window_size=3 + ) == [(9, 15)] + assert extract_contig_window( + contig_size=200, positions_of_interest={19}, window_size=3 + ) == [(16, 22)] + assert extract_contig_window( + contig_size=200, positions_of_interest={12, 19}, window_size=3 + ) == [(9, 22)] - assert extract_contig_window(contig_size=10, positions_of_interest={2, 5, 8}, window_size=2) == [(0, 9)] + assert extract_contig_window( + contig_size=10, positions_of_interest={2, 5, 8}, window_size=2 + ) == [(0, 9)] def test_extract_contig_window_with_circular_contig(): # TODO try to use @pytest.mark.parametrize to test different combinations # # check that circularity is properly taken into account - assert extract_contig_window(contig_size=12, positions_of_interest={1}, window_size=2, is_circular=True) == [(0, 3), (11, 11)] - assert extract_contig_window(contig_size=12, positions_of_interest={1}, window_size=3, is_circular=True) == [(0, 4), (10, 11)] - assert extract_contig_window(contig_size=12, positions_of_interest={10}, window_size=3, is_circular=True) == [(0, 1), (7, 11)] - - assert extract_contig_window(contig_size=12, positions_of_interest={6}, window_size=6, is_circular=True) == [(0, 11)] - assert extract_contig_window(contig_size=12, positions_of_interest={1}, window_size=6, is_circular=True) == [(0, 11)] - assert extract_contig_window(contig_size=12, positions_of_interest={1}, window_size=6, is_circular=False) == [(0, 7)] - - assert extract_contig_window(contig_size=12, positions_of_interest={0, 9}, window_size=2, is_circular=False) == [(0, 2), (7, 11)] - - assert extract_contig_window(contig_size=894, positions_of_interest=[151, 152, 153, 893], window_size=4, is_circular=True) == [(0, 3), (147, 157), (889, 893)] + assert extract_contig_window( + contig_size=12, positions_of_interest={1}, window_size=2, is_circular=True + ) == [(0, 3), (11, 11)] + assert extract_contig_window( + contig_size=12, positions_of_interest={1}, window_size=3, is_circular=True + ) == [(0, 4), (10, 11)] + assert extract_contig_window( + contig_size=12, positions_of_interest={10}, window_size=3, is_circular=True + ) == [(0, 1), (7, 11)] + + assert extract_contig_window( + contig_size=12, positions_of_interest={6}, window_size=6, is_circular=True + ) == [(0, 11)] + assert extract_contig_window( + contig_size=12, positions_of_interest={1}, window_size=6, is_circular=True + ) == [(0, 11)] + assert extract_contig_window( + contig_size=12, positions_of_interest={1}, window_size=6, is_circular=False + ) == [(0, 7)] + + assert extract_contig_window( + contig_size=12, positions_of_interest={0, 9}, window_size=2, is_circular=False + ) == [(0, 2), (7, 11)] + + assert extract_contig_window( + contig_size=894, + positions_of_interest=[151, 152, 153, 893], + window_size=4, + is_circular=True, + ) == [(0, 3), (147, 157), (889, 893)] def test_extract_contig_window_out_of_range(): @@ -55,20 +92,43 @@ def test_extract_contig_window_out_of_range(): def test_get_n_next_genes_index(): - assert list(get_n_next_genes_index(current_index=6, next_genes_count=3, contig_size=100, is_circular=False)) == [7, 8, 9] + assert list( + get_n_next_genes_index( + current_index=6, next_genes_count=3, contig_size=100, is_circular=False + ) + ) == [7, 8, 9] # there is no next gene because the current index is at the end of a non circular contig - assert list(get_n_next_genes_index(current_index=11, next_genes_count=2, contig_size=12, is_circular=False)) == [] + assert ( + list( + get_n_next_genes_index( + current_index=11, next_genes_count=2, contig_size=12, is_circular=False + ) + ) + == [] + ) def test_get_n_next_genes_index_circular(): - assert list(get_n_next_genes_index(current_index=10, next_genes_count=3, contig_size=12, is_circular=True)) == [11, 0, 1] - assert list(get_n_next_genes_index(current_index=10, next_genes_count=16, contig_size=12, is_circular=True)) == [11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + assert list( + get_n_next_genes_index( + current_index=10, next_genes_count=3, contig_size=12, is_circular=True + ) + ) == [11, 0, 1] + assert list( + get_n_next_genes_index( + current_index=10, next_genes_count=16, contig_size=12, is_circular=True + ) + ) == [11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9] def test_get_n_next_genes_index_out_of_range(): with pytest.raises(IndexError): - assert list(get_n_next_genes_index(current_index=10, next_genes_count=16, contig_size=8, is_circular=False)) + assert list( + get_n_next_genes_index( + current_index=10, next_genes_count=16, contig_size=8, is_circular=False + ) + ) @pytest.fixture() @@ -79,10 +139,10 @@ def simple_contig(): contig_size = 6 contig.length = contig_size genes = [Gene(str(i)) for i in range(contig_size)] - organism = Organism('organism_A') - for i, (gene, family_name) in enumerate(zip(genes, 'ABCDEFGHIJKLMNOP')): - family = GeneFamily(i, family_name) - gene.fill_annotations(start=i+1, stop=i+2, strand="+", position=i) + organism = Organism("organism_A") + for i, (gene, family_name) in enumerate(zip(genes, "ABCDEFGHIJKLMNOP")): + family = GeneFamily(i, family_name) + gene.fill_annotations(start=i + 1, stop=i + 2, strand="+", position=i) gene.fill_parents(organism, contig) @@ -100,7 +160,7 @@ def simple_circular_contig(): contig_size = 6 genes = [Gene(str(i)) for i in range(contig_size)] - for i, (gene, family_name) in enumerate(zip(genes, 'ABCDEFGHIJKLMNOP')): + for i, (gene, family_name) in enumerate(zip(genes, "ABCDEFGHIJKLMNOP")): family = GeneFamily(i, family_name) gene.fill_annotations(start=0, stop=0, strand=0, position=i) @@ -115,20 +175,15 @@ def test_add_edges_to_context_graph(simple_contig): # simple_contig families : ABCDEF - add_edges_to_context_graph(context_graph, - contig=simple_contig, - contig_windows=[(0, 3)], - transitivity=1) + add_edges_to_context_graph( + context_graph, contig=simple_contig, contig_windows=[(0, 3)], transitivity=1 + ) nodes = sorted([n.name for n in context_graph.nodes()]) edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()} - assert nodes == ['A', "B", "C", "D"] - assert edges == {('A', 'B'), - ('A', 'C'), - ('B', 'C'), - ('B', 'D'), - ('C', 'D')} + assert nodes == ["A", "B", "C", "D"] + assert edges == {("A", "B"), ("A", "C"), ("B", "C"), ("B", "D"), ("C", "D")} def test_add_edges_to_context_graph_2(simple_contig): @@ -136,17 +191,15 @@ def test_add_edges_to_context_graph_2(simple_contig): # simple_contig families : A B-C-D E F - add_edges_to_context_graph(context_graph, - contig=simple_contig, - contig_windows=[(1, 3)], - transitivity=0) + add_edges_to_context_graph( + context_graph, contig=simple_contig, contig_windows=[(1, 3)], transitivity=0 + ) nodes = sorted([n.name for n in context_graph.nodes()]) edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()} assert nodes == ["B", "C", "D"] - assert edges == {('B', 'C'), - ('C', 'D')} + assert edges == {("B", "C"), ("C", "D")} def test_add_edges_to_context_graph_linear(simple_contig): @@ -157,19 +210,22 @@ def test_add_edges_to_context_graph_linear(simple_contig): context_graph = nx.Graph() - add_edges_to_context_graph(context_graph, - contig=simple_contig, - contig_windows=[(4, 5), (0, 2)], - transitivity=0) + add_edges_to_context_graph( + context_graph, + contig=simple_contig, + contig_windows=[(4, 5), (0, 2)], + transitivity=0, + ) nodes = sorted([n.name for n in context_graph.nodes()]) edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()} assert nodes == ["A", "B", "C", "E", "F"] - assert edges == {('A', 'B'), - ('B', 'C'), - ('E', "F"), - } + assert edges == { + ("A", "B"), + ("B", "C"), + ("E", "F"), + } def test_add_edges_to_context_graph_circular(simple_contig): @@ -180,19 +236,23 @@ def test_add_edges_to_context_graph_circular(simple_contig): context_graph = nx.Graph() simple_contig.is_circular = True - add_edges_to_context_graph(context_graph, - contig=simple_contig, - contig_windows=[(4, 5), (0, 2)], - transitivity=0) + add_edges_to_context_graph( + context_graph, + contig=simple_contig, + contig_windows=[(4, 5), (0, 2)], + transitivity=0, + ) nodes = sorted([n.name for n in context_graph.nodes()]) edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()} assert nodes == ["A", "B", "C", "E", "F"] - assert edges == {('A', 'B'), - ('B', 'C'), - ('E', "F"), - ('A', 'F')} # circular so F and A are linked + assert edges == { + ("A", "B"), + ("B", "C"), + ("E", "F"), + ("A", "F"), + } # circular so F and A are linked def test_compute_gene_context_graph(simple_contig): @@ -206,16 +266,15 @@ def test_compute_gene_context_graph(simple_contig): families_in_contigs = [g.family for g in simple_contig.genes] family_names_of_interest = ["C"] - families_of_interest = {f for f in families_in_contigs if f.name in family_names_of_interest} + families_of_interest = { + f for f in families_in_contigs if f.name in family_names_of_interest + } - context_graph, _ = compute_gene_context_graph(families_of_interest, - transitive=0, - window_size=2) + context_graph, _ = compute_gene_context_graph( + families_of_interest, transitive=0, window_size=2 + ) nodes = sorted([n.name for n in context_graph.nodes()]) edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()} assert nodes == ["A", "B", "C", "D", "E"] - assert edges == {('A', 'B'), - ('B', 'C'), - ('C', "D"), - ('D', 'E')} + assert edges == {("A", "B"), ("B", "C"), ("C", "D"), ("D", "E")} diff --git a/tests/formats/test_writeFlatGenomes.py b/tests/formats/test_writeFlatGenomes.py index a028152b..d5197b65 100644 --- a/tests/formats/test_writeFlatGenomes.py +++ b/tests/formats/test_writeFlatGenomes.py @@ -1,39 +1,56 @@ from ppanggolin.formats.writeFlatGenomes import convert_overlapping_coordinates_for_gff + def test_convert_overlapping_coordinates_for_gff(): # test case where coordinates are have no frameshift and no edge overlap coordinates = [(7, 10)] contig_length = 15 - assert convert_overlapping_coordinates_for_gff(coordinates, contig_length) == [(7, 10)] - + assert convert_overlapping_coordinates_for_gff(coordinates, contig_length) == [ + (7, 10) + ] # test case where coordinates are simply a frameshift with not edge overlap coordinates = [(1, 5), (7, 10)] contig_length = 15 - assert convert_overlapping_coordinates_for_gff(coordinates, contig_length) == [(1, 5), (7, 10)] + assert convert_overlapping_coordinates_for_gff(coordinates, contig_length) == [ + (1, 5), + (7, 10), + ] # Test case where the gene overlaps. Coordinates are adjusted for GFF format coordinates = [(4, 8), (1, 2)] contig_length = 8 - assert convert_overlapping_coordinates_for_gff(coordinates, contig_length) == [(4, 10)] + assert convert_overlapping_coordinates_for_gff(coordinates, contig_length) == [ + (4, 10) + ] # Test case where coordinates overlap and has frameshift coordinates = [(4, 8), (7, 13), (1, 2)] contig_length = 13 - assert convert_overlapping_coordinates_for_gff(coordinates, contig_length) == [(4, 8), (7, 15)] + assert convert_overlapping_coordinates_for_gff(coordinates, contig_length) == [ + (4, 8), + (7, 15), + ] # Test case where coordinates overlap and has frameshift at the edge of the contig coordinates = [(12, 18), (1, 4)] contig_length = 20 - assert convert_overlapping_coordinates_for_gff(coordinates, contig_length) == [(12, 18), (21, 24)] + assert convert_overlapping_coordinates_for_gff(coordinates, contig_length) == [ + (12, 18), + (21, 24), + ] # Test case where coordinates overlap and has frameshift at the edge of the contig coordinates = [(12, 20), (2, 4)] contig_length = 20 - assert convert_overlapping_coordinates_for_gff(coordinates, contig_length) == [(12, 20), (22, 24)] - + assert convert_overlapping_coordinates_for_gff(coordinates, contig_length) == [ + (12, 20), + (22, 24), + ] # Test case where coordinates have just the last nt of the contig and the rest is at th ebegining of the contig coordinates = [(20, 20), (1, 10)] contig_length = 20 - assert convert_overlapping_coordinates_for_gff(coordinates, contig_length) == [(20, 30)] \ No newline at end of file + assert convert_overlapping_coordinates_for_gff(coordinates, contig_length) == [ + (20, 30) + ] diff --git a/tests/region/test_genomicIsland.py b/tests/region/test_genomicIsland.py index bfece3b0..fce5c6c6 100644 --- a/tests/region/test_genomicIsland.py +++ b/tests/region/test_genomicIsland.py @@ -1,13 +1,23 @@ -from ppanggolin.utils import find_consecutive_sequences, find_region_border_position, get_consecutive_region_positions +from ppanggolin.utils import ( + find_consecutive_sequences, + find_region_border_position, + get_consecutive_region_positions, +) import pytest + def test_find_consecutive_sequences_single_sequence(): sequence = [1, 2, 3, 4, 5] assert find_consecutive_sequences(sequence) == [[1, 2, 3, 4, 5]] + def test_find_consecutive_sequences_multiple_sequences(): sequence = [1, 2, 3, 7, 8, 9, 11, 12, 13, 0] - assert find_consecutive_sequences(sequence) == [[0, 1, 2, 3], [7, 8, 9], [11, 12, 13]] + assert find_consecutive_sequences(sequence) == [ + [0, 1, 2, 3], + [7, 8, 9], + [11, 12, 13], + ] def test_find_region_border_position_single_sequence(): @@ -15,23 +25,27 @@ def test_find_region_border_position_single_sequence(): contig_length = 10 assert find_region_border_position(region_positions, contig_length) == (1, 5) + def test_find_region_border_position_edge_overlap(): region_positions = [0, 1, 2, 3, 7, 8, 9, 10, 11, 12, 13, 14] contig_length = 15 assert find_region_border_position(region_positions, contig_length) == (7, 3) + def test_find_region_border_position_empty_sequence(): region_positions = [] contig_length = 10 with pytest.raises(ValueError): find_region_border_position(region_positions, contig_length) + def test_find_region_border_position_multiple_fragments(): region_positions = [1, 3, 4, 5, 8, 9] contig_length = 10 with pytest.raises(ValueError): find_region_border_position(region_positions, contig_length) + def test_find_region_border_position_fragmented_but_no_zero(): # region is in two pieces but it miss position 0 to be correct overlap region_positions = [8, 9, 1, 2, 3, 4] @@ -39,6 +53,7 @@ def test_find_region_border_position_fragmented_but_no_zero(): with pytest.raises(ValueError): find_region_border_position(region_positions, contig_length) + def test_find_region_border_position_fragmented_but_wrong_max_po(): # region does not reach the end of the contig. It misses position 9 region_positions = [7, 8, 0, 1, 2, 3, 4] @@ -46,18 +61,27 @@ def test_find_region_border_position_fragmented_but_wrong_max_po(): with pytest.raises(ValueError): find_region_border_position(region_positions, contig_length) + def test_get_consecutive_region_positions_regular(): region_positions = [2, 3, 4, 5, 6] contig_length = 15 - assert get_consecutive_region_positions(region_positions, contig_length) == [[2, 3, 4, 5, 6]] + assert get_consecutive_region_positions(region_positions, contig_length) == [ + [2, 3, 4, 5, 6] + ] def test_get_consecutive_region_positions_overlap(): region_positions = [0, 1, 2, 3, 7, 8, 9, 10, 11, 12, 13, 14] contig_length = 15 - assert get_consecutive_region_positions(region_positions, contig_length) == [[7, 8, 9, 10, 11, 12, 13, 14], [0, 1, 2, 3]] + assert get_consecutive_region_positions(region_positions, contig_length) == [ + [7, 8, 9, 10, 11, 12, 13, 14], + [0, 1, 2, 3], + ] + def test_get_consecutive_region_positions_all_genes(): region_positions = [4, 5, 6, 7, 0, 1, 2, 3] contig_length = 8 - assert get_consecutive_region_positions(region_positions, contig_length) == [[0, 1, 2, 3, 4, 5, 6, 7]] \ No newline at end of file + assert get_consecutive_region_positions(region_positions, contig_length) == [ + [0, 1, 2, 3, 4, 5, 6, 7] + ] diff --git a/tests/region/test_rgp_cluster.py b/tests/region/test_rgp_cluster.py index 51cf6eb7..d8b914a7 100644 --- a/tests/region/test_rgp_cluster.py +++ b/tests/region/test_rgp_cluster.py @@ -12,14 +12,15 @@ @pytest.fixture def genes() -> Generator[Set[Gene], None, None]: - """Create a set of genes to fill gene families - """ + """Create a set of genes to fill gene families""" organism = Organism("organism") contig = Contig(0, "contig") genes = [] for i in range(randint(11, 20)): gene = Gene(f"gene_{str(i)}") - gene.fill_annotations(start=10 * i + 1, stop=10 * (i + 1), strand='+', position=i, genetic_code=4) + gene.fill_annotations( + start=10 * i + 1, stop=10 * (i + 1), strand="+", position=i, genetic_code=4 + ) gene.fill_parents(organism, contig) contig[gene.start] = gene genes.append(gene) @@ -28,8 +29,7 @@ def genes() -> Generator[Set[Gene], None, None]: @pytest.fixture def families(genes) -> Generator[Set[GeneFamily], None, None]: - """Create a set of gene families fill with genes to test edges - """ + """Create a set of gene families fill with genes to test edges""" families = set() genes = list(genes) nb_families = randint(9, 20) @@ -59,8 +59,7 @@ def families(genes) -> Generator[Set[GeneFamily], None, None]: @pytest.fixture def identical_rgps(genes, families) -> Generator[Set[Region], None, None]: - """Create a set of identical rgps - """ + """Create a set of identical rgps""" identical_rgps = set() for i in range(1, randint(6, 21)): rgp = Region(f"RGP_{i}") @@ -74,61 +73,54 @@ def identical_rgps(genes, families) -> Generator[Set[Region], None, None]: class TestIdenticalRegions: def test_init_with_valid_inputs(self, identical_rgps, families): - """Tests that the IdenticalRegions object is initialized correctly with valid inputs. - """ + """Tests that the IdenticalRegions object is initialized correctly with valid inputs.""" is_contig_border = True - identical_regions = IdenticalRegions("IdenticalRegions", identical_rgps, families, is_contig_border) + identical_regions = IdenticalRegions( + "IdenticalRegions", identical_rgps, families, is_contig_border + ) assert identical_regions.name == "IdenticalRegions" assert identical_regions.rgps == identical_rgps assert identical_regions.families == families assert identical_regions.is_contig_border == is_contig_border - @pytest.mark.parametrize("wrong_type", - ["string", - 1, - 0.8, - list(), - dict()]) + @pytest.mark.parametrize("wrong_type", ["string", 1, 0.8, list(), dict()]) def test_init_with_identical_rgps_not_isintance_set(self, wrong_type, families): - """Tests that the IdenticalRegions object cannot be initialized with a not instance set for identical_rgps. - """ + """Tests that the IdenticalRegions object cannot be initialized with a not instance set for identical_rgps.""" with pytest.raises(TypeError): IdenticalRegions("IdenticalRegions", wrong_type, families, True) - def test_init_with_rgp_is_not_instance_region_in_identical_rgps(self, identical_rgps, families): - """Tests that the IdenticalRegions object raise TypeError if one element is not instance Region. - """ + def test_init_with_rgp_is_not_instance_region_in_identical_rgps( + self, identical_rgps, families + ): + """Tests that the IdenticalRegions object raise TypeError if one element is not instance Region.""" with pytest.raises(TypeError): - IdenticalRegions("IdenticalRegions", identical_rgps.union({1}), families, True) + IdenticalRegions( + "IdenticalRegions", identical_rgps.union({1}), families, True + ) def test_init_with_empty_identical_rgps(self, families): - """Tests that the IdenticalRegions object cannot be initialized with an empty set of identical regions. - """ + """Tests that the IdenticalRegions object cannot be initialized with an empty set of identical regions.""" with pytest.raises(ValueError): IdenticalRegions("IdenticalRegions", set(), families, True) - @pytest.mark.parametrize("wrong_type", - ["string", - 1, - 0.8, - list(), - dict()]) + @pytest.mark.parametrize("wrong_type", ["string", 1, 0.8, list(), dict()]) def test_init_with_families_not_isintance_set(self, wrong_type, identical_rgps): - """Tests that the IdenticalRegions object cannot be initialized with a not instance set. - """ + """Tests that the IdenticalRegions object cannot be initialized with a not instance set.""" with pytest.raises(TypeError): IdenticalRegions("IdenticalRegions", identical_rgps, wrong_type, True) - def test_init_with_family_is_not_instance_genefamilies_in_families(self, identical_rgps, families): - """Tests that the IdenticalRegions object raise TypeError if one element is not instance Region. - """ + def test_init_with_family_is_not_instance_genefamilies_in_families( + self, identical_rgps, families + ): + """Tests that the IdenticalRegions object raise TypeError if one element is not instance Region.""" with pytest.raises(TypeError): - IdenticalRegions("IdenticalRegions", identical_rgps, families.union({1}), True) + IdenticalRegions( + "IdenticalRegions", identical_rgps, families.union({1}), True + ) def test_init_with_empty_families(self, identical_rgps): - """Tests that the IdenticalRegions object cannot be initialized with an empty set of identical regions. - """ + """Tests that the IdenticalRegions object cannot be initialized with an empty set of identical regions.""" with pytest.raises(ValueError): IdenticalRegions("IdenticalRegions", identical_rgps, set(), True) @@ -146,8 +138,12 @@ def test_eq_with_equal_identical_regions(self): families2 = {family1, family2} is_contig_border = True - identical_regions1 = IdenticalRegions("IdenticalRegions", identical_rgps1, families1, is_contig_border) - identical_regions2 = IdenticalRegions("IdenticalRegions", identical_rgps2, families2, is_contig_border) + identical_regions1 = IdenticalRegions( + "IdenticalRegions", identical_rgps1, families1, is_contig_border + ) + identical_regions2 = IdenticalRegions( + "IdenticalRegions", identical_rgps2, families2, is_contig_border + ) assert identical_regions1 == identical_regions2 @@ -165,15 +161,18 @@ def test_eq_with_non_identical_regions(self): families2 = {family1} is_contig_border = True - identical_regions1 = IdenticalRegions("IdenticalRegions", identical_rgps1, families1, is_contig_border) - identical_regions2 = IdenticalRegions("IdenticalRegions", identical_rgps2, families2, is_contig_border) + identical_regions1 = IdenticalRegions( + "IdenticalRegions", identical_rgps1, families1, is_contig_border + ) + identical_regions2 = IdenticalRegions( + "IdenticalRegions", identical_rgps2, families2, is_contig_border + ) assert identical_regions1 != identical_regions2 def test_compute_grr(): - """Tests that compute_grr returns the correct value when there is a non-zero intersection between families - """ + """Tests that compute_grr returns the correct value when there is a non-zero intersection between families""" set1 = {1, 2, 3, 4, 5} set2 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10} @@ -186,10 +185,12 @@ def test_dereplicate_rgp(identical_rgps): rgp1 = list_identical_rgps[0] assert rgp_cluster.dereplicate_rgp({rgp1}) == [rgp1] - identical_region_obj = rgp_cluster.IdenticalRegions(name="identical_rgps_0", - identical_rgps=identical_rgps, - families=set(list_identical_rgps[0].families), - is_contig_border=True) + identical_region_obj = rgp_cluster.IdenticalRegions( + name="identical_rgps_0", + identical_rgps=identical_rgps, + families=set(list_identical_rgps[0].families), + is_contig_border=True, + ) assert rgp_cluster.dereplicate_rgp(rgps=identical_rgps)[0] == identical_region_obj @@ -207,16 +208,27 @@ def test_compute_rgp_metric(genes, families): assert not RGP_b.is_contig_border shared_families = len(set(RGP_a.families).intersection(set(RGP_b.families))) - expected_grr = (RGP_a.ID, RGP_b.ID, {'incomplete_aware_grr': shared_families / min(len(set(RGP_a.families)), len(set(RGP_b.families))), - "min_grr": shared_families / min(len(set(RGP_a.families)), len(set(RGP_b.families))), - 'max_grr': shared_families / max(len(set(RGP_a.families)), len(set(RGP_b.families))), - 'shared_family': shared_families}) + expected_grr = ( + RGP_a.ID, + RGP_b.ID, + { + "incomplete_aware_grr": shared_families + / min(len(set(RGP_a.families)), len(set(RGP_b.families))), + "min_grr": shared_families + / min(len(set(RGP_a.families)), len(set(RGP_b.families))), + "max_grr": shared_families + / max(len(set(RGP_a.families)), len(set(RGP_b.families))), + "shared_family": shared_families, + }, + ) # min_grr min_result = rgp_cluster.compute_rgp_metric(RGP_a, RGP_b, 0, "min_grr") assert min_result == expected_grr - # incomplete_aware_grr: same as min grr as rgp1 is incomplete - incomplete_aware_result = rgp_cluster.compute_rgp_metric(RGP_a, RGP_b, 0, "incomplete_aware_grr") + # incomplete_aware_grr: same as min grr as rgp1 is incomplete + incomplete_aware_result = rgp_cluster.compute_rgp_metric( + RGP_a, RGP_b, 0, "incomplete_aware_grr" + ) assert incomplete_aware_result == expected_grr diff --git a/tests/test_edge.py b/tests/test_edge.py index 0702cdc7..cdf43499 100644 --- a/tests/test_edge.py +++ b/tests/test_edge.py @@ -11,20 +11,19 @@ class TestEdge: @pytest.fixture def organism(self) -> Generator[Organism, None, None]: - """Generate a basic organism object - """ + """Generate a basic organism object""" yield Organism("organism") @pytest.fixture def families_pair(self) -> Generator[Tuple[GeneFamily, GeneFamily], None, None]: - """Generate a families pair - """ + """Generate a families pair""" yield GeneFamily(1, "family1"), GeneFamily(2, "family2") @pytest.fixture - def genes_pair(self, organism, families_pair) -> Generator[Tuple[Gene, Gene], None, None]: - """Generate genes_pair - """ + def genes_pair( + self, organism, families_pair + ) -> Generator[Tuple[Gene, Gene], None, None]: + """Generate genes_pair""" gene1, gene2 = Gene("gene1"), Gene("gene2") gene1.fill_parents(organism, None) gene2.fill_parents(organism, None) @@ -33,14 +32,12 @@ def genes_pair(self, organism, families_pair) -> Generator[Tuple[Gene, Gene], No @pytest.fixture def edge(self, genes_pair): - """Generate a basic edge - """ + """Generate a basic edge""" edge = Edge(*genes_pair) yield edge def test_constructor(self, genes_pair, organism, families_pair): - """Tests that an Edge object can be created with two genes belonging to different families - """ + """Tests that an Edge object can be created with two genes belonging to different families""" gene1, gene2 = genes_pair edge = Edge(gene1, gene2) assert edge.source == gene1.family @@ -54,9 +51,9 @@ def test_constructor_attribute_error(self): Tests that an AttributeError is raised when creating an Edge object with a gene that does not belong to any family """ - gene1 = Gene('gene1') - gene1.family = GeneFamily(0, 'test') - gene2 = Gene('gene2') + gene1 = Gene("gene1") + gene1.family = GeneFamily(0, "test") + gene2 = Gene("gene2") with pytest.raises(AttributeError): # Test target attribute error Edge(gene1, gene2) @@ -65,44 +62,40 @@ def test_constructor_attribute_error(self): Edge(gene2, gene1) def test_gene_pairs(self, edge, genes_pair): - """Tests that gene pairs' generator return what's expected - """ + """Tests that gene pairs' generator return what's expected""" assert set(edge.gene_pairs) == {genes_pair} def test_get_organisms(self, edge, organism): - """Tests that organism generator return what's expected - """ + """Tests that organism generator return what's expected""" assert set(edge.organisms) == {organism} def test_get_number_of_organisms(self, edge): - """Tests that the good number of organism is returned - """ + """Tests that the good number of organism is returned""" assert isinstance(edge.number_of_organisms, int) assert edge.number_of_organisms == 1 def test_get_organisms_dict(self, edge, organism, genes_pair): - """Tests that organism-gene_pairs dict is built as expected - """ + """Tests that organism-gene_pairs dict is built as expected""" assert edge.get_organisms_dict() == {organism: [genes_pair]} def test_get_organism_genes_pairs(self, edge, organism, genes_pair): - """Tests that the gene pairs corresponding to the organism is returned - """ + """Tests that the gene pairs corresponding to the organism is returned""" assert edge.get_organism_genes_pairs(organism) == [genes_pair] def test_edge_add_genes_same_organism(self, edge, genes_pair, organism): - """Tests that genes can be added to the edge that are on the same organism - """ - gene1, gene2, gene3, gene4 = *genes_pair, Gene('gene3'), Gene('gene4') + """Tests that genes can be added to the edge that are on the same organism""" + gene1, gene2, gene3, gene4 = *genes_pair, Gene("gene3"), Gene("gene4") gene3.fill_parents(organism, None) gene4.fill_parents(organism, None) edge.add_genes(gene3, gene4) - assert edge.get_organism_genes_pairs(organism) == [(gene1, gene2), (gene3, gene4)] + assert edge.get_organism_genes_pairs(organism) == [ + (gene1, gene2), + (gene3, gene4), + ] def test_edge_add_genes_different_organisms(self, edge, organism): - """Tests that an Exception is raised when adding genes to the edge that are not on the same organism - """ - gene1, gene2 = Gene('gene3'), Gene('gene4') + """Tests that an Exception is raised when adding genes to the edge that are not on the same organism""" + gene1, gene2 = Gene("gene3"), Gene("gene4") gene1.fill_parents(organism, None) org = Organism("org") gene2.fill_parents(org, None) @@ -110,9 +103,8 @@ def test_edge_add_genes_different_organisms(self, edge, organism): edge.add_genes(gene1, gene2) def test_edge_add_genes_one_none_gene(self, edge, organism): - """Tests that a TypeError is raised when adding genes to the edge where one gene is None - """ - gene1 = Gene('gene1') + """Tests that a TypeError is raised when adding genes to the edge where one gene is None""" + gene1 = Gene("gene1") gene1.fill_parents(organism) with pytest.raises(TypeError): edge.add_genes(gene1, None) @@ -120,9 +112,8 @@ def test_edge_add_genes_one_none_gene(self, edge, organism): edge.add_genes(None, gene1) def test_edge_add_genes_without_organisms(self, edge, organism): - """Tests that a ValueError is raised when adding genes not filled with organism - """ - gene1, gene2 = Gene('gene1'), Gene('gene2') + """Tests that a ValueError is raised when adding genes not filled with organism""" + gene1, gene2 = Gene("gene1"), Gene("gene2") gene1.fill_parents(organism, None) with pytest.raises(ValueError): edge.add_genes(gene1, gene2) diff --git a/tests/test_genefamily.py b/tests/test_genefamily.py index 594ad8d5..d985a0c0 100644 --- a/tests/test_genefamily.py +++ b/tests/test_genefamily.py @@ -12,26 +12,53 @@ class TestGeneFamily: - """Tests the gene family class - """ + """Tests the gene family class""" + @pytest.fixture def family(self) -> Generator[GeneFamily, None, None]: - """Create a gene family for all tests - """ + """Create a gene family for all tests""" yield GeneFamily(1, "test") def test_construct_gene_family(self, family): - """Tests that a GeneFamily object can be created with valid family_id and name - """ + """Tests that a GeneFamily object can be created with valid family_id and name""" assert isinstance(family, GeneFamily) - assert all(attr in ["ID", "name", "_edges_getter", "_genePerOrg", "_genes_getter", "_representative", - "removed", "sequence", "_partition", "_spots", "_module", "bitarray", "_metadata_getter"] - for attr in family.__dict__) # Check that no attribute was added else it should be tested - assert all(hasattr(family, attr) for attr in ["ID", "name", "_edges_getter", "_genePerOrg", "_genes_getter", - "removed", "sequence", "_partition", "_spots", "_module", - "bitarray"]) # Check that no attribute was removed else it should be tested + assert all( + attr + in [ + "ID", + "name", + "_edges_getter", + "_genePerOrg", + "_genes_getter", + "_representative", + "removed", + "sequence", + "_partition", + "_spots", + "_module", + "bitarray", + "_metadata_getter", + ] + for attr in family.__dict__ + ) # Check that no attribute was added else it should be tested + assert all( + hasattr(family, attr) + for attr in [ + "ID", + "name", + "_edges_getter", + "_genePerOrg", + "_genes_getter", + "removed", + "sequence", + "_partition", + "_spots", + "_module", + "bitarray", + ] + ) # Check that no attribute was removed else it should be tested assert family.ID == 1 - assert family.name == 'test' + assert family.name == "test" assert family._edges_getter == {} assert family._genePerOrg == {} assert family._genes_getter == dict() @@ -42,52 +69,49 @@ def test_construct_gene_family(self, family): assert family._module is None assert family.bitarray is None - @pytest.mark.parametrize("partition, name", - [ - ("P", "persistent"), - ("Pp", "persistent"), - ("P whatever, only first letter is important", "persistent"), - ("C", "cloud"), - ("C loud", "cloud"), - ("C whatever, only first letter is important", "cloud"), - ("S", "shell"), - ("Shut", "shell"), - ("S whatever, only first letter is important", "shell"), - ("un de troa kvar", "undefined"), - ("1", "undefined"), - ("p", "undefined"), - ("c", "undefined"), - ("s", "undefined"), - ]) + @pytest.mark.parametrize( + "partition, name", + [ + ("P", "persistent"), + ("Pp", "persistent"), + ("P whatever, only first letter is important", "persistent"), + ("C", "cloud"), + ("C loud", "cloud"), + ("C whatever, only first letter is important", "cloud"), + ("S", "shell"), + ("Shut", "shell"), + ("S whatever, only first letter is important", "shell"), + ("un de troa kvar", "undefined"), + ("1", "undefined"), + ("p", "undefined"), + ("c", "undefined"), + ("s", "undefined"), + ], + ) def test_get_named_partition_of_gene_family_object(self, family, partition, name): - """Tests that the named partition of a GeneFamily object can be retrieved - """ + """Tests that the named partition of a GeneFamily object can be retrieved""" family.partition = partition assert family.named_partition == name def test_get_named_partition_error_partition_empty(self, family): - """Tests that if no partition given to gene family, raise a ValueError - """ + """Tests that if no partition given to gene family, raise a ValueError""" with pytest.raises(ValueError): _ = family.named_partition def test_add_sequence_to_gene_family(self, family): - """Tests that a sequence can be added to a GeneFamily object - """ - family.add_sequence('ATCG') - assert family.sequence == 'ATCG' + """Tests that a sequence can be added to a GeneFamily object""" + family.add_sequence("ATCG") + assert family.sequence == "ATCG" def test_add_gene_to_gene_family(self, family): - """Tests that a Gene object can be added to a GeneFamily object - """ - gene = Gene('gene1') + """Tests that a Gene object can be added to a GeneFamily object""" + gene = Gene("gene1") family.add(gene) assert gene in family.genes assert gene.family == family def test_add_gene_error(self, family): - """Tests that a non-gene object can't be added to a GeneFamily as gene - """ + """Tests that a non-gene object can't be added to a GeneFamily as gene""" with pytest.raises(TypeError): family.add(33) @@ -103,7 +127,7 @@ def test_get_representative_gene(self, family): def test_raise_typeerror_with_no_gene_type_as_representative(self, family): with pytest.raises(TypeError): - family.representative = 'test' + family.representative = "test" def test_raise_exception_if_representative_not_set(self, family): with pytest.raises(Exception): @@ -111,18 +135,22 @@ def test_raise_exception_if_representative_not_set(self, family): @pytest.fixture def genes(self) -> Generator[Set[Gene], None, None]: - """Creeate a set of genes to fill gene families - """ + """Creeate a set of genes to fill gene families""" genes = set() for i in range(1, randint(11, 20)): gene = Gene(f"gene_{str(i)}") - gene.fill_annotations(start=10*(i-1) + 1, stop=10*i, strand='+', position=i, genetic_code=4) + gene.fill_annotations( + start=10 * (i - 1) + 1, + stop=10 * i, + strand="+", + position=i, + genetic_code=4, + ) genes.add(gene) yield genes def test_get_number_of_genes(self, family, genes): - """Tests that the number of genes can be retrieved - """ + """Tests that the number of genes can be retrieved""" for gene in genes: family.add(gene) assert isinstance(len(family), int) @@ -130,8 +158,7 @@ def test_get_number_of_genes(self, family, genes): @pytest.fixture def organisms(self, genes) -> Generator[Set[Organism], None, None]: - """Create a set of organisms fill with genes to test edges - """ + """Create a set of organisms fill with genes to test edges""" organisms = set() genes = list(genes) nb_organisms = randint(2, 10) @@ -165,59 +192,58 @@ def organisms(self, genes) -> Generator[Set[Organism], None, None]: yield organisms def test_get_org_dict(self, family, genes, organisms): - """Tests that all organisms and genes are retrieved as expected - """ + """Tests that all organisms and genes are retrieved as expected""" for gene in genes: family.add(gene) org_dict = family.get_org_dict() assert isinstance(org_dict, dict) assert all(isinstance(org, Organism) for org in org_dict.keys()) - assert all(isinstance(gene, Gene) for gene_set in org_dict.values() for gene in gene_set) + assert all( + isinstance(gene, Gene) + for gene_set in org_dict.values() + for gene in gene_set + ) assert set(org_dict.keys()) == organisms - assert set([gene for gene_set in org_dict.values() for gene in gene_set]) == genes + assert ( + set([gene for gene_set in org_dict.values() for gene in gene_set]) == genes + ) def test_get_org_dict_with_no_organism_fill_to_genes(self, family, genes): - """Tests that if genes are not fill with organism an AttributeError is returned - """ + """Tests that if genes are not fill with organism an AttributeError is returned""" for gene in genes: family.add(gene) with pytest.raises(AttributeError): _ = family.get_org_dict() def test_organisms(self, family, organisms, genes): - """Tests that all organisms are retrieved as expected - """ + """Tests that all organisms are retrieved as expected""" for gene in genes: family.add(gene) assert set(family.organisms) == organisms def test_number_of_organism(self, family, organisms, genes): - """Tests that the expected number of organisms is found - """ + """Tests that the expected number of organisms is found""" for gene in genes: family.add(gene) assert isinstance(family.number_of_organisms, int) assert family.number_of_organisms == len(organisms) def test_get_genes_per_org(self, family, organisms, genes): - """Tests that for a giver organism, all the genes are retrieved as expected - """ + """Tests that for a giver organism, all the genes are retrieved as expected""" for gene in genes: family.add(gene) for organism in organisms: assert set(family.get_genes_per_org(organism)) == set(organism.genes) def test_get_genes_per_org_if_org_not_in_family(self, family): - """Test that a KeyError is generated if an organism not belonging to the family is given - """ + """Test that a KeyError is generated if an organism not belonging to the family is given""" with pytest.raises(KeyError): org = Organism("organism") _ = set(family.get_genes_per_org(org)) @pytest.fixture def families(self, genes) -> Generator[Set[GeneFamily], None, None]: - """Create a set of gene families fill with genes to test edges - """ + """Create a set of gene families fill with genes to test edges""" families = set() genes = list(genes) nb_families = randint(2, 10) @@ -246,11 +272,12 @@ def families(self, genes) -> Generator[Set[GeneFamily], None, None]: @pytest.fixture def edges(self, families, genes, organisms) -> Generator[Set[Edge], None, None]: - """Create a set of edges fill with genes and gene families to test edges - """ + """Create a set of edges fill with genes and gene families to test edges""" edges = {} - pair_genes = filter(lambda x: x[0] != x[1] and x[0].organism == x[1].organism, - combinations_with_replacement(genes, 2)) + pair_genes = filter( + lambda x: x[0] != x[1] and x[0].organism == x[1].organism, + combinations_with_replacement(genes, 2), + ) for pair in pair_genes: key = frozenset([pair[0].family, pair[1].family]) edge = edges.get(key) @@ -264,65 +291,71 @@ def edges(self, families, genes, organisms) -> Generator[Set[Edge], None, None]: yield set(edges.values()) def test_get_neighbors_of_gene_family(self, families, edges): - """Tests get all the expected neighbor of the family in the graph - """ + """Tests get all the expected neighbor of the family in the graph""" for family in families: - assert all(isinstance(neighbor, GeneFamily) for neighbor in family.neighbors) - expected_neighbors = set([edge.source for edge in edges - if edge.target == family]).union(set([edge.target for edge in edges - if edge.source == family])) + assert all( + isinstance(neighbor, GeneFamily) for neighbor in family.neighbors + ) + expected_neighbors = set( + [edge.source for edge in edges if edge.target == family] + ).union(set([edge.target for edge in edges if edge.source == family])) assert set(family.neighbors) == expected_neighbors def test_get_number_of_neighbors(self, families, edges): - """Tests that the expected number of neighbors is found - """ + """Tests that the expected number of neighbors is found""" for family in families: - expected_neighbors = set([edge.source for edge in edges - if edge.target == family]).union(set([edge.target for edge in edges - if edge.source == family])) + expected_neighbors = set( + [edge.source for edge in edges if edge.target == family] + ).union(set([edge.target for edge in edges if edge.source == family])) assert isinstance(family.number_of_neighbors, int) assert family.number_of_neighbors == len(expected_neighbors) # Tests that the edges of a GeneFamily object can be retrieved def test_get_edges_of_gene_family(self, families, edges): - """Tests that all the edges belonging to the family are retrieved - """ + """Tests that all the edges belonging to the family are retrieved""" for family in families: - expected_edges = set([edge for edge in edges if edge.source == family or edge.target == family]) + expected_edges = set( + [ + edge + for edge in edges + if edge.source == family or edge.target == family + ] + ) assert all(isinstance(edge, Edge) for edge in family.edges) assert set(family.edges) == expected_edges def test_get_number_of_edges(self, families, edges): - """Tests that the expected number of edges is found - """ + """Tests that the expected number of edges is found""" for family in families: - expected_edges = set([edge for edge in edges if edge.source == family or edge.target == family]) + expected_edges = set( + [ + edge + for edge in edges + if edge.source == family or edge.target == family + ] + ) assert isinstance(family.number_of_edges, int) assert family.number_of_neighbors == len(expected_edges) def test_add_spot_to_gene_family(self, family): - """Tests that a Spot object can be added to a GeneFamily object - """ + """Tests that a Spot object can be added to a GeneFamily object""" spot = Spot(1) family.add_spot(spot) assert spot in family.spots def test_add_non_spot_as_spot_in_family(self, family): - """Tests that a non-spot object cannot be added to Gene Family - """ + """Tests that a non-spot object cannot be added to Gene Family""" with pytest.raises(TypeError): family.add_spot(323) def test_add_module_to_gene_family(self, family): - """Tests that a Module object can be added to a GeneFamily object - """ + """Tests that a Module object can be added to a GeneFamily object""" module = Module(1) family.set_module(module) assert module == family.module def test_add_non_module_as_module_in_family(self, family): - """Tests that a non-module object cannot be added to Gene Family - """ + """Tests that a non-module object cannot be added to Gene Family""" with pytest.raises(TypeError): family.set_module(323) diff --git a/tests/test_genome.py b/tests/test_genome.py index 9e29133f..42fa16b9 100644 --- a/tests/test_genome.py +++ b/tests/test_genome.py @@ -10,21 +10,18 @@ class TestFeature: - """Tests Feature class - """ + """Tests Feature class""" @pytest.fixture def feature(self) -> Generator[Feature, None, None]: - """Generate a basic feature for tests - """ - yield Feature('test_id') + """Generate a basic feature for tests""" + yield Feature("test_id") def test_creation(self, feature): - """Tests that 'Feature' is created successfully with the given identifier - """ - assert feature.ID == 'test_id' + """Tests that 'Feature' is created successfully with the given identifier""" + assert feature.ID == "test_id" assert not feature.is_fragment - assert feature.type == '' + assert feature.type == "" assert feature.start is None assert feature.stop is None assert feature.strand is None @@ -36,187 +33,176 @@ def test_creation(self, feature): assert feature.dna is None def test_create_feature_with_identifier_not_instance_string(self): - """Tests that a Feature object cannot be created with a non-string type identifier - """ + """Tests that a Feature object cannot be created with a non-string type identifier""" with pytest.raises(AssertionError): Feature(4) def test_create_feature_empty_identifier(self): - """Tests that a Feature object cannot be created with an empty identifier - """ + """Tests that a Feature object cannot be created with an empty identifier""" with pytest.raises(ValueError): - Feature('') + Feature("") def tests_write_organism(self, feature): - """Tests that write feature return feature name as string - """ + """Tests that write feature return feature name as string""" assert str(feature) == "test_id" def test_fill_annotations(self, feature): - """Tests that 'fill_annotations' method fills the attributes correctly - """ - feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 'local_id') + """Tests that 'fill_annotations' method fills the attributes correctly""" + feature.fill_annotations(1, 10, "+", "gene_type", "name", "product", "local_id") assert feature.start == 1 assert feature.stop == 10 - assert feature.type == 'gene_type' - assert feature.strand == '+' - assert feature.product == 'product' - assert feature.name == 'name' - assert feature.local_identifier == 'local_id' + assert feature.type == "gene_type" + assert feature.strand == "+" + assert feature.product == "product" + assert feature.name == "name" + assert feature.local_identifier == "local_id" def test_fill_annotations_type_error(self, feature): - """Tests that 'fill_annotations' method raises a TypeError if attribute value is not with the correct type - """ + """Tests that 'fill_annotations' method raises a TypeError if attribute value is not with the correct type""" with pytest.raises(TypeError): - feature.fill_annotations('1', 10, '+', 'gene_type', 'name', 'product', 'local_id') + feature.fill_annotations( + "1", 10, "+", "gene_type", "name", "product", "local_id" + ) with pytest.raises(TypeError): - feature.fill_annotations(1, "10", '+', 'gene_type', 'name', 'product', 'local_id') + feature.fill_annotations( + 1, "10", "+", "gene_type", "name", "product", "local_id" + ) with pytest.raises(TypeError): - feature.fill_annotations(1, 10, 4, 'gene_type', 'name', 'product', 'local_id') + feature.fill_annotations( + 1, 10, 4, "gene_type", "name", "product", "local_id" + ) with pytest.raises(TypeError): - feature.fill_annotations(1, 10, "+", 4, 'name', 'product', 'local_id') + feature.fill_annotations(1, 10, "+", 4, "name", "product", "local_id") with pytest.raises(TypeError): - feature.fill_annotations(1, 10, '+', 'gene_type', 4, 'product', 'local_id') + feature.fill_annotations(1, 10, "+", "gene_type", 4, "product", "local_id") with pytest.raises(TypeError): - feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 4, 'local_id') + feature.fill_annotations(1, 10, "+", "gene_type", "name", 4, "local_id") with pytest.raises(TypeError): - feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 4) + feature.fill_annotations(1, 10, "+", "gene_type", "name", "product", 4) def test_fill_annotations_value_error(self, feature): - """Tests that 'fill_annotations' method raises a TypeError if strand is not '+' or '-' - """ + """Tests that 'fill_annotations' method raises a TypeError if strand is not '+' or '-'""" with pytest.raises(ValueError): - feature.fill_annotations(1, 10, '4', 'gene_type', 'name', 'product', 'local_id') + feature.fill_annotations( + 1, 10, "4", "gene_type", "name", "product", "local_id" + ) def test_fill_parents(self, feature): - """Tests that 'fill_parents' method associates the object with the given organism and contig - """ - organism = Organism('org_id') - contig = Contig(0, 'contig_name') - feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 'local_id') + """Tests that 'fill_parents' method associates the object with the given organism and contig""" + organism = Organism("org_id") + contig = Contig(0, "contig_name") + feature.fill_annotations(1, 10, "+", "gene_type", "name", "product", "local_id") feature.fill_parents(organism, contig) assert feature.organism == organism assert feature.contig == contig def test_fill_parents_with_organism_or_contig_only(self, feature): - """Tests that Gene can be filled with only an organism or a contig - """ - organism = Organism('org') + """Tests that Gene can be filled with only an organism or a contig""" + organism = Organism("org") contig = Contig(0, "ctg") - feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 'local_id') + feature.fill_annotations(1, 10, "+", "gene_type", "name", "product", "local_id") feature.fill_parents(organism=organism) assert feature.organism == organism feature.fill_parents(contig=contig) assert feature.contig == contig def test_fill_parents_with_nothing(self, feature): - """Tests that Gene cannot be filled with neither an organism and a contig - """ + """Tests that Gene cannot be filled with neither an organism and a contig""" with pytest.raises(AssertionError): feature.fill_parents() def test_set_organism(self, feature): - """Tests that organism setter sets organism with the valid type - """ - organism = Organism('organism') + """Tests that organism setter sets organism with the valid type""" + organism = Organism("organism") feature.organism = organism assert feature.organism == organism def test_set_organism_not_isinstance_organism(self, feature): - """Tests that organism setter return TypeError if sets organism with the invalid type - """ + """Tests that organism setter return TypeError if sets organism with the invalid type""" with pytest.raises(TypeError): feature.organism = 4 def test_set_contig(self, feature): - """Tests that contig setter sets contig with the valid type - """ - contig = Contig(0, 'contig') + """Tests that contig setter sets contig with the valid type""" + contig = Contig(0, "contig") feature.contig = contig assert feature.contig == contig def test_set_contig_not_isinstance_contig(self, feature): - """Tests that contig setter return TypeError if sets contig with the invalid type - """ + """Tests that contig setter return TypeError if sets contig with the invalid type""" with pytest.raises(TypeError): feature.contig = 4 def test_add_dna(self, feature): - """Tests that 'add_dna' method adds the DNA sequence to the object successfully - """ - feature.add_sequence('ATCG') - assert feature.dna == 'ATCG' + """Tests that 'add_dna' method adds the DNA sequence to the object successfully""" + feature.add_sequence("ATCG") + assert feature.dna == "ATCG" def test_add_dna_type_error(self, feature): - """Tests that 'add_dna' method raises a TypeError if the DNA sequence is not a string - """ + """Tests that 'add_dna' method raises a TypeError if the DNA sequence is not a string""" with pytest.raises(AssertionError): feature.add_sequence(123) def test_length(self, feature): - """Tests len method - """ - feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 'local_id') + """Tests len method""" + feature.fill_annotations(1, 10, "+", "gene_type", "name", "product", "local_id") assert isinstance(len(feature), int) assert len(feature) == 10 def test_length_start_or_stop_are_not_known(self): - """Tests that len raises ValueError when start is not known - """ + """Tests that len raises ValueError when start is not known""" with pytest.raises(ValueError): - feature = Feature('test') + feature = Feature("test") feature.stop = 10 len(feature) with pytest.raises(ValueError): - feature = Feature('test') + feature = Feature("test") feature.start = 1 len(feature) - @pytest.mark.parametrize("coordinates, expected_overlaps_contig_edge_flag", [ - ([(1, 4), (3, 10)], False), - ([(2, 4), (1, 1)], True), - ([(1, 4), (1, 10)], False), - ([(1, 4), (6, 10), (1, 2)], True), - ([(5, 10), (9, 10), (1, 4)], True), - - ]) - def test_overlaps_contig_edge(self, coordinates, expected_overlaps_contig_edge_flag): - feature = Feature('ID') - feature.fill_annotations(start=1, stop=10, strand='+', coordinates=coordinates) + @pytest.mark.parametrize( + "coordinates, expected_overlaps_contig_edge_flag", + [ + ([(1, 4), (3, 10)], False), + ([(2, 4), (1, 1)], True), + ([(1, 4), (1, 10)], False), + ([(1, 4), (6, 10), (1, 2)], True), + ([(5, 10), (9, 10), (1, 4)], True), + ], + ) + def test_overlaps_contig_edge( + self, coordinates, expected_overlaps_contig_edge_flag + ): + feature = Feature("ID") + feature.fill_annotations(start=1, stop=10, strand="+", coordinates=coordinates) assert feature.overlaps_contig_edge == expected_overlaps_contig_edge_flag class TestRNA: - """Tests RNA Class - """ + """Tests RNA Class""" @pytest.fixture def rna(self) -> Generator[RNA, None, None]: - """Generate a basic gene for tests - """ - yield RNA('rna') + """Generate a basic gene for tests""" + yield RNA("rna") def test_create_gene_object(self, rna): - """Tests that a Gene object can be created with a valid gene_id - """ - assert rna.ID == 'rna' + """Tests that a Gene object can be created with a valid gene_id""" + assert rna.ID == "rna" class TestGene: - """Tests Gene class - """ + """Tests Gene class""" @pytest.fixture def gene(self) -> Generator[Gene, None, None]: - """Generate a basic gene for tests - """ - yield Gene('gene') + """Generate a basic gene for tests""" + yield Gene("gene") def test_create_gene_object(self, gene): - """Tests that a Gene object can be created with a valid gene_id - """ - assert gene.ID == 'gene' + """Tests that a Gene object can be created with a valid gene_id""" + assert gene.ID == "gene" assert gene.position is None assert gene._family is None assert gene._RGP is None @@ -226,155 +212,151 @@ def test_create_gene_object(self, gene): assert gene._frame is None def test_fill_annotations(self, gene): - """Tests that Gene annotations can be filled with valid parameters - """ - gene.fill_annotations(start=1, stop=10, strand='+', position=10, genetic_code=4) + """Tests that Gene annotations can be filled with valid parameters""" + gene.fill_annotations(start=1, stop=10, strand="+", position=10, genetic_code=4) assert gene.position == 10 assert gene.genetic_code == 4 def test_fill_annotations_type_error(self, gene): - """Tests that Gene annotations cannot be filled with invalid parameters - """ + """Tests that Gene annotations cannot be filled with invalid parameters""" with pytest.raises(TypeError): - gene.fill_annotations(start=1, stop=10, strand='+', position='10', genetic_code=4) + gene.fill_annotations( + start=1, stop=10, strand="+", position="10", genetic_code=4 + ) with pytest.raises(TypeError): - gene.fill_annotations(start=1, stop=10, strand='+', position=10, genetic_code="4") + gene.fill_annotations( + start=1, stop=10, strand="+", position=10, genetic_code="4" + ) @pytest.mark.parametrize("frame", [0, 1, 2]) def test_set_frame(self, frame): - """Tests that frame can be set - """ - gene = Gene('gene') + """Tests that frame can be set""" + gene = Gene("gene") gene.frame = frame assert gene._frame == frame @pytest.mark.parametrize("frame", [0, 1, 2]) def test_get_frame(self, frame): - """Tests that frame can be getting - """ - gene = Gene('gene') + """Tests that frame can be getting""" + gene = Gene("gene") gene.frame = frame assert gene.frame == frame def test_raise_assertion_error_if_frame_not_set(self): - """Tests that frame cannot be return if it has not been set - """ - gene = Gene('gene') + """Tests that frame cannot be return if it has not been set""" + gene = Gene("gene") with pytest.raises(AssertionError): _ = gene.frame def test_raise_assertion_error_if_frame_already_set(self): - """Tests that frame cannot be set if it has already been set - """ - gene = Gene('gene') + """Tests that frame cannot be set if it has already been set""" + gene = Gene("gene") gene.frame = 1 with pytest.raises(AssertionError): gene.frame = 2 @pytest.mark.parametrize("frame", [3, "1", 1.5]) def test_raise_value_error_if_frame_not_0_1_or_2(self, frame): - """Tests that frame cannot be set with value different from 0, 1 or 2 - """ - gene = Gene('gene') + """Tests that frame cannot be set with value different from 0, 1 or 2""" + gene = Gene("gene") with pytest.raises(ValueError): gene.frame = frame @pytest.mark.parametrize("frame", [0, 1, 2]) def test_fill_partial_gene(self, frame): - """Tests that Gene annotations can be filled with partial genes - """ - gene = Gene('gene') - gene.fill_annotations(start=1, stop=10, strand='+', is_partial=True, frame=frame) + """Tests that Gene annotations can be filled with partial genes""" + gene = Gene("gene") + gene.fill_annotations( + start=1, stop=10, strand="+", is_partial=True, frame=frame + ) assert gene.is_partial is True assert gene.frame == frame def test_add_protein(self, gene): - """Tests that a protein sequence can be added to a Gene object - """ - gene.add_protein('MVKLAVLALALAVLALALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALA') - assert gene.protein == 'MVKLAVLALALAVLALALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALA' + """Tests that a protein sequence can be added to a Gene object""" + gene.add_protein( + "MVKLAVLALALAVLALALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALA" + ) + assert ( + gene.protein + == "MVKLAVLALALAVLALALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALA" + ) def test_add_protein_non_string(self, gene): - """Tests that a non-string protein sequence cannot be added to a Gene object - """ + """Tests that a non-string protein sequence cannot be added to a Gene object""" with pytest.raises(TypeError): gene.add_protein(123) def test_set_family(self, gene): - """Tests that family setter sets family with the valid type - """ - family = GeneFamily(0, 'family') + """Tests that family setter sets family with the valid type""" + family = GeneFamily(0, "family") gene.family = family assert gene.family == family def test_set_family_not_instance_gene_family(self, gene): - """Tests that family setter return TypeError if sets family is not instance GeneFamily - """ + """Tests that family setter return TypeError if sets family is not instance GeneFamily""" with pytest.raises(TypeError): gene.family = 4 def test_set_rgp(self, gene): - """Tests that RGP setter sets family with the valid type - """ + """Tests that RGP setter sets family with the valid type""" region = Region(0) gene.RGP = region assert gene.RGP == region def test_set_rgp_not_instance_region(self, gene): - """Tests that family setter return TypeError if sets rgp is not instance Region - """ + """Tests that family setter return TypeError if sets rgp is not instance Region""" with pytest.raises(TypeError): gene.RGP = 4 class TestContig: - """Tests Contig class - """ + """Tests Contig class""" @pytest.fixture def contig(self) -> Generator[Contig, None, None]: - """Generate basic contig for tests - """ + """Generate basic contig for tests""" yield Contig(0, "contig") @pytest.fixture def gene(self) -> Generator[Gene, None, None]: - """Generate basic gene for tests - """ - gene = Gene('test_gene') - gene.fill_annotations(start=1, stop=10, strand='+', position=0, genetic_code=4) + """Generate basic gene for tests""" + gene = Gene("test_gene") + gene.fill_annotations(start=1, stop=10, strand="+", position=0, genetic_code=4) yield gene @pytest.fixture def genes(self) -> Generator[Tuple[Gene, Gene, Gene], None, None]: - """Generate three basic genes for tests - """ - gene1 = Gene('test_gene1') - gene1.fill_annotations(start=1, stop=10, strand='+', position=0, genetic_code=4) - gene2 = Gene('test_gene2') - gene2.fill_annotations(start=11, stop=20, strand='+', position=1, genetic_code=4) - gene3 = Gene('test_gene3') - gene3.fill_annotations(start=21, stop=30, strand='+', position=2, genetic_code=4) + """Generate three basic genes for tests""" + gene1 = Gene("test_gene1") + gene1.fill_annotations(start=1, stop=10, strand="+", position=0, genetic_code=4) + gene2 = Gene("test_gene2") + gene2.fill_annotations( + start=11, stop=20, strand="+", position=1, genetic_code=4 + ) + gene3 = Gene("test_gene3") + gene3.fill_annotations( + start=21, stop=30, strand="+", position=2, genetic_code=4 + ) yield gene1, gene2, gene3 def test_create_contig(self, contig): - """Tests that a contig is correctly created - """ + """Tests that a contig is correctly created""" assert contig.name == "contig" assert not contig.is_circular - assert contig._rna_getter == set() # Saving the rna annotations. We're not using them in the vast majority of cases. + assert ( + contig._rna_getter == set() + ) # Saving the rna annotations. We're not using them in the vast majority of cases. assert contig._genes_getter == {} assert contig._genes_position == [] assert contig._organism is None def tests_write_contig(self, contig): - """Tests that write contig return contig name as string - """ + """Tests that write contig return contig name as string""" assert str(contig) == "contig" def test_add_gene(self, gene, contig): - """Tests that a gene can be added to the contig - """ + """Tests that a gene can be added to the contig""" contig.add(gene) assert len(contig._genes_getter) == 1 assert len(contig._genes_position) == 1 @@ -382,44 +364,45 @@ def test_add_gene(self, gene, contig): assert contig._genes_position[0] == gene def test_add_gene_at_far_position(self, gene, contig): - """Tests that a gene can be added at each position and between position are fill with None - """ + """Tests that a gene can be added at each position and between position are fill with None""" contig.add(gene) new_gene = Gene("Gene2") - new_gene.fill_annotations(start=50, stop=72, strand='+', position=6, genetic_code=4) + new_gene.fill_annotations( + start=50, stop=72, strand="+", position=6, genetic_code=4 + ) contig.add(new_gene) assert len(contig._genes_position) == 7 assert contig._genes_position[1:6] == [None] * 5 def test_add_gene_not_instance_gene(self, contig): - """Tests that the contig cannot be fill with a non-gene object - """ + """Tests that the contig cannot be fill with a non-gene object""" with pytest.raises(TypeError): contig.add(1) with pytest.raises(TypeError): - contig[1] = '4' + contig[1] = "4" def test_add_gene_with_start_already_taken(self, contig): - """Tests that the contig cannot be fill with a non-gene object - """ - initial_gene = Gene('test_gene') - initial_gene.fill_annotations(start=1, stop=12, strand='+', position=4, genetic_code=4) + """Tests that the contig cannot be fill with a non-gene object""" + initial_gene = Gene("test_gene") + initial_gene.fill_annotations( + start=1, stop=12, strand="+", position=4, genetic_code=4 + ) contig.add(initial_gene) with pytest.raises(ValueError): - new_identical_gene = Gene('test_gene') - new_identical_gene.fill_annotations(start=1, stop=12, strand='+', position=2, genetic_code=4) + new_identical_gene = Gene("test_gene") + new_identical_gene.fill_annotations( + start=1, stop=12, strand="+", position=2, genetic_code=4 + ) contig.add(new_identical_gene) def test_add_gene_without_position(self, contig): - """Test that adding a gene not fill with position raise an AttributeError - """ + """Test that adding a gene not fill with position raise an AttributeError""" with pytest.raises(AttributeError): - gene = Gene('test_gene') + gene = Gene("test_gene") contig.add(gene) def test_number_of_genes(self, genes, contig): - """Tests len method - """ + """Tests len method""" gene1, gene2, gene3 = genes contig.add(gene1) contig.add(gene2) @@ -428,14 +411,12 @@ def test_number_of_genes(self, genes, contig): assert contig.number_of_genes == 3 def test_get_gene(self, gene, contig): - """Tests that a gene can be retrieved by its position - """ + """Tests that a gene can be retrieved by its position""" contig.add(gene) assert contig[0] == gene def test_get_genes(self, genes, contig): - """Tests that a list of genes within a range can be retrieved - """ + """Tests that a list of genes within a range can be retrieved""" gene1, gene2, gene3 = genes contig.add(gene1) contig.add(gene2) @@ -443,28 +424,25 @@ def test_get_genes(self, genes, contig): assert set(contig.get_genes(0, 2)) == set(genes) def test_get_gene_with_non_integer_index(self, contig): - """Tests that a gene cannot be retrieved with an index that is not an integer - """ + """Tests that a gene cannot be retrieved with an index that is not an integer""" with pytest.raises(TypeError): - _ = contig['a'] + _ = contig["a"] def test_get_genes_with_non_integer_begin_and_end_positions(self, genes, contig): - """Tests that genes cannot be retrieved with non-integer begin and end positions - """ + """Tests that genes cannot be retrieved with non-integer begin and end positions""" gene1, gene2, gene3 = genes contig.add(gene1) contig.add(gene2) contig.add(gene3) with pytest.raises(TypeError): - contig.get_genes('a', 2) + contig.get_genes("a", 2) with pytest.raises(TypeError): - contig.get_genes(5, 'b') + contig.get_genes(5, "b") with pytest.raises(TypeError): - contig.get_genes('a', 'b') + contig.get_genes("a", "b") def test_get_genes_with_end_position_lower_than_begin_position(self, genes, contig): - """Tests that genes cannot be retrieved with end position lower than begin position - """ + """Tests that genes cannot be retrieved with end position lower than begin position""" gene1, gene2, gene3 = genes contig.add(gene1) contig.add(gene2) @@ -472,9 +450,10 @@ def test_get_genes_with_end_position_lower_than_begin_position(self, genes, cont with pytest.raises(ValueError): contig.get_genes(2, 0) - def test_get_genes_with_end_position_greater_than_last_position(self, genes, contig): - """Tests that genes cannot be retrieved with given end position greater than last gene position in the contig - """ + def test_get_genes_with_end_position_greater_than_last_position( + self, genes, contig + ): + """Tests that genes cannot be retrieved with given end position greater than last gene position in the contig""" gene1, gene2, gene3 = genes contig.add(gene1) contig.add(gene2) @@ -482,7 +461,9 @@ def test_get_genes_with_end_position_greater_than_last_position(self, genes, con with pytest.raises(IndexError): contig.get_genes(0, 3) - def test_get_genes_with_end_position_greater_than_last_position_with_outrange_ok(self, genes, contig): + def test_get_genes_with_end_position_greater_than_last_position_with_outrange_ok( + self, genes, contig + ): gene1, gene2, gene3 = genes contig.add(gene1) contig.add(gene2) @@ -490,134 +471,116 @@ def test_get_genes_with_end_position_greater_than_last_position_with_outrange_ok assert set(contig.get_genes(0, 5, outrange_ok=True)) == set(genes) def test_iterate_over_genes(self, genes, contig): - """Tests that all genes in the contig can be iterated over - """ + """Tests that all genes in the contig can be iterated over""" gene1, gene2, gene3 = genes contig.add(gene1) contig.add(gene2) contig.add(gene3) - assert list(contig.genes) == sorted([gene1, gene2, gene3], key=lambda x: x.position) + assert list(contig.genes) == sorted( + [gene1, gene2, gene3], key=lambda x: x.position + ) def test_add_rna(self, contig): - """Tests that an RNA can be added to the contig - """ - rna = RNA('test_rna') + """Tests that an RNA can be added to the contig""" + rna = RNA("test_rna") contig.add_rna(rna) assert list(contig.RNAs) == [rna] def test_set_organism(self, contig): - """Tests that an organism can be set to the contig - """ + """Tests that an organism can be set to the contig""" organism = Organism("organism") contig.organism = organism assert contig.organism == organism def test_set_organism_with_not_instance_organism(self, contig): - """Tests that the contig cannot be fill with a non-organism object - """ + """Tests that the contig cannot be fill with a non-organism object""" with pytest.raises(TypeError): contig.organism = 4 class TestOrganism: - """Tests Contig class - """ + """Tests Contig class""" @pytest.fixture def organism(self) -> Generator[Organism, None, None]: - """Generate a basic organism for test - """ - yield Organism('organism') + """Generate a basic organism for test""" + yield Organism("organism") @pytest.fixture def contig(self) -> Generator[Contig, None, None]: - """Generate a basic contig for test - """ + """Generate a basic contig for test""" yield Contig(0, "contig") @pytest.fixture def gene(self) -> Generator[Gene, None, None]: - """Generate a basic gene for test - """ - gene = Gene('test_gene') - gene.fill_annotations(start=1, stop=10, strand='+', position=0, genetic_code=4) + """Generate a basic gene for test""" + gene = Gene("test_gene") + gene.fill_annotations(start=1, stop=10, strand="+", position=0, genetic_code=4) yield gene def test_create_organism(self, organism): - """Tests that an Organism instance can be created with a valid name - """ - assert organism.name == 'organism' + """Tests that an Organism instance can be created with a valid name""" + assert organism.name == "organism" assert organism._contigs_getter == {} assert organism._families is None assert organism.bitarray is None def test_create_organism_empty_name(self): - """Tests that an Organism instance cannot be created with an empty name - """ + """Tests that an Organism instance cannot be created with an empty name""" with pytest.raises(AssertionError): - Organism('') + Organism("") def test_create_organism_with_name_not_string(self): - """Tests that an Organism instance cannot be created with a name not instance string - """ + """Tests that an Organism instance cannot be created with a name not instance string""" with pytest.raises(AssertionError): Organism(4) def tests_write_organism(self, organism): - """Tests that write organism return organism name as string - """ + """Tests that write organism return organism name as string""" assert str(organism) == "organism" def test_add_contig(self, organism, contig): - """Tests that a contig can be added to an Organism instance - """ + """Tests that a contig can be added to an Organism instance""" organism.add(contig) - assert organism._contigs_getter['contig'] == contig + assert organism._contigs_getter["contig"] == contig def test_add_contig_not_instance_contig(self, organism): - """Tests that a non Contig object cannot be added to an Organism instance - """ + """Tests that a non Contig object cannot be added to an Organism instance""" with pytest.raises(AssertionError): organism.add(4) def test_add_contig_existing_name(self, organism, contig): - """Tests that a contig with an existing name cannot be added to an Organism instance - """ + """Tests that a contig with an existing name cannot be added to an Organism instance""" organism.add(contig) with pytest.raises(KeyError): - organism.add(Contig(0, 'contig')) + organism.add(Contig(0, "contig")) def test_get_contig(self, organism, contig): - """Tests that a contig can be retrieved from an Organism instance - """ + """Tests that a contig can be retrieved from an Organism instance""" organism.add(contig) - assert organism.get('contig') == contig + assert organism.get("contig") == contig def test_get_contig_not_instance_string(self, organism): - """Tests that a non Contig object cannot be added to an Organism instance - """ + """Tests that a non Contig object cannot be added to an Organism instance""" with pytest.raises(TypeError): organism.get(4) def test_get_nonexistent_contig(self, organism): - """Tests that a non-existent contig cannot be retrieved from an Organism instance - """ + """Tests that a non-existent contig cannot be retrieved from an Organism instance""" with pytest.raises(KeyError): - organism.get('contig1') + organism.get("contig1") def test_number_of_contigs(self, organism): - """Tests that the number of contigs in an organism instance can be retrieved - """ - organism.add(Contig(1, 'contig1')) - organism.add(Contig(2, 'contig2')) + """Tests that the number of contigs in an organism instance can be retrieved""" + organism.add(Contig(1, "contig1")) + organism.add(Contig(2, "contig2")) assert organism.number_of_contigs == 2 assert isinstance(len(organism), int) assert len(organism) == 2 def test_get_families(self, organism, contig, gene): - """Tests that gene families in an organism can be retrieved - """ + """Tests that gene families in an organism can be retrieved""" family = GeneFamily(0, "fam") family.add(gene) gene.fill_parents(organism, contig) @@ -626,8 +589,7 @@ def test_get_families(self, organism, contig, gene): assert set(organism.families) == {family} def test_number_of_families(self, organism, contig, gene): - """Tests that the number of gene families in an organism instance can be retrieved - """ + """Tests that the number of gene families in an organism instance can be retrieved""" family = GeneFamily(0, "fam") family.add(gene) gene.fill_parents(organism, contig) @@ -636,30 +598,29 @@ def test_number_of_families(self, organism, contig, gene): assert organism.number_of_families() == 1 def tests_get_genes(self, organism, contig, gene): - """Tests that genes in an organism can be retrieved - """ + """Tests that genes in an organism can be retrieved""" gene.fill_parents(organism, contig) organism.add(contig) contig.add(gene) assert set(organism.genes) == {gene} def test_number_of_genes(self, organism, contig, gene): - """Tests that the number of genes in an organism instance can be retrieved - """ + """Tests that the number of genes in an organism instance can be retrieved""" gene.fill_parents(organism, contig) organism.add(contig) contig.add(gene) assert organism.number_of_genes() == 1 def test_mk_bitarray(self, organism, contig): - """Tests that a bitarray can be created for an Organism instance - """ - fam1 = GeneFamily(1, 'fam1') - fam2 = GeneFamily(2, 'fam2') - gene1 = Gene('gene1') - gene2 = Gene('gene2') - gene1.fill_annotations(start=1, stop=10, strand='+', position=0, genetic_code=4) - gene2.fill_annotations(start=11, stop=19, strand='+', position=1, genetic_code=4) + """Tests that a bitarray can be created for an Organism instance""" + fam1 = GeneFamily(1, "fam1") + fam2 = GeneFamily(2, "fam2") + gene1 = Gene("gene1") + gene2 = Gene("gene2") + gene1.fill_annotations(start=1, stop=10, strand="+", position=0, genetic_code=4) + gene2.fill_annotations( + start=11, stop=19, strand="+", position=1, genetic_code=4 + ) fam1.add(gene1) fam2.add(gene2) contig[gene1.start] = gene1 diff --git a/tests/test_metadata.py b/tests/test_metadata.py index dff4ad05..9a823402 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -10,54 +10,45 @@ class TestMetadata: @pytest.fixture def metadata(self) -> Generator[Metadata, None, None]: - """Create a simple metadata - """ + """Create a simple metadata""" yield Metadata("source", attribute1="value1", attribute2=["value2", "value3"]) def test_constructor(self, metadata): - """Tests that the Metadata object is created successfully with a valid source and attributes - """ + """Tests that the Metadata object is created successfully with a valid source and attributes""" assert metadata.source == "source" assert metadata.attribute1 == "value1" assert metadata.attribute2 == "value2,value3" def test_constructor_with_empty_source_name(self): - """Tests that a ValueError is raised when creating a Metadata object with an empty source name - """ + """Tests that a ValueError is raised when creating a Metadata object with an empty source name""" with pytest.raises(ValueError): Metadata("", attribute="value") def test_constructor_with_non_string_source_name(self): - """Tests that a TypeError is raised when creating a Metadata object with a non-string source name - """ + """Tests that a TypeError is raised when creating a Metadata object with a non-string source name""" with pytest.raises(TypeError): Metadata(123, attribute="value") def test_constructor_with_no_attributes(self): - """Tests that an Exception is raised when creating a Metadata object with no attributes - """ + """Tests that an Exception is raised when creating a Metadata object with no attributes""" with pytest.raises(Exception): Metadata("source") def test_get_existing_attribute_value(self, metadata): - """Tests that the value of an existing attribute is returned correctly - """ + """Tests that the value of an existing attribute is returned correctly""" assert metadata.attribute1 == "value1" def test_get_non_existing_attribute_value(self, metadata): - """Tests that an AttributeError is raised when getting the value of a non-existing attribute - """ + """Tests that an AttributeError is raised when getting the value of a non-existing attribute""" with pytest.raises(AttributeError): _ = metadata.non_existing_attribute def test_attribute_fields(self, metadata): - """Tests that the 'fields' method returns a list of all the attributes in the Metadata object - """ + """Tests that the 'fields' method returns a list of all the attributes in the Metadata object""" assert metadata.fields == ["attribute1", "attribute2"] def test_length(self, metadata): - """Tests that the number_of_attribute method returns the correct number of attributes in the Metadata object - """ + """Tests that the number_of_attribute method returns the correct number of attributes in the Metadata object""" assert isinstance(len(metadata), int) assert len(metadata) == 2 @@ -71,7 +62,11 @@ def metadata(self) -> Generator[Set[Metadata], None, None]: """ metadata = set() for i in range(randint(5, 10)): - metadata.add(Metadata(f"source_{i}", **{f"attr_{j}": j for j in range(randint(1, 5))})) + metadata.add( + Metadata( + f"source_{i}", **{f"attr_{j}": j for j in range(randint(1, 5))} + ) + ) yield metadata @pytest.fixture @@ -86,41 +81,45 @@ def metafeatures(self, metadata) -> Generator[MetaFeatures, None, None]: yield metafeatures def test_add_metadata(self, metafeatures, metadata): - """Tests that metadata can be added to the metadata getter - """ - assert all(list(metafeatures._metadata_getter[meta.source].values()) == [meta] for meta in metadata) + """Tests that metadata can be added to the metadata getter""" + assert all( + list(metafeatures._metadata_getter[meta.source].values()) == [meta] + for meta in metadata + ) def test_get_metadata_feature_corresponding_to_source(self, metafeatures, metadata): - """Tests that all the metadata features corresponding to a source can be retrieved - """ - assert all(list(metafeatures.get_metadata_by_source(meta.source).values()) == [meta] for meta in metadata) + """Tests that all the metadata features corresponding to a source can be retrieved""" + assert all( + list(metafeatures.get_metadata_by_source(meta.source).values()) == [meta] + for meta in metadata + ) def test_remove_source_from_feature(self, metafeatures): - """Tests that a source can be removed from the feature - """ + """Tests that a source can be removed from the feature""" metadata = Metadata("source_del", attribute1="value") metafeatures.add_metadata(metadata) metafeatures.del_metadata_by_source("source_del") assert metafeatures.get_metadata_by_source("source_del") is None def test_generate_all_metadata_sources(self, metafeatures, metadata): - """Tests that all metadata sources can be generated - """ + """Tests that all metadata sources can be generated""" assert list(metafeatures.sources) == [meta.source for meta in metadata] def test_get_metadata_by_attribute_values(self, metafeatures): - """Tests that metadata can be retrieved based on attribute values - """ + """Tests that metadata can be retrieved based on attribute values""" meta = Metadata("source_test", attribute1="value_to_retrieve") # meta_list = Metadata("source_list", attribute1=["val_1", "val_2"]) metafeatures.add_metadata(meta) # metafeatures[meta_list.source] = meta_list - assert list(metafeatures.get_metadata_by_attribute(attribute1="value_to_retrieve")) == [meta] + assert list( + metafeatures.get_metadata_by_attribute(attribute1="value_to_retrieve") + ) == [meta] # assert list(metafeatures.get_metadata(attribute1="val_1")) == [meta_list] - def test_get_maximum_number_of_metadata_for_one_source(self, metafeatures, metadata): - """Tests that the maximum number of metadata for one source can be retrieved - """ + def test_get_maximum_number_of_metadata_for_one_source( + self, metafeatures, metadata + ): + """Tests that the maximum number of metadata for one source can be retrieved""" metadata1 = Metadata("source_max", attribute1="value1") metadata2 = Metadata("source_max", attribute2="value2") metafeatures.add_metadata(metadata1) @@ -128,7 +127,6 @@ def test_get_maximum_number_of_metadata_for_one_source(self, metafeatures, metad assert metafeatures.max_metadata_by_source() == ("source_max", 2) def test_metadata_is_not_with_type_metadata(self, metafeatures): - """Tests that an AssertionError is raised when metadata is not with type Metadata - """ + """Tests that an AssertionError is raised when metadata is not with type Metadata""" with pytest.raises(AssertionError): metafeatures.add_metadata("not_metadata") diff --git a/tests/test_pangenome.py b/tests/test_pangenome.py index 3a7f958a..0f4e27a9 100644 --- a/tests/test_pangenome.py +++ b/tests/test_pangenome.py @@ -48,30 +48,23 @@ def test_cstr(self, pangenome): "_spot_getter": dict, "_module_getter": dict, "status": dict, - "parameters": dict + "parameters": dict, } status_keys = [ - 'genomesAnnotated', - 'geneSequences', - 'genesClustered', - 'defragmented', - 'geneFamilySequences', - 'neighborsGraph', - 'partitioned', - 'predictedRGP', - 'spots', - 'modules', - "metadata", - "metasources" - ] - metadata_keys = [ - "families", - "genes", - "genomes", - "RGPs", + "genomesAnnotated", + "geneSequences", + "genesClustered", + "defragmented", + "geneFamilySequences", + "neighborsGraph", + "partitioned", + "predictedRGP", "spots", - "modules" + "modules", + "metadata", + "metasources", ] + metadata_keys = ["families", "genes", "genomes", "RGPs", "spots", "modules"] for attr, attr_type in pangenome_attr_type.items(): assert hasattr(pangenome, attr) assert isinstance(pangenome.__getattribute__(attr), attr_type) @@ -113,13 +106,11 @@ def test_add_file_is_not_path(self, pangenome): class TestPangenomeOrganism(TestPangenome): - """This class tests methods in pangenome class associated to organisms. - """ + """This class tests methods in pangenome class associated to organisms.""" @pytest.fixture def organism(self) -> Generator[Organism, None, None]: - """Create a basic organism - """ + """Create a basic organism""" yield Organism(name="organism") def test_add_organism(self, pangenome, organism): @@ -166,7 +157,7 @@ def test_get_organism_not_in_pangenome(self, pangenome): :param pangenome: Pangenome object to test method """ with pytest.raises(KeyError): - pangenome.get_organism('org') + pangenome.get_organism("org") def test_get_organism_with_name_not_instance_string(self, pangenome): """Ensure that it raises an AssertionError when a non-string name is passed as organism name. @@ -210,8 +201,7 @@ def test_number_of_organisms(self, add_organisms, pangenome, organisms): class TestPangenomeGeneFamilies(TestPangenome): - """This class tests methods in pangenome class associated to gene families. - """ + """This class tests methods in pangenome class associated to gene families.""" @pytest.fixture def family(self) -> Generator[GeneFamily, None, None]: @@ -285,7 +275,7 @@ def families(self) -> Generator[Set[GeneFamily], None, None]: """ families = set() for i in range(randint(5, 20)): - family = GeneFamily(family_id=i, name=f'family{i}') + family = GeneFamily(family_id=i, name=f"family{i}") families.add(family) yield families @@ -311,8 +301,7 @@ def test_number_of_gene_families_empty(self, add_families, pangenome, families): class TestPangenomeGene(TestPangenome): - """This class tests methods in pangenome class associated to Gene. - """ + """This class tests methods in pangenome class associated to Gene.""" @pytest.fixture def genes(self) -> Generator[Set[Gene], None, None]: @@ -441,8 +430,7 @@ def test_get_multigenic(self, pangenome): class TestPangenomeEdge(TestPangenome): - """This class tests methods in pangenome class associated to Edge. - """ + """This class tests methods in pangenome class associated to Edge.""" @staticmethod def make_gene_pair(gene_id_1: int = 1, gene_id_2: int = 2) -> Tuple[Gene, Gene]: @@ -512,8 +500,7 @@ def test_number_of_edges(self, pangenome, gene_pair): class TestPangenomeBinary(TestPangenomeOrganism, TestPangenomeGeneFamilies): - """This class tests methods in pangenome class associated to binary methods. - """ + """This class tests methods in pangenome class associated to binary methods.""" # TODO Better test for this part def test_get_org_index(self, add_organisms, pangenome): @@ -531,7 +518,9 @@ def test_get_org_index(self, add_organisms, pangenome): assert index not in index_know index_know.add(index) - def test_compute_family_bitarrays_with_index_already_computed(self, add_organisms, add_families, pangenome): + def test_compute_family_bitarrays_with_index_already_computed( + self, add_organisms, add_families, pangenome + ): """Tests the compute_family_bitarrays function in Pangenome class :param add_families: Add families to the pangenome object @@ -540,7 +529,9 @@ def test_compute_family_bitarrays_with_index_already_computed(self, add_organism org_idx = pangenome.get_org_index() assert pangenome.compute_family_bitarrays() == org_idx - def test_compute_family_bitarrays_without_index_already_computed(self, add_organisms, add_families, pangenome): + def test_compute_family_bitarrays_without_index_already_computed( + self, add_organisms, add_families, pangenome + ): """Tests the compute_family_bitarrays function of the Pangenome class. :param add_families: Add families to the pangenome @@ -565,7 +556,9 @@ def test_get_fam_index(self, add_families, pangenome): assert index not in index_know index_know.add(index) - def test_compute_org_bitarrays_with_index_already_computed(self, add_organisms, add_families, pangenome): + def test_compute_org_bitarrays_with_index_already_computed( + self, add_organisms, add_families, pangenome + ): """Tests the compute_family_bitarrays function in Pangenome class :param add_families: Add families to the pangenome object @@ -574,7 +567,9 @@ def test_compute_org_bitarrays_with_index_already_computed(self, add_organisms, fams_index = pangenome.get_fam_index() assert pangenome.compute_org_bitarrays() == fams_index - def test_compute_org_bitarrays_without_index_already_computed(self, add_organisms, add_families, pangenome): + def test_compute_org_bitarrays_without_index_already_computed( + self, add_organisms, add_families, pangenome + ): """Tests the compute_family_bitarrays function of the Pangenome class. :param add_families: Add families to the pangenome @@ -586,8 +581,7 @@ def test_compute_org_bitarrays_without_index_already_computed(self, add_organism class TestPangenomeRGP(TestPangenome): - """This class tests methods in pangenome class associated to Region - """ + """This class tests methods in pangenome class associated to Region""" def test_add_region(self, pangenome): """Tests the add_region method in the Pangenome class. @@ -654,8 +648,7 @@ def test_number_of_rgp(self, pangenome): class TestPangenomeSpot(TestPangenome): - """This class tests methods in pangenome class associated to Spot. - """ + """This class tests methods in pangenome class associated to Spot.""" def test_add_spot(self, pangenome): """Tests the add_spot method in the Pangenome class. @@ -723,8 +716,7 @@ def test_number_of_spots(self, pangenome): class TestPangenomeModule(TestPangenome): - """This class tests methods in pangenome class associated to Modules. - """ + """This class tests methods in pangenome class associated to Modules.""" def test_add_module(self, pangenome): """Tests the add_module method in the Pangenome class. @@ -792,8 +784,7 @@ def test_number_of_modules(self, pangenome): class TestPangenomeMetadata(TestPangenome): - """This class tests methods in pangenome class associated to Metadata. - """ + """This class tests methods in pangenome class associated to Metadata.""" @pytest.fixture def add_element_to_pangenome(self, pangenome): @@ -810,7 +801,7 @@ def add_element_to_pangenome(self, pangenome): ctg = Contig(0, "Ctg") org.add(ctg) gene = Gene("Gene") - gene.fill_annotations(start=1, stop=100, position=0, strand='+') + gene.fill_annotations(start=1, stop=100, position=0, strand="+") gene.add_metadata(metadata=metadata) ctg.add(gene) pangenome.add_organism(org) @@ -830,12 +821,25 @@ def test_select_elem(self, add_element_to_pangenome, pangenome): :param add_element_to_pangenome: Add elements to the pangenome :param pangenome: Access the pangenome object """ - assert all(isinstance(elem, GeneFamily) for elem in set(pangenome.select_elem("families"))) - assert all(isinstance(elem, Organism) for elem in set(pangenome.select_elem("genomes"))) - assert all(isinstance(elem, Gene) for elem in set(pangenome.select_elem("genes"))) - assert all(isinstance(elem, Region) for elem in set(pangenome.select_elem("RGPs"))) - assert all(isinstance(elem, Spot) for elem in set(pangenome.select_elem("spots"))) - assert all(isinstance(elem, Module) for elem in set(pangenome.select_elem("modules"))) + assert all( + isinstance(elem, GeneFamily) + for elem in set(pangenome.select_elem("families")) + ) + assert all( + isinstance(elem, Organism) for elem in set(pangenome.select_elem("genomes")) + ) + assert all( + isinstance(elem, Gene) for elem in set(pangenome.select_elem("genes")) + ) + assert all( + isinstance(elem, Region) for elem in set(pangenome.select_elem("RGPs")) + ) + assert all( + isinstance(elem, Spot) for elem in set(pangenome.select_elem("spots")) + ) + assert all( + isinstance(elem, Module) for elem in set(pangenome.select_elem("modules")) + ) with pytest.raises(KeyError): pangenome.select_elem("error") @@ -847,7 +851,7 @@ def test_metadata_sources(self, add_element_to_pangenome, pangenome): """ for metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"]: assert isinstance(pangenome.metadata_sources(metatype), set) - assert pangenome.metadata_sources(metatype) == {'source'} + assert pangenome.metadata_sources(metatype) == {"source"} def test_metadata(self, add_element_to_pangenome, pangenome): """Tests the metadata generator of the Pangenome class. @@ -859,7 +863,7 @@ def test_metadata(self, add_element_to_pangenome, pangenome): for metadata_gen in pangenome.metadata(metatype): for metadata in metadata_gen: assert isinstance(metadata, Metadata) - assert metadata.source == 'source' + assert metadata.source == "source" def test_get_elem_by_metadata(self, add_element_to_pangenome, pangenome): """Tests the metadata generator filtered by metadata attribute of the Pangenome class. @@ -867,13 +871,19 @@ def test_get_elem_by_metadata(self, add_element_to_pangenome, pangenome): :param add_element_to_pangenome: Add elements to the pangenome :param pangenome: Access the pangenome object """ - for metatype, expected_type in {"families": GeneFamily, "genomes": Organism, "genes": Gene, "RGPs": Region, - "spots": Spot, "modules": Module}.items(): + for metatype, expected_type in { + "families": GeneFamily, + "genomes": Organism, + "genes": Gene, + "RGPs": Region, + "spots": Spot, + "modules": Module, + }.items(): for elem in pangenome.get_elem_by_metadata(metatype, attribute="attr"): assert isinstance(elem, expected_type) for metadata in elem.metadata: assert isinstance(metadata, Metadata) - assert metadata.source == 'source' + assert metadata.source == "source" def test_get_elem_by_source(self, add_element_to_pangenome, pangenome): """Tests the metadata generator filtered by source of the Pangenome class. @@ -881,10 +891,18 @@ def test_get_elem_by_source(self, add_element_to_pangenome, pangenome): :param add_element_to_pangenome: Add elements to the pangenome :param pangenome: Access the pangenome object """ - for metatype, expected_type in {"families": GeneFamily, "genomes": Organism, "genes": Gene, "RGPs": Region, - "spots": Spot, "modules": Module}.items(): - for elem in pangenome.get_elem_by_source(source='source', metatype=metatype): + for metatype, expected_type in { + "families": GeneFamily, + "genomes": Organism, + "genes": Gene, + "RGPs": Region, + "spots": Spot, + "modules": Module, + }.items(): + for elem in pangenome.get_elem_by_source( + source="source", metatype=metatype + ): assert isinstance(elem, expected_type) for metadata in elem.metadata: assert isinstance(metadata, Metadata) - assert metadata.source == 'source' + assert metadata.source == "source" diff --git a/tests/test_region.py b/tests/test_region.py index dbc35bf9..b74c23e0 100644 --- a/tests/test_region.py +++ b/tests/test_region.py @@ -8,29 +8,33 @@ from ppanggolin.geneFamily import GeneFamily from ppanggolin.genome import Gene, Contig, Organism + @pytest.fixture def contig() -> Contig: - contig = Contig(0, 'contig_name') + contig = Contig(0, "contig_name") contig.length = 200 return contig + @pytest.fixture def genes(contig) -> Generator[Set[Gene], None, None]: - """Create a set of genes to fill gene families - """ + """Create a set of genes to fill gene families""" genes = [] for i in range(0, 11): gene = Gene(f"gene_{str(i)}") - gene.fill_annotations(start=10 * i + 1, stop=10 * (i + 1), strand='+', position=i, genetic_code=4) + gene.fill_annotations( + start=10 * i + 1, stop=10 * (i + 1), strand="+", position=i, genetic_code=4 + ) gene.contig = contig genes.append(gene) return genes + @pytest.fixture def gene(contig) -> Gene: - gene = Gene('gene') - gene.fill_annotations(start=1, stop=10, strand='+', position=0) - contig = Contig(0, 'contig_name') + gene = Gene("gene") + gene.fill_annotations(start=1, stop=10, strand="+", position=0) + contig = Contig(0, "contig_name") contig.length = 10 gene.contig = contig return gene @@ -38,8 +42,7 @@ def gene(contig) -> Gene: @pytest.fixture def families(genes) -> Generator[Set[GeneFamily], None, None]: - """Create a set of gene families fill with genes to test edges - """ + """Create a set of gene families fill with genes to test edges""" families = set() genes = list(genes) nb_families = randint(2, 10) @@ -99,27 +102,24 @@ def organisms(genes) -> Generator[Set[Organism], None, None]: class TestRegion: - """Tests for region class - """ - attr_val = {'score': 0, 'starter': None, 'stopper': None} + """Tests for region class""" + + attr_val = {"score": 0, "starter": None, "stopper": None} @pytest.fixture def region(self) -> Generator[Region, None, None]: - """Generate a region object to test class - """ + """Generate a region object to test class""" yield Region("RGP") def test_cstr(self, region: Region): - """Tests that region is constructed as expected - """ + """Tests that region is constructed as expected""" assert isinstance(region, Region) assert region.name == "RGP" assert isinstance(region._genes_getter, dict) def test_add_gene(self, region, gene): - """Tests that genes can be aadded to a region - """ - + """Tests that genes can be aadded to a region""" + region.add(gene) assert len(region._genes_getter) == 1 @@ -129,49 +129,44 @@ def test_add_gene(self, region, gene): assert gene.RGP == region def test_add_gene_not_is_instance_gene(self, region): - """Test that adding object with instance not Gene return a TypeError - """ + """Test that adding object with instance not Gene return a TypeError""" with pytest.raises(TypeError): region.add(0) def test_add_gene_not_fill_with_position(self, region): - """Test that adding gene not fill with position return an AttributeError - """ + """Test that adding gene not fill with position return an AttributeError""" with pytest.raises(AttributeError): - region.add(Gene('gene')) + region.add(Gene("gene")) def test_add_genes_at_position_already_taken(self, region, contig): - """Test that adding genes with same position return a ValueError - """ - gene = Gene('gene') - gene.fill_annotations(start=1, stop=10, strand='+', position=0) + """Test that adding genes with same position return a ValueError""" + gene = Gene("gene") + gene.fill_annotations(start=1, stop=10, strand="+", position=0) gene.contig = contig region.add(gene) with pytest.raises(KeyError): - another_gene = Gene('gene') - another_gene.fill_annotations(start=4, stop=12, strand='-', position=0) + another_gene = Gene("gene") + another_gene.fill_annotations(start=4, stop=12, strand="-", position=0) another_gene.contig = contig region.add(another_gene) def test_add_genes_from_different_contigs(self, region): - """Test that adding genes from different contigs return an Exception - """ - gene1, gene2 = Gene('gene_1'), Gene('gene_2') - gene1.fill_annotations(start=1, stop=10, strand='+', position=0) - gene2.fill_annotations(start=11, stop=20, strand='+', position=1) - gene1.fill_parents(None, Contig(1, 'contig_1')) + """Test that adding genes from different contigs return an Exception""" + gene1, gene2 = Gene("gene_1"), Gene("gene_2") + gene1.fill_annotations(start=1, stop=10, strand="+", position=0) + gene2.fill_annotations(start=11, stop=20, strand="+", position=1) + gene1.fill_parents(None, Contig(1, "contig_1")) region.add(gene1) - gene2.fill_parents(None, Contig(2, 'contig_2')) + gene2.fill_parents(None, Contig(2, "contig_2")) with pytest.raises(Exception): region.add(gene2) def test_add_genes_from_different_organisms(self, region): - """Test that adding genes from different organisms return an Exception - """ - gene1, gene2 = Gene('gene_1'), Gene('gene_2') - gene1.fill_annotations(start=1, stop=10, strand='+', position=0) - gene2.fill_annotations(start=11, stop=20, strand='+', position=1) + """Test that adding genes from different organisms return an Exception""" + gene1, gene2 = Gene("gene_1"), Gene("gene_2") + gene1.fill_annotations(start=1, stop=10, strand="+", position=0) + gene2.fill_annotations(start=11, stop=20, strand="+", position=1) gene1.fill_parents(Organism("org_1")) region.add(gene1) gene2.fill_parents(Organism("org_2")) @@ -179,48 +174,42 @@ def test_add_genes_from_different_organisms(self, region): region.add(gene2) def test_get_genes(self, region): - """Tests that genes can be retrieved from the region - """ - gene = Gene('gene') - gene.fill_annotations(start=1, stop=10, strand='+', position=0) + """Tests that genes can be retrieved from the region""" + gene = Gene("gene") + gene.fill_annotations(start=1, stop=10, strand="+", position=0) region.add(gene) assert region.get(0) == gene def test_get_genes_with_position_not_integer(self, region): - """Tests that getting a gene with wrong type for position raise a TypeError - """ + """Tests that getting a gene with wrong type for position raise a TypeError""" with pytest.raises(TypeError): region.get("0") def test_get_genes_with_position_not_in_region(self, region): - """Tests that getting a gene at position not belonging in the region return a KeyError - """ + """Tests that getting a gene at position not belonging in the region return a KeyError""" with pytest.raises(KeyError): region.get(randint(0, 20)) def test_del_gene(self, region): - """Tests that genes can be deleted from the region - """ - gene = Gene('gene') - gene.fill_annotations(start=1, stop=10, strand='+', position=0) + """Tests that genes can be deleted from the region""" + gene = Gene("gene") + gene.fill_annotations(start=1, stop=10, strand="+", position=0) region.add(gene) assert region.get(0) == gene region.remove(0) assert 0 not in region._genes_getter def test_del_genes_with_position_not_integer(self, region): - """Tests that removing a gene with wrong type for position raise a TypeError - """ + """Tests that removing a gene with wrong type for position raise a TypeError""" with pytest.raises(TypeError): region.remove("0") def test_get_length(self, region, contig): - """Tests that the length of the region can be retrieved - """ - gene1, gene2 = Gene('gene_1'), Gene('gene_2') - gene1.fill_annotations(start=1, stop=10, strand='+', position=0) + """Tests that the length of the region can be retrieved""" + gene1, gene2 = Gene("gene_1"), Gene("gene_2") + gene1.fill_annotations(start=1, stop=10, strand="+", position=0) gene1.contig = contig - gene2.fill_annotations(start=11, stop=20, strand='+', position=1) + gene2.fill_annotations(start=11, stop=20, strand="+", position=1) gene2.contig = contig region.add(gene1) @@ -228,29 +217,26 @@ def test_get_length(self, region, contig): assert region.length == 20 def test_get_organism(self, region, contig): - """Tests that the organism linked to the region can be retrieved - """ - gene = Gene('gene') - gene.fill_annotations(start=1, stop=10, strand='+', position=0) + """Tests that the organism linked to the region can be retrieved""" + gene = Gene("gene") + gene.fill_annotations(start=1, stop=10, strand="+", position=0) gene.fill_parents(Organism("org"), contig) region.add(gene) - assert region.organism.name == 'org' + assert region.organism.name == "org" def test_get_contig(self, region): - """Tests that the contig linked to the region can be retrieved - """ - gene = Gene('gene') - gene.fill_annotations(start=1, stop=10, strand='+', position=0) + """Tests that the contig linked to the region can be retrieved""" + gene = Gene("gene") + gene.fill_annotations(start=1, stop=10, strand="+", position=0) gene.fill_parents(contig=Contig(0, "contig")) region.add(gene) - assert region.contig.name == 'contig' + assert region.contig.name == "contig" def test_is_whole_contig_true(self, region): - """Tests that the property is_whole_contig return True if the region has the same length as contig - """ - starter, stopper = Gene('starter'), Gene('stopper') - starter.fill_annotations(start=1, stop=10, strand='+', position=0) - stopper.fill_annotations(start=11, stop=20, strand='+', position=1) + """Tests that the property is_whole_contig return True if the region has the same length as contig""" + starter, stopper = Gene("starter"), Gene("stopper") + starter.fill_annotations(start=1, stop=10, strand="+", position=0) + stopper.fill_annotations(start=11, stop=20, strand="+", position=1) contig = Contig(0, "contig") contig[starter.start], contig[stopper.start] = starter, stopper starter.fill_parents(None, contig), stopper.fill_parents(None, contig) @@ -258,13 +244,17 @@ def test_is_whole_contig_true(self, region): assert region.is_whole_contig is True def test_is_whole_contig_false(self, region): - """Tests that the property is_whole_contig return False if the region has not the same length as contig - """ - before, starter, stopper, after = Gene('before'), Gene('starter'), Gene('stopper'), Gene('after') - before.fill_annotations(start=1, stop=10, strand='+', position=0) - starter.fill_annotations(start=11, stop=20, strand='+', position=1) - stopper.fill_annotations(start=21, stop=30, strand='+', position=2) - after.fill_annotations(start=31, stop=40, strand='+', position=3) + """Tests that the property is_whole_contig return False if the region has not the same length as contig""" + before, starter, stopper, after = ( + Gene("before"), + Gene("starter"), + Gene("stopper"), + Gene("after"), + ) + before.fill_annotations(start=1, stop=10, strand="+", position=0) + starter.fill_annotations(start=11, stop=20, strand="+", position=1) + stopper.fill_annotations(start=21, stop=30, strand="+", position=2) + after.fill_annotations(start=31, stop=40, strand="+", position=3) contig = Contig(0, "contig") contig[before.start], contig[after.start] = before, after contig[starter.start], contig[stopper.start] = starter, stopper @@ -274,18 +264,26 @@ def test_is_whole_contig_false(self, region): assert region.is_whole_contig is False def test_is_contig_border_true(self, region): - """Test that property is_contig_border return true if the region is bordering the contig - """ - before, starter, stopper, after = Gene('before'), Gene('starter'), Gene('stopper'), Gene('after') - before.fill_annotations(start=1, stop=10, strand='+', position=0) - starter.fill_annotations(start=11, stop=20, strand='+', position=1) - stopper.fill_annotations(start=21, stop=30, strand='+', position=2) - after.fill_annotations(start=31, stop=40, strand='+', position=3) + """Test that property is_contig_border return true if the region is bordering the contig""" + before, starter, stopper, after = ( + Gene("before"), + Gene("starter"), + Gene("stopper"), + Gene("after"), + ) + before.fill_annotations(start=1, stop=10, strand="+", position=0) + starter.fill_annotations(start=11, stop=20, strand="+", position=1) + stopper.fill_annotations(start=21, stop=30, strand="+", position=2) + after.fill_annotations(start=31, stop=40, strand="+", position=3) contig = Contig(0, "contig") before.fill_parents(None, contig), after.fill_parents(None, contig) starter.fill_parents(None, contig), stopper.fill_parents(None, contig) # Test bordering right - contig[before.start], contig[starter.start], contig[stopper.start] = before, starter, stopper + contig[before.start], contig[starter.start], contig[stopper.start] = ( + before, + starter, + stopper, + ) region.add(starter), region.add(stopper) assert region.is_contig_border is True # Test bordering left @@ -295,13 +293,17 @@ def test_is_contig_border_true(self, region): assert region.is_contig_border is True def test_is_contig_border_false(self, region): - """Tests that the property is_contig_border return False if the region is not bordering the contig - """ - before, starter, stopper, after = Gene('before'), Gene('starter'), Gene('stopper'), Gene('after') - before.fill_annotations(start=1, stop=10, strand='+', position=0) - starter.fill_annotations(start=11, stop=20, strand='+', position=1) - stopper.fill_annotations(start=21, stop=30, strand='+', position=2) - after.fill_annotations(start=31, stop=40, strand='+', position=3) + """Tests that the property is_contig_border return False if the region is not bordering the contig""" + before, starter, stopper, after = ( + Gene("before"), + Gene("starter"), + Gene("stopper"), + Gene("after"), + ) + before.fill_annotations(start=1, stop=10, strand="+", position=0) + starter.fill_annotations(start=11, stop=20, strand="+", position=1) + stopper.fill_annotations(start=21, stop=30, strand="+", position=2) + after.fill_annotations(start=31, stop=40, strand="+", position=3) contig = Contig(0, "contig") contig[before.start], contig[after.start] = before, after contig[starter.start], contig[stopper.start] = starter, stopper @@ -311,22 +313,19 @@ def test_is_contig_border_false(self, region): assert region.is_contig_border is False def test_is_contig_border_assertion_error_if_no_gene(self, region): - """Tests that an AssertionError is returned if there is no gene in the region - """ + """Tests that an AssertionError is returned if there is no gene in the region""" with pytest.raises(AssertionError): _ = region.is_contig_border def test_len(self, region, genes): - """Tests that the expected number of genes is retrieved in the region - """ + """Tests that the expected number of genes is retrieved in the region""" for gene in genes: region.add(gene) assert isinstance(len(region), int) assert len(region) == len(genes) def test_equality(self, genes): - """Test equality between two regions - """ + """Test equality between two regions""" region_1, region_2 = Region("RGP_1"), Region("RGP_2") for gene in genes: region_1.add(gene) @@ -339,31 +338,26 @@ def test_wrong_position(self, gene): with pytest.raises(ValueError): region[42] = gene - def test_not_equal(self, region, genes): - """Test difference between two regions - """ + """Test difference between two regions""" for gene in genes: region.add(gene) assert region != Region("other_RGP") def test_equality_with_not_instance_region(self, region): - """Test comparison between a region and another object raise a TypeError - """ + """Test comparison between a region and another object raise a TypeError""" with pytest.raises(TypeError): assert region == 4 def test_get_gene_families(self, region, genes, families): - """Tests that gene families can be retrieved from the region - """ + """Tests that gene families can be retrieved from the region""" for gene in genes: region.add(gene) assert all(isinstance(family, GeneFamily) for family in region.families) assert set(region.families) == families def test_get_number_of_gene_families(self, region, genes, families): - """Tests that gene families can be retrieved from the region - """ + """Tests that gene families can be retrieved from the region""" for gene in genes: region.add(gene) assert isinstance(region.number_of_families, int) @@ -375,55 +369,68 @@ def test_starter_stopper_simpler(self, region): check that the starter and stopper genes are correct. as well as the coordinates of the region """ - contig = Contig(0, 'contig_name') + contig = Contig(0, "contig_name") contig.length = 200 genes = [] for i in range(0, 10): gene = Gene(f"gene_{str(i)}") - gene.fill_annotations(start=10 * i + 1, stop=10 * (i + 1), strand='+', position=i, genetic_code=4) + gene.fill_annotations( + start=10 * i + 1, + stop=10 * (i + 1), + strand="+", + position=i, + genetic_code=4, + ) gene.fill_parents(contig=contig) contig.add(gene) genes.append(gene) - + region.add(genes[2]) assert region.starter == genes[2] assert region.stopper == genes[2] assert region.coordinates == genes[2].coordinates assert region.coordinates == [(genes[2].start, genes[2].stop)] - + region.add(genes[3]) region.add(genes[4]) assert region.starter == genes[2] assert region.stopper == genes[4] - assert region.coordinates == [(genes[2].start, genes[4].stop)] - + assert region.coordinates == [(genes[2].start, genes[4].stop)] def test_starter_stopper_with_contig_overlap(self, region): """ check that when region overlaps the contig, the starter and stopper gene are correct. as well as the coordinates of the region """ - contig = Contig(0, 'contig_name', is_circular=True) + contig = Contig(0, "contig_name", is_circular=True) contig.length = 400 genes = [] for i in range(0, 10): gene = Gene(f"gene_{str(i)}") - gene.fill_annotations(start=10 * i + 1, stop=10 * (i + 1), strand='+', position=i, genetic_code=4) + gene.fill_annotations( + start=10 * i + 1, + stop=10 * (i + 1), + strand="+", + position=i, + genetic_code=4, + ) gene.fill_parents(contig=contig) contig.add(gene) genes.append(gene) - + region.add(genes[9]) region.add(genes[0]) - assert region.starter == genes[9] assert region.stopper == genes[0] - assert region.coordinates == [(genes[9].start, contig.length), (1, genes[0].stop)] + assert region.coordinates == [ + (genes[9].start, contig.length), + (1, genes[0].stop), + ] def test_starter_stopper_with_contig_overlap_of_gene(self, region): """ @@ -431,32 +438,38 @@ def test_starter_stopper_with_contig_overlap_of_gene(self, region): """ - contig = Contig(0, 'contig_name', is_circular=True) + contig = Contig(0, "contig_name", is_circular=True) contig.length = 400 genes = [] for i in range(0, 10): gene = Gene(f"gene_{str(i)}") - gene.fill_annotations(start=10 * i + 5, stop=10 * (i + 1), strand='+', position=i) + gene.fill_annotations( + start=10 * i + 5, stop=10 * (i + 1), strand="+", position=i + ) gene.fill_parents(contig=contig) contig.add(gene) genes.append(gene) - + # add a gene that overlap the contig edge gene_that_overlap = Gene(f"gene_{str(10)}") - gene_that_overlap.fill_annotations(start=300, stop=5, strand='+', position=10, coordinates=[(300, 400), (1, 5)]) + gene_that_overlap.fill_annotations( + start=300, stop=5, strand="+", position=10, coordinates=[(300, 400), (1, 5)] + ) gene_that_overlap.fill_parents(contig=contig) contig.add(gene_that_overlap) genes.append(gene_that_overlap) - region.add(gene_that_overlap) assert region.starter == gene_that_overlap assert region.stopper == gene_that_overlap - assert region.coordinates == gene_that_overlap.coordinates - assert region.coordinates == [(gene_that_overlap.start, contig.length), (1, gene_that_overlap.stop)] - + assert region.coordinates == gene_that_overlap.coordinates + assert region.coordinates == [ + (gene_that_overlap.start, contig.length), + (1, gene_that_overlap.stop), + ] + # if we add more genes around the one that overlap region.add(genes[9]) @@ -465,17 +478,18 @@ def test_starter_stopper_with_contig_overlap_of_gene(self, region): region.add(genes[0]) assert region.starter == genes[7] assert region.stopper == genes[0] - assert region.coordinates == [(genes[7].start, contig.length), (1, genes[0].stop)] - - + assert region.coordinates == [ + (genes[7].start, contig.length), + (1, genes[0].stop), + ] - def test_get_bordering_genes(self,region): + def test_get_bordering_genes(self, region): """ Test simple border. for a contig with 10 genes. Add gene from 1 to 8 into the region. Gene at the border are 0 and 9 """ - contig = Contig(0, 'contig_name') + contig = Contig(0, "contig_name") contig.length = 200 family = GeneFamily(1, "test") @@ -484,28 +498,33 @@ def test_get_bordering_genes(self,region): genes = [] for i in range(0, 10): gene = Gene(f"gene_{str(i)}") - gene.fill_annotations(start=10 * i + 1, stop=10 * (i + 1), strand='+', position=i, genetic_code=4) + gene.fill_annotations( + start=10 * i + 1, + stop=10 * (i + 1), + strand="+", + position=i, + genetic_code=4, + ) gene.fill_parents(contig=contig) gene.family = family contig.add(gene) genes.append(gene) - + for gene in genes[1:-1]: region.add(gene) borders = region.get_bordering_genes(1, {}) assert borders == [[genes[0]], [genes[-1]]] - def test_get_bordering_genes_overlap_contigs(self, region): """ - Test border of a region that overlap contig edge. + Test border of a region that overlap contig edge. for a contig with 10 genes. Add gene from 0,1 and 9. left border is 8 and right is 2 """ - contig = Contig(0, 'contig_name', is_circular=True) + contig = Contig(0, "contig_name", is_circular=True) contig.length = 200 family = GeneFamily(1, "test") @@ -514,13 +533,19 @@ def test_get_bordering_genes_overlap_contigs(self, region): genes = [] for i in range(0, 10): gene = Gene(f"gene_{str(i)}") - gene.fill_annotations(start=10 * i + 1, stop=10 * (i + 1), strand='+', position=i, genetic_code=4) + gene.fill_annotations( + start=10 * i + 1, + stop=10 * (i + 1), + strand="+", + position=i, + genetic_code=4, + ) gene.fill_parents(contig=contig) gene.family = family contig.add(gene) genes.append(gene) - + region.add(genes[0]) region.add(genes[1]) region.add(genes[9]) @@ -533,7 +558,7 @@ def test_get_bordering_genes_whole_contig(self, region): Test border of a region that cover all the contig. Expect no border """ - contig = Contig(0, 'contig_name', is_circular=True) + contig = Contig(0, "contig_name", is_circular=True) contig.length = 200 family = GeneFamily(1, "test") @@ -542,7 +567,13 @@ def test_get_bordering_genes_whole_contig(self, region): genes = [] for i in range(0, 10): gene = Gene(f"gene_{str(i)}") - gene.fill_annotations(start=10 * i + 1, stop=10 * (i + 1), strand='+', position=i, genetic_code=4) + gene.fill_annotations( + start=10 * i + 1, + stop=10 * (i + 1), + strand="+", + position=i, + genetic_code=4, + ) gene.fill_parents(contig=contig) gene.family = family contig.add(gene) @@ -553,18 +584,17 @@ def test_get_bordering_genes_whole_contig(self, region): borders = region.get_bordering_genes(1, {}) - assert borders == [[], []] # no border - + assert borders == [[], []] # no border def test_get_bordering_genes_with_multigenic(self, region): """ - Test border with multigenic for a non circular contig with 10 genes. + Test border with multigenic for a non circular contig with 10 genes. Add gene from 3 to 7 into the region. gene 2 and 8 are mulitgenic Gene at the border are 1 on the left and 9 """ - contig = Contig(0, 'contig_name') + contig = Contig(0, "contig_name") contig.length = 200 family = GeneFamily(1, "test") @@ -576,7 +606,13 @@ def test_get_bordering_genes_with_multigenic(self, region): genes = [] for i in range(0, 10): gene = Gene(f"gene_{str(i)}") - gene.fill_annotations(start=10 * i + 1, stop=10 * (i + 1), strand='+', position=i, genetic_code=4) + gene.fill_annotations( + start=10 * i + 1, + stop=10 * (i + 1), + strand="+", + position=i, + genetic_code=4, + ) gene.fill_parents(contig=contig) if i == 2 or i == 8: gene.family = multigenic_family @@ -585,15 +621,13 @@ def test_get_bordering_genes_with_multigenic(self, region): contig.add(gene) genes.append(gene) - + for gene in genes[3:8]: region.add(gene) - borders = region.get_bordering_genes(1, {multigenic_family}) assert borders == [[genes[1]], [genes[9]]] - def test_get_bordering_genes_with_all_multigenic(self, region): """ @@ -601,7 +635,7 @@ def test_get_bordering_genes_with_all_multigenic(self, region): for a contig with 10 genes. Add gene from 1 to 8 into the region. no border as families are multigenic """ - contig = Contig(0, 'contig_name') + contig = Contig(0, "contig_name") contig.length = 200 family = GeneFamily(1, "test") @@ -610,76 +644,74 @@ def test_get_bordering_genes_with_all_multigenic(self, region): genes = [] for i in range(0, 10): gene = Gene(f"gene_{str(i)}") - gene.fill_annotations(start=10 * i + 1, stop=10 * (i + 1), strand='+', position=i, genetic_code=4) + gene.fill_annotations( + start=10 * i + 1, + stop=10 * (i + 1), + strand="+", + position=i, + genetic_code=4, + ) gene.fill_parents(contig=contig) gene.family = family contig.add(gene) genes.append(gene) - + for gene in genes[1:-1]: region.add(gene) borders = region.get_bordering_genes(1, {family}) - assert borders == [[], []] # no border + assert borders == [[], []] # no border + class TestSpot: @pytest.fixture def spot(self) -> Generator[Spot, None, None]: - """Generate a spot for test - """ + """Generate a spot for test""" yield Spot(0) def test_cstr(self, spot): - """Tests that spot is constructed as expected - """ + """Tests that spot is constructed as expected""" assert spot.ID == 0 assert isinstance(spot._region_getter, dict) and len(spot._region_getter) == 0 assert isinstance(spot._uniqOrderedSet, dict) and len(spot._uniqOrderedSet) == 0 assert isinstance(spot._uniqContent, dict) and len(spot._uniqContent) == 0 def test_cstr_type_error(self): - """Tests that TypeError is returned if identifier is not an integer - """ + """Tests that TypeError is returned if identifier is not an integer""" with pytest.raises(TypeError): Spot("spot_0") def test_repr(self, spot): - """Test that the canonical string representing a spot does not change - """ + """Test that the canonical string representing a spot does not change""" assert repr(spot) == "Spot 0 - #RGP: 0" def test_str(self, spot): - """Test that the writing spot method does not change - """ + """Test that the writing spot method does not change""" assert str(spot) == "spot_0" @pytest.fixture def region(self) -> Generator[Region, None, None]: - """Create a region for test - """ + """Create a region for test""" yield Region("RGP_0") def test_add_region(self, spot, region): - """Tests that adding a Region object to the Spot object works as expected - """ + """Tests that adding a Region object to the Spot object works as expected""" spot.add(region) assert region == spot._region_getter[region.name] def test_add_not_instance_region(self, spot): - """Tests that a TypeError is returned if a non-region type is trying to be added - """ + """Tests that a TypeError is returned if a non-region type is trying to be added""" with pytest.raises(TypeError): spot.add("region") def test_add_different_region_with_same_name(self, spot): - """Test that adding a new Region same name than another in the spot return a KeyError - """ + """Test that adding a new Region same name than another in the spot return a KeyError""" region_1, region_2 = Region("RGP"), Region("RGP") gene_1, gene_2 = Gene("gene_1"), Gene("gene_2") - gene_1.fill_annotations(start=1, stop=10, strand='+', position=0) - gene_2.fill_annotations(start=1, stop=10, strand='+', position=0) + gene_1.fill_annotations(start=1, stop=10, strand="+", position=0) + gene_2.fill_annotations(start=1, stop=10, strand="+", position=0) gene_1.family, gene_2.family = GeneFamily(0, "Fam_0"), GeneFamily(1, "Fam_1") region_1[0], region_2[0] = gene_1, gene_2 spot[region_1.name] = region_1 @@ -687,10 +719,9 @@ def test_add_different_region_with_same_name(self, spot): spot[region_2.name] = region_2 def test_add_two_time_the_same_region(self, spot, region): - """Test that adding a two time the same region is working as expected - """ + """Test that adding a two time the same region is working as expected""" gene = Gene("gene") - gene.fill_annotations(start=1, stop=10, strand='+', position=0) + gene.fill_annotations(start=1, stop=10, strand="+", position=0) gene.family = GeneFamily(0, "Fam") region[0] = gene spot[region.name] = region @@ -699,27 +730,23 @@ def test_add_two_time_the_same_region(self, spot, region): assert region in spot._region_getter.values() def test_get_region(self, spot, region): - """Tests that getting the region in the Spot object works as expected - """ + """Tests that getting the region in the Spot object works as expected""" spot.add(region) assert spot.get(region.name) == region def test_get_region_not_in_spot(self, spot): - """Tests that a KeyError is raised when the name of the region does not exist in the spot - """ + """Tests that a KeyError is raised when the name of the region does not exist in the spot""" with pytest.raises(KeyError): _ = spot["rgp"] def test_delete_region_in_spot(self, spot, region): - """Tests that remove a region from the spot work as expected - """ + """Tests that remove a region from the spot work as expected""" spot[region.name] = region del spot[region.name] assert region.name not in spot._region_getter def test_len(self, spot, region): - """Tests that getting the number of regions work as expected - """ + """Tests that getting the number of regions work as expected""" assert isinstance(len(spot), int) assert len(spot) == 0 spot[region.name] = region @@ -727,8 +754,7 @@ def test_len(self, spot, region): @pytest.fixture def regions(self, genes): - """Create a random number of regions fill with genes - """ + """Create a random number of regions fill with genes""" regions = set() genes = sorted(list(genes), key=lambda x: x.position) nb_regions = randint(2, len(genes)) @@ -754,8 +780,7 @@ def regions(self, genes): yield regions def test_get_all_regions(self, spot, regions): - """Tests that getting all the region in the spot works as expected - """ + """Tests that getting all the region in the spot works as expected""" for region in regions: spot[region.name] = region assert len(spot) == len(regions) @@ -763,23 +788,20 @@ def test_get_all_regions(self, spot, regions): assert regions == set(spot.regions) def test_get_families(self, spot, regions, families): - """Tests that getting the gene families in the Spot object works as expected - """ + """Tests that getting the gene families in the Spot object works as expected""" for region in regions: spot[region.name] = region assert set(spot.families) == families def test_number_of_families(self, spot, regions, families): - """Tests that getting the number of families in the spot works as expected - """ + """Tests that getting the number of families in the spot works as expected""" for region in regions: spot[region.name] = region assert isinstance(spot.number_of_families, int) assert spot.number_of_families == len(families) def test_add_spot_to_families(self, spot, regions, families): - """Tests that adding spot to families works as expected - """ + """Tests that adding spot to families works as expected""" for region in regions: spot[region.name] = region spot.spot_2_families() @@ -787,8 +809,7 @@ def test_add_spot_to_families(self, spot, regions, families): @pytest.fixture def srgps(self, regions): - """Create a random number of same rgp for all regions - """ + """Create a random number of same rgp for all regions""" srgps = set() for region in regions: nb_sim_rgp = randint(1, 3) @@ -800,28 +821,33 @@ def srgps(self, regions): yield srgps def test_get_uniq_rgp_set(self, spot, regions, families, srgps): - """Tests that getting identical rgp in the Spot object works as expected - """ - for region in list(regions) + list(srgps): # With lists provide sRGP to be key RGP in dict + """Tests that getting identical rgp in the Spot object works as expected""" + for region in list(regions) + list( + srgps + ): # With lists provide sRGP to be key RGP in dict spot[region.name] = region assert len(spot) == len(regions) + len(srgps) uniq2rgp = spot.get_uniq_to_rgp() for region, sim_rgps in uniq2rgp.items(): assert region in regions - assert set(region.families) == set.union(*[set(srgp.families) for srgp in sim_rgps]) + assert set(region.families) == set.union( + *[set(srgp.families) for srgp in sim_rgps] + ) def test_get_uniq_ordered_set(self, spot, regions, families, srgps): - """Tests that getting the unique synteny in the Spot object works as expected - """ - for region in list(regions) + list(srgps): # With lists provide sRGP to be key RGP in dict + """Tests that getting the unique synteny in the Spot object works as expected""" + for region in list(regions) + list( + srgps + ): # With lists provide sRGP to be key RGP in dict spot[region.name] = region assert len(spot) == len(regions) + len(srgps) assert spot.get_uniq_ordered_set().issubset(regions) def test_get_uniq_content(self, spot, regions, families, srgps): - """Tests that getting the unique RGP in the Spot object works as expected - """ - for region in list(regions) + list(srgps): # With lists provide sRGP to be key RGP in dict + """Tests that getting the unique RGP in the Spot object works as expected""" + for region in list(regions) + list( + srgps + ): # With lists provide sRGP to be key RGP in dict spot[region.name] = region assert len(spot) == len(regions) + len(srgps) assert spot.get_uniq_ordered_set().issubset(regions) @@ -830,47 +856,41 @@ def test_get_uniq_content(self, spot, regions, families, srgps): class TestModule: @pytest.fixture def module(self): - """Create a basic module - """ + """Create a basic module""" yield Module(0) def test_cstr(self, module): - """Test that a module is construct as expected - """ + """Test that a module is construct as expected""" assert module.ID == 0 - assert isinstance(module._families_getter, dict) and module._families_getter == {} + assert ( + isinstance(module._families_getter, dict) and module._families_getter == {} + ) def test_cstr_type_error(self): - """Test that if the identifier is not an integer it raises a TypeError - """ + """Test that if the identifier is not an integer it raises a TypeError""" with pytest.raises(TypeError): Spot("mod_0") def test_repr(self, module): - """Test that the canonical string representing a module does not change - """ + """Test that the canonical string representing a module does not change""" assert repr(module) == "Module 0 - #Families: 0" def test_str(self, module): - """Test that the writing spot method does not change - """ + """Test that the writing spot method does not change""" assert str(module) == "module_0" def test_hash(self, module): - """Test that len method work as expected - """ + """Test that len method work as expected""" assert isinstance(hash(module), int) def test_len(self, module): - """Test that len method work as expected - """ - module._families_getter["fam"] = GeneFamily(randint(1,5), "fam") + """Test that len method work as expected""" + module._families_getter["fam"] = GeneFamily(randint(1, 5), "fam") assert isinstance(len(module), int) assert len(module) == 1 def test_eq(self, families): - """Test equality between modules - """ + """Test equality between modules""" module1, module2, module3 = Module(1), Module(2), Module(3) for family in families: module1[family.name] = family @@ -879,45 +899,39 @@ def test_eq(self, families): assert module1 != module3 def test_eq_with_is_not_instance_module(self, module): - """Test comparison between a module and another object raise a TypeError - """ + """Test comparison between a module and another object raise a TypeError""" with pytest.raises(TypeError): assert module == 4 @pytest.fixture def family(self) -> Generator[GeneFamily, None, None]: - """Create a basic gene family for test - """ - yield GeneFamily(0, 'family') + """Create a basic gene family for test""" + yield GeneFamily(0, "family") def test_add_family(self, module, family): - """Tests that a gene family can be added to the module - """ + """Tests that a gene family can be added to the module""" module[family.name] = family assert len(module._families_getter) == 1 - assert module._families_getter['family'] == family + assert module._families_getter["family"] == family def test_add_different_families_with_same_name(self, module): - """Test that adding a new family with the same name as another in the module return a KeyError - """ - family_1, family_2 = GeneFamily(1, 'family_1'), GeneFamily(1, 'family_1') + """Test that adding a new family with the same name as another in the module return a KeyError""" + family_1, family_2 = GeneFamily(1, "family_1"), GeneFamily(1, "family_1") module[family_1.name] = family_1 with pytest.raises(KeyError): module[family_2.name] = family_2 def test_add_two_time_the_same_family(self, module, family): - """Test that adding a two time the same family is working as expected - """ + """Test that adding a two time the same family is working as expected""" module[family.name] = family assert family in module._families_getter.values() module[family.name] = family assert family in module._families_getter.values() def test_get_family(self, module, family): - """Tests that a gene family can be retrieved from the module - """ + """Tests that a gene family can be retrieved from the module""" module[family.name] = family - assert module['family'] == family + assert module["family"] == family def test_get_family_which_does_not_exist(self, module): """Tests that if a gene family does not exist it raises a KeyError""" @@ -926,15 +940,13 @@ def test_get_family_which_does_not_exist(self, module): _ = module[fam.name] def test_delete_family(self, module, family): - """Tests that a gene family can be deleted from the module - """ + """Tests that a gene family can be deleted from the module""" module[family.name] = family del module[family.name] assert len(module) == 0 def test_delete_family_which_does_not_exist(self, module): - """Tests that if a gene family does not exist it raises a KeyError - """ + """Tests that if a gene family does not exist it raises a KeyError""" fam = GeneFamily(randint(1, 20), f"fam{randint(1, 20)}") with pytest.raises(KeyError): del module[fam.name] @@ -943,47 +955,42 @@ def test_delete_family_which_does_not_exist(self, module): class TestGeneContext: @pytest.fixture def context(self): - """Generate a basic context - """ + """Generate a basic context""" yield GeneContext(0) def test_cstr(self, context): - """Test that a gene context is construct as expected - """ + """Test that a gene context is construct as expected""" assert context.ID == 0 - assert isinstance(context._families_getter, dict) and context._families_getter == {} + assert ( + isinstance(context._families_getter, dict) + and context._families_getter == {} + ) def test_cstr_type_error(self): - """Test that if the identifier is not an integer it raises a TypeError - """ + """Test that if the identifier is not an integer it raises a TypeError""" with pytest.raises(TypeError): Spot("gc_0") def test_repr(self, context): - """Test that the canonical string representing a context does not change - """ + """Test that the canonical string representing a context does not change""" assert repr(context) == "Context 0 - #Families: 0" def test_str(self, context): - """Test that the writing spot method does not change - """ + """Test that the writing spot method does not change""" assert str(context) == "GC_0" def test_hash(self, context): - """Test that len method work as expected - """ + """Test that len method work as expected""" assert isinstance(hash(context), int) def test_len(self, context): - """Test that len method work as expected - """ + """Test that len method work as expected""" context._families_getter["fam"] = GeneFamily(randint(1, 5), "fam") assert isinstance(len(context), int) assert len(context) == 1 def test_eq(self, families): - """Test equality between two contexts - """ + """Test equality between two contexts""" context1, context2, context3 = GeneContext(1), GeneContext(2), GeneContext(3) for family in families: context1[family.name] = family @@ -992,45 +999,39 @@ def test_eq(self, families): assert context1 != context3 def test_eq_with_is_not_instance_context(self, context): - """Test comparison between a context and another object raise a TypeError - """ + """Test comparison between a context and another object raise a TypeError""" with pytest.raises(TypeError): assert context == 4 @pytest.fixture def family(self) -> Generator[GeneFamily, None, None]: - """Create a basic gene family for test - """ - yield GeneFamily(0, 'family') + """Create a basic gene family for test""" + yield GeneFamily(0, "family") def test_add_family(self, context, family): - """Tests that a gene family can be added to the context - """ + """Tests that a gene family can be added to the context""" context[family.name] = family assert len(context._families_getter) == 1 - assert context._families_getter['family'] == family + assert context._families_getter["family"] == family def test_add_different_families_with_same_name(self, context): - """Test that adding a new family with the same name as another in the context return a KeyError - """ - family_1, family_2 = GeneFamily(1, 'family_1'), GeneFamily(1, 'family_1') + """Test that adding a new family with the same name as another in the context return a KeyError""" + family_1, family_2 = GeneFamily(1, "family_1"), GeneFamily(1, "family_1") context[family_1.name] = family_1 with pytest.raises(KeyError): context[family_2.name] = family_2 def test_add_two_time_the_same_family(self, context, family): - """Test that adding a two time the same family is working as expected - """ + """Test that adding a two time the same family is working as expected""" context[family.name] = family assert family in context._families_getter.values() context[family.name] = family assert family in context._families_getter.values() def test_get_family(self, context, family): - """Tests that a gene family can be retrieved from the context - """ + """Tests that a gene family can be retrieved from the context""" context[family.name] = family - assert context['family'] == family + assert context["family"] == family def test_get_family_which_does_not_exist(self, context): """Tests that if a gene family does not exist it raises a KeyError""" @@ -1039,15 +1040,13 @@ def test_get_family_which_does_not_exist(self, context): _ = context[fam.name] def test_delete_family(self, context, family): - """Tests that a gene family can be deleted from the context - """ + """Tests that a gene family can be deleted from the context""" context[family.name] = family - del context['family'] + del context["family"] assert len(context) == 0 def test_delete_family_which_does_not_exist(self, context): - """Tests that if a gene family does not exist it raises a KeyError - """ + """Tests that if a gene family does not exist it raises a KeyError""" fam = GeneFamily(randint(1, 20), f"fam{randint(1, 20)}") with pytest.raises(KeyError): del context[fam.name] diff --git a/tests/utils/test_utilities.py b/tests/utils/test_utilities.py index edb48d55..68ce840a 100644 --- a/tests/utils/test_utilities.py +++ b/tests/utils/test_utilities.py @@ -7,7 +7,14 @@ import zipfile from typing import Generator -from ppanggolin.utils import is_compressed, read_compressed_or_not, write_compressed_or_not, has_non_ascii, replace_non_ascii +from ppanggolin.utils import ( + is_compressed, + read_compressed_or_not, + write_compressed_or_not, + has_non_ascii, + replace_non_ascii, +) + class TestCompressed: """ @@ -20,7 +27,7 @@ def plain_file(self, tmp_path: Path) -> Generator[Path, None, None]: Creates a temporary plain text file for testing. """ file_path = tmp_path / "test.txt" - with open(file_path, 'wb') as f: + with open(file_path, "wb") as f: f.write(b"Test data") yield file_path @@ -30,7 +37,7 @@ def gzip_file(self, tmp_path: Path) -> Generator[Path, None, None]: Creates a temporary gzip file for testing. """ file_path = tmp_path / "test.gz" - with gzip.open(file_path, 'wb') as f: + with gzip.open(file_path, "wb") as f: f.write(b"Test data") yield file_path @@ -40,7 +47,7 @@ def bz2_file(self, tmp_path: Path) -> Generator[Path, None, None]: Creates a temporary bz2 file for testing. """ file_path = tmp_path / "test.bz2" - with bz2.open(file_path, 'wb') as f: + with bz2.open(file_path, "wb") as f: f.write(b"Test data") yield file_path @@ -50,15 +57,14 @@ def zip_file(self, tmp_path: Path) -> Generator[Path, None, None]: Creates a temporary zip file for testing. """ file_path = tmp_path / "test.zip" - with zipfile.ZipFile(file_path, 'w') as z: + with zipfile.ZipFile(file_path, "w") as z: z.writestr("test.txt", "Test data") yield file_path class TestIsCompressed(TestCompressed): def test_is_compressed_with_plain_file(self, plain_file: Path) -> None: - """Test is_compressed function with a plain text file. - """ + """Test is_compressed function with a plain text file.""" assert is_compressed(plain_file) == (False, None) def test_is_compressed_with_gzip_file(self, gzip_file: Path) -> None: @@ -91,6 +97,7 @@ class TestReadCompressedOrNot(TestCompressed): """ Test cases for the read_compressed_or_not function. """ + def test_read_compressed_gzip(self, gzip_file: Path) -> None: """ Test read_compressed_or_not function with a gzip file. @@ -145,7 +152,7 @@ def test_write_compressed(self, plain_file_path: Path) -> None: """ with write_compressed_or_not(plain_file_path, compress=True) as f: f.write("Test data") - with gzip.open(plain_file_path.with_suffix('.txt.gz'), 'rt') as f: + with gzip.open(plain_file_path.with_suffix(".txt.gz"), "rt") as f: assert f.read() == "Test data" def test_write_uncompressed(self, plain_file_path: Path) -> None: @@ -154,29 +161,39 @@ def test_write_uncompressed(self, plain_file_path: Path) -> None: """ with write_compressed_or_not(plain_file_path, compress=False) as f: f.write("Test data") - with open(plain_file_path, 'r') as f: + with open(plain_file_path, "r") as f: assert f.read() == "Test data" # Test cases for has_non_ascii -@pytest.mark.parametrize("input_string, expected", [ - ("Escherichia_coli", False), # All ASCII characters - ("Escherichia_colí", True), # Contains non-ASCII character 'í' - ("simple_string", False), # Simple ASCII string - ("Ωmega", True), # Contains non-ASCII character 'Ω' - ("", False), # Empty string should return False -]) +@pytest.mark.parametrize( + "input_string, expected", + [ + ("Escherichia_coli", False), # All ASCII characters + ("Escherichia_colí", True), # Contains non-ASCII character 'í' + ("simple_string", False), # Simple ASCII string + ("Ωmega", True), # Contains non-ASCII character 'Ω' + ("", False), # Empty string should return False + ], +) def test_has_non_ascii(input_string, expected): assert has_non_ascii(input_string) == expected + # Test cases for replace_non_ascii -@pytest.mark.parametrize("input_string, replacement, expected", [ - ("Escherichia_coli", "_", "Escherichia_coli"), # All ASCII characters, no replacement needed - ("Escherichia_colí", "_", "Escherichia_col_"), # Replace 'í' with '_' - ("Ωmega", "-", "-mega"), # Replace 'Ω' with '-' - ("Escherichia_Ωcoli", "X", "Escherichia_Xcoli"),# Replace 'Ω' with 'X' - ("", "_", ""), # Empty string, no replacement -]) +@pytest.mark.parametrize( + "input_string, replacement, expected", + [ + ( + "Escherichia_coli", + "_", + "Escherichia_coli", + ), # All ASCII characters, no replacement needed + ("Escherichia_colí", "_", "Escherichia_col_"), # Replace 'í' with '_' + ("Ωmega", "-", "-mega"), # Replace 'Ω' with '-' + ("Escherichia_Ωcoli", "X", "Escherichia_Xcoli"), # Replace 'Ω' with 'X' + ("", "_", ""), # Empty string, no replacement + ], +) def test_replace_non_ascii(input_string, replacement, expected): assert replace_non_ascii(input_string, replacement) == expected -