diff --git a/.github/workflows/binette_ci.yml b/.github/workflows/binette_ci.yml index cb6b54a..a9d0f19 100644 --- a/.github/workflows/binette_ci.yml +++ b/.github/workflows/binette_ci.yml @@ -1,7 +1,7 @@ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python -name: Test Binette +name: CI on: pull_request: diff --git a/.github/workflows/build_draft_pdf.yml b/.github/workflows/build_draft_pdf.yml new file mode 100644 index 0000000..3383dd6 --- /dev/null +++ b/.github/workflows/build_draft_pdf.yml @@ -0,0 +1,24 @@ +name: build draft paper pdf +on: [push] + +jobs: + paper: + runs-on: ubuntu-latest + name: Paper Draft + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Build draft PDF + uses: openjournals/openjournals-draft-action@master + with: + journal: joss + # This should be the path to the paper within your repo. + paper-path: paper/paper.md + - name: Upload + uses: actions/upload-artifact@v1 + with: + name: paper + # This is the output path where Pandoc will write the compiled + # PDF. Note, this should be the same directory as the input + # paper.md + path: paper/paper.pdf \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..a9b0c1f --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,48 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + release: + types: [published] + +# on: [push] + +permissions: + contents: read + id-token: write + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + + - name: Publish package distributions to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} + # repository-url: https://test.pypi.org/legacy/ + + + + diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..de6efe3 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,39 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 +python: + install: + - method: pip + path: . + extra_requirements: + - doc + - main_deps + + + +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.8" + + + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/conf.py + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +# python: +# install: +# - requirements: docs/requirements.txt \ No newline at end of file diff --git a/README.md b/README.md index 8f9804c..dfc390b 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,7 @@ -[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/binette/README.html) [![Anaconda-Server Badge](https://anaconda.org/bioconda/binette/badges/downloads.svg)](https://anaconda.org/bioconda/binette) [![Test Coverage](https://genotoul-bioinfo.github.io/Binette/coverage-badge.svg)](https://genotoul-bioinfo.github.io/Binette/) +[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/binette/README.html) [![Anaconda-Server Badge](https://anaconda.org/bioconda/binette/badges/downloads.svg)](https://anaconda.org/bioconda/binette) [![Anaconda-Server Badge](https://anaconda.org/bioconda/binette/badges/license.svg)](https://anaconda.org/bioconda/binette) [![Anaconda-Server Badge](https://anaconda.org/bioconda/binette/badges/version.svg)](https://anaconda.org/bioconda/binette) + +[![Test Coverage](https://genotoul-bioinfo.github.io/Binette/coverage-badge.svg)](https://genotoul-bioinfo.github.io/Binette/) [![CI Status](https://github.com/genotoul-bioinfo/Binette/actions/workflows/binette_ci.yml/badge.svg)](https://github.com/genotoul-bioinfo/Binette/actions/workflows) [![Documentation Status](https://readthedocs.org/projects/binette/badge/?version=latest)](https://binette.readthedocs.io/en/latest/?badge=latest) + # Binette @@ -9,10 +12,10 @@ From the input bin sets, Binette constructs new hybrid bins. A bin can be seen a - Difference bin: This bin contains the contigs that are exclusively found in one bin and not present in the others. - Union bin: The union bin includes all the contigs contained within the overlapping bins -It then uses checkm2 to assess bins quality to finally select the best bins possible. +It then uses CheckM2 to assess bins quality to finally select the best bins possible. Binette is inspired from the metaWRAP bin-refinement tool but it effectively solves all the problems from that very tool. -- Enhanced Speed: Binette significantly improves the speed of the refinement process. It achieves this by launching the initial steps of checkm2, such as prodigal and diamond runs, only once on all contigs. These intermediate results are then utilized to assess the quality of any given bin, eliminating redundant computations and accelerating the refinement process. +- Enhanced Speed: Binette significantly improves the speed of the refinement process. It achieves this by launching the initial steps of CheckM2, such as Prodigal and Diamond runs, only once on all contigs. These intermediate results are then utilized to assess the quality of any given bin, eliminating redundant computations and accelerating the refinement process. - No Limit on Input Bin Sets: Unlike its predecessor, Binette is not constrained by the number of input bin sets. It can handle and process multiple bin sets simultaneously. diff --git a/binette/__init__.py b/binette/__init__.py index e69de29..63eb0cb 100644 --- a/binette/__init__.py +++ b/binette/__init__.py @@ -0,0 +1 @@ +__version__ = '0.1.6' \ No newline at end of file diff --git a/binette/binette.py b/binette/main.py similarity index 88% rename from binette/binette.py rename to binette/main.py index 013bcf4..80d1e90 100755 --- a/binette/binette.py +++ b/binette/main.py @@ -13,8 +13,8 @@ import sys import logging import os -import pkg_resources +import binette from binette import contig_manager, cds, diamond, bin_quality, bin_manager, io_manager as io from typing import List, Dict, Set, Tuple @@ -40,83 +40,84 @@ def init_logging(verbose, debug): ) + def parse_arguments(args): """Parse script arguments.""" - program_version = pkg_resources.get_distribution("Binette").version parser = ArgumentParser( - description=f"Binette version={program_version}", + description=f"Binette version={binette.__version__}", formatter_class=ArgumentDefaultsHelpFormatter, ) - # TODO add catagory to better visualize the required and the optional args - input_arg = parser.add_mutually_exclusive_group(required=True) + + # Input arguments category + input_group = parser.add_argument_group('Input Arguments') + input_arg = input_group.add_mutually_exclusive_group(required=True) input_arg.add_argument( "-d", "--bin_dirs", nargs="+", - help="list of bin folders containing each bin in a fasta file.", + help="List of bin folders containing each bin in a fasta file.", ) input_arg.add_argument( "-b", "--contig2bin_tables", nargs="+", - help="list of contig2bin table with two columns separated\ + help="List of contig2bin table with two columns separated\ with a tabulation: contig, bin", ) - parser.add_argument("-c", "--contigs", required=True, help="Contigs in fasta format.") + input_group.add_argument("-c", "--contigs", required=True, help="Contigs in fasta format.") - parser.add_argument( + # Other parameters category + other_group = parser.add_argument_group('Other Arguments') + + other_group.add_argument( "-m", "--min_completeness", - default=10, + default=40, type=int, help="Minimum completeness required for final bin selections.", ) - parser.add_argument("-t", "--threads", default=1, type=int, help="Number of threads.") + other_group.add_argument("-t", "--threads", default=1, type=int, help="Number of threads to use.") - parser.add_argument("-o", "--outdir", default="results", help="Output directory.") + other_group.add_argument("-o", "--outdir", default="results", help="Output directory.") - parser.add_argument( + other_group.add_argument( "-w", "--contamination_weight", - default=5, + default=2, type=float, help="Bin are scored as follow: completeness - weight * contamination. " "A low contamination_weight favor complete bins over low contaminated bins.", ) - parser.add_argument( - "-e", - "--extension", - default="fasta", - help="Extension of fasta files in bin folders " - "(necessary when --bin_dirs is used).", - ) - - parser.add_argument( + other_group.add_argument( "--checkm2_db", help="Provide a path for the CheckM2 diamond database. " "By default the database set via is used.", ) - parser.add_argument("-v", "--verbose", help="increase output verbosity", action="store_true") + other_group.add_argument("--low_mem", help="Use low mem mode when running diamond", action="store_true") + + other_group.add_argument("-v", "--verbose", help="increase output verbosity", action="store_true") - parser.add_argument("--debug", help="active debug mode", action="store_true") + other_group.add_argument("--debug", help="Activate debug mode", action="store_true") - parser.add_argument("--resume", help="active resume mode", action="store_true") + other_group.add_argument("--resume", + action="store_true", + help="Activate resume mode. Binette will examine the 'temporary_files' directory " + "within the output directory and reuse any existing files if possible." + ) - parser.add_argument("--low_mem", help="low mem mode", action="store_true") - parser.add_argument("--version", action="version", version=program_version) + other_group.add_argument("--version", action="version", version=binette.__version__) args = parser.parse_args(args) return args - def parse_input_files(bin_dirs: List[str], contig2bin_tables: List[str], contigs_fasta: str) -> Tuple[Dict[str, List], List, Dict[str, List], Dict[str, int]]: """ Parses input files to retrieve information related to bins and contigs. diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/api/api_ref.md b/docs/api/api_ref.md new file mode 100644 index 0000000..3de18e1 --- /dev/null +++ b/docs/api/api_ref.md @@ -0,0 +1,8 @@ +# API Reference + +```{toctree} +:maxdepth: 2 +binette +indice_and_table +``` + diff --git a/docs/api/binette.md b/docs/api/binette.md new file mode 100644 index 0000000..bc3c754 --- /dev/null +++ b/docs/api/binette.md @@ -0,0 +1,75 @@ +# binette package + +## Submodules + +## binette.bin_manager module + +```{eval-rst} +.. automodule:: binette.bin_manager + :members: + :undoc-members: + :show-inheritance: +``` + +## binette.bin_quality module + +```{eval-rst} +.. automodule:: binette.bin_quality + :members: + :undoc-members: + :show-inheritance: +``` + +## binette.binette module + +```{eval-rst} +.. automodule:: binette.binette + :members: + :undoc-members: + :show-inheritance: +``` + +## binette.cds module + +```{eval-rst} +.. automodule:: binette.cds + :members: + :undoc-members: + :show-inheritance: +``` + +## binette.contig_manager module + +```{eval-rst} +.. automodule:: binette.contig_manager + :members: + :undoc-members: + :show-inheritance: +``` + +## binette.diamond module + +```{eval-rst} +.. automodule:: binette.diamond + :members: + :undoc-members: + :show-inheritance: +``` + +## binette.io_manager module + +```{eval-rst} +.. automodule:: binette.io_manager + :members: + :undoc-members: + :show-inheritance: +``` + +## Module contents + +```{eval-rst} +.. automodule:: binette + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/docs/api/modules.md b/docs/api/modules.md new file mode 100644 index 0000000..b83d27c --- /dev/null +++ b/docs/api/modules.md @@ -0,0 +1,7 @@ +# binette + +```{toctree} +:maxdepth: 4 + +binette +``` diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..b8e839a --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,57 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +from binette import __version__ + +project = 'Binette' +copyright = '2024, Jean Mainguy' +author = 'Jean Mainguy' +release = __version__ + + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + "myst_parser", + # "sphinxcontrib.jquery", + "sphinx.ext.duration", + "sphinx.ext.autosectionlabel", + "sphinx.ext.autodoc", + 'sphinx_search.extension' +] + + +source_suffix = { + '.md': 'markdown' +} + + +templates_path = ['_templates'] + + +# Prefix document path to section labels, to use: +# `path/to/file:heading` instead of just `heading` +autosectionlabel_prefix_document = True + +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = 'sphinx_rtd_theme' #'alabaster' # + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + + + diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 0000000..80a8ac9 --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,73 @@ +# Contributing + +Thank you for your interest in contributing to Binette! This is an open-source project and everyone is welcome to contribute to it. + +## Reporting a Bug + +If you have any question, if you found a bug. Please open an issue. + +You can check the [Issues](https://github.com/genotoul-bioinfo/Binette/issues) page to see if the bug or question has been already reported. + +If it's not reported, create a new [issue](https://github.com/genotoul-bioinfo/Binette/issues). + +## Adding a New Feature to Binette + +### Starting with an Issue + +If you have ideas for new features or improvements, initiate a discussion in an issue. This allows us to evaluate and discuss your suggestions together. + +For minor changes like fixing typos or making small edits, create a new Pull Request (PR) directly with your proposed changes. + +### Setting Up the Development Environment + +1. **Fork and Clone the Repository:** + - Fork the repository to your GitHub account. + - Clone your forked repository to your local machine. + +2. **Get an Environment:** + Create an environment with all Binette prerequisites installed by following the installation instructions [here](./installation.md#installing-from-source-code-within-a-conda-environnement). + +3. **Install in Editable Mode:** + To enable seamless code editing and testing of new functionality, install PPanGGOLiN in editable mode using the following command: + + ```bash + pip install -e . + ``` + + This allows you to modify the code and experiment with new features directly. + + +```{note} +Currently, we are not utilizing any auto formatters (like autopep8 or black). Kindly refrain from using them, as it could introduce extensive changes across the project, making code review challenging for us. +``` + + +### Making Your Changes + +Maintain consistency in code formatting. When adding new code, closely follow the existing structure. Functions should include descriptive docstrings explaining their purpose and detailing the parameters. Ensure that argument types are specified in the function definitions. + +### Update Documentation + +If your changes alter the tool's behavior, update the documentation to reflect them. Provide clear descriptions and, if necessary, examples of commands and their respective outputs. + + +### Tests + +#### Continuous Integration (CI) Workflow + +We've configured a CI workflow in the Actions tab, executing Binette on a small dataset and testing its results. If you've introduced a new feature, consider updating the CI YAML file to test it and ensure seamless integration. + +#### Unit Tests + +It is recommended to add unit test to any additions to the code. The test suite is located in the 'tests' directory at the root of the project. + +### Creating a Pull Request 🚀 + +Once you've made your changes: + +1. **Create a Pull Request:** Submit a pull request from your forked repository to the 'dev' branch on GitHub. + +2. **Describe Your Changes:** Clearly describe the modifications you've made and link any associated issue(s) in the PR description. + +3. **Collaborative Review:** We will review your changes, offer feedback, and engage in discussions until we collectively agree on the implementation. + diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..0ecd6ae --- /dev/null +++ b/docs/index.md @@ -0,0 +1,37 @@ +% Binette documentation master file, created by +% sphinx-quickstart on Thu Jan 11 21:13:20 2024. +% You can adapt this file completely to your liking, but it should at least +% contain the root `toctree` directive. + + +[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/binette/README.html) [![Anaconda-Server Badge](https://anaconda.org/bioconda/binette/badges/downloads.svg)](https://anaconda.org/bioconda/binette) [![Anaconda-Server Badge](https://anaconda.org/bioconda/binette/badges/license.svg)](https://anaconda.org/bioconda/binette) [![Anaconda-Server Badge](https://anaconda.org/bioconda/binette/badges/version.svg)](https://anaconda.org/bioconda/binette) + +[![Test Coverage](https://genotoul-bioinfo.github.io/Binette/coverage-badge.svg)](https://genotoul-bioinfo.github.io/Binette/) [![CI Status](https://github.com/genotoul-bioinfo/Binette/actions/workflows/binette_ci.yml/badge.svg)](https://github.com/genotoul-bioinfo/Binette/actions/workflows) [![Documentation Status](https://readthedocs.org/projects/binette/badge/?version=latest)](https://binette.readthedocs.io/en/latest/?badge=latest) + + +# Binette + + +Binette is a fast and accurate binning refinement tool to constructs high quality MAGs from the output of multiple binning tools. + +From the input bin sets, Binette constructs new hybrid bins. A bin can be seen as a set of contigs. When at least two bins overlap, meaning they share at least one contig, Binette utilizes basic set operations to create new bins. +- Intersection bin: This bin consists of the contigs that are shared by the overlapping bins. +- Difference bin: This bin contains the contigs that are exclusively found in one bin and not present in the others. +- Union bin: The union bin includes all the contigs contained within the overlapping bins + +It then uses CheckM2 to assess bins quality to finally select the best bins possible. + +Binette is inspired from the metaWRAP bin-refinement tool but it effectively solves all the problems from that very tool. +- Enhanced Speed: Binette significantly improves the speed of the refinement process. It achieves this by launching the initial steps of CheckM2, such as Prodigal and Diamond runs, only once on all contigs. These intermediate results are then utilized to assess the quality of any given bin, eliminating redundant computations and accelerating the refinement process. +- No Limit on Input Bin Sets: Unlike its predecessor, Binette is not constrained by the number of input bin sets. It can handle and process multiple bin sets simultaneously. + +```{toctree} +:caption: 'Documentation' +:maxdepth: 2 + +installation +usage +contributing +api/api_ref +``` + diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 0000000..e0743f0 --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,69 @@ + +# Installation + +## With Bioconda + +Binette can be esailly installed with conda + +```bash +conda create -c bioconda -c defaults -c conda-forge -n binette binette +conda activate binette +``` + +Binette should be able to run : + +``` +binette -h +``` + + +```{tip} +For quicker installation and potential resolution of conflicting dependencies, consider using [Mamba](https://github.com/mamba-org/mamba), an efficient alternative to conda. + +``` + + +## Installing from Source Code within a conda environnement + +A straightforward method to install Binette from the source code is by utilizing a conda environment that includes all the necessary dependencies. + +**1. Clone the Binette Repository** + +```bash +git clone https://github.com/genotoul-bioinfo/Binette +cd Binette +``` + +**2. Installing Dependencies with a Conda Environment File** + +Install Binette dependencies listed in the [binette.yaml](https://github.com/genotoul-bioinfo/Binette/blob/main/binette.yaml) file located at the root of the repository, using conda: + +```bash +conda env create -n binette -f binette.yaml +conda activate binette +``` + +**3. Installing Binette** + +Finally, install Binette using **pip**: + +```bash +pip install . +``` + +Binette should be able to run : + +```bash +binette -h +``` + + +## Downloading the CheckM2 database + +Before using Binette, it is necessary to download the CheckM2 database: + +```bash +checkm2 database --download --path +``` + +Make sure to replace `` with the desired path where you want to store the CheckM2 database. diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..32bb245 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/usage.md b/docs/usage.md new file mode 100644 index 0000000..2108a50 --- /dev/null +++ b/docs/usage.md @@ -0,0 +1,82 @@ + +# Usage + +## Input Formats + +Binette supports two input formats for bin sets: + +1. **Contig2bin Tables:** You can provide bin sets using contig2bin tables, which establish the relationship between each contig and its corresponding bin. In this format, you need to specify the `--contig2bin_tables` argument. + +For example, consider the following two `contig2bin_tables`: + +- `bin_set1.tsv`: + + ```tsv + contig_1 binA + contig_8 binA + contig_15 binB + contig_9 binC + ``` + +- `bin_set2.tsv`: + + ```tsv + contig_1 bin.0 + contig_8 bin.0 + contig_15 bin.1 + contig_9 bin.2 + contig_10 bin.0 + ``` + + The `binette` command to process this input would be: + + ```bash + binette --contig2bin_tables bin_set1.tsv bin_set2.tsv --contigs assembly.fasta + ``` + +2. **Bin Directories:** Alternatively, you can use bin directories, where each bin is represented by a separate FASTA file. For this format, you need to provide the `--bin_dirs` argument. Here's an example of two bin directories: + + ``` + bin_set1/ + ├── binA.fa: contains sequences of contig_1, contig_8 + ├── binB.fa: contains sequences of contig_15 + └── binC.fa: contains sequences of contig_9 + ``` + + ``` + bin_set2/ + ├── binA.fa: contains sequences of contig_1, contig_8, contig_10 + ├── binB.fa: contains sequences of contig_15 + └── binC.fa: contains sequences of contig_9 + ``` + + The `binette` command to process this input would be: + + ```bash + binette --bin_dirs bin_set1 bin_set2 --contigs assembly.fasta + ``` + +In both formats, the `--contigs` argument should specify a FASTA file containing all the contigs found in the bins. Typically, this file would be the assembly FASTA file used to generate the bins. In these exemple the `assembly.fasta` file should contain at least the five contigs mentioned in the `contig2bin_tables` files or in the bin fasta files: `contig_1`, `contig_8`, `contig_15`, `contig_9`, and `contig_10`. + +## Outputs + +Binette results are stored in the `results` directory. You can specify a different directory using the `--outdir` option. + +In this directory you will find: +- `final_bins_quality_reports.tsv`: This is a TSV (tab-separated values) file containing quality information about the final selected bins. +- `final_bins/`: This directory stores all the selected bins in fasta format. +- `temporary_files/`: This directory contains intermediate files. If you choose to use the `--resume` option, Binette will utilize files in this directory to prevent the recomputation of time-consuming steps. + + +The `final_bins_quality_reports.tsv` file contains the following columns: +| Column Name | Description | +|---------------------|--------------------------------------------------------------------------------------------------------------| +| **bin_id** | This column displays the unique ID of the bin. | +| **origin** | Indicates the source or origin of the bin, specifying from which bin set it originates or the intermediate set operation that created it. | +| **name** | The name of the bin. | +| **completeness** | The completeness of the bin, determined by CheckM2. | +| **contamination** | The contamination of the bin, determined by CheckM2. | +| **score** | This column displays the computed score, which is calculated as: `completeness - contamination * weight`. You can customize the contamination weight using the `--contamination_weight` option. | +| **size** | Represents the size of the bin in nucleotides. | +| **N50** | Displays the N50 of the bin. | +| **contig_count** | The number of contigs contained within the bin. \ No newline at end of file diff --git a/paper/binette_overview.pdf b/paper/binette_overview.pdf new file mode 100644 index 0000000..bc9978f Binary files /dev/null and b/paper/binette_overview.pdf differ diff --git a/paper/paper.bib b/paper/paper.bib new file mode 100644 index 0000000..33e696b --- /dev/null +++ b/paper/paper.bib @@ -0,0 +1,182 @@ +@article{Nayfach2019global_human_gut_microbiome, +title={New insights from uncultivated genomes of the global human gut microbiome}, +volume={568}, +DOI={10.1038/s41586-019-1058-x}, +number={7753}, +journal={Nature}, +author={Nayfach, +Stephen and Shi, +Zhou Jason and Seshadri, +Rekha and Pollard, +Katherine S. and Kyrpides, +Nikos C.}, +year={2019}, +month={Mar}, +pages={505–510} } + + +@article{Acinas_Sánchez_et_al_2021, +title={Deep ocean metagenomes provide insight into the metabolic architecture of bathypelagic microbial communities}, +volume={4}, +DOI={10.1038/s42003-021-02112-2}, +number={1}, +journal={Communications Biology}, +author={Acinas, Silvia G. and Sánchez, Pablo and Salazar, Guillem and Cornejo-Castillo, Francisco M. and Sebastián, Marta and Logares, Ramiro and Royo-Llonch, Marta and Paoli, Lucas and Sunagawa, Shinichi and Hingamp, Pascal and Ogata, Hiroyuki and Lima-Mendez, Gipsi and Roux, Simon and González, José M. and Arrieta, Jesús M. and Alam, Intikhab S. and Kamau, Allan and Bowler, Chris and Raes, Jeroen and Pesant, Stéphane and Bork, Peer and Agustí, Susana and Gojobori, Takashi and Vaqué, Dolors and Sullivan, Matthew B. and Pedrós-Alió, Carlos and Massana, Ramon and Duarte, Carlos M. and Gasol, Josep M.}, +year={2021}, +month={May}, +pages={1–15} } + +@article{kang2019metabat, + title={MetaBAT 2: an adaptive binning algorithm for robust and efficient genome reconstruction from metagenome assemblies}, + author={Kang, Dongwan D and Li, Feng and Kirton, Edward and Thomas, Ashleigh and Egan, Rob and An, Hong and Wang, Zhong}, + journal={PeerJ}, + volume={7}, + DOI={10.7717/peerj.7359}, + pages={e7359}, + year={2019}, + publisher={PeerJ Inc.} +} + +@article{alneberg2014concoct, + title={Binning metagenomic contigs by coverage and composition}, + author={Alneberg, Johannes and Bjarnason, Brynjar Sm{\'a}ri and De Bruijn, Ino and Schirmer, Melanie and Quick, Joshua and Ijaz, Umer Z and Lahti, Leo and Loman, Nicholas J and Andersson, Anders F and Quince, Christopher}, + journal={Nature methods}, + volume={11}, + DOI={10.1038/nmeth.3103}, + number={11}, + pages={1144--1146}, + year={2014}, + publisher={Nature Publishing Group US New York} +} + + +@article{nissen2021improved, + title={Improved metagenome binning and assembly using deep variational autoencoders}, + author={Nissen, Jakob Nybo and Johansen, Joachim and Alles{\o}e, Rosa Lundbye and S{\o}nderby, Casper Kaae and Armenteros, Jose Juan Almagro and Gr{\o}nbech, Christopher Heje and Jensen, Lars Juhl and Nielsen, Henrik Bj{\o}rn and Petersen, Thomas Nordahl and Winther, Ole and others}, + journal={Nature biotechnology}, + volume={39}, + DOI={10.1038/s41587-020-00777-4}, + number={5}, + pages={555--560}, + year={2021}, + publisher={Nature Publishing Group US New York} +} + + +@article{sieber2018dastool, + title={Recovery of genomes from metagenomes via a dereplication, aggregation and scoring strategy}, + author={Sieber, Christian MK and Probst, Alexander J and Sharrar, Allison and Thomas, Brian C and Hess, Matthias and Tringe, Susannah G and Banfield, Jillian F}, + journal={Nature microbiology}, + volume={3}, + DOI={10.1038/s41564-018-0171-1}, + number={7}, + pages={836--843}, + year={2018}, + publisher={Nature Publishing Group UK London} +} + + +@article{ruhlemann2022magscot, + title={MAGScoT: a fast, lightweight and accurate bin-refinement tool}, + author={R{\"u}hlemann, Malte Christoph and Wacker, Eike Matthias and Ellinghaus, David and Franke, Andre}, + journal={Bioinformatics}, + volume={38}, + DOI={10.1093/bioinformatics/btac694}, + number={24}, + pages={5430--5433}, + year={2022}, + publisher={Oxford University Press} +} + +@article{uritskiy2018metawrap, + title={MetaWRAP—a flexible pipeline for genome-resolved metagenomic data analysis}, + author={Uritskiy, Gherman V and DiRuggiero, Jocelyne and Taylor, James}, + journal={Microbiome}, + volume={6}, + DOI={10.1186/s40168-018-0541-1}, + number={1}, + pages={1--13}, + year={2018}, + publisher={BioMed Central} +} + +@article{meyer2022critical, + title={Critical assessment of metagenome interpretation: the second round of challenges}, + author={Meyer, Fernando and Fritz, Adrian and Deng, Zhi-Luo and Koslicki, David and Lesker, Till Robin and Gurevich, Alexey and Robertson, Gary and Alser, Mohammed and Antipov, Dmitry and Beghini, Francesco and others}, + journal={Nature methods}, + volume={19}, + DOI={10.1038/s41592-022-01431-4}, + number={4}, + pages={429--440}, + year={2022}, + publisher={Nature Publishing Group US New York} +} + +@article{parks2015checkm, + title={CheckM: assessing the quality of microbial genomes recovered from isolates, single cells, and metagenomes}, + author={Parks, Donovan H and Imelfort, Michael and Skennerton, Connor T and Hugenholtz, Philip and Tyson, Gene W}, + journal={Genome research}, + volume={25}, + DOI={10.1101/gr.186072.114}, + number={7}, + pages={1043--1055}, + year={2015}, + publisher={Cold Spring Harbor Lab} +} + +@article{chklovski2023checkm2, + title={CheckM2: a rapid, scalable and accurate tool for assessing microbial genome quality using machine learning}, + author={Chklovski, Alex and Parks, Donovan H and Woodcroft, Ben J and Tyson, Gene W}, + journal={Nature Methods}, + volume={20}, + DOI={10.1038/s41592-023-01940-w}, + number={8}, + pages={1203--1212}, + year={2023}, + publisher={Nature Publishing Group US New York} +} + +@article{buchfink2015diamond, + title={Fast and sensitive protein alignment using DIAMOND}, + author={Buchfink, Benjamin and Xie, Chao and Huson, Daniel H}, + journal={Nature methods}, + volume={12}, + DOI={10.1038/nmeth.3176}, + number={1}, + pages={59--60}, + year={2015}, + publisher={Nature Publishing Group US New York} +} + +@article{larralde2022pyrodigal, + title={Pyrodigal: Python bindings and interface to Prodigal, an efficient method for gene prediction in prokaryotes}, + author={Larralde, Martin}, + journal={Journal of Open Source Software}, + volume={7}, + DOI={10.21105/joss.04296}, + number={72}, + pages={4296}, + year={2022} +} + +@article{hyatt2010prodigal, + title={Prodigal: prokaryotic gene recognition and translation initiation site identification}, + author={Hyatt, Doug and Chen, Gwo-Liang and LoCascio, Philip F and Land, Miriam L and Larimer, Frank W and Hauser, Loren J}, + journal={BMC bioinformatics}, + volume={11}, + DOI={10.1186/1471-2105-11-119}, + pages={1--11}, + year={2010}, + publisher={Springer} +} + + + +@article{metagWGS_inprep, + title={MetagWGS, a complete workflow to analyse metagenomic data (from Illumina reads or PacBio HiFi reads)}, + author={Mainguy, Jean and Vienne, Maïna and Fourquet, Joanna and Darbot, Vincent and Noirot, Céline and Castinel, Adrien and Combes, Sylvie and Gaspin, Christine and Milan, Denis and Donnadieu, Cécile and Iampietro, Carole and Bouchez, Olivier and Pascal, Géraldine and Hoede, Claire}, + journal={Journal}, + year={in preparation} + +} + diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 0000000..8b8bdf2 --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,49 @@ +--- +title: 'Binette: a fast and accurate bin refinement tool to construct high quality Metagenome Assembled Genomes.' +tags: + - Python + - Metagenomics + - Binning + - Bin refinement + - MAGs + +authors: + - name: Jean Mainguy + orcid: 0009-0006-9160-9744 + affiliation: "1, 2" + - name: Claire Hoede + orcid: 0000-0001-5054-7731 + affiliation: "1, 2" + corresponding: true +affiliations: + - name: Université de Toulouse, INRAE, BioinfOmics, GenoToul Bioinformatics facility, 31326, Castanet-Tolosan, France + index: 1 + - name: Université de Toulouse, INRAE, UR 875 MIAT, 31326, Castanet-Tolosan, France + index: 2 +date: 30 november 2023 +bibliography: paper.bib +--- + + +# Statement of need +Metagenomics enables the study of microbial communities and their individual members through shotgun sequencing. An essential phase of metagenomic analysis is the recovery of metagenome-assembled genomes (MAGs). MAGs serve as a gateway to additional analyses, including the exploration of organism-specific metabolic pathways, and form the basis for comprehensive large-scale metagenomic surveys [@Nayfach2019global_human_gut_microbiome;@Acinas_Sánchez_et_al_2021]. + +In a metagenomic analysis, sequence reads are first assembled into longer sequences called contigs. These contigs are then grouped into bins based on common characteristics in a process called binning to obtain MAGs. There are several tools that can be used to bin contigs into MAGs. These tools are based on various statistical and machine learning methods and use contig characteristics such as tetranucleotide frequencies, GC content and similar abundances across samples [@kang2019metabat;@alneberg2014concoct;@nissen2021improved]. + +The approach of applying multiple binning methods and combining them has proven useful to obtain more and better quality MAGs from metagenomic datasets.This combination process is called bin-refinement and several tools exist to perform such tasks, such as DASTool [@sieber2018dastool], MagScot [@ruhlemann2022magscot] and the bin-refinement module of the metaWRAP pipeline [@uritskiy2018metawrap]. Of these, metaWRAP's bin-refinement tool has demonstrated remarkable efficiency in benchmark analysis [@meyer2022critical]. However, it has certain limitations, most notably its inability to integrate more than three binning results. In addition, it repeatedly uses CheckM [@parks2015checkm] to assess bin quality throughout its execution, which contributes to its slower performance. Furthermore, since it is embedded in a larger framework, it may present challenges when attempting to integrate it into an independent analysis pipeline. + +We present Binette, a bin refinement tool inspired by metaWRAP's bin refinement module, which addresses the limitations of the latter and ensures better results. + +# Summary +Binette is a Python reimplementation and enhanced version of the bin refinement module used in metaWRAP. It takes as input sets of bins generated by various binning tools. Using these input bin sets, Binette constructs new hybrid bins using basic set operations. Specifically, a bin can be defined as a set of contigs, and when two or more bins share at least one contig, Binette generates new bins based on their intersection, difference, and union (\autoref{fig:overview}.A). This approach differs from metaWRAP, which exclusively generates hybrid bins based on bin intersections and allows Binette to expand the range of possible bins. + + +![**Overview of Binette Steps**. **(A) Intermediate Bin Creation Example**: Bins are represented as square shapes, each containing colored lines representing the contigs they contain. Creation of intermediate bins involves the initial bins sharing at least one contig. Set operations are applied to the contigs within the bins to generate these intermediate bins. **(B) Binette Workflow Overview**: Input bins serve as the basis for generating intermediate bins. Each bin undergoes a scoring process utilizing quality metrics provided by CheckM2. Subsequently, the bins are sorted based on their scores, and a selection process is executed to retain non-redundant bins.\label{fig:overview}](./binette_overview.pdf) + + +Bin completeness and contamination are assessed using CheckM2 [@chklovski2023checkm2]. Bins are scored using the following scoring function: $completeness - weight * contamination$, with the default weight set to 2. These scored bins are then sorted, facilitating the selection of a final new set of non-redundant bins (\autoref{fig:overview}.B). The ability to score bins is based on CheckM2 rather than CheckM1 as in the metaWRAP pipeline. CheckM2 uses a novel approach to evaluate bin quality based on machine learning techniques. This approach improves speed and also provides better results than CheckM1. Binette initiates CheckM2 processing by running its initial steps once for all contigs within the input bins. These initial steps involve gene prediction using Prodigal and alignment against the CheckM2 database using Diamond [@buchfink2015diamond]. Binette uses Pyrodigal [@larralde2022pyrodigal], a Python module that provides bindings and an interface to Prodigal [@hyatt2010prodigal]. The intermediate Checkm2 results are then used to assess the quality of individual bins, eliminating redundant calculations and speeding up the refinement process. + +Binette serves as the bin refinement tool within the [metagWGS](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs) metagenomic analysis pipeline [@metagWGS_inprep], providing a robust and faster alternative to the bin refinement module of the metaWRAP pipeline as well as other similar bin refinement tools. + + +# References diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8ff3ff8 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,68 @@ +[build-system] +requires = ["setuptools>=61.0.0", "setuptools-scm"] +build-backend = "setuptools.build_meta" + +[project] +name = "Binette" +description = "Binette: accurate binning refinement tool to constructs high quality MAGs." +dynamic = ["version"] +authors = [ + {name = "Jean Mainguy"}, +] +maintainers = [ + {name = "Jean Mainguy"}, +] + +readme = "README.md" +keywords = ["Bioinformatics", "Prokaryote", "Binning", "Refinement", "Metagenomics"] +classifiers=[ + "Environment :: Console", + "Intended Audience :: Science/Research", + "Natural Language :: English", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python :: 3", + "Topic :: Scientific/Engineering :: Bio-Informatics"] +requires-python = ">=3.8" +license = {file="LICENCE"} + +[project.optional-dependencies] +main_deps = [ + "checkm2==1.*", + "networkx==3.*", + "numpy==1.19.2", + "packaging==23.*", + "pandas==1.4.0", + "pyfastx==2.*", + "pyrodigal==2.*", + "requests==2.*", + "tqdm==4.*", +] + +doc = [ + "sphinx==6.2.1", + "sphinx_rtd_theme==1.2.2", + "readthedocs-sphinx-search==0.3.1", + "sphinx-autobuild==2021.3.14", + "myst-parser==1.0.0", + "docutils==0.18.1" +] +dev = [ + "pytest>=7.0.0", + "pytest-cov" +] +# +[project.urls] +Repository = "https://github.com/genotoul-bioinfo/Binette" +#Changelog = "https://github.com/me/spam/blob/master/CHANGELOG.md" +Documentation = "https://binette.readthedocs.io" +# +# +[project.scripts] +binette = "binette.main:main" + +[tool.setuptools] +packages = ["binette"] + + +[tool.setuptools.dynamic] +version = {attr = "binette.__version__"} diff --git a/setup.py b/setup.py deleted file mode 100644 index 36029dd..0000000 --- a/setup.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -from os import path -from setuptools import setup, find_packages - -if __name__ == "__main__": - # Get the long description from the README file - setup_dir = path.abspath(path.dirname(__file__)) - with open(path.join(setup_dir, "README.md"), encoding="utf-8") as f: - long_description = f.read() - - setup( - name="binette", - version="0.1.6", - author="Jean Mainguy", - packages=find_packages(), - entry_points={"console_scripts": ["binette = binette.binette:main"]}, - url="https://github.com/genotoul-bioinfo/Binette", - license="MIT", - description="Binette: accurate binning refinement tool to constructs high quality MAGs.", - long_description=(long_description), - long_description_content_type="text/markdown", - install_requires=[],#"pyrodigal", "pyfastx", "networkx", "checkm2"], - ) diff --git a/tests/bin_manager_test.py b/tests/bin_manager_test.py index 73e1d7f..4daa0bf 100644 --- a/tests/bin_manager_test.py +++ b/tests/bin_manager_test.py @@ -5,7 +5,7 @@ import pytest -from binette import bin_manager, binette +from binette import bin_manager import networkx as nx def test_get_all_possible_combinations(): diff --git a/tests/main_binette_test.py b/tests/main_binette_test.py index f4ebb13..e93dcea 100644 --- a/tests/main_binette_test.py +++ b/tests/main_binette_test.py @@ -1,7 +1,7 @@ import pytest import logging -from binette.binette import log_selected_bin_info, select_bins_and_write_them, manage_protein_alignement, parse_input_files, parse_arguments, init_logging, main +from binette.main import log_selected_bin_info, select_bins_and_write_them, manage_protein_alignement, parse_input_files, parse_arguments, init_logging, main from binette.bin_manager import Bin from binette import diamond import os