From 397a8ccb4cf6282a0f2b258ccef848dae7230afa Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 1 Sep 2023 21:39:35 +0800 Subject: [PATCH] Disable button if computation is ongoing and able it if not and revise code for clearing dcc store data --- .dockerignore | 404 ++--- .github/ISSUE_TEMPLATE/bug_report.md | 64 +- .github/ISSUE_TEMPLATE/feature_request.md | 40 +- .github/workflows/check-for-syntax-errors.yml | 16 +- .github/workflows/delete-package-versions.yml | 60 +- .github/workflows/dockerize-and-publish.yml | 134 +- .github/workflows/mirror-main-to-demo.yml | 40 +- .gitignore | 400 ++--- .vscode/settings.json | 46 +- CONTRIBUTING.md | 50 +- Dockerfile-app | 50 +- Dockerfile-workflow | 68 +- LICENSE | 42 +- README.md | 22 +- app.py | 564 +++---- assets/1-global.css | 140 +- assets/2-homepage.css | 210 +-- assets/3-table.css | 114 +- assets/4-input-elem.css | 58 +- assets/5-graph.css | 10 +- assets/6-igv.css | 4 +- assets/7-loading.css | 36 +- assets/8-tooltips.css | 22 +- assets/9-analysis.css | 54 +- callbacks/branch.py | 50 +- callbacks/browse_loci/callbacks.py | 436 +++--- callbacks/browse_loci/util.py | 118 +- callbacks/coexpression/callbacks.py | 1030 ++++++------ callbacks/coexpression/util.py | 1288 +++++++-------- callbacks/constants.py | 128 +- callbacks/file_util.py | 238 +-- callbacks/general_util.py | 112 +- callbacks/homepage/callbacks.py | 368 ++--- callbacks/homepage/util.py | 163 +- callbacks/lift_over/callbacks.py | 850 +++++----- callbacks/lift_over/util.py | 1384 ++++++++--------- callbacks/links_util.py | 116 +- callbacks/style_util.py | 22 +- callbacks/text_mining/callbacks.py | 286 ++-- callbacks/text_mining/util.py | 188 +-- callbacks/tf_enrich/callbacks.py | 306 ++-- callbacks/tf_enrich/util.py | 360 ++--- dependencies/install-libraries-workflow.r | 22 +- dependencies/requirements-app.txt | 24 +- dependencies/requirements-workflow.txt | 28 +- pages/analysis/browse_loci.py | 122 +- pages/analysis/co_expr.py | 802 +++++----- pages/analysis/lift_over.py | 254 +-- pages/analysis/text_mining.py | 230 +-- pages/analysis/tf_enrich.py | 268 ++-- pages/analysis_layout.py | 62 +- pages/homepage.py | 348 ++--- pages/navigation/analysis_nav.py | 56 +- pages/navigation/main_nav.py | 100 +- prepare_data/workflow/Snakefile | 40 +- prepare_data/workflow/configfile.yaml | 74 +- .../rules/last_whole_genome_alignment.smk | 158 +- .../workflow/rules/prepare_annotation.smk | 38 +- .../rules/prepare_gene_descriptions.smk | 2 +- .../transcription_factor_binding_sites.smk | 58 +- .../ontology_enrichment/go-enrichment.r | 152 +- .../ontology_enrichment/po-enrichment.r | 158 +- .../ontology_enrichment/to-enrichment.r | 158 +- .../pathway_enrichment/ora-enrichment.r | 140 +- .../pathway_enrichment/pe-enrichment.r | 122 +- .../pathway_enrichment/spia-enrichment.r | 198 +-- .../util/aggregate-go-annotations.py | 278 ++-- .../util/aggregate-po-annotations.py | 182 +-- .../util/aggregate-to-annotations.py | 176 +-- .../util/file-convert-msu.py | 106 +- .../util/get-genes-in-pathway-dict.py | 92 +- .../util/get-genes-in-pathway.r | 48 +- .../util/msu-to-entrez-id.py | 90 +- .../util/msu-to-transcript-id.py | 204 +-- .../util/ricegeneid-msu-to-transcript-id.r | 100 +- .../util/transcript-to-msu-id.py | 82 +- .../prepare_df_rgi_gene_description.py | 56 +- .../scripts/get_promoter_sequences.py | 94 +- prepare_data/workflow/scripts/gff_db.py | 24 +- .../detect-modules-via-coach.py | 86 +- .../detect-modules-via-demon.py | 78 +- ...te-mapping-from-networkx-int-edge-graph.py | 100 +- .../get-modules-from-clusterone-results.py | 100 +- .../restore-node-labels-in-modules.py | 82 +- .../network_util/convert-to-int-edge-list.py | 110 +- .../scripts/ogi_mapping/generate-ogi-dicts.py | 220 +-- .../workflow/scripts/qtaro/prepare_qtaro.py | 106 +- .../text_mining/get_pubmed_per_gene.py | 988 ++++++------ .../workflow/scripts/tfbs/get_chr_sizes.py | 36 +- prepare_data/workflow/scripts/tfbs/get_fam.py | 96 +- .../scripts/tfbs/get_promoter_sequences.py | 94 +- .../scripts/tfbs/get_promoter_sizes.py | 50 +- .../scripts/tfbs/get_tfbs_intervals.py | 118 +- uwsgi.ini | 10 +- 94 files changed, 8772 insertions(+), 8739 deletions(-) diff --git a/.dockerignore b/.dockerignore index 7ce61c4f..db02b76e 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,203 +1,203 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# Data folder -data/ -data.zip -static/ -enrichment_analysis/data/ - -# History files -.Rhistory -.Rapp.history - -# Session Data files -.RData -.RDataTmp - -# User-specific files -.Ruserdata - -# Example code in package build process -*-Ex.R - -# Output files from R CMD build -/*.tar.gz - -# Output files from R CMD check -/*.Rcheck/ - -# RStudio files -.Rproj.user/ - -# produced vignettes -vignettes/*.html -vignettes/*.pdf - -# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 -.httr-oauth - -# knitr and R markdown default cache directories -*_cache/ -/cache/ - -# Temporary files created by R markdown -*.utf8.md -*.knit.md - -# R Environment Variables -.Renviron - -# pkgdown site -docs/ - -# translation temp files -po/*~ - -# RStudio Connect folder -rsconnect/ - -# Scratch -scratch/ -scratch.ipynb -scratch.py -*dummy* - -# Scripts for bulk processing -*-bulk.* -*-bulk*.* - -# Binary files -prepare_data/**/LazyFox -*.jar - -.github/ -.vscode/ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Data folder +data/ +data.zip +static/ +enrichment_analysis/data/ + +# History files +.Rhistory +.Rapp.history + +# Session Data files +.RData +.RDataTmp + +# User-specific files +.Ruserdata + +# Example code in package build process +*-Ex.R + +# Output files from R CMD build +/*.tar.gz + +# Output files from R CMD check +/*.Rcheck/ + +# RStudio files +.Rproj.user/ + +# produced vignettes +vignettes/*.html +vignettes/*.pdf + +# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 +.httr-oauth + +# knitr and R markdown default cache directories +*_cache/ +/cache/ + +# Temporary files created by R markdown +*.utf8.md +*.knit.md + +# R Environment Variables +.Renviron + +# pkgdown site +docs/ + +# translation temp files +po/*~ + +# RStudio Connect folder +rsconnect/ + +# Scratch +scratch/ +scratch.ipynb +scratch.py +*dummy* + +# Scripts for bulk processing +*-bulk.* +*-bulk*.* + +# Binary files +prepare_data/**/LazyFox +*.jar + +.github/ +.vscode/ Dockerfile* \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 651c92c5..5c9795c0 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,32 +1,32 @@ ---- -name: Bug report -about: Create a report to help us improve -title: '' -labels: 'bug' -assignees: '' - ---- - -**Describe the bug** -A clear and concise description of what the bug is. - -**To reproduce** -Steps to reproduce the behavior: -1. Go to '...' -2. Click on '...' -3. Scroll down to '...' -4. See error - -**Expected behavior** -A clear and concise description of what you expected to happen. - -**Screenshots** -If applicable, add screenshots to help explain your problem. - -**Platform** - - OS (Please indicate the version as well): [e.g., Windows 10, Ubuntu Jammy Jellyfish] - - Browser: [e.g., Chrome, Safari] - - Browser version: [e.g., 114.0.5735.199. If unspecified, the latest browser version will be assumed] - -**Additional context** -Add any other context about the problem here. +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: 'bug' +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To reproduce** +Steps to reproduce the behavior: +1. Go to '...' +2. Click on '...' +3. Scroll down to '...' +4. See error + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Screenshots** +If applicable, add screenshots to help explain your problem. + +**Platform** + - OS (Please indicate the version as well): [e.g., Windows 10, Ubuntu Jammy Jellyfish] + - Browser: [e.g., Chrome, Safari] + - Browser version: [e.g., 114.0.5735.199. If unspecified, the latest browser version will be assumed] + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 36014cde..a1e43a6c 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -1,20 +1,20 @@ ---- -name: Feature request -about: Suggest an idea for this project -title: '' -labels: 'enhancement' -assignees: '' - ---- - -**Is your feature request related to a problem? Please describe.** -A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] - -**Describe the solution you'd like** -A clear and concise description of what you want to happen. - -**Describe alternatives you've considered** -A clear and concise description of any alternative solutions or features you've considered. - -**Additional context** -Add any other context or screenshots about the feature request here. +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: 'enhancement' +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/workflows/check-for-syntax-errors.yml b/.github/workflows/check-for-syntax-errors.yml index 300b7b17..7e39332b 100644 --- a/.github/workflows/check-for-syntax-errors.yml +++ b/.github/workflows/check-for-syntax-errors.yml @@ -1,8 +1,8 @@ -on: [push, pull_request] -name: Check for syntax errors -jobs: - check-for-syntax-errors: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: cclauss/Find-Python-syntax-errors-action@master +on: [push, pull_request] +name: Check for syntax errors +jobs: + check-for-syntax-errors: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: cclauss/Find-Python-syntax-errors-action@master diff --git a/.github/workflows/delete-package-versions.yml b/.github/workflows/delete-package-versions.yml index 9762a1a3..280c72d5 100644 --- a/.github/workflows/delete-package-versions.yml +++ b/.github/workflows/delete-package-versions.yml @@ -1,30 +1,30 @@ -name: Delete package versions - -on: - workflow_run: - workflows: [Create and publish a Docker image] - types: [completed] - branches: [main] - -jobs: - delete-versions: - runs-on: ubuntu-latest - if: ${{ github.event.workflow_run.conclusion == 'success' }} - - steps: - - uses: actions/delete-package-versions@v4 - with: - package-name: 'rice-pilaf/app' - package-type: 'container' - min-versions-to-keep: 1 - - uses: actions/delete-package-versions@v4 - with: - package-name: 'rice-pilaf/workflow' - package-type: 'container' - min-versions-to-keep: 1 - - on-failure: - runs-on: ubuntu-latest - if: ${{ github.event.workflow_run.conclusion == 'failure' }} - steps: - - run: echo 'The triggering workflow failed' +name: Delete package versions + +on: + workflow_run: + workflows: [Create and publish a Docker image] + types: [completed] + branches: [main] + +jobs: + delete-versions: + runs-on: ubuntu-latest + if: ${{ github.event.workflow_run.conclusion == 'success' }} + + steps: + - uses: actions/delete-package-versions@v4 + with: + package-name: 'rice-pilaf/app' + package-type: 'container' + min-versions-to-keep: 1 + - uses: actions/delete-package-versions@v4 + with: + package-name: 'rice-pilaf/workflow' + package-type: 'container' + min-versions-to-keep: 1 + + on-failure: + runs-on: ubuntu-latest + if: ${{ github.event.workflow_run.conclusion == 'failure' }} + steps: + - run: echo 'The triggering workflow failed' diff --git a/.github/workflows/dockerize-and-publish.yml b/.github/workflows/dockerize-and-publish.yml index b8d2cba8..24b356f1 100644 --- a/.github/workflows/dockerize-and-publish.yml +++ b/.github/workflows/dockerize-and-publish.yml @@ -1,67 +1,67 @@ -name: Create and publish a Docker image - -on: - workflow_run: - workflows: [Check for syntax errors] - types: [completed] - branches: [main] - -env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }} - -jobs: - build-and-push-image: - runs-on: ubuntu-latest - if: ${{ github.event.workflow_run.conclusion == 'success' }} - strategy: - fail-fast: false - matrix: # Cannot use env variables inside matrix - include: - - dockerfile: Dockerfile-app # Change to Dockerfile of app - image: ghcr.io/bioinfodlsu/rice-pilaf/app - - dockerfile: Dockerfile-workflow # Change to Dockerfile of workflow - image: ghcr.io/bioinfodlsu/rice-pilaf/workflow - permissions: - contents: read - packages: write - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - - name: Extract metadata (tags, labels) for Docker - id: meta - uses: docker/metadata-action@v4 - with: - images: ${{ matrix.image }} - - - name: Set up QEMU - uses: docker/setup-qemu-action@v2 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - - name: Log in to the Container registry - uses: docker/login-action@v2 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Build and push Docker image - uses: docker/build-push-action@v3 - with: - context: . - cache-from: type=gha - cache-to: type=gha, mode=max - file: ${{ matrix.dockerfile }} - push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - - on-failure: - runs-on: ubuntu-latest - if: ${{ github.event.workflow_run.conclusion == 'failure' }} - steps: - - run: echo 'The triggering workflow failed' +name: Create and publish a Docker image + +on: + workflow_run: + workflows: [Check for syntax errors] + types: [completed] + branches: [main] + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +jobs: + build-and-push-image: + runs-on: ubuntu-latest + if: ${{ github.event.workflow_run.conclusion == 'success' }} + strategy: + fail-fast: false + matrix: # Cannot use env variables inside matrix + include: + - dockerfile: Dockerfile-app # Change to Dockerfile of app + image: ghcr.io/bioinfodlsu/rice-pilaf/app + - dockerfile: Dockerfile-workflow # Change to Dockerfile of workflow + image: ghcr.io/bioinfodlsu/rice-pilaf/workflow + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v4 + with: + images: ${{ matrix.image }} + + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Log in to the Container registry + uses: docker/login-action@v2 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push Docker image + uses: docker/build-push-action@v3 + with: + context: . + cache-from: type=gha + cache-to: type=gha, mode=max + file: ${{ matrix.dockerfile }} + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + + on-failure: + runs-on: ubuntu-latest + if: ${{ github.event.workflow_run.conclusion == 'failure' }} + steps: + - run: echo 'The triggering workflow failed' diff --git a/.github/workflows/mirror-main-to-demo.yml b/.github/workflows/mirror-main-to-demo.yml index d9613f9c..ec0d0586 100644 --- a/.github/workflows/mirror-main-to-demo.yml +++ b/.github/workflows/mirror-main-to-demo.yml @@ -1,20 +1,20 @@ -name: Mirror main to demo branch - -on: - workflow_run: - workflows: [Check for syntax errors] - types: [completed] - branches: [main] - -jobs: - mirror_job: - runs-on: ubuntu-latest - name: Mirror main branch to demo branch - steps: - - name: Mirror action step - id: mirror - uses: google/mirror-branch-action@v2.0 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - source: 'main' - dest: 'demo' +name: Mirror main to demo branch + +on: + workflow_run: + workflows: [Check for syntax errors] + types: [completed] + branches: [main] + +jobs: + mirror_job: + runs-on: ubuntu-latest + name: Mirror main branch to demo branch + steps: + - name: Mirror action step + id: mirror + uses: google/mirror-branch-action@v2.0 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + source: 'main' + dest: 'demo' diff --git a/.gitignore b/.gitignore index 33546701..810f6ec2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,201 +1,201 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# Data folder -data/ -data.zip -static/ -enrichment_analysis/data/ - -# History files -.Rhistory -.Rapp.history - -# Session Data files -.RData -.RDataTmp - -# User-specific files -.Ruserdata - -# Example code in package build process -*-Ex.R - -# Output files from R CMD build -/*.tar.gz - -# Output files from R CMD check -/*.Rcheck/ - -# RStudio files -.Rproj.user/ - -# produced vignettes -vignettes/*.html -vignettes/*.pdf - -# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 -.httr-oauth - -# knitr and R markdown default cache directories -*_cache/ -/cache/ - -# Temporary files created by R markdown -*.utf8.md -*.knit.md - -# R Environment Variables -.Renviron - -# pkgdown site -docs/ - -# translation temp files -po/*~ - -# RStudio Connect folder -rsconnect/ - -# Scratch -scratch/ -scratch.ipynb -scratch.py -*dummy* -generic-enrichment.r -app-1.py - -# Scripts for bulk processing -*-bulk.* -*-bulk*.* - -# Binary files -prepare_data/**/LazyFox +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Data folder +data/ +data.zip +static/ +enrichment_analysis/data/ + +# History files +.Rhistory +.Rapp.history + +# Session Data files +.RData +.RDataTmp + +# User-specific files +.Ruserdata + +# Example code in package build process +*-Ex.R + +# Output files from R CMD build +/*.tar.gz + +# Output files from R CMD check +/*.Rcheck/ + +# RStudio files +.Rproj.user/ + +# produced vignettes +vignettes/*.html +vignettes/*.pdf + +# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 +.httr-oauth + +# knitr and R markdown default cache directories +*_cache/ +/cache/ + +# Temporary files created by R markdown +*.utf8.md +*.knit.md + +# R Environment Variables +.Renviron + +# pkgdown site +docs/ + +# translation temp files +po/*~ + +# RStudio Connect folder +rsconnect/ + +# Scratch +scratch/ +scratch.ipynb +scratch.py +*dummy* +generic-enrichment.r +app-1.py + +# Scripts for bulk processing +*-bulk.* +*-bulk*.* + +# Binary files +prepare_data/**/LazyFox *.jar \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index cfaa0770..a8054ea2 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,23 +1,23 @@ -{ - "[python]": { - "editor.defaultFormatter": "ms-python.autopep8", - "editor.formatOnSave": true - }, - "[r]": { - "editor.defaultFormatter": "REditorSupport.r", - "editor.formatOnSave": true - }, - "python.formatting.provider": "none", - "editor.codeActionsOnSave": { - "source.organizeImports": true - }, - "saveAndRunExt": { - "commands": [ - { - "match": ".*\\.py", - "isShellCommand": true, - "cmd": "autoflake -i --remove-all-unused-imports --remove-unused-variables ${file}" - } - ] - } -} +{ + "[python]": { + "editor.defaultFormatter": "ms-python.autopep8", + "editor.formatOnSave": true + }, + "[r]": { + "editor.defaultFormatter": "REditorSupport.r", + "editor.formatOnSave": true + }, + "python.formatting.provider": "none", + "editor.codeActionsOnSave": { + "source.organizeImports": true + }, + "saveAndRunExt": { + "commands": [ + { + "match": ".*\\.py", + "isShellCommand": true, + "cmd": "autoflake -i --remove-all-unused-imports --remove-unused-variables ${file}" + } + ] + } +} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 27b57816..3e39ebc0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,25 +1,25 @@ -Welcome! -## Table of Contents -## Reporting Bugs -## Requesting Features -## Contributing Workflow -1. Read wiki -2. Fork repository and clone -3. Set up the environment - - OS - - Manual Setup - - Docker: https://learn.microsoft.com/en-us/training/modules/use-docker-container-dev-env-vs-code/ -5. Implement the feature - - Refer to the wiki - - Note on code formatting and linting -6. Dockerize - - Refer to the wiki -7. Submit pull request - - Pull requests with merge conflicts - - Have to pass the check for syntax errors -9. Update documentation - - Wiki documentation - -## License -## Code of Conduct -## Contact +Welcome! +## Table of Contents +## Reporting Bugs +## Requesting Features +## Contributing Workflow +1. Read wiki +2. Fork repository and clone +3. Set up the environment + - OS + - Manual Setup + - Docker: https://learn.microsoft.com/en-us/training/modules/use-docker-container-dev-env-vs-code/ +5. Implement the feature + - Refer to the wiki + - Note on code formatting and linting +6. Dockerize + - Refer to the wiki +7. Submit pull request + - Pull requests with merge conflicts + - Have to pass the check for syntax errors +9. Update documentation + - Wiki documentation + +## License +## Code of Conduct +## Contact diff --git a/Dockerfile-app b/Dockerfile-app index d079ab8d..b26e43a3 100644 --- a/Dockerfile-app +++ b/Dockerfile-app @@ -1,25 +1,25 @@ -FROM tiangolo/uwsgi-nginx-flask:python3.10 - -COPY . /app -WORKDIR /app - -RUN set -ex - -RUN echo 'deb [trusted=yes] http://cloud.r-project.org/bin/linux/debian bullseye-cran40/' >> /etc/apt/sources.list - -RUN apt-get update \ - && apt-get install -y \ - git \ - python3-dev \ - python3-pip \ - && apt-get clean - -RUN pip3 install --no-cache-dir -r dependencies/requirements-app.txt - -# Install mcdp2 -RUN cd ../ \ - && git clone https://github.com/fmfi-compbio/mcdp2 \ - && cd mcdp2 \ - && git reset --hard fd7c69f5e97db8c1052df859cb02d86533287e64 \ - && pip3 install . \ - && cd ../app +FROM tiangolo/uwsgi-nginx-flask:python3.10 + +COPY . /app +WORKDIR /app + +RUN set -ex + +RUN echo 'deb [trusted=yes] http://cloud.r-project.org/bin/linux/debian bullseye-cran40/' >> /etc/apt/sources.list + +RUN apt-get update \ + && apt-get install -y \ + git \ + python3-dev \ + python3-pip \ + && apt-get clean + +RUN pip3 install --no-cache-dir -r dependencies/requirements-app.txt + +# Install mcdp2 +RUN cd ../ \ + && git clone https://github.com/fmfi-compbio/mcdp2 \ + && cd mcdp2 \ + && git reset --hard fd7c69f5e97db8c1052df859cb02d86533287e64 \ + && pip3 install . \ + && cd ../app diff --git a/Dockerfile-workflow b/Dockerfile-workflow index ef6e21b2..88f3ed99 100644 --- a/Dockerfile-workflow +++ b/Dockerfile-workflow @@ -1,34 +1,34 @@ -FROM tiangolo/uwsgi-nginx-flask:python3.10 - -COPY . /app -WORKDIR /app - -RUN set -ex - -RUN echo 'deb [trusted=yes] http://cloud.r-project.org/bin/linux/debian bullseye-cran40/' >> /etc/apt/sources.list - -RUN apt-get clean \ - && apt-get update \ - && apt-get install -y \ - build-essential \ - git \ - libcurl4-openssl-dev \ - libffi-dev \ - libfontconfig1-dev \ - libssl-dev \ - libxml2-dev \ - python3-dev \ - python3-pip \ - r-base - -RUN pip3 install --no-cache-dir -r dependencies/requirements-workflow.txt - -# Install mcdp2 -RUN cd ../ \ - && git clone https://github.com/fmfi-compbio/mcdp2 \ - && cd mcdp2 \ - && git reset --hard fd7c69f5e97db8c1052df859cb02d86533287e64 \ - && pip3 install . \ - && cd ../app - -RUN Rscript --vanilla dependencies/install-libraries-workflow.r +FROM tiangolo/uwsgi-nginx-flask:python3.10 + +COPY . /app +WORKDIR /app + +RUN set -ex + +RUN echo 'deb [trusted=yes] http://cloud.r-project.org/bin/linux/debian bullseye-cran40/' >> /etc/apt/sources.list + +RUN apt-get clean \ + && apt-get update \ + && apt-get install -y \ + build-essential \ + git \ + libcurl4-openssl-dev \ + libffi-dev \ + libfontconfig1-dev \ + libssl-dev \ + libxml2-dev \ + python3-dev \ + python3-pip \ + r-base + +RUN pip3 install --no-cache-dir -r dependencies/requirements-workflow.txt + +# Install mcdp2 +RUN cd ../ \ + && git clone https://github.com/fmfi-compbio/mcdp2 \ + && cd mcdp2 \ + && git reset --hard fd7c69f5e97db8c1052df859cb02d86533287e64 \ + && pip3 install . \ + && cd ../app + +RUN Rscript --vanilla dependencies/install-libraries-workflow.r diff --git a/LICENSE b/LICENSE index 0afae2e4..dd58ad3c 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,21 @@ -MIT License - -Copyright (c) 2023 Bioinformatics Lab, De La Salle University Manila - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +MIT License + +Copyright (c) 2023 Bioinformatics Lab, De La Salle University Manila + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index ac652051..a76b7575 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ -# RicePilaf -Welcome to RicePilaf, a post-GWAS/QTL analysis dashboard for rice genomes. - -## Installation and usage -Please visit the RicePilaf [wiki](https://github.com/bioinfodlsu/rice-pilaf/wiki). - -## Demo -A demo version can be seen [here](http://ricepilaf.bioinfodlsu.com/). - -## Contact -If you have issues, concerns, questions, please contact: Anish Shrestha (anish.shrestha --atmark-- dlsu.edu.ph) +# RicePilaf +Welcome to RicePilaf, a post-GWAS/QTL analysis dashboard for rice genomes. + +## Installation and usage +Please visit the RicePilaf [wiki](https://github.com/bioinfodlsu/rice-pilaf/wiki). + +## Demo +A demo version can be seen [here](http://ricepilaf.bioinfodlsu.com/). + +## Contact +If you have issues, concerns, questions, please contact: Anish Shrestha (anish.shrestha --atmark-- dlsu.edu.ph) diff --git a/app.py b/app.py index 93564ec3..c1842bc4 100644 --- a/app.py +++ b/app.py @@ -1,282 +1,282 @@ -import dash -import dash_bootstrap_components as dbc -from dash import dcc, html -import sqlite3 - -import pages.navigation.main_nav as main_nav - -import callbacks.homepage.callbacks -import callbacks.homepage.util -import callbacks.lift_over.callbacks -import callbacks.browse_loci.callbacks -import callbacks.coexpression.callbacks -import callbacks.tf_enrich.callbacks -import callbacks.text_mining.callbacks - -from callbacks.branch import * -from callbacks.constants import * -from callbacks.file_util import * - -from flask import Flask - -server = Flask(__name__, static_folder='static') -app = dash.Dash(__name__, use_pages=True, - external_stylesheets=[dbc.themes.BOOTSTRAP, - dbc.icons.BOOTSTRAP, dbc.icons.FONT_AWESOME], - server=server, - title='RicePilaf', - update_title='Loading...') - -welcome = dcc.Markdown( - ''' - Welcome ! Rice Pilaf is short for Rice Post-GWAS/QTL Dashboard. - Ok, we are not good at abbreviations, but like a good pilaf, this dashboard combines many ingredients. - With this tool, you can do amazing things like ... (write me) - ''' -) - - -# ============ -# Main Layout -# ============ - - -app.layout = lambda: dbc.Container([ - dbc.Row( - html.Div( - children=[ - html.P([ - 'This is a demo version. Click ', - dcc.Link( - ['here ', html.I( - id='demo-link', - className='fa-solid fa-up-right-from-square fa-2xs' - )], - href='https://github.com/bioinfodlsu/rice-pilaf/wiki/1.-Installation', - target='_blank', - className='top-navbar-item' - ), - ' to install.'], className='my-auto' - ) - ], - className='banner d-flex justify-content-center py-1 text-white', - id='demo-banner' - ), - style=show_if_in_demo_branch() - ), - - dbc.Row(main_nav.navbar()), - - dash.page_container, - - # Session storage - html.Div( - id='session-container', - children=[ - # ========= - # Homepage - # ========= - dcc.Store( - id='homepage-is-submitted', - storage_type='session' - ), - - dcc.Store( - id='homepage-genomic-intervals-saved-input', - storage_type='session' - ), - - dcc.Store( - id='homepage-genomic-intervals-submitted-input', - storage_type='session' - ), - - dcc.Store( - id='current-analysis-page-nav', - storage_type='session' - ), - - - - # ========== - # Lift-over - # ========== - dcc.Store( - id='lift-over-is-submitted', - storage_type='session' - ), - - dcc.Store( - id='lift-over-active-tab', - storage_type='session' - ), - - dcc.Store( - id='lift-over-other-refs-saved-input', - storage_type='session' - ), - - dcc.Store( - id='lift-over-other-refs-submitted-input', - storage_type='session' - ), - - dcc.Store( - id='lift-over-active-filter', - storage_type='session' - ), - - dcc.Store( - id='lift-over-nb-table', - storage_type='session' - ), - - dcc.Store( - id='lift-over-nb-entire-table', - storage_type='session' - ), - - # ============ - # IGV Browser - # ============ - dcc.Store( - id='igv-selected-genomic-intervals-saved-input', - storage_type='session' - ), - - dcc.Store( - id='igv-selected-genomic-intervals-submitted-input', - storage_type='session' - ), - - dcc.Store( - id='igv-selected-tracks-submitted-input', - storage_type='session' - ), - - dcc.Store( - id='igv-is-submitted', - storage_type='session' - ), - - # ============== - # Co-expression - # ============== - dcc.Store( - id='coexpression-addl-genes-saved-input', - storage_type='session' - ), - - dcc.Store( - id='coexpression-submitted-addl-genes', - storage_type='session' - ), - - dcc.Store( - id='coexpression-combined-genes', - storage_type='session' - ), - - dcc.Store( - id='coexpression-network-saved-input', - storage_type='session' - ), - - dcc.Store( - id='coexpression-submitted-network', - storage_type='session' - ), - - dcc.Store( - id='coexpression-clustering-algo-saved-input', - storage_type='session' - ), - - dcc.Store( - id='coexpression-submitted-clustering-algo', - storage_type='session' - ), - - dcc.Store( - id='coexpression-submitted-parameter-module', - storage_type='session' - ), - - dcc.Store( - id='coexpression-parameter-module-saved-input', - storage_type='session' - ), - - dcc.Store( - id='coexpression-is-submitted', - storage_type='session' - ), - - # ============================== - # Regulatory Feature Enrichment - # ============================== - - dcc.Store( - id='tfbs-saved-input', - storage_type='session' - ), - - dcc.Store( - id='tfbs-submitted-input', - storage_type='session' - ), - - dcc.Store( - id='tfbs-is-submitted', - storage_type='session' - ), - - # ============ - # Text Mining - # ============ - - dcc.Store( - id='text-mining-query-saved-input', - storage_type='session' - ), - - dcc.Store( - id='text-mining-query-submitted-input', - storage_type='session' - ), - - dcc.Store( - id='text-mining-is-submitted', - storage_type='session' - ), - ]) -], fluid=True, className='pb-4') - -callbacks.homepage.callbacks.init_callback(app) - -callbacks.lift_over.callbacks.init_callback(app) -callbacks.browse_loci.callbacks.init_callback(app) -callbacks.coexpression.callbacks.init_callback(app) -callbacks.tf_enrich.callbacks.init_callback(app) -callbacks.text_mining.callbacks.init_callback(app) - -# Create database table -const = Constants() -make_dir(const.TEMP) - -try: - connection = sqlite3.connect(const.FILE_STATUS_DB) - cursor = connection.cursor() - - query = f'CREATE TABLE IF NOT EXISTS {const.FILE_STATUS_TABLE} (name TEXT, UNIQUE(name));' - - cursor.execute(query) - connection.commit() - - cursor.close() - connection.close() -except sqlite3.Error as error: - pass - -if __name__ == '__main__': - app.run_server(port='8050', debug=True) +import dash +import dash_bootstrap_components as dbc +from dash import dcc, html +import sqlite3 + +import pages.navigation.main_nav as main_nav + +import callbacks.homepage.callbacks +import callbacks.homepage.util +import callbacks.lift_over.callbacks +import callbacks.browse_loci.callbacks +import callbacks.coexpression.callbacks +import callbacks.tf_enrich.callbacks +import callbacks.text_mining.callbacks + +from callbacks.branch import * +from callbacks.constants import * +from callbacks.file_util import * + +from flask import Flask + +server = Flask(__name__, static_folder='static') +app = dash.Dash(__name__, use_pages=True, + external_stylesheets=[dbc.themes.BOOTSTRAP, + dbc.icons.BOOTSTRAP, dbc.icons.FONT_AWESOME], + server=server, + title='RicePilaf', + update_title='Loading...') + +welcome = dcc.Markdown( + ''' + Welcome ! Rice Pilaf is short for Rice Post-GWAS/QTL Dashboard. + Ok, we are not good at abbreviations, but like a good pilaf, this dashboard combines many ingredients. + With this tool, you can do amazing things like ... (write me) + ''' +) + + +# ============ +# Main Layout +# ============ + + +app.layout = lambda: dbc.Container([ + dbc.Row( + html.Div( + children=[ + html.P([ + 'This is a demo version. Click ', + dcc.Link( + ['here ', html.I( + id='demo-link', + className='fa-solid fa-up-right-from-square fa-2xs' + )], + href='https://github.com/bioinfodlsu/rice-pilaf/wiki/1.-Installation', + target='_blank', + className='top-navbar-item' + ), + ' to install.'], className='my-auto' + ) + ], + className='banner d-flex justify-content-center py-1 text-white', + id='demo-banner' + ), + style=show_if_in_demo_branch() + ), + + dbc.Row(main_nav.navbar()), + + dash.page_container, + + # Session storage + html.Div( + id='session-container', + children=[ + # ========= + # Homepage + # ========= + dcc.Store( + id='homepage-is-submitted', + storage_type='session' + ), + + dcc.Store( + id='homepage-genomic-intervals-saved-input', + storage_type='session' + ), + + dcc.Store( + id='homepage-genomic-intervals-submitted-input', + storage_type='session' + ), + + dcc.Store( + id='current-analysis-page-nav', + storage_type='session' + ), + + + + # ========== + # Lift-over + # ========== + dcc.Store( + id='lift-over-is-submitted', + storage_type='session' + ), + + dcc.Store( + id='lift-over-active-tab', + storage_type='session' + ), + + dcc.Store( + id='lift-over-other-refs-saved-input', + storage_type='session' + ), + + dcc.Store( + id='lift-over-other-refs-submitted-input', + storage_type='session' + ), + + dcc.Store( + id='lift-over-active-filter', + storage_type='session' + ), + + dcc.Store( + id='lift-over-nb-table', + storage_type='session' + ), + + dcc.Store( + id='lift-over-nb-entire-table', + storage_type='session' + ), + + # ============ + # IGV Browser + # ============ + dcc.Store( + id='igv-selected-genomic-intervals-saved-input', + storage_type='session' + ), + + dcc.Store( + id='igv-selected-genomic-intervals-submitted-input', + storage_type='session' + ), + + dcc.Store( + id='igv-selected-tracks-submitted-input', + storage_type='session' + ), + + dcc.Store( + id='igv-is-submitted', + storage_type='session' + ), + + # ============== + # Co-expression + # ============== + dcc.Store( + id='coexpression-addl-genes-saved-input', + storage_type='session' + ), + + dcc.Store( + id='coexpression-submitted-addl-genes', + storage_type='session' + ), + + dcc.Store( + id='coexpression-combined-genes', + storage_type='session' + ), + + dcc.Store( + id='coexpression-network-saved-input', + storage_type='session' + ), + + dcc.Store( + id='coexpression-submitted-network', + storage_type='session' + ), + + dcc.Store( + id='coexpression-clustering-algo-saved-input', + storage_type='session' + ), + + dcc.Store( + id='coexpression-submitted-clustering-algo', + storage_type='session' + ), + + dcc.Store( + id='coexpression-submitted-parameter-module', + storage_type='session' + ), + + dcc.Store( + id='coexpression-parameter-module-saved-input', + storage_type='session' + ), + + dcc.Store( + id='coexpression-is-submitted', + storage_type='session' + ), + + # ============================== + # Regulatory Feature Enrichment + # ============================== + + dcc.Store( + id='tfbs-saved-input', + storage_type='session' + ), + + dcc.Store( + id='tfbs-submitted-input', + storage_type='session' + ), + + dcc.Store( + id='tfbs-is-submitted', + storage_type='session' + ), + + # ============ + # Text Mining + # ============ + + dcc.Store( + id='text-mining-query-saved-input', + storage_type='session' + ), + + dcc.Store( + id='text-mining-query-submitted-input', + storage_type='session', + ), + + dcc.Store( + id='text-mining-is-submitted', + storage_type='session' + ), + ]) +], fluid=True, className='pb-4') + +callbacks.homepage.callbacks.init_callback(app) + +callbacks.lift_over.callbacks.init_callback(app) +callbacks.browse_loci.callbacks.init_callback(app) +callbacks.coexpression.callbacks.init_callback(app) +callbacks.tf_enrich.callbacks.init_callback(app) +callbacks.text_mining.callbacks.init_callback(app) + +# Create database table +const = Constants() +make_dir(const.TEMP) + +try: + connection = sqlite3.connect(const.FILE_STATUS_DB) + cursor = connection.cursor() + + query = f'CREATE TABLE IF NOT EXISTS {const.FILE_STATUS_TABLE} (name TEXT, UNIQUE(name));' + + cursor.execute(query) + connection.commit() + + cursor.close() + connection.close() +except sqlite3.Error as error: + pass + +if __name__ == '__main__': + app.run_server(port='8050', debug=True) diff --git a/assets/1-global.css b/assets/1-global.css index 440aa34c..6ef584a9 100644 --- a/assets/1-global.css +++ b/assets/1-global.css @@ -1,67 +1,75 @@ -:root { - --green-top: #254b5d; - --green-top-hover: #2d6078; - --green-lighter: #254b5d; - --green-lightest: #4aa59f; - - --light-gray: #cfddd7; - --bg-gray: #fdfdfd; - --bg-gray-darker: #f2f7f7; - --bg-gray-darkest: #e8f0f0; - - --th-gray: #e1eeed; - --td-gray: #eff2f2; - --table-button-gray: #d3d3d3; - - --link-blue: #6b6bb9; - --button-red: #dc3545; -} - -html, body { - background-color: var(--bg-gray); -} - -.page-button { - width: 11em; - background-color: var(--green-lighter); - border-color: var(--green-lighter); -} - -.page-button:hover { - width: 11em; - background-color: var(--green-top-hover); - border-color: var(--green-top-hover); -} - -i { - cursor: pointer; -} - -a { - color: var(--link-blue); - text-decoration: none; -} - -a:hover { - color: var(--bs-body-color); -} - -.non-clickable { - cursor: default; -} - -.bi-chevron-bar-right { - -webkit-text-stroke: 1px; -} - -hr { - margin: 0; -} - -.left-align { - text-align: left !important; -} - -.link-muted:hover { - color: #6c757d !important; +:root { + --green-top: #254b5d; + --green-top-hover: #2d6078; + --green-lighter: #254b5d; + --green-lightest: #4aa59f; + + --light-gray: #cfddd7; + --bg-gray: #fdfdfd; + --bg-gray-darker: #f2f7f7; + --bg-gray-darkest: #e8f0f0; + + --th-gray: #e1eeed; + --td-gray: #eff2f2; + --table-button-gray: #d3d3d3; + + --link-blue: #6b6bb9; + --button-red: #dc3545; + + --button-gray-disabled: #cccccc +} + +html, body { + background-color: var(--bg-gray); +} + +.page-button { + width: 11em; + background-color: var(--green-lighter); + border-color: var(--green-lighter); +} + +.page-button:disabled { + width: 11em; + background-color: var(--button-gray-disabled); + border-color: var(--button-gray-disabled); +} + +.page-button:hover { + width: 11em; + background-color: var(--green-top-hover); + border-color: var(--green-top-hover); +} + +i { + cursor: pointer; +} + +a { + color: var(--link-blue); + text-decoration: none; +} + +a:hover { + color: var(--bs-body-color); +} + +.non-clickable { + cursor: default; +} + +.bi-chevron-bar-right { + -webkit-text-stroke: 1px; +} + +hr { + margin: 0; +} + +.left-align { + text-align: left !important; +} + +.link-muted:hover { + color: #6c757d !important; } \ No newline at end of file diff --git a/assets/2-homepage.css b/assets/2-homepage.css index bc4e02d5..4d8d88bb 100644 --- a/assets/2-homepage.css +++ b/assets/2-homepage.css @@ -1,106 +1,106 @@ -#logo { - height: 30px; -} - -#genome-ref-input-container { - background-color: var(--bg-gray-darker); -} - -#homepage-reset, #homepage-clear-cache { - background-color: white; -} - -#homepage-reset:hover, #homepage-clear-cache:hover { - background-color: var(--button-red); -} - -#homepage-submit { - background-color: var(--green-top); - border-color: var(--green-top); -} - -#homepage-submit:hover { - background-color: var(--green-top-hover); - border-color: var(--green-top-hover); -} - -#top-navbar { - background-color: var(--green-top) !important; -} - -.home-button { - width: 100%; -} - -#homepage-dash-nav { - border-radius: 10px; -} - -#homepage-dash-nav .nav-item:hover { - background-color: var(--bg-gray-darkest); - border-radius: 10px; -} - -.nav-link { - color: var(--bs-body-color) !important; - cursor: pointer; -} - -.nav-pills .nav-link.active { - color: white !important; - background-color: var(--green-lightest); -} - -.top-navbar-item { - color: var(--bs-nav-link-color) !important; -} - -.top-navbar-item.active { - color: white !important; -} - -#post-gwas-hdr { - margin-bottom: 1em; -} - -#genomic-interval-container { - margin-bottom: 0.6em; -} - -#genomic-interval-hdr { - display: inline; -} - -.sample-genomic-interval { - color: var(--link-blue); -} - -.sample-genomic-interval:hover { - text-decoration: underline; - cursor: pointer; -} - -#reset-analyses-container { - margin-left: 3.5em; - margin-right: 3.5em; -} - -.banner { - background-color: #214151; -} - -#demo-banner { - border-bottom: 1px solid white; -} - -#demo-banner i { - color: white !important; -} - -#demo-banner i.active { - color: white !important; -} - -#page { - margin-left: 3em; +#logo { + height: 30px; +} + +#genome-ref-input-container { + background-color: var(--bg-gray-darker); +} + +#homepage-reset, #homepage-clear-cache { + background-color: white; +} + +#homepage-reset:hover, #homepage-clear-cache:hover { + background-color: var(--button-red); +} + +#homepage-submit { + background-color: var(--green-top); + border-color: var(--green-top); +} + +#homepage-submit:hover { + background-color: var(--green-top-hover); + border-color: var(--green-top-hover); +} + +#top-navbar { + background-color: var(--green-top) !important; +} + +.home-button { + width: 100%; +} + +#homepage-dash-nav { + border-radius: 10px; +} + +#homepage-dash-nav .nav-item:hover { + background-color: var(--bg-gray-darkest); + border-radius: 10px; +} + +.nav-link { + color: var(--bs-body-color) !important; + cursor: pointer; +} + +.nav-pills .nav-link.active { + color: white !important; + background-color: var(--green-lightest); +} + +.top-navbar-item { + color: var(--bs-nav-link-color) !important; +} + +.top-navbar-item.active { + color: white !important; +} + +#post-gwas-hdr { + margin-bottom: 1em; +} + +#genomic-interval-container { + margin-bottom: 0.6em; +} + +#genomic-interval-hdr { + display: inline; +} + +.sample-genomic-interval { + color: var(--link-blue); +} + +.sample-genomic-interval:hover { + text-decoration: underline; + cursor: pointer; +} + +#reset-analyses-container { + margin-left: 3.5em; + margin-right: 3.5em; +} + +.banner { + background-color: #214151; +} + +#demo-banner { + border-bottom: 1px solid white; +} + +#demo-banner i { + color: white !important; +} + +#demo-banner i.active { + color: white !important; +} + +#page { + margin-left: 3em; } \ No newline at end of file diff --git a/assets/3-table.css b/assets/3-table.css index 2736dd36..1109eaf5 100644 --- a/assets/3-table.css +++ b/assets/3-table.css @@ -1,58 +1,58 @@ -table tr td { - vertical-align: top; -} - -table tr td p, table tr td div p { - margin-bottom: 0; - padding-bottom: 0; - display: inline; -} - -table tr td div { - text-align: right !important; -} - -th:not(.dash-filter) { - font-weight: bold !important; - background-color: var(--th-gray) !important; -} - -tr:nth-of-type(odd) td { - background-color: var(--td-gray) !important; -} - -th, td { - padding: 0.5em !important; - font-family: sans-serif; -} - -.page-number, .current-page { - font-family: var(--bs-body-font-family) !important; - font-size: var(--font-size-base) !important; -} - -.table-button { - border: 1px solid var(--table-button-gray); - background-color: transparent; -} - -.table-button:hover { - border: 1px solid var(--table-button-gray); - background-color: var(--td-gray); -} - -#text-mining-result-table th { - text-align: center; -} - -#text-mining-result-table table tr td div { - text-align: left !important; -} - -.dash-cell-value { - overflow-y: hidden !important; -} - -table tr td ul { - text-align: left !important; +table tr td { + vertical-align: top; +} + +table tr td p, table tr td div p { + margin-bottom: 0; + padding-bottom: 0; + display: inline; +} + +table tr td div { + text-align: right !important; +} + +th:not(.dash-filter) { + font-weight: bold !important; + background-color: var(--th-gray) !important; +} + +tr:nth-of-type(odd) td { + background-color: var(--td-gray) !important; +} + +th, td { + padding: 0.5em !important; + font-family: sans-serif; +} + +.page-number, .current-page { + font-family: var(--bs-body-font-family) !important; + font-size: var(--font-size-base) !important; +} + +.table-button { + border: 1px solid var(--table-button-gray); + background-color: transparent; +} + +.table-button:hover { + border: 1px solid var(--table-button-gray); + background-color: var(--td-gray); +} + +#text-mining-result-table th { + text-align: center; +} + +#text-mining-result-table table tr td div { + text-align: left !important; +} + +.dash-cell-value { + overflow-y: hidden !important; +} + +table tr td ul { + text-align: left !important; } \ No newline at end of file diff --git a/assets/4-input-elem.css b/assets/4-input-elem.css index 3cb90557..db7033b5 100644 --- a/assets/4-input-elem.css +++ b/assets/4-input-elem.css @@ -1,30 +1,30 @@ -.form-check-input:checked { - background-color: var(--green-lighter); - border-color: var(--green-lighter); -} - -.form-check-input:focus { - border-color: var(--light-gray); - box-shadow: 0 0 0 0.25rem var(--light-gray); -} - -.rc-slider-dot-active, .rc-slider-handle { - border-color: var(--green-lighter) !important; -} - -.rc-slider-track { - background-color: var(--green-lighter) !important; -} - -#coexpression-parameter-slider-container { - display: flex; - justify-content: center; -} - -#coexpression-parameter-slider { - width: 95%; -} - -#lift-over-overlap-table-filter { - margin-bottom: -0.5em; +.form-check-input:checked { + background-color: var(--green-lighter); + border-color: var(--green-lighter); +} + +.form-check-input:focus { + border-color: var(--light-gray); + box-shadow: 0 0 0 0.25rem var(--light-gray); +} + +.rc-slider-dot-active, .rc-slider-handle { + border-color: var(--green-lighter) !important; +} + +.rc-slider-track { + background-color: var(--green-lighter) !important; +} + +#coexpression-parameter-slider-container { + display: flex; + justify-content: center; +} + +#coexpression-parameter-slider { + width: 95%; +} + +#lift-over-overlap-table-filter { + margin-bottom: -0.5em; } \ No newline at end of file diff --git a/assets/5-graph.css b/assets/5-graph.css index 176b9da3..877a384c 100644 --- a/assets/5-graph.css +++ b/assets/5-graph.css @@ -1,6 +1,6 @@ -#coexpression-module-graph { - visibility: hidden; - width: 100%; - height: 100vh; - border: 1px solid var(--table-button-gray); +#coexpression-module-graph { + visibility: hidden; + width: 100%; + height: 100vh; + border: 1px solid var(--table-button-gray); } \ No newline at end of file diff --git a/assets/6-igv.css b/assets/6-igv.css index 3f2ac2dd..e5ef926d 100644 --- a/assets/6-igv.css +++ b/assets/6-igv.css @@ -1,3 +1,3 @@ -.igv-root-div, #igv-Nipponbare-local { - margin: 0 !important; +.igv-root-div, #igv-Nipponbare-local { + margin: 0 !important; } \ No newline at end of file diff --git a/assets/7-loading.css b/assets/7-loading.css index 2d897d1e..782bc77b 100644 --- a/assets/7-loading.css +++ b/assets/7-loading.css @@ -1,19 +1,19 @@ -/* This ensures that the loading affordance is displayed at the top of the container div. - Otherwise, it will be displayed at the center of the container div. */ - -#coexpression-results-module-tabs-container + div, -#coexpression-pathways + div, -#lift-over-results-table + div, -#text-mining-result-table + div, -#tf-enrichment-result-table + div, -#coexpression-module-graph + div, -#coexpression-graph-container { - display: block !important; - height: 0 !important; - margin-top: 0; - padding-top: 0; -} - -.dash-default-spinner > div { - background-color: var(--green-lighter) !important; +/* This ensures that the loading affordance is displayed at the top of the container div. + Otherwise, it will be displayed at the center of the container div. */ + +#coexpression-results-module-tabs-container + div, +#coexpression-pathways + div, +#lift-over-results-table + div, +#text-mining-result-table + div, +#tf-enrichment-result-table + div, +#coexpression-module-graph + div, +#coexpression-graph-container { + display: block !important; + height: 0 !important; + margin-top: 0; + padding-top: 0; +} + +.dash-default-spinner > div { + background-color: var(--green-lighter) !important; } \ No newline at end of file diff --git a/assets/8-tooltips.css b/assets/8-tooltips.css index eee3d505..f6e365ab 100644 --- a/assets/8-tooltips.css +++ b/assets/8-tooltips.css @@ -1,12 +1,12 @@ -.algo-desc { - display: block; - margin-bottom: 1em; -} - -.reference { - font-size: small; -} - -i[id$="-tooltip"] { - margin-left: 1em; +.algo-desc { + display: block; + margin-bottom: 1em; +} + +.reference { + font-size: small; +} + +i[id$="-tooltip"] { + margin-left: 1em; } \ No newline at end of file diff --git a/assets/9-analysis.css b/assets/9-analysis.css index ba32f9b7..1cbad7df 100644 --- a/assets/9-analysis.css +++ b/assets/9-analysis.css @@ -1,28 +1,28 @@ -.analysis-intro { - background-color: var(--bg-gray-darker); - border-radius: 10px; -} - -.analysis-intro p:last-child { - margin-bottom: 0; - padding-bottom: 0; -} - -#lift-over-results-statistics { - margin-bottom: 0; - padding-bottom: 0; -} - -#coexpression-table-stats { - text-align: left; -} - -.stats-icon { - color: #254b5d; -} - -.stats { - margin-left: 0; - padding-left: 0; - text-align: left; +.analysis-intro { + background-color: var(--bg-gray-darker); + border-radius: 10px; +} + +.analysis-intro p:last-child { + margin-bottom: 0; + padding-bottom: 0; +} + +#lift-over-results-statistics { + margin-bottom: 0; + padding-bottom: 0; +} + +#coexpression-table-stats { + text-align: left; +} + +.stats-icon { + color: #254b5d; +} + +.stats { + margin-left: 0; + padding-left: 0; + text-align: left; } \ No newline at end of file diff --git a/callbacks/branch.py b/callbacks/branch.py index e3a414ab..26a9cb80 100644 --- a/callbacks/branch.py +++ b/callbacks/branch.py @@ -1,25 +1,25 @@ -from pathlib import Path - - -def get_active_branch_name(): - """ - Lifted from https://stackoverflow.com/questions/26134026/how-to-get-the-current-checked-out-git-branch-name-through-pygit2 - """ - head_dir = Path(".") / ".git" / "HEAD" - with head_dir.open("r") as f: - content = f.read().splitlines() - - for line in content: - if line[0:4] == "ref:": - return line.partition("refs/heads/")[2] - - -def is_in_demo_branch(): - return get_active_branch_name() == 'demo' - - -def show_if_in_demo_branch(): - if is_in_demo_branch(): - return {'display': 'block'} - - return {'display': 'none'} +from pathlib import Path + + +def get_active_branch_name(): + """ + Lifted from https://stackoverflow.com/questions/26134026/how-to-get-the-current-checked-out-git-branch-name-through-pygit2 + """ + head_dir = Path(".") / ".git" / "HEAD" + with head_dir.open("r") as f: + content = f.read().splitlines() + + for line in content: + if line[0:4] == "ref:": + return line.partition("refs/heads/")[2] + + +def is_in_demo_branch(): + return get_active_branch_name() == 'demo' + + +def show_if_in_demo_branch(): + if is_in_demo_branch(): + return {'display': 'block'} + + return {'display': 'none'} diff --git a/callbacks/browse_loci/callbacks.py b/callbacks/browse_loci/callbacks.py index b1190e0a..c72d3f64 100644 --- a/callbacks/browse_loci/callbacks.py +++ b/callbacks/browse_loci/callbacks.py @@ -1,218 +1,218 @@ -import json -import dash_bio as dashbio - -from dash import Input, Output, State, html -from dash.exceptions import PreventUpdate -from flask import json, send_from_directory, abort -from werkzeug.exceptions import HTTPException - -from .util import * -from ..lift_over import util as lift_over_util -from ..file_util import * - -from ..constants import Constants -const = Constants() - - -def init_callback(app): - @app.callback( - Output('igv-genomic-intervals-input', 'children'), - State('homepage-genomic-intervals-submitted-input', 'data'), - Input('homepage-is-submitted', 'data'), - Input('igv-submit', 'n_clicks') - ) - def display_input(nb_intervals_str, homepage_is_submitted, *_): - if homepage_is_submitted: - if nb_intervals_str and not lift_over_util.is_error(lift_over_util.get_genomic_intervals_from_input(nb_intervals_str)): - return [html.B('Your Input Intervals: '), html.Span(nb_intervals_str)] - else: - return None - - raise PreventUpdate - - @app.callback( - Output('igv-is-submitted', 'data', allow_duplicate=True), - Output('igv-selected-genomic-intervals-submitted-input', 'data'), - Input('igv-submit', 'n_clicks'), - State('igv-genomic-intervals', 'value'), - State('igv-track-filter', 'value'), - State('homepage-is-submitted', 'data'), - prevent_initial_call=True - ) - def submit_igv_input(igv_submit_n_clicks, selected_nb_interval, selected_tracks, homepage_is_submitted): - if homepage_is_submitted and igv_submit_n_clicks >= 1: - return True, selected_nb_interval - - raise PreventUpdate - - @app.callback( - Output('igv-results-container', 'style'), - Input('igv-is-submitted', 'data') - ) - def display_igv_output(igv_is_submitted): - if igv_is_submitted: - return {'display': 'block'} - else: - return {'display': 'none'} - - # Lifted from https://flask.palletsprojects.com/en/2.2.x/errorhandling/#:~:text=When%20an%20error%20occurs%20in,user%20when%20an%20error%20occurs. - @app.server.errorhandler(HTTPException) - def handle_exception(e): - """Return JSON instead of HTML for HTTP errors.""" - # start with the correct headers and status code from the error - response = e.get_response() - # replace the body with JSON - response.data = json.dumps({ - "code": e.code, - "name": e.name, - "description": e.description, - }) - response.content_type = "application/json" - return response - - @app.server.route('/genomes_nipponbare/') - def send_genomes_nipponbare_url(filename): - try: - return send_from_directory(const.GENOMES_NIPPONBARE, filename) - except FileNotFoundError: - abort(404) - - @app.server.route('/annotations_nb////') - def send_annotations_nb_url(nb_intervals_str, foldername, selected_interval_str, file_format): - try: - temp_output_folder_dir = get_path_to_temp( - nb_intervals_str, const.TEMP_IGV, foldername) - - selected_interval_str_filename = convert_text_to_path( - selected_interval_str) - - selected_interval_str_file = f'{selected_interval_str_filename}.{file_format}' - - return send_from_directory(temp_output_folder_dir, selected_interval_str_file) - - except FileNotFoundError: - abort(404) - - @app.server.route('/open_chromatin_panicle/') - def send_open_chromatin_panicle_url(filename): - try: - return send_from_directory(const.OPEN_CHROMATIN_PANICLE, filename) - - except FileNotFoundError: - abort(404) - - @app.callback( - Output('igv-genomic-intervals', 'options'), - Output('igv-genomic-intervals', 'value'), - Input('homepage-genomic-intervals-submitted-input', 'data'), - - State('homepage-is-submitted', 'data'), - State('igv-selected-genomic-intervals-saved-input', 'data') - ) - def display_selected_genomic_intervals(nb_intervals_str, homepage_is_submitted, selected_nb_interval): - if homepage_is_submitted: - igv_options = nb_intervals_str.split(';') - - if not selected_nb_interval: - selected_nb_interval = igv_options[0] - - return igv_options, selected_nb_interval - - raise PreventUpdate - - @app.callback( - Output('igv-track-intro', 'children'), - Output('igv-track-filter', 'options'), - Output('igv-track-filter', 'value'), - Input('igv-selected-genomic-intervals-submitted-input', 'data'), - State('homepage-is-submitted', 'data'), - Input('igv-selected-tracks-submitted-input', 'data'), - State('igv-is-submitted', 'data') - ) - def display_igv_tracks_filter(nb_intervals_str, homepage_is_submitted, selected_tracks, igv_is_submitted): - if homepage_is_submitted and igv_is_submitted: - tracks = ['MSU V7 genes', 'chromatin open'] - - if not selected_tracks: - selected_tracks = [tracks[0]] - - return 'Select the tracks to be displayed', \ - tracks, selected_tracks - raise PreventUpdate - - @app.callback( - Output('igv-display', 'children'), - State('igv-selected-genomic-intervals-submitted-input', 'data'), - Input('igv-selected-tracks-submitted-input', 'data'), - State('homepage-is-submitted', 'data'), - State('igv-is-submitted', 'data'), - State('homepage-genomic-intervals-submitted-input', 'data') - ) - def display_igv(selected_nb_intervals_str, selected_tracks, homepage_is_submitted, igv_is_submitted, nb_intervals_str): - if homepage_is_submitted and igv_is_submitted: - track_info = [ - { - "name": "MSU V7 genes", - "format": "gff3", - "description": " Rice Genome Annotation Project", - "url": f"annotations_nb/{nb_intervals_str}/IRGSPMSU.gff.db/{selected_nb_intervals_str}/gff", - "displayMode": "EXPANDED", - "height": 200 - }, - { - "name": "chromatin open", - "format": "bed", - "description": " Rice Genome Annotation Project", - "url": f"open_chromatin_panicle/SRR7126116_ATAC-Seq_Panicles.bed", - "displayMode": "EXPANDED", - "height": 200 - } - ] - - display_tracks = [ - track for track in track_info if selected_tracks and track['name'] in selected_tracks] - - return html.Div([ - dashbio.Igv( - id='igv-Nipponbare-local', - reference={ - "id": "GCF_001433935.1", - "name": "O. sativa IRGSP-1.0 (GCF_001433935.1)", - "fastaURL": "genomes_nipponbare/Npb.fasta", - "indexURL": "genomes_nipponbare/Npb.fasta.fai", - "tracks": display_tracks - }, - locus=[selected_nb_intervals_str] - ) - ]) - - raise PreventUpdate - - @app.callback( - Output('igv-selected-genomic-intervals-saved-input', - 'data', allow_duplicate=True), - Input('igv-genomic-intervals', 'value'), - State('homepage-is-submitted', 'data'), - Input('igv-track-filter', 'value'), - - prevent_initial_call=True - ) - def set_input_igv_session_state(selected_nb_intervals_str, homepage_is_submitted, *_): - if homepage_is_submitted: - return selected_nb_intervals_str - - raise PreventUpdate - - @app.callback( - Output('igv-selected-tracks-submitted-input', - 'data', allow_duplicate=True), - Input('igv-track-filter', 'value'), - State('homepage-is-submitted', 'data'), - State('igv-is-submitted', 'data'), - prevent_initial_call=True - ) - def set_submitted_igv_session_state(selected_tracks, homepage_is_submitted, igv_is_submitted): - if homepage_is_submitted and igv_is_submitted: - return selected_tracks - - raise PreventUpdate +import json +import dash_bio as dashbio + +from dash import Input, Output, State, html +from dash.exceptions import PreventUpdate +from flask import json, send_from_directory, abort +from werkzeug.exceptions import HTTPException + +from .util import * +from ..lift_over import util as lift_over_util +from ..file_util import * + +from ..constants import Constants +const = Constants() + + +def init_callback(app): + @app.callback( + Output('igv-genomic-intervals-input', 'children'), + State('homepage-genomic-intervals-submitted-input', 'data'), + Input('homepage-is-submitted', 'data'), + Input('igv-submit', 'n_clicks') + ) + def display_input(nb_intervals_str, homepage_is_submitted, *_): + if homepage_is_submitted: + if nb_intervals_str and not lift_over_util.is_error(lift_over_util.get_genomic_intervals_from_input(nb_intervals_str)): + return [html.B('Your Input Intervals: '), html.Span(nb_intervals_str)] + else: + return None + + raise PreventUpdate + + @app.callback( + Output('igv-is-submitted', 'data', allow_duplicate=True), + Output('igv-selected-genomic-intervals-submitted-input', 'data'), + Input('igv-submit', 'n_clicks'), + State('igv-genomic-intervals', 'value'), + State('igv-track-filter', 'value'), + State('homepage-is-submitted', 'data'), + prevent_initial_call=True + ) + def submit_igv_input(igv_submit_n_clicks, selected_nb_interval, selected_tracks, homepage_is_submitted): + if homepage_is_submitted and igv_submit_n_clicks >= 1: + return True, selected_nb_interval + + raise PreventUpdate + + @app.callback( + Output('igv-results-container', 'style'), + Input('igv-is-submitted', 'data') + ) + def display_igv_output(igv_is_submitted): + if igv_is_submitted: + return {'display': 'block'} + else: + return {'display': 'none'} + + # Lifted from https://flask.palletsprojects.com/en/2.2.x/errorhandling/#:~:text=When%20an%20error%20occurs%20in,user%20when%20an%20error%20occurs. + @app.server.errorhandler(HTTPException) + def handle_exception(e): + """Return JSON instead of HTML for HTTP errors.""" + # start with the correct headers and status code from the error + response = e.get_response() + # replace the body with JSON + response.data = json.dumps({ + "code": e.code, + "name": e.name, + "description": e.description, + }) + response.content_type = "application/json" + return response + + @app.server.route('/genomes_nipponbare/') + def send_genomes_nipponbare_url(filename): + try: + return send_from_directory(const.GENOMES_NIPPONBARE, filename) + except FileNotFoundError: + abort(404) + + @app.server.route('/annotations_nb////') + def send_annotations_nb_url(nb_intervals_str, foldername, selected_interval_str, file_format): + try: + temp_output_folder_dir = get_path_to_temp( + nb_intervals_str, const.TEMP_IGV, foldername) + + selected_interval_str_filename = convert_text_to_path( + selected_interval_str) + + selected_interval_str_file = f'{selected_interval_str_filename}.{file_format}' + + return send_from_directory(temp_output_folder_dir, selected_interval_str_file) + + except FileNotFoundError: + abort(404) + + @app.server.route('/open_chromatin_panicle/') + def send_open_chromatin_panicle_url(filename): + try: + return send_from_directory(const.OPEN_CHROMATIN_PANICLE, filename) + + except FileNotFoundError: + abort(404) + + @app.callback( + Output('igv-genomic-intervals', 'options'), + Output('igv-genomic-intervals', 'value'), + Input('homepage-genomic-intervals-submitted-input', 'data'), + + State('homepage-is-submitted', 'data'), + State('igv-selected-genomic-intervals-saved-input', 'data') + ) + def display_selected_genomic_intervals(nb_intervals_str, homepage_is_submitted, selected_nb_interval): + if homepage_is_submitted: + igv_options = nb_intervals_str.split(';') + + if not selected_nb_interval: + selected_nb_interval = igv_options[0] + + return igv_options, selected_nb_interval + + raise PreventUpdate + + @app.callback( + Output('igv-track-intro', 'children'), + Output('igv-track-filter', 'options'), + Output('igv-track-filter', 'value'), + Input('igv-selected-genomic-intervals-submitted-input', 'data'), + State('homepage-is-submitted', 'data'), + Input('igv-selected-tracks-submitted-input', 'data'), + State('igv-is-submitted', 'data') + ) + def display_igv_tracks_filter(nb_intervals_str, homepage_is_submitted, selected_tracks, igv_is_submitted): + if homepage_is_submitted and igv_is_submitted: + tracks = ['MSU V7 genes', 'chromatin open'] + + if not selected_tracks: + selected_tracks = [tracks[0]] + + return 'Select the tracks to be displayed', \ + tracks, selected_tracks + raise PreventUpdate + + @app.callback( + Output('igv-display', 'children'), + State('igv-selected-genomic-intervals-submitted-input', 'data'), + Input('igv-selected-tracks-submitted-input', 'data'), + State('homepage-is-submitted', 'data'), + State('igv-is-submitted', 'data'), + State('homepage-genomic-intervals-submitted-input', 'data') + ) + def display_igv(selected_nb_intervals_str, selected_tracks, homepage_is_submitted, igv_is_submitted, nb_intervals_str): + if homepage_is_submitted and igv_is_submitted: + track_info = [ + { + "name": "MSU V7 genes", + "format": "gff3", + "description": " Rice Genome Annotation Project", + "url": f"annotations_nb/{nb_intervals_str}/IRGSPMSU.gff.db/{selected_nb_intervals_str}/gff", + "displayMode": "EXPANDED", + "height": 200 + }, + { + "name": "chromatin open", + "format": "bed", + "description": " Rice Genome Annotation Project", + "url": f"open_chromatin_panicle/SRR7126116_ATAC-Seq_Panicles.bed", + "displayMode": "EXPANDED", + "height": 200 + } + ] + + display_tracks = [ + track for track in track_info if selected_tracks and track['name'] in selected_tracks] + + return html.Div([ + dashbio.Igv( + id='igv-Nipponbare-local', + reference={ + "id": "GCF_001433935.1", + "name": "O. sativa IRGSP-1.0 (GCF_001433935.1)", + "fastaURL": "genomes_nipponbare/Npb.fasta", + "indexURL": "genomes_nipponbare/Npb.fasta.fai", + "tracks": display_tracks + }, + locus=[selected_nb_intervals_str] + ) + ]) + + raise PreventUpdate + + @app.callback( + Output('igv-selected-genomic-intervals-saved-input', + 'data', allow_duplicate=True), + Input('igv-genomic-intervals', 'value'), + State('homepage-is-submitted', 'data'), + Input('igv-track-filter', 'value'), + + prevent_initial_call=True + ) + def set_input_igv_session_state(selected_nb_intervals_str, homepage_is_submitted, *_): + if homepage_is_submitted: + return selected_nb_intervals_str + + raise PreventUpdate + + @app.callback( + Output('igv-selected-tracks-submitted-input', + 'data', allow_duplicate=True), + Input('igv-track-filter', 'value'), + State('homepage-is-submitted', 'data'), + State('igv-is-submitted', 'data'), + prevent_initial_call=True + ) + def set_submitted_igv_session_state(selected_tracks, homepage_is_submitted, igv_is_submitted): + if homepage_is_submitted and igv_is_submitted: + return selected_tracks + + raise PreventUpdate diff --git a/callbacks/browse_loci/util.py b/callbacks/browse_loci/util.py index 10c1a661..729d5742 100644 --- a/callbacks/browse_loci/util.py +++ b/callbacks/browse_loci/util.py @@ -1,59 +1,59 @@ -from ..lift_over import util -import gffutils -import pandas as pd -import os -from ..file_util import * -from ..constants import Constants - -const = Constants() - - -def write_igv_tracks_to_file(nb_intervals_str): - # tracks found in igv - track_db = [[const.ANNOTATIONS_NB, 'IRGSPMSU.gff.db', 'gff'], - [const.OPEN_CHROMATIN_PANICLE, 'SRR7126116_ATAC-Seq_Panicles.bed', 'bed']] - - # write to file the data for igv - for db in track_db: - file_ext = db[2] - - if file_ext == 'gff': - source_dir = f'{db[0]}/{db[1]}' - source_file = db[1] - - write_gff_igv_track_to_file( - source_dir, source_file, nb_intervals_str) - - -def write_gff_igv_track_to_file(source_dir, source_file, nb_intervals_str): - if path_exists(source_dir): - loci_list = nb_intervals_str.split(';') - genomic_interval_list = util.get_genomic_intervals_from_input( - nb_intervals_str) - - temp_folder = get_path_to_temp( - nb_intervals_str, const.TEMP_IGV, source_file) - make_dir(temp_folder) - - for i in range(len(loci_list)): - cur_loci = loci_list[i] - - dest_file = f'{convert_text_to_path(cur_loci)}.gff' - dest_dir = f'{temp_folder}/{dest_file}' - - if not path_exists(dest_dir): - genes_in_interval = get_loci_data_in_gff_file( - source_dir, genomic_interval_list[i]) - - with open(dest_dir, 'w') as fp: - for line in genes_in_interval: - fp.write('%s\n' % line) - - -def get_loci_data_in_gff_file(source_dir, nb_interval): - db = gffutils.FeatureDB(f'{source_dir}', keep_order=True) - - genes_in_interval = list(db.region(region=(nb_interval.chrom, nb_interval.start, nb_interval.stop), - completely_within=False, featuretype='gene')) - - return genes_in_interval +from ..lift_over import util +import gffutils +import pandas as pd +import os +from ..file_util import * +from ..constants import Constants + +const = Constants() + + +def write_igv_tracks_to_file(nb_intervals_str): + # tracks found in igv + track_db = [[const.ANNOTATIONS_NB, 'IRGSPMSU.gff.db', 'gff'], + [const.OPEN_CHROMATIN_PANICLE, 'SRR7126116_ATAC-Seq_Panicles.bed', 'bed']] + + # write to file the data for igv + for db in track_db: + file_ext = db[2] + + if file_ext == 'gff': + source_dir = f'{db[0]}/{db[1]}' + source_file = db[1] + + write_gff_igv_track_to_file( + source_dir, source_file, nb_intervals_str) + + +def write_gff_igv_track_to_file(source_dir, source_file, nb_intervals_str): + if path_exists(source_dir): + loci_list = nb_intervals_str.split(';') + genomic_interval_list = util.get_genomic_intervals_from_input( + nb_intervals_str) + + temp_folder = get_path_to_temp( + nb_intervals_str, const.TEMP_IGV, source_file) + make_dir(temp_folder) + + for i in range(len(loci_list)): + cur_loci = loci_list[i] + + dest_file = f'{convert_text_to_path(cur_loci)}.gff' + dest_dir = f'{temp_folder}/{dest_file}' + + if not path_exists(dest_dir): + genes_in_interval = get_loci_data_in_gff_file( + source_dir, genomic_interval_list[i]) + + with open(dest_dir, 'w') as fp: + for line in genes_in_interval: + fp.write('%s\n' % line) + + +def get_loci_data_in_gff_file(source_dir, nb_interval): + db = gffutils.FeatureDB(f'{source_dir}', keep_order=True) + + genes_in_interval = list(db.region(region=(nb_interval.chrom, nb_interval.start, nb_interval.stop), + completely_within=False, featuretype='gene')) + + return genes_in_interval diff --git a/callbacks/coexpression/callbacks.py b/callbacks/coexpression/callbacks.py index d4e84740..44770564 100644 --- a/callbacks/coexpression/callbacks.py +++ b/callbacks/coexpression/callbacks.py @@ -1,515 +1,515 @@ -from dash import Input, Output, State, html, dcc -from dash.exceptions import PreventUpdate -from collections import namedtuple - -from .util import * -from ..lift_over import util as lift_over_util -from ..branch import * - -Input_parameter_module = namedtuple('Input_parameter_module', [ - 'param_slider_marks', 'param_slider_value']) - -Submitted_parameter_module = namedtuple('Submitted_parameter_module', [ - 'param_slider_marks', 'param_slider_value', 'param_module', 'layout', 'pathway_active_tab']) - - -def init_callback(app): - @app.callback( - Output('coexpression-genomic-intervals-input', 'children'), - State('homepage-genomic-intervals-submitted-input', 'data'), - Input('homepage-is-submitted', 'data'), - Input('coexpression-submit', 'n_clicks') - ) - def display_input(nb_intervals_str, homepage_is_submitted, *_): - if homepage_is_submitted: - if nb_intervals_str and not lift_over_util.is_error(lift_over_util.get_genomic_intervals_from_input(nb_intervals_str)): - return [html.B('Your Input Intervals: '), html.Span(nb_intervals_str)] - - return None - - raise PreventUpdate - - @app.callback( - Output('coexpression-is-submitted', 'data', allow_duplicate=True), - Output('coexpression-submitted-addl-genes', - 'data', allow_duplicate=True), - Output('coexpression-combined-genes', - 'data', allow_duplicate=True), - - Output('coexpression-submitted-network', - 'data', allow_duplicate=True), - Output('coexpression-submitted-clustering-algo', - 'data', allow_duplicate=True), - Output('coexpression-submitted-parameter-module', - 'data', allow_duplicate=True), - - Input('coexpression-submit', 'n_clicks'), - State('homepage-is-submitted', 'data'), - - State('lift-over-nb-table', 'data'), - - State('coexpression-addl-genes', 'value'), - State('coexpression-network', 'value'), - State('coexpression-clustering-algo', 'value'), - State('coexpression-parameter-slider', 'marks'), - State('coexpression-parameter-slider', 'value'), - prevent_initial_call=True - ) - def submit_coexpression_input(coexpression_submit_n_clicks, homepage_is_submitted, - implicated_gene_ids, submitted_addl_genes, - submitted_network, submitted_algo, submitted_slider_marks, submitted_slider_value): - if homepage_is_submitted and coexpression_submit_n_clicks >= 1: - paramater_module_value = Submitted_parameter_module( - submitted_slider_marks, submitted_slider_value, '', 'circle', 'tab-0')._asdict() - - submitted_parameter_module = { - submitted_algo: paramater_module_value} - - if submitted_addl_genes: - submitted_addl_genes = submitted_addl_genes.strip() - else: - submitted_addl_genes = '' - - list_addl_genes = list( - filter(None, [gene.strip() for gene in submitted_addl_genes.split(';')])) - - gene_ids = list(set.union( - set(implicated_gene_ids), set(list_addl_genes))) - - return True, submitted_addl_genes, gene_ids, submitted_network, submitted_algo, submitted_parameter_module - - raise PreventUpdate - - @app.callback( - Output('coexpression-results-container', 'style'), - Input('coexpression-is-submitted', 'data'), - ) - def display_coexpression_output(coexpression_is_submitted): - if coexpression_is_submitted: - return {'display': 'block'} - - else: - return {'display': 'none'} - - @app.callback( - Output('coexpression-parameter-slider', 'marks'), - Output('coexpression-parameter-slider', 'value'), - Input('coexpression-clustering-algo', 'value'), - State('coexpression-parameter-module-saved-input', 'data') - ) - def set_parameter_slider(algo, parameter_module): - if parameter_module and algo in parameter_module: - return parameter_module[algo]['param_slider_marks'], parameter_module[algo]['param_slider_value'] - - return get_parameters_for_algo(algo), module_detection_algos[algo].default_param * module_detection_algos[algo].multiplier - - @app.callback( - Output('coexpression-module-graph', 'elements'), - Output('coexpression-module-graph', 'layout'), - Output('coexpression-module-graph', 'style', allow_duplicate=True), - Output('coexpression-graph-container', 'style'), - - Input('coexpression-combined-genes', 'data'), - - Input('coexpression-submitted-network', 'data'), - Input('coexpression-submitted-clustering-algo', 'data'), - State('coexpression-is-submitted', 'data'), - State('coexpression-submitted-parameter-module', 'data'), - - prevent_initial_call=True - ) - def hide_table_graph(combined_gene_ids, submitted_network, submitted_algo, coexpression_is_submitted, submitted_parameter_module): - if coexpression_is_submitted: - if submitted_algo and submitted_algo in submitted_parameter_module: - parameters = submitted_parameter_module[submitted_algo]['param_slider_value'] - layout = submitted_parameter_module[submitted_algo]['layout'] - - return load_module_graph( - combined_gene_ids, None, submitted_network, submitted_algo, parameters, layout) + ({'visibility': 'hidden'}, ) - - raise PreventUpdate - - @app.callback( - Output('coexpression-table-container', 'style', allow_duplicate=True), - Input('coexpression-submit', 'n_clicks'), - - prevent_initial_call=True - ) - def hide_table(*_): - return {'visibility': 'hidden'} - - @app.callback( - Output('coexpression-module-graph', 'style', allow_duplicate=True), - Input('coexpression-modules', 'value'), - - prevent_initial_call=True - ) - def hide_graph(*_): - return {'visibility': 'hidden'} - - @app.callback( - Output('coexpression-modules', 'options'), - Output('coexpression-modules', 'value'), - Output('coexpression-results-module-tabs-container', 'style'), - Output('coexpression-module-stats', 'children'), - - State('homepage-genomic-intervals-submitted-input', 'data'), - - Input('coexpression-combined-genes', 'data'), - Input('coexpression-submitted-addl-genes', 'data'), - - Input('coexpression-submitted-network', 'data'), - Input('coexpression-submitted-clustering-algo', 'data'), - State('homepage-is-submitted', 'data'), - State('coexpression-submitted-parameter-module', 'data'), - State('coexpression-is-submitted', 'data') - ) - def perform_module_enrichment(genomic_intervals, combined_gene_ids, submitted_addl_genes, - submitted_network, submitted_algo, homepage_is_submitted, submitted_parameter_module, coexpression_is_submitted): - if homepage_is_submitted: - if coexpression_is_submitted: - if submitted_algo and submitted_algo in submitted_parameter_module: - parameters = submitted_parameter_module[submitted_algo]['param_slider_value'] - - enriched_modules = do_module_enrichment_analysis( - combined_gene_ids, genomic_intervals, submitted_addl_genes, submitted_network, submitted_algo, parameters) - - # Display statistics - num_enriched_modules = len(enriched_modules) - total_num_modules = count_modules( - submitted_network, submitted_algo, parameters) - stats = f'{num_enriched_modules} out of {total_num_modules} ' - if total_num_modules == 1: - stats += 'module ' - else: - stats += 'modules ' - - if num_enriched_modules == 1: - stats += 'was found to be enriched (adjusted p-value < 0.05).' - else: - stats += 'were found to be enriched (adjusted p-value < 0.05).' - - first_module = None - if enriched_modules: - first_module = enriched_modules[0] - else: - return enriched_modules, first_module, {'display': 'none'}, stats - - if submitted_parameter_module and submitted_algo in submitted_parameter_module: - if submitted_parameter_module[submitted_algo]['param_module']: - first_module = submitted_parameter_module[submitted_algo]['param_module'] - - return enriched_modules, first_module, {'display': 'block'}, stats - - raise PreventUpdate - - @app.callback( - Output('coexpression-pathways', 'data'), - Output('coexpression-pathways', 'columns'), - Output('coexpression-graph-stats', 'children'), - Output('coexpression-table-stats', 'children'), - - Output('coexpression-table-container', 'style'), - - Input('coexpression-combined-genes', 'data'), - Input('coexpression-submitted-network', 'data'), - Input('coexpression-submitted-clustering-algo', 'data'), - Input('coexpression-modules-pathway', 'active_tab'), - Input('coexpression-modules', 'value'), - State('coexpression-submitted-parameter-module', 'data'), - State('coexpression-is-submitted', 'data') - ) - def display_pathways(combined_gene_ids, - submitted_network, submitted_algo, active_tab, module, submitted_parameter_module, coexpression_is_submitted): - if coexpression_is_submitted: - if submitted_network and submitted_algo and submitted_algo in submitted_parameter_module: - parameters = submitted_parameter_module[submitted_algo]['param_slider_value'] - - try: - module_idx = module.split(' ')[1] - table, _ = convert_to_df( - active_tab, module_idx, submitted_network, submitted_algo, parameters) - except Exception: - table, _ = convert_to_df( - active_tab, None, submitted_network, submitted_algo, parameters) - - columns = [{'id': x, 'name': x, 'presentation': 'markdown'} - for x in table.columns] - - num_enriched = get_num_unique_entries(table, 'ID') - if num_enriched == 1: - stats = f'This module is enriched in {num_enriched} {get_noun_for_active_tab(active_tab).singular}.' - else: - stats = f'This module is enriched in {num_enriched} {get_noun_for_active_tab(active_tab).plural}.' - - graph_stats = 'The selected module has ' - try: - total_num_genes, num_combined_gene_ids = count_genes_in_module( - combined_gene_ids, int(module_idx), submitted_network, submitted_algo, parameters) - except UnboundLocalError: - total_num_genes, num_combined_gene_ids = 0, 0 - - if total_num_genes == 1: - graph_stats += f'{total_num_genes} gene, of which {num_combined_gene_ids} ' - else: - graph_stats += f'{total_num_genes} genes, of which {num_combined_gene_ids} ' - - if num_combined_gene_ids == 1: - graph_stats += 'is implicated by your GWAS/QTL or part of the gene list you manually entered.' - else: - graph_stats += 'are implicated by your GWAS/QTL or part of the gene list you manually entered.' - - if total_num_genes == 0: - return table.to_dict('records'), columns, graph_stats, stats, {'display': 'none'} - else: - return table.to_dict('records'), columns, graph_stats, stats, {'visibility': 'visible'} - - raise PreventUpdate - - @app.callback( - Output('coexpression-module-graph', 'elements', allow_duplicate=True), - Output('coexpression-module-graph', 'layout', allow_duplicate=True), - Output('coexpression-module-graph', 'style', allow_duplicate=True), - Output('coexpression-graph-container', 'style', allow_duplicate=True), - Output('coexpression-extra-bottom-div', 'style', allow_duplicate=True), - - Input('coexpression-combined-genes', 'data'), - Input('coexpression-modules', 'value'), - - State('coexpression-submitted-network', 'data'), - State('coexpression-submitted-clustering-algo', 'data'), - State('coexpression-submitted-parameter-module', 'data'), - - Input('coexpression-graph-layout', 'value'), - State('coexpression-is-submitted', 'data'), - - State('coexpression-modules', 'options'), - - Input('coexpression-reset-graph', 'n_clicks'), - - prevent_initial_call=True - ) - def display_table_graph(combined_gene_ids, module, submitted_network, submitted_algo, submitted_parameter_module, - layout, coexpression_is_submitted, modules, *_): - if coexpression_is_submitted: - if submitted_network and submitted_algo and submitted_algo in submitted_parameter_module: - parameters = submitted_parameter_module[submitted_algo]['param_slider_value'] - - if not modules: - module_graph = load_module_graph( - combined_gene_ids, None, submitted_network, submitted_algo, parameters, layout) - else: - module_graph = load_module_graph( - combined_gene_ids, module, submitted_network, submitted_algo, parameters, layout) - - # No enriched modules - if not modules: - return module_graph + ({'display': 'none'}, {'height': '0em'}) - - return module_graph + ({'visibility': 'visible', 'width': '100%', - 'height': '100vh'}, {'height': '1.5em'}) - - raise PreventUpdate - - @app.callback( - Output('coexpression-addl-genes-saved-input', - 'data', allow_duplicate=True), - Output('coexpression-network-saved-input', - 'data', allow_duplicate=True), - Output('coexpression-clustering-algo-saved-input', - 'data', allow_duplicate=True), - Output('coexpression-parameter-module-saved-input', - 'data', allow_duplicate=True), - - State('coexpression-addl-genes', 'value'), - Input('coexpression-network', 'value'), - Input('coexpression-clustering-algo', 'value'), - Input('coexpression-parameter-slider', 'value'), - State('coexpression-parameter-slider', 'marks'), - State('homepage-is-submitted', 'data'), - State('coexpression-parameter-module-saved-input', 'data'), - prevent_initial_call='True' - ) - def set_input_coexpression_session_state(addl_genes, network, algo, parameter_value, parameter_mark, homepage_is_submitted, input_parameter_module): - if homepage_is_submitted: - input_paramater_module_value = Input_parameter_module( - parameter_mark, parameter_value)._asdict() - - if input_parameter_module: - input_parameter_module[algo] = input_paramater_module_value - - else: - input_parameter_module = {algo: input_paramater_module_value} - - return addl_genes, network, algo, input_parameter_module - - raise PreventUpdate - - @app.callback( - Output('coexpression-submitted-parameter-module', - 'data', allow_duplicate=True), - - Input('coexpression-modules', 'value'), - Input('coexpression-graph-layout', 'value'), - Input('coexpression-modules-pathway', 'active_tab'), - - State('coexpression-submitted-network', 'data'), - State('coexpression-submitted-clustering-algo', 'data'), - State('homepage-is-submitted', 'data'), - State('coexpression-submitted-parameter-module', 'data'), - prevent_initial_call=True - ) - def set_submitted_coexpression_session_state(module, layout, active_tab, submitted_network, submitted_algo, homepage_is_submitted, submitted_parameter_module): - if homepage_is_submitted: - if submitted_network and submitted_parameter_module and submitted_algo in submitted_parameter_module: - submitted_parameter_module[submitted_algo]['param_module'] = module - submitted_parameter_module[submitted_algo]['layout'] = layout - submitted_parameter_module[submitted_algo]['pathway_active_tab'] = active_tab - - return submitted_parameter_module - - raise PreventUpdate - - @app.callback( - Output('coexpression-addl-genes', 'value'), - - State('homepage-is-submitted', 'data'), - State('coexpression-addl-genes-saved-input', 'data'), - - Input('homepage-genomic-intervals-submitted-input', 'data') - ) - def display_submitted_addl_genes(homepage_is_submitted, addl_genes, *_): - if homepage_is_submitted: - if not addl_genes: - return '' - - return addl_genes - - raise PreventUpdate - - @app.callback( - Output('coexpression-network', 'value'), - - State('homepage-is-submitted', 'data'), - State('coexpression-network-saved-input', 'data'), - - Input('homepage-genomic-intervals-submitted-input', 'data') - ) - def display_selected_coexpression_network(homepage_is_submitted, network, *_): - if homepage_is_submitted: - if not network: - return 'OS-CX' - - return network - - raise PreventUpdate - - @app.callback( - Output('coexpression-clustering-algo', 'value'), - - State('homepage-is-submitted', 'data'), - State('coexpression-clustering-algo-saved-input', 'data'), - - Input('homepage-genomic-intervals-submitted-input', 'data') - ) - def get_selected_clustering_algo(homepage_is_submitted, algo, *_): - if homepage_is_submitted: - if not algo: - return 'clusterone' - - return algo - - raise PreventUpdate - - @app.callback( - Output('coexpression-graph-layout', 'value'), - Output('coexpression-modules-pathway', 'active_tab'), - - Input('coexpression-submitted-network', 'data'), - Input('coexpression-submitted-clustering-algo', 'data'), - State('coexpression-is-submitted', 'data'), - State('coexpression-submitted-parameter-module', 'data') - ) - def display_selected_graph_layout(submitted_network, submitted_algo, coexpression_is_submitted, submitted_parameter_module): - if coexpression_is_submitted: - if submitted_network and submitted_algo and submitted_algo in submitted_parameter_module: - layout = 'circle' - if submitted_parameter_module[submitted_algo]['layout']: - layout = submitted_parameter_module[submitted_algo]['layout'] - - active_tab = 'tab-0' - if submitted_parameter_module[submitted_algo]['pathway_active_tab']: - active_tab = submitted_parameter_module[submitted_algo]['pathway_active_tab'] - - return layout, active_tab - - raise PreventUpdate - - @app.callback( - Output('coexpression-input', 'children'), - Input('coexpression-is-submitted', 'data'), - State('coexpression-addl-genes', 'value'), - State('coexpression-network', 'value'), - State('coexpression-clustering-algo', 'value'), - State('coexpression-parameter-slider', 'value') - ) - def display_coexpression_submitted_input(coexpression_is_submitted, genes, network, algo, parameters): - if coexpression_is_submitted: - if not genes: - genes = 'None' - else: - genes = '; '.join( - list(filter(None, [gene.strip() for gene in genes.split(';')]))) - - return [html.B('Additional Genes: '), genes, - html.Br(), - html.B('Selected Co-Expression Network: '), get_user_facing_network( - network), - html.Br(), - html.B('Selected Module Detection Algorithm: '), get_user_facing_algo( - algo), - html.Br(), - html.B('Selected Algorithm Parameter: '), get_user_facing_parameter(algo, parameters)] - - raise PreventUpdate - - @app.callback( - Output('coexpression-clustering-algo-modal', 'is_open'), - Input('coexpression-clustering-algo-tooltip', 'n_clicks') - ) - def open_modals(tooltip_n_clicks): - if tooltip_n_clicks > 0: - return True - - @app.callback( - Output('coexpression-pathways', 'filter_query'), - Input('coexpression-modules-pathway', 'active_tab'), - Input('coexpression-reset-table', 'n_clicks') - ) - def reset_table_filters(*_): - return '' - - @app.callback( - Output('coexpression-download-df-to-csv', 'data'), - Input('coexpression-export-table', 'n_clicks'), - State('coexpression-pathways', 'data'), - State('homepage-genomic-intervals-submitted-input', 'data') - ) - def download_coexpression_table_to_csv(download_n_clicks, coexpression_df, genomic_intervals): - if download_n_clicks >= 1: - df = pd.DataFrame(coexpression_df) - return dcc.send_data_frame(df.to_csv, f'[{genomic_intervals}] Co-Expression Network Analysis Table.csv', index=False) - - raise PreventUpdate - - @app.callback( - Output('coexpression-download-graph-to-json', 'data'), - Input('coexpression-export-graph', 'n_clicks'), - State('coexpression-module-graph', 'elements'), - State('homepage-genomic-intervals-submitted-input', 'data') - ) - def download_coexpression_table_to_csv(download_n_clicks, coexpression_dict, genomic_intervals): - if download_n_clicks >= 1: - return dict(content='Hello world!', filename=f'[{genomic_intervals}] Co-Expression Network Analysis Graph.txt') - - raise PreventUpdate +from dash import Input, Output, State, html, dcc +from dash.exceptions import PreventUpdate +from collections import namedtuple + +from .util import * +from ..lift_over import util as lift_over_util +from ..branch import * + +Input_parameter_module = namedtuple('Input_parameter_module', [ + 'param_slider_marks', 'param_slider_value']) + +Submitted_parameter_module = namedtuple('Submitted_parameter_module', [ + 'param_slider_marks', 'param_slider_value', 'param_module', 'layout', 'pathway_active_tab']) + + +def init_callback(app): + @app.callback( + Output('coexpression-genomic-intervals-input', 'children'), + State('homepage-genomic-intervals-submitted-input', 'data'), + Input('homepage-is-submitted', 'data'), + Input('coexpression-submit', 'n_clicks') + ) + def display_input(nb_intervals_str, homepage_is_submitted, *_): + if homepage_is_submitted: + if nb_intervals_str and not lift_over_util.is_error(lift_over_util.get_genomic_intervals_from_input(nb_intervals_str)): + return [html.B('Your Input Intervals: '), html.Span(nb_intervals_str)] + + return None + + raise PreventUpdate + + @app.callback( + Output('coexpression-is-submitted', 'data', allow_duplicate=True), + Output('coexpression-submitted-addl-genes', + 'data', allow_duplicate=True), + Output('coexpression-combined-genes', + 'data', allow_duplicate=True), + + Output('coexpression-submitted-network', + 'data', allow_duplicate=True), + Output('coexpression-submitted-clustering-algo', + 'data', allow_duplicate=True), + Output('coexpression-submitted-parameter-module', + 'data', allow_duplicate=True), + + Input('coexpression-submit', 'n_clicks'), + State('homepage-is-submitted', 'data'), + + State('lift-over-nb-table', 'data'), + + State('coexpression-addl-genes', 'value'), + State('coexpression-network', 'value'), + State('coexpression-clustering-algo', 'value'), + State('coexpression-parameter-slider', 'marks'), + State('coexpression-parameter-slider', 'value'), + prevent_initial_call=True + ) + def submit_coexpression_input(coexpression_submit_n_clicks, homepage_is_submitted, + implicated_gene_ids, submitted_addl_genes, + submitted_network, submitted_algo, submitted_slider_marks, submitted_slider_value): + if homepage_is_submitted and coexpression_submit_n_clicks >= 1: + paramater_module_value = Submitted_parameter_module( + submitted_slider_marks, submitted_slider_value, '', 'circle', 'tab-0')._asdict() + + submitted_parameter_module = { + submitted_algo: paramater_module_value} + + if submitted_addl_genes: + submitted_addl_genes = submitted_addl_genes.strip() + else: + submitted_addl_genes = '' + + list_addl_genes = list( + filter(None, [gene.strip() for gene in submitted_addl_genes.split(';')])) + + gene_ids = list(set.union( + set(implicated_gene_ids), set(list_addl_genes))) + + return True, submitted_addl_genes, gene_ids, submitted_network, submitted_algo, submitted_parameter_module + + raise PreventUpdate + + @app.callback( + Output('coexpression-results-container', 'style'), + Input('coexpression-is-submitted', 'data'), + ) + def display_coexpression_output(coexpression_is_submitted): + if coexpression_is_submitted: + return {'display': 'block'} + + else: + return {'display': 'none'} + + @app.callback( + Output('coexpression-parameter-slider', 'marks'), + Output('coexpression-parameter-slider', 'value'), + Input('coexpression-clustering-algo', 'value'), + State('coexpression-parameter-module-saved-input', 'data') + ) + def set_parameter_slider(algo, parameter_module): + if parameter_module and algo in parameter_module: + return parameter_module[algo]['param_slider_marks'], parameter_module[algo]['param_slider_value'] + + return get_parameters_for_algo(algo), module_detection_algos[algo].default_param * module_detection_algos[algo].multiplier + + @app.callback( + Output('coexpression-module-graph', 'elements'), + Output('coexpression-module-graph', 'layout'), + Output('coexpression-module-graph', 'style', allow_duplicate=True), + Output('coexpression-graph-container', 'style'), + + Input('coexpression-combined-genes', 'data'), + + Input('coexpression-submitted-network', 'data'), + Input('coexpression-submitted-clustering-algo', 'data'), + State('coexpression-is-submitted', 'data'), + State('coexpression-submitted-parameter-module', 'data'), + + prevent_initial_call=True + ) + def hide_table_graph(combined_gene_ids, submitted_network, submitted_algo, coexpression_is_submitted, submitted_parameter_module): + if coexpression_is_submitted: + if submitted_algo and submitted_algo in submitted_parameter_module: + parameters = submitted_parameter_module[submitted_algo]['param_slider_value'] + layout = submitted_parameter_module[submitted_algo]['layout'] + + return load_module_graph( + combined_gene_ids, None, submitted_network, submitted_algo, parameters, layout) + ({'visibility': 'hidden'}, ) + + raise PreventUpdate + + @app.callback( + Output('coexpression-table-container', 'style', allow_duplicate=True), + Input('coexpression-submit', 'n_clicks'), + + prevent_initial_call=True + ) + def hide_table(*_): + return {'visibility': 'hidden'} + + @app.callback( + Output('coexpression-module-graph', 'style', allow_duplicate=True), + Input('coexpression-modules', 'value'), + + prevent_initial_call=True + ) + def hide_graph(*_): + return {'visibility': 'hidden'} + + @app.callback( + Output('coexpression-modules', 'options'), + Output('coexpression-modules', 'value'), + Output('coexpression-results-module-tabs-container', 'style'), + Output('coexpression-module-stats', 'children'), + + State('homepage-genomic-intervals-submitted-input', 'data'), + + Input('coexpression-combined-genes', 'data'), + Input('coexpression-submitted-addl-genes', 'data'), + + Input('coexpression-submitted-network', 'data'), + Input('coexpression-submitted-clustering-algo', 'data'), + State('homepage-is-submitted', 'data'), + State('coexpression-submitted-parameter-module', 'data'), + State('coexpression-is-submitted', 'data') + ) + def perform_module_enrichment(genomic_intervals, combined_gene_ids, submitted_addl_genes, + submitted_network, submitted_algo, homepage_is_submitted, submitted_parameter_module, coexpression_is_submitted): + if homepage_is_submitted: + if coexpression_is_submitted: + if submitted_algo and submitted_algo in submitted_parameter_module: + parameters = submitted_parameter_module[submitted_algo]['param_slider_value'] + + enriched_modules = do_module_enrichment_analysis( + combined_gene_ids, genomic_intervals, submitted_addl_genes, submitted_network, submitted_algo, parameters) + + # Display statistics + num_enriched_modules = len(enriched_modules) + total_num_modules = count_modules( + submitted_network, submitted_algo, parameters) + stats = f'{num_enriched_modules} out of {total_num_modules} ' + if total_num_modules == 1: + stats += 'module ' + else: + stats += 'modules ' + + if num_enriched_modules == 1: + stats += 'was found to be enriched (adjusted p-value < 0.05).' + else: + stats += 'were found to be enriched (adjusted p-value < 0.05).' + + first_module = None + if enriched_modules: + first_module = enriched_modules[0] + else: + return enriched_modules, first_module, {'display': 'none'}, stats + + if submitted_parameter_module and submitted_algo in submitted_parameter_module: + if submitted_parameter_module[submitted_algo]['param_module']: + first_module = submitted_parameter_module[submitted_algo]['param_module'] + + return enriched_modules, first_module, {'display': 'block'}, stats + + raise PreventUpdate + + @app.callback( + Output('coexpression-pathways', 'data'), + Output('coexpression-pathways', 'columns'), + Output('coexpression-graph-stats', 'children'), + Output('coexpression-table-stats', 'children'), + + Output('coexpression-table-container', 'style'), + + Input('coexpression-combined-genes', 'data'), + Input('coexpression-submitted-network', 'data'), + Input('coexpression-submitted-clustering-algo', 'data'), + Input('coexpression-modules-pathway', 'active_tab'), + Input('coexpression-modules', 'value'), + State('coexpression-submitted-parameter-module', 'data'), + State('coexpression-is-submitted', 'data') + ) + def display_pathways(combined_gene_ids, + submitted_network, submitted_algo, active_tab, module, submitted_parameter_module, coexpression_is_submitted): + if coexpression_is_submitted: + if submitted_network and submitted_algo and submitted_algo in submitted_parameter_module: + parameters = submitted_parameter_module[submitted_algo]['param_slider_value'] + + try: + module_idx = module.split(' ')[1] + table, _ = convert_to_df( + active_tab, module_idx, submitted_network, submitted_algo, parameters) + except Exception: + table, _ = convert_to_df( + active_tab, None, submitted_network, submitted_algo, parameters) + + columns = [{'id': x, 'name': x, 'presentation': 'markdown'} + for x in table.columns] + + num_enriched = get_num_unique_entries(table, 'ID') + if num_enriched == 1: + stats = f'This module is enriched in {num_enriched} {get_noun_for_active_tab(active_tab).singular}.' + else: + stats = f'This module is enriched in {num_enriched} {get_noun_for_active_tab(active_tab).plural}.' + + graph_stats = 'The selected module has ' + try: + total_num_genes, num_combined_gene_ids = count_genes_in_module( + combined_gene_ids, int(module_idx), submitted_network, submitted_algo, parameters) + except UnboundLocalError: + total_num_genes, num_combined_gene_ids = 0, 0 + + if total_num_genes == 1: + graph_stats += f'{total_num_genes} gene, of which {num_combined_gene_ids} ' + else: + graph_stats += f'{total_num_genes} genes, of which {num_combined_gene_ids} ' + + if num_combined_gene_ids == 1: + graph_stats += 'is implicated by your GWAS/QTL or part of the gene list you manually entered.' + else: + graph_stats += 'are implicated by your GWAS/QTL or part of the gene list you manually entered.' + + if total_num_genes == 0: + return table.to_dict('records'), columns, graph_stats, stats, {'display': 'none'} + else: + return table.to_dict('records'), columns, graph_stats, stats, {'visibility': 'visible'} + + raise PreventUpdate + + @app.callback( + Output('coexpression-module-graph', 'elements', allow_duplicate=True), + Output('coexpression-module-graph', 'layout', allow_duplicate=True), + Output('coexpression-module-graph', 'style', allow_duplicate=True), + Output('coexpression-graph-container', 'style', allow_duplicate=True), + Output('coexpression-extra-bottom-div', 'style', allow_duplicate=True), + + Input('coexpression-combined-genes', 'data'), + Input('coexpression-modules', 'value'), + + State('coexpression-submitted-network', 'data'), + State('coexpression-submitted-clustering-algo', 'data'), + State('coexpression-submitted-parameter-module', 'data'), + + Input('coexpression-graph-layout', 'value'), + State('coexpression-is-submitted', 'data'), + + State('coexpression-modules', 'options'), + + Input('coexpression-reset-graph', 'n_clicks'), + + prevent_initial_call=True + ) + def display_table_graph(combined_gene_ids, module, submitted_network, submitted_algo, submitted_parameter_module, + layout, coexpression_is_submitted, modules, *_): + if coexpression_is_submitted: + if submitted_network and submitted_algo and submitted_algo in submitted_parameter_module: + parameters = submitted_parameter_module[submitted_algo]['param_slider_value'] + + if not modules: + module_graph = load_module_graph( + combined_gene_ids, None, submitted_network, submitted_algo, parameters, layout) + else: + module_graph = load_module_graph( + combined_gene_ids, module, submitted_network, submitted_algo, parameters, layout) + + # No enriched modules + if not modules: + return module_graph + ({'display': 'none'}, {'height': '0em'}) + + return module_graph + ({'visibility': 'visible', 'width': '100%', + 'height': '100vh'}, {'height': '1.5em'}) + + raise PreventUpdate + + @app.callback( + Output('coexpression-addl-genes-saved-input', + 'data', allow_duplicate=True), + Output('coexpression-network-saved-input', + 'data', allow_duplicate=True), + Output('coexpression-clustering-algo-saved-input', + 'data', allow_duplicate=True), + Output('coexpression-parameter-module-saved-input', + 'data', allow_duplicate=True), + + State('coexpression-addl-genes', 'value'), + Input('coexpression-network', 'value'), + Input('coexpression-clustering-algo', 'value'), + Input('coexpression-parameter-slider', 'value'), + State('coexpression-parameter-slider', 'marks'), + State('homepage-is-submitted', 'data'), + State('coexpression-parameter-module-saved-input', 'data'), + prevent_initial_call='True' + ) + def set_input_coexpression_session_state(addl_genes, network, algo, parameter_value, parameter_mark, homepage_is_submitted, input_parameter_module): + if homepage_is_submitted: + input_paramater_module_value = Input_parameter_module( + parameter_mark, parameter_value)._asdict() + + if input_parameter_module: + input_parameter_module[algo] = input_paramater_module_value + + else: + input_parameter_module = {algo: input_paramater_module_value} + + return addl_genes, network, algo, input_parameter_module + + raise PreventUpdate + + @app.callback( + Output('coexpression-submitted-parameter-module', + 'data', allow_duplicate=True), + + Input('coexpression-modules', 'value'), + Input('coexpression-graph-layout', 'value'), + Input('coexpression-modules-pathway', 'active_tab'), + + State('coexpression-submitted-network', 'data'), + State('coexpression-submitted-clustering-algo', 'data'), + State('homepage-is-submitted', 'data'), + State('coexpression-submitted-parameter-module', 'data'), + prevent_initial_call=True + ) + def set_submitted_coexpression_session_state(module, layout, active_tab, submitted_network, submitted_algo, homepage_is_submitted, submitted_parameter_module): + if homepage_is_submitted: + if submitted_network and submitted_parameter_module and submitted_algo in submitted_parameter_module: + submitted_parameter_module[submitted_algo]['param_module'] = module + submitted_parameter_module[submitted_algo]['layout'] = layout + submitted_parameter_module[submitted_algo]['pathway_active_tab'] = active_tab + + return submitted_parameter_module + + raise PreventUpdate + + @app.callback( + Output('coexpression-addl-genes', 'value'), + + State('homepage-is-submitted', 'data'), + State('coexpression-addl-genes-saved-input', 'data'), + + Input('homepage-genomic-intervals-submitted-input', 'data') + ) + def display_submitted_addl_genes(homepage_is_submitted, addl_genes, *_): + if homepage_is_submitted: + if not addl_genes: + return '' + + return addl_genes + + raise PreventUpdate + + @app.callback( + Output('coexpression-network', 'value'), + + State('homepage-is-submitted', 'data'), + State('coexpression-network-saved-input', 'data'), + + Input('homepage-genomic-intervals-submitted-input', 'data') + ) + def display_selected_coexpression_network(homepage_is_submitted, network, *_): + if homepage_is_submitted: + if not network: + return 'OS-CX' + + return network + + raise PreventUpdate + + @app.callback( + Output('coexpression-clustering-algo', 'value'), + + State('homepage-is-submitted', 'data'), + State('coexpression-clustering-algo-saved-input', 'data'), + + Input('homepage-genomic-intervals-submitted-input', 'data') + ) + def get_selected_clustering_algo(homepage_is_submitted, algo, *_): + if homepage_is_submitted: + if not algo: + return 'clusterone' + + return algo + + raise PreventUpdate + + @app.callback( + Output('coexpression-graph-layout', 'value'), + Output('coexpression-modules-pathway', 'active_tab'), + + Input('coexpression-submitted-network', 'data'), + Input('coexpression-submitted-clustering-algo', 'data'), + State('coexpression-is-submitted', 'data'), + State('coexpression-submitted-parameter-module', 'data') + ) + def display_selected_graph_layout(submitted_network, submitted_algo, coexpression_is_submitted, submitted_parameter_module): + if coexpression_is_submitted: + if submitted_network and submitted_algo and submitted_algo in submitted_parameter_module: + layout = 'circle' + if submitted_parameter_module[submitted_algo]['layout']: + layout = submitted_parameter_module[submitted_algo]['layout'] + + active_tab = 'tab-0' + if submitted_parameter_module[submitted_algo]['pathway_active_tab']: + active_tab = submitted_parameter_module[submitted_algo]['pathway_active_tab'] + + return layout, active_tab + + raise PreventUpdate + + @app.callback( + Output('coexpression-input', 'children'), + Input('coexpression-is-submitted', 'data'), + State('coexpression-addl-genes', 'value'), + State('coexpression-network', 'value'), + State('coexpression-clustering-algo', 'value'), + State('coexpression-parameter-slider', 'value') + ) + def display_coexpression_submitted_input(coexpression_is_submitted, genes, network, algo, parameters): + if coexpression_is_submitted: + if not genes: + genes = 'None' + else: + genes = '; '.join( + list(filter(None, [gene.strip() for gene in genes.split(';')]))) + + return [html.B('Additional Genes: '), genes, + html.Br(), + html.B('Selected Co-Expression Network: '), get_user_facing_network( + network), + html.Br(), + html.B('Selected Module Detection Algorithm: '), get_user_facing_algo( + algo), + html.Br(), + html.B('Selected Algorithm Parameter: '), get_user_facing_parameter(algo, parameters)] + + raise PreventUpdate + + @app.callback( + Output('coexpression-clustering-algo-modal', 'is_open'), + Input('coexpression-clustering-algo-tooltip', 'n_clicks') + ) + def open_modals(tooltip_n_clicks): + if tooltip_n_clicks > 0: + return True + + @app.callback( + Output('coexpression-pathways', 'filter_query'), + Input('coexpression-modules-pathway', 'active_tab'), + Input('coexpression-reset-table', 'n_clicks') + ) + def reset_table_filters(*_): + return '' + + @app.callback( + Output('coexpression-download-df-to-csv', 'data'), + Input('coexpression-export-table', 'n_clicks'), + State('coexpression-pathways', 'data'), + State('homepage-genomic-intervals-submitted-input', 'data') + ) + def download_coexpression_table_to_csv(download_n_clicks, coexpression_df, genomic_intervals): + if download_n_clicks >= 1: + df = pd.DataFrame(coexpression_df) + return dcc.send_data_frame(df.to_csv, f'[{genomic_intervals}] Co-Expression Network Analysis Table.csv', index=False) + + raise PreventUpdate + + @app.callback( + Output('coexpression-download-graph-to-json', 'data'), + Input('coexpression-export-graph', 'n_clicks'), + State('coexpression-module-graph', 'elements'), + State('homepage-genomic-intervals-submitted-input', 'data') + ) + def download_coexpression_table_to_csv(download_n_clicks, coexpression_dict, genomic_intervals): + if download_n_clicks >= 1: + return dict(content='Hello world!', filename=f'[{genomic_intervals}] Co-Expression Network Analysis Graph.txt') + + raise PreventUpdate diff --git a/callbacks/coexpression/util.py b/callbacks/coexpression/util.py index 83f94608..8fe47a43 100644 --- a/callbacks/coexpression/util.py +++ b/callbacks/coexpression/util.py @@ -1,644 +1,644 @@ -from ..constants import Constants -from ..file_util import * -from ..general_util import * -from ..links_util import * -import os -import pickle - -import pandas as pd -import networkx as nx -from scipy.stats import fisher_exact, false_discovery_control - -from collections import namedtuple - -const = Constants() - -# Settings for the module detection algorithms: -# - multiplier: Value multiplied to the parameter to get the name of the directory -# For example, results of running clusterone at param=0.3 are saved in 30 -# - default_param: Default parameter of the module detection algorithm -# - low: User-facing display for the lowest parameter -# - high: User-facing display for the highest parameter - -Module_detection_algo = namedtuple('Module_detection_algo', [ - 'multiplier', 'default_param', 'low', 'high']) -module_detection_algos = { - 'clusterone': Module_detection_algo( - 100, 0.3, '1 (Looser Modules)', '4 (Denser Modules)'), - 'coach': Module_detection_algo( - 1000, 0.225, '1 (Looser Modules)', '4 (Denser Modules)'), - 'demon': Module_detection_algo( - 100, 0.25, '1 (Looser Modules)', '4 (Denser Modules)'), - 'fox': Module_detection_algo( - 100, 0.05, '1 (Looser Modules)', '4 (Denser Modules)'), -} - - -MODULE_DETECTION_ALGOS_VALUE_LABEL = [ - {'value': 'clusterone', 'label': 'ClusterONE', - 'label_id': 'clusterone'}, - {'value': 'coach', 'label': 'COACH', 'label_id': 'coach'}, - {'value': 'demon', 'label': 'DEMON', 'label_id': 'demon'}, - {'value': 'fox', 'label': 'FOX', 'label_id': 'fox'} -] - -COEXPRESSION_NETWORKS_VALUE_LABEL = [ - {'value': 'OS-CX', 'label': 'RiceNet v2', 'label_id': 'os-cx'}, - {'value': 'RCRN', - 'label': 'Rice Combined Mutual Ranked Network (RCRN)', 'label_id': 'rcrn'}, -] - -Enrichment_tab = namedtuple('Enrichment_tab', ['enrichment', 'path']) -enrichment_tabs = [Enrichment_tab('Gene Ontology', 'ontology_enrichment/go'), - Enrichment_tab('Trait Ontology', 'ontology_enrichment/to'), - Enrichment_tab('Plant Ontology', 'ontology_enrichment/po'), - Enrichment_tab('Pathways (Over-Representation)', - 'pathway_enrichment/ora'), - Enrichment_tab('Pathway-Express', 'pathway_enrichment/pe'), - Enrichment_tab('SPIA', 'pathway_enrichment/spia')] - - -def get_user_facing_parameter(algo, parameter, network='OS-CX'): - parameters = sorted( - map(int, os.listdir(f'{const.NETWORK_MODULES}/{network}/MSU/{algo}'))) - - return parameters.index(parameter) + 1 - - -def get_user_facing_algo(algo): - for entry in MODULE_DETECTION_ALGOS_VALUE_LABEL: - if entry['value'] == algo: - return entry['label'] - - -def get_user_facing_network(network): - for entry in COEXPRESSION_NETWORKS_VALUE_LABEL: - if entry['value'] == network: - return entry['label'] - - -def get_parameters_for_algo(algo, network='OS-CX'): - """ - Returns the user-facing parameters for the module detection algorithms - - Parameters: - - algo: Module detection algorithm - - network: Any of the coexpression networks supported by the app - - Returns: - - User-facing parameters for the module detection algorithms - """ - param_dict = {} - parameters = sorted( - map(int, os.listdir(f'{const.NETWORK_MODULES}/{network}/MSU/{algo}'))) - - # Display the user-facing parameters for the module detection algorithms - for idx, parameter in enumerate(parameters): - if idx == 0: - param_dict[int(parameter)] = module_detection_algos[algo].low - elif idx == len(parameters) - 1: - param_dict[int(parameter)] = module_detection_algos[algo].high - else: - param_dict[int(parameter)] = str(idx + 1) - - return param_dict - -# ================================================= -# Utility functions for module enrichment analysis -# ================================================= - - -def create_module_enrichment_results_dir(genomic_intervals, addl_genes, network, algo, parameters): - """ - Writes the accessions of the GWAS-implicated genes to a file - - Parameters: - - genes: Accessions of the genes implicated by GWAS - - genomic_intervals: Genomic interval entered by the user - - network: Coexpression network - - algo: Module detection algorithm - - parameters: Parameter at which module detection algorithm is run - - Returns: - - Parent directory of the file to which the accessions of the GWAS-implicated genes are written - """ - if addl_genes: - temp_output_folder_dir = get_path_to_temp( - genomic_intervals, const.TEMP_COEXPRESSION, f'{shorten_name(addl_genes)}/{network}/{algo}/{parameters}') - else: - temp_output_folder_dir = get_path_to_temp( - genomic_intervals, const.TEMP_COEXPRESSION, f'{network}/{algo}/{parameters}') - - if not path_exists(temp_output_folder_dir): - make_dir(temp_output_folder_dir) - - return temp_output_folder_dir - - -def fetch_enriched_modules(output_dir): - """ - Fetches the enriched modules from the output file of the module enrichment analysis - - Parameters: - - output_dir: Parent directory of the output file of the module enrichment analysis - - Returns: - - Enriched modules (i.e., their respectives indices and adjust p-values) - """ - modules = [] - with open(f'{output_dir}/enriched_modules.tsv') as modules_file: - for line in modules_file: - line = line.rstrip().split('\t') - idx = line[0] - p_value = float(line[1]) - - modules.append( - f'Module {idx} (Adj. p-value = {display_in_sci_notation(p_value)})') - - return modules - - -def do_module_enrichment_analysis(implicated_gene_ids, genomic_intervals, addl_genes, network, algo, parameters): - """ - Determine which modules are enriched given the set of GWAS-implicated genes - - Parameters: - - implicated_gene_ids: Accessions of the genes implicated by GWAS - - genomic_intervals: Genomic interval entered by the user - - network: Coexpression network - - algo: Module detection algorithm - - parameters: Parameter at which module detection algorithm is run - - Returns: - - Enriched modules (i.e., their respectives indices and adjust p-values) - """ - implicated_genes = set(implicated_gene_ids) - INPUT_GENES_DIR = create_module_enrichment_results_dir( - genomic_intervals, addl_genes, network, algo, parameters) - ENRICHED_MODULES_PATH = f'{INPUT_GENES_DIR}/enriched_modules.tsv' - - if not path_exists(ENRICHED_MODULES_PATH): - ENRICHED_MODULES_PATH_WITH_TIMESTAMP = append_timestamp_to_filename( - ENRICHED_MODULES_PATH) - MODULES_PATH = f'{const.NETWORK_MODULES}/{network}/MSU/{algo}/{parameters}/{algo}-module-list.tsv' - - # ==================================================================================== - # This replicates the logic of running the universal enrichment function `enricher()` - # provided by clusterProfiler - # ==================================================================================== - - with open(MODULES_PATH) as modules_file, open(ENRICHED_MODULES_PATH_WITH_TIMESTAMP, 'w') as enriched_modules_file: - modules = [] - background_genes = set() - for idx, line in enumerate(modules_file): - module_genes = set(line.strip().split('\t')) - background_genes = background_genes.union(module_genes) - if implicated_genes.intersection(module_genes): - modules.append(idx) - - p_values_indices = [] - p_values = [] - modules_file.seek(0) - for idx, line in enumerate(modules_file): - if idx in modules: - module = line.strip().split('\t') - module_genes = set(module) - table = construct_contigency_table( - background_genes, implicated_genes, module_genes) - - p_values.append(fisher_exact( - table, alternative='greater').pvalue) - - # Add 1 since user-facing module number is one-based - p_values_indices.append(idx + 1) - - adj_p_values = false_discovery_control(p_values, method='bh') - significant_adj_p_values = [(p_values_indices[idx], adj_p_value) for idx, adj_p_value in enumerate( - adj_p_values) if adj_p_value < const.P_VALUE_CUTOFF] - significant_adj_p_values.sort(key=lambda x: x[1]) - significant_adj_p_values = [ - f'{ID}\t{adj_p_value}' for ID, adj_p_value in significant_adj_p_values] - - enriched_modules_file.write('\n'.join(significant_adj_p_values)) - - try: - os.replace(ENRICHED_MODULES_PATH_WITH_TIMESTAMP, - ENRICHED_MODULES_PATH) - except: - pass - - return fetch_enriched_modules(INPUT_GENES_DIR) - - -def construct_contigency_table(background_genes, implicated_genes, module_genes): - not_in_implicated = background_genes.difference(implicated_genes) - not_in_module = background_genes.difference(module_genes) - - in_implicated_in_module = len(implicated_genes.intersection(module_genes)) - in_implicated_not_in_module = len( - implicated_genes.intersection(not_in_module)) - - not_in_implicated_in_module = len( - not_in_implicated.intersection(module_genes)) - not_in_implicated_not_in_module = len( - not_in_implicated.intersection(not_in_module)) - - table = [[in_implicated_in_module, not_in_implicated_in_module], - [in_implicated_not_in_module, not_in_implicated_not_in_module]] - - return table - - -# =============================================================================================== -# Utility functions for the display of the tables showing the results of the enrichment analysis -# =============================================================================================== - - -def convert_transcript_to_msu_id(transcript_ids_str, network): - """ - Converts given KEGG transcript IDs to their respective MSU accessions. - - Parameters: - - transcript_ids_str: KEGG transcript IDs - - network: Coexpression network - - Returns: - - Equivalent MSU accessions of the KEGG transcript IDs - """ - with open(f'{const.GENE_ID_MAPPING}/{network}/transcript-to-msu-id.pickle', 'rb') as f: - mapping_dict = pickle.load(f) - - output_str = '' - transcript_ids = transcript_ids_str.split('\n') - for transcript_id in transcript_ids: - for msu_id in mapping_dict[transcript_id]: - output_str += f'{msu_id}\n({transcript_id})\n\n' - - # Remove trailing newline characters - return output_str[:-2] - - -def get_genes_in_module(module_idx, network, algo, parameters): - with open(f'{const.NETWORK_MODULES}/{network}/transcript/{algo}/{parameters}/{algo}-module-list.tsv') as f: - for idx, module in enumerate(f): - if idx + 1 == int(module_idx): - return set(module.split('\t')) - - -def get_genes_in_pathway(pathway_id, network): - with open(f'{const.ENRICHMENT_ANALYSIS}/{network}/{const.KEGG_DOSA_GENESET}', 'rb') as f: - genes_in_pathway = pickle.load(f) - - return genes_in_pathway[pathway_id] - - -def get_genes_in_module_and_pathway(pathway_id, module_idx, network, algo, parameters): - return '\n'.join(list(get_genes_in_pathway(pathway_id, network).intersection( - get_genes_in_module(module_idx, network, algo, parameters)))) - - -def get_kegg_pathway_name(pathway_id, network): - with open(f'{const.ENRICHMENT_ANALYSIS}/{network}/{const.KEGG_DOSA_PATHWAY_NAMES}') as pathways: - for line in pathways: - line = line.split('\t') - if line[0].rstrip() == pathway_id: - return line[1].strip() - - -def remove_rap_db_info_in_pathway_name(pathway_name): - return pathway_name[:-len(' - Oryza sativa japonica (Japanese rice) (RAPDB)')] - -# ======================================================================================= -# Functions for the display of the tables showing the results of the enrichment analysis -# ======================================================================================= - - -def convert_to_df_go(result): - cols = ['ID', 'Gene Ontology Term', 'Gene Ratio', - 'BG Ratio', 'p-value', 'Adj. p-value', 'Genes'] - - if result.empty: - return create_empty_df_with_cols(cols) - - # Prettify display of genes - result['Genes'] = result['Genes'].str.split('/').str.join('\n') - - result['ID'] = get_go_link(result, 'ID') - - result = result.sort_values('Adj. p-value') - - display_cols_in_sci_notation( - result, [col for col in cols if 'p-value' in col]) - - return result[cols].dropna() - - -def convert_to_df_to(result): - cols = ['ID', 'Trait Ontology Term', 'Gene Ratio', - 'BG Ratio', 'p-value', 'Adj. p-value', 'Genes'] - - if result.empty: - return create_empty_df_with_cols(cols) - - # Prettify display of genes - result['Genes'] = result['Genes'].str.split('/').str.join('\n') - - result['ID'] = get_to_po_link(result, 'ID') - - result = result.sort_values('Adj. p-value') - - display_cols_in_sci_notation( - result, [col for col in cols if 'p-value' in col]) - - return result[cols].dropna() - - -def convert_to_df_po(result): - cols = ['ID', 'Plant Ontology Term', 'Gene Ratio', - 'BG Ratio', 'p-value', 'Adj. p-value', 'Genes'] - - if result.empty: - return create_empty_df_with_cols(cols) - - # Prettify display of genes - result['Genes'] = result['Genes'].str.split('/').str.join('\n') - - result['ID'] = get_to_po_link(result, 'ID') - - result = result.sort_values('Adj. p-value') - - display_cols_in_sci_notation( - result, [col for col in cols if 'p-value' in col]) - - return result[cols].dropna() - - -def convert_to_df_ora(result, network): - cols = ['ID', 'KEGG Pathway', 'Gene Ratio', - 'BG Ratio', 'p-value', 'Adj. p-value', 'Genes'] - - if result.empty: - return create_empty_df_with_cols(cols) - - result['KEGG Pathway'] = result['KEGG Pathway'].apply( - remove_rap_db_info_in_pathway_name) - - # Construct link before appending the MSU accession - result['ID'] = get_kegg_link(result, 'ID', 'Genes') - - # Prettify display of genes and convert to MSU accessions - result['Genes'] = result['Genes'].str.split( - '/').str.join('\n') - result['Genes'] = result.apply( - lambda x: convert_transcript_to_msu_id(x['Genes'], network), axis=1) - - result = result.sort_values('Adj. p-value') - - display_cols_in_sci_notation( - result, [col for col in cols if 'p-value' in col]) - - return result[cols].dropna() - - -def convert_to_df_pe(result, module_idx, network, algo, parameters): - cols = ['ID', 'KEGG Pathway', 'ORA p-value', 'Perturbation p-value', 'Combined p-value', - 'Adj. ORA p-value', 'Adj. Perturbation p-value', - 'Adj. Combined p-value', 'Genes'] - - if result.empty: - return create_empty_df_with_cols(cols) - - result = result.loc[result['Adj. Combined p-value'] < const.P_VALUE_CUTOFF] - - # IMPORTANT: Do not change ordering of instructions - - # Prettify display of ID - result['ID'] = result['ID'].str[len('path:'):] - - result['KEGG Pathway'] = result.apply( - lambda x: get_kegg_pathway_name(x['ID'], network), axis=1) - result['KEGG Pathway'] = result['KEGG Pathway'].apply( - remove_rap_db_info_in_pathway_name) - - result['Genes'] = result.apply(lambda x: get_genes_in_module_and_pathway( - x['ID'], module_idx, network, algo, parameters), axis=1) - - # Construct link before appending the MSU accession - result['ID'] = get_kegg_link(result, 'ID', 'Genes') - - result['Genes'] = result.apply( - lambda x: convert_transcript_to_msu_id(x['Genes'], network), axis=1) - - result = result.sort_values('Adj. Combined p-value') - - display_cols_in_sci_notation( - result, [col for col in cols if 'p-value' in col]) - - return result[cols].dropna() - - -def convert_to_df_spia(result, network): - cols = ['ID', 'KEGG Pathway', 'ORA p-value', 'Total Acc. Perturbation', 'Perturbation p-value', 'Combined p-value', - 'Adj. Combined p-value', 'Pathway Status', 'Genes'] - - if result.empty: - return create_empty_df_with_cols(cols) - - result = result.loc[result['Adj. Combined p-value'] < const.P_VALUE_CUTOFF] - - # Prettify display of ID - result['ID'] = 'dosa' + result['ID'] - result['Total Acc. Perturbation'] = result['tA'] - - # Prettify display of genes and convert to MSU accessions - result['Genes'] = result['View on KEGG'].apply( - get_genes_from_kegg_link) - - # Construct link before appending the MSU accession - result['ID'] = get_kegg_link(result, 'ID', 'Genes') - - result['Genes'] = result.apply( - lambda x: convert_transcript_to_msu_id(x['Genes'], network), axis=1) - - result = result.sort_values('Adj. Combined p-value') - - display_cols_in_sci_notation( - result, [col for col in cols if 'p-value' in col]) - - return result[cols].dropna() - - -def convert_to_df(active_tab, module_idx, network, algo, parameters): - """ - Returns the results of ontology and pathway enrichment analysis as a data frame - - Parameters: - - active_tab: ID of the tab corresponding to the selected enrichment analysis - - module_idx: Index of the selected module - - network: Coexpression network - - algo: Module detection algorithm - - parameters: Parameter at which module detection algorithm is run - - Returns: - - Data frame containing the results of ontology and pathway enrichment analysis - - True if the data frame is empty; False, otherwise - """ - dir = enrichment_tabs[get_tab_index(active_tab)].path - enrichment_type = dir.split('/')[-1] - - file = f'{const.ENRICHMENT_ANALYSIS}/{network}/output/{algo}/{parameters}/{dir}/results/{enrichment_type}-df-{module_idx}.tsv' - - columns = {'go': ['ID', 'Gene Ontology Term', 'Gene Ratio', - 'BG Ratio', 'p-value', 'Adj. p-value', 'q-value', 'Genes', 'Count'], - 'to': ['ID', 'Trait Ontology Term', 'Gene Ratio', - 'BG Ratio', 'p-value', 'Adj. p-value', 'q-value', 'Genes', 'Count'], - 'po': ['ID', 'Plant Ontology Term', 'Gene Ratio', - 'BG Ratio', 'p-value', 'Adj. p-value', 'q-value', 'Genes', 'Count'], - 'ora': ['ID', 'KEGG Pathway', 'Gene Ratio', - 'BG Ratio', 'p-value', 'Adj. p-value', 'q-value', 'Genes', 'Count'], - 'pe': ['ID', 'totalAcc', 'totalPert', 'totalAccNorm', 'totalPertNorm', - 'Perturbation p-value', 'pAcc', 'ORA p-value', 'Combined p-value', - 'Adj. Perturbation p-value', 'Adj. Accumulation p-value', - 'Adj. ORA p-value', 'Adj. Combined p-value'], - 'spia': ['KEGG Pathway', 'ID', 'pSize', 'NDE', 'ORA p-value', 'tA', - 'Perturbation p-value', 'Combined p-value', 'Adj. Combined p-value', - 'Adj. Combined p-value (Bonferroni)', 'Pathway Status', 'View on KEGG']} - - try: - result = pd.read_csv(file, delimiter='\t', - names=columns[enrichment_type], skiprows=1) - - # SPIA is a special case - if enrichment_type.lower() == 'spia': - # Add dtype argument to preserve leading 0 in KEGG pathway ID - result = pd.read_csv(file, delimiter='\t', - names=columns[enrichment_type], skiprows=1, dtype={'ID': object}) - - empty = result.empty - except: - result = pd.DataFrame() - empty = True - - # Return results data frame and whether it is empty - if enrichment_type == 'go': - return convert_to_df_go(result), empty - - elif enrichment_type == 'to': - return convert_to_df_to(result), empty - - elif enrichment_type == 'po': - return convert_to_df_po(result), empty - - elif enrichment_type == 'ora': - return convert_to_df_ora(result, network), empty - - elif enrichment_type == 'pe': - return convert_to_df_pe(result, module_idx, network, algo, parameters), empty - - elif enrichment_type == 'spia': - return convert_to_df_spia(result, network), empty - - -def convert_module_to_edge_list(module, network_file, output_dir, filename): - module = set(module) - selected_nodes = set() - with open(network_file) as network, open(f'{output_dir}/{filename}', 'w') as output: - for edge in network: - edge = edge.rstrip() - nodes = edge.split('\t') - - if nodes[0] in module and nodes[1] in module: - selected_nodes.add(nodes[0]) - selected_nodes.add(nodes[1]) - output.write(f'{nodes[0]}\t{nodes[1]}\n') - - assert len(selected_nodes - module) == 0 - - -def convert_modules_to_edgelist(network_file, module_file, module_index, output_dir): - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - with open(module_file) as modules: - for idx, module in enumerate(modules): - if idx == module_index - 1: - module = module.rstrip() - module = module.split('\t') - filename = f'module-{idx + 1}.tsv' - convert_module_to_edge_list( - module, network_file, output_dir, filename) - - break - - -def load_module_graph(implicated_gene_ids, module, network, algo, parameters, layout): - """ - Displays the subgraph induced by the module - - Parameters: - - implicated_gene_ids: Accessions of the genes implicated by GWAS - - module: Gene module - - network: Coexpression network - - algo: Module detection algorithm - - parameters: Parameter at which module detection algorithm is run - - layout: Layout of the graph display - - Returns: - - Elements (nodes and edges) of the graph - - Dictionary storing the layout of the graph - - Dictionary storing the visibility, width, and height of the graph - """ - try: - # Ignore the word "Module" at the start - module_idx = int(module.split(' ')[1]) - OUTPUT_DIR = f'{const.TEMP}/{network}/{algo}/modules/{parameters}' - coexpress_nw = f'{OUTPUT_DIR}/module-{module_idx}.tsv' - - if not path_exists(coexpress_nw): - NETWORK_FILE = f'{const.NETWORKS}/{network}.txt' - MODULE_FILE = f'{const.NETWORK_MODULES}/{network}/MSU/{algo}/{parameters}/{algo}-module-list.tsv' - - convert_modules_to_edgelist( - NETWORK_FILE, MODULE_FILE, module_idx, OUTPUT_DIR) - - G = nx.read_edgelist(coexpress_nw, data=(('coexpress', float))) - - # Highlight the GWAS-implicated genes - elements = nx.cytoscape_data(G)['elements'] - for node in elements['nodes']: - if node['data']['id'] in implicated_gene_ids: - node['classes'] = 'shaded' - - return elements, {'name': layout}, {'visibility': 'visible', 'width': '100%', 'height': '100vh'} - - # Triggered when there are no enriched modules - except: - return {}, {'name': layout}, {'display': 'none', 'width': '100%', 'height': '100vh'} - -# ==================================== -# Functions for displaying statistics -# ==================================== - - -def count_modules(network, algo, parameters): - with open(f'{const.NETWORK_MODULES}/{network}/MSU/{algo}/{parameters}/{algo}-module-list.tsv') as f: - return len(f.readlines()) - - -Noun = namedtuple('Noun', ['singular', 'plural']) - - -def get_noun_for_active_tab(active_tab): - tab_idx = get_tab_index(active_tab) - if 0 <= tab_idx and tab_idx <= 2: - return Noun('ontology term', 'ontology terms') - else: - return Noun('pathway', 'pathways') - - -def count_genes_in_module(implicated_genes, module_idx, network, algo, parameters): - with open(f'{const.NETWORK_MODULES}/{network}/MSU/{algo}/{parameters}/{algo}-module-list.tsv') as modules: - for idx, module in enumerate(modules): - if idx == module_idx - 1: - module_genes = module.strip().split('\t') - return len(module_genes), len(set.intersection(set(module_genes), set(implicated_genes))) +from ..constants import Constants +from ..file_util import * +from ..general_util import * +from ..links_util import * +import os +import pickle + +import pandas as pd +import networkx as nx +from scipy.stats import fisher_exact, false_discovery_control + +from collections import namedtuple + +const = Constants() + +# Settings for the module detection algorithms: +# - multiplier: Value multiplied to the parameter to get the name of the directory +# For example, results of running clusterone at param=0.3 are saved in 30 +# - default_param: Default parameter of the module detection algorithm +# - low: User-facing display for the lowest parameter +# - high: User-facing display for the highest parameter + +Module_detection_algo = namedtuple('Module_detection_algo', [ + 'multiplier', 'default_param', 'low', 'high']) +module_detection_algos = { + 'clusterone': Module_detection_algo( + 100, 0.3, '1 (Looser Modules)', '4 (Denser Modules)'), + 'coach': Module_detection_algo( + 1000, 0.225, '1 (Looser Modules)', '4 (Denser Modules)'), + 'demon': Module_detection_algo( + 100, 0.25, '1 (Looser Modules)', '4 (Denser Modules)'), + 'fox': Module_detection_algo( + 100, 0.05, '1 (Looser Modules)', '4 (Denser Modules)'), +} + + +MODULE_DETECTION_ALGOS_VALUE_LABEL = [ + {'value': 'clusterone', 'label': 'ClusterONE', + 'label_id': 'clusterone'}, + {'value': 'coach', 'label': 'COACH', 'label_id': 'coach'}, + {'value': 'demon', 'label': 'DEMON', 'label_id': 'demon'}, + {'value': 'fox', 'label': 'FOX', 'label_id': 'fox'} +] + +COEXPRESSION_NETWORKS_VALUE_LABEL = [ + {'value': 'OS-CX', 'label': 'RiceNet v2', 'label_id': 'os-cx'}, + {'value': 'RCRN', + 'label': 'Rice Combined Mutual Ranked Network (RCRN)', 'label_id': 'rcrn'}, +] + +Enrichment_tab = namedtuple('Enrichment_tab', ['enrichment', 'path']) +enrichment_tabs = [Enrichment_tab('Gene Ontology', 'ontology_enrichment/go'), + Enrichment_tab('Trait Ontology', 'ontology_enrichment/to'), + Enrichment_tab('Plant Ontology', 'ontology_enrichment/po'), + Enrichment_tab('Pathways (Over-Representation)', + 'pathway_enrichment/ora'), + Enrichment_tab('Pathway-Express', 'pathway_enrichment/pe'), + Enrichment_tab('SPIA', 'pathway_enrichment/spia')] + + +def get_user_facing_parameter(algo, parameter, network='OS-CX'): + parameters = sorted( + map(int, os.listdir(f'{const.NETWORK_MODULES}/{network}/MSU/{algo}'))) + + return parameters.index(parameter) + 1 + + +def get_user_facing_algo(algo): + for entry in MODULE_DETECTION_ALGOS_VALUE_LABEL: + if entry['value'] == algo: + return entry['label'] + + +def get_user_facing_network(network): + for entry in COEXPRESSION_NETWORKS_VALUE_LABEL: + if entry['value'] == network: + return entry['label'] + + +def get_parameters_for_algo(algo, network='OS-CX'): + """ + Returns the user-facing parameters for the module detection algorithms + + Parameters: + - algo: Module detection algorithm + - network: Any of the coexpression networks supported by the app + + Returns: + - User-facing parameters for the module detection algorithms + """ + param_dict = {} + parameters = sorted( + map(int, os.listdir(f'{const.NETWORK_MODULES}/{network}/MSU/{algo}'))) + + # Display the user-facing parameters for the module detection algorithms + for idx, parameter in enumerate(parameters): + if idx == 0: + param_dict[int(parameter)] = module_detection_algos[algo].low + elif idx == len(parameters) - 1: + param_dict[int(parameter)] = module_detection_algos[algo].high + else: + param_dict[int(parameter)] = str(idx + 1) + + return param_dict + +# ================================================= +# Utility functions for module enrichment analysis +# ================================================= + + +def create_module_enrichment_results_dir(genomic_intervals, addl_genes, network, algo, parameters): + """ + Writes the accessions of the GWAS-implicated genes to a file + + Parameters: + - genes: Accessions of the genes implicated by GWAS + - genomic_intervals: Genomic interval entered by the user + - network: Coexpression network + - algo: Module detection algorithm + - parameters: Parameter at which module detection algorithm is run + + Returns: + - Parent directory of the file to which the accessions of the GWAS-implicated genes are written + """ + if addl_genes: + temp_output_folder_dir = get_path_to_temp( + genomic_intervals, const.TEMP_COEXPRESSION, f'{shorten_name(addl_genes)}/{network}/{algo}/{parameters}') + else: + temp_output_folder_dir = get_path_to_temp( + genomic_intervals, const.TEMP_COEXPRESSION, f'{network}/{algo}/{parameters}') + + if not path_exists(temp_output_folder_dir): + make_dir(temp_output_folder_dir) + + return temp_output_folder_dir + + +def fetch_enriched_modules(output_dir): + """ + Fetches the enriched modules from the output file of the module enrichment analysis + + Parameters: + - output_dir: Parent directory of the output file of the module enrichment analysis + + Returns: + - Enriched modules (i.e., their respectives indices and adjust p-values) + """ + modules = [] + with open(f'{output_dir}/enriched_modules.tsv') as modules_file: + for line in modules_file: + line = line.rstrip().split('\t') + idx = line[0] + p_value = float(line[1]) + + modules.append( + f'Module {idx} (Adj. p-value = {display_in_sci_notation(p_value)})') + + return modules + + +def do_module_enrichment_analysis(implicated_gene_ids, genomic_intervals, addl_genes, network, algo, parameters): + """ + Determine which modules are enriched given the set of GWAS-implicated genes + + Parameters: + - implicated_gene_ids: Accessions of the genes implicated by GWAS + - genomic_intervals: Genomic interval entered by the user + - network: Coexpression network + - algo: Module detection algorithm + - parameters: Parameter at which module detection algorithm is run + + Returns: + - Enriched modules (i.e., their respectives indices and adjust p-values) + """ + implicated_genes = set(implicated_gene_ids) + INPUT_GENES_DIR = create_module_enrichment_results_dir( + genomic_intervals, addl_genes, network, algo, parameters) + ENRICHED_MODULES_PATH = f'{INPUT_GENES_DIR}/enriched_modules.tsv' + + if not path_exists(ENRICHED_MODULES_PATH): + ENRICHED_MODULES_PATH_WITH_TIMESTAMP = append_timestamp_to_filename( + ENRICHED_MODULES_PATH) + MODULES_PATH = f'{const.NETWORK_MODULES}/{network}/MSU/{algo}/{parameters}/{algo}-module-list.tsv' + + # ==================================================================================== + # This replicates the logic of running the universal enrichment function `enricher()` + # provided by clusterProfiler + # ==================================================================================== + + with open(MODULES_PATH) as modules_file, open(ENRICHED_MODULES_PATH_WITH_TIMESTAMP, 'w') as enriched_modules_file: + modules = [] + background_genes = set() + for idx, line in enumerate(modules_file): + module_genes = set(line.strip().split('\t')) + background_genes = background_genes.union(module_genes) + if implicated_genes.intersection(module_genes): + modules.append(idx) + + p_values_indices = [] + p_values = [] + modules_file.seek(0) + for idx, line in enumerate(modules_file): + if idx in modules: + module = line.strip().split('\t') + module_genes = set(module) + table = construct_contigency_table( + background_genes, implicated_genes, module_genes) + + p_values.append(fisher_exact( + table, alternative='greater').pvalue) + + # Add 1 since user-facing module number is one-based + p_values_indices.append(idx + 1) + + adj_p_values = false_discovery_control(p_values, method='bh') + significant_adj_p_values = [(p_values_indices[idx], adj_p_value) for idx, adj_p_value in enumerate( + adj_p_values) if adj_p_value < const.P_VALUE_CUTOFF] + significant_adj_p_values.sort(key=lambda x: x[1]) + significant_adj_p_values = [ + f'{ID}\t{adj_p_value}' for ID, adj_p_value in significant_adj_p_values] + + enriched_modules_file.write('\n'.join(significant_adj_p_values)) + + try: + os.replace(ENRICHED_MODULES_PATH_WITH_TIMESTAMP, + ENRICHED_MODULES_PATH) + except: + pass + + return fetch_enriched_modules(INPUT_GENES_DIR) + + +def construct_contigency_table(background_genes, implicated_genes, module_genes): + not_in_implicated = background_genes.difference(implicated_genes) + not_in_module = background_genes.difference(module_genes) + + in_implicated_in_module = len(implicated_genes.intersection(module_genes)) + in_implicated_not_in_module = len( + implicated_genes.intersection(not_in_module)) + + not_in_implicated_in_module = len( + not_in_implicated.intersection(module_genes)) + not_in_implicated_not_in_module = len( + not_in_implicated.intersection(not_in_module)) + + table = [[in_implicated_in_module, not_in_implicated_in_module], + [in_implicated_not_in_module, not_in_implicated_not_in_module]] + + return table + + +# =============================================================================================== +# Utility functions for the display of the tables showing the results of the enrichment analysis +# =============================================================================================== + + +def convert_transcript_to_msu_id(transcript_ids_str, network): + """ + Converts given KEGG transcript IDs to their respective MSU accessions. + + Parameters: + - transcript_ids_str: KEGG transcript IDs + - network: Coexpression network + + Returns: + - Equivalent MSU accessions of the KEGG transcript IDs + """ + with open(f'{const.GENE_ID_MAPPING}/{network}/transcript-to-msu-id.pickle', 'rb') as f: + mapping_dict = pickle.load(f) + + output_str = '' + transcript_ids = transcript_ids_str.split('\n') + for transcript_id in transcript_ids: + for msu_id in mapping_dict[transcript_id]: + output_str += f'{msu_id}\n({transcript_id})\n\n' + + # Remove trailing newline characters + return output_str[:-2] + + +def get_genes_in_module(module_idx, network, algo, parameters): + with open(f'{const.NETWORK_MODULES}/{network}/transcript/{algo}/{parameters}/{algo}-module-list.tsv') as f: + for idx, module in enumerate(f): + if idx + 1 == int(module_idx): + return set(module.split('\t')) + + +def get_genes_in_pathway(pathway_id, network): + with open(f'{const.ENRICHMENT_ANALYSIS}/{network}/{const.KEGG_DOSA_GENESET}', 'rb') as f: + genes_in_pathway = pickle.load(f) + + return genes_in_pathway[pathway_id] + + +def get_genes_in_module_and_pathway(pathway_id, module_idx, network, algo, parameters): + return '\n'.join(list(get_genes_in_pathway(pathway_id, network).intersection( + get_genes_in_module(module_idx, network, algo, parameters)))) + + +def get_kegg_pathway_name(pathway_id, network): + with open(f'{const.ENRICHMENT_ANALYSIS}/{network}/{const.KEGG_DOSA_PATHWAY_NAMES}') as pathways: + for line in pathways: + line = line.split('\t') + if line[0].rstrip() == pathway_id: + return line[1].strip() + + +def remove_rap_db_info_in_pathway_name(pathway_name): + return pathway_name[:-len(' - Oryza sativa japonica (Japanese rice) (RAPDB)')] + +# ======================================================================================= +# Functions for the display of the tables showing the results of the enrichment analysis +# ======================================================================================= + + +def convert_to_df_go(result): + cols = ['ID', 'Gene Ontology Term', 'Gene Ratio', + 'BG Ratio', 'p-value', 'Adj. p-value', 'Genes'] + + if result.empty: + return create_empty_df_with_cols(cols) + + # Prettify display of genes + result['Genes'] = result['Genes'].str.split('/').str.join('\n') + + result['ID'] = get_go_link(result, 'ID') + + result = result.sort_values('Adj. p-value') + + display_cols_in_sci_notation( + result, [col for col in cols if 'p-value' in col]) + + return result[cols].dropna() + + +def convert_to_df_to(result): + cols = ['ID', 'Trait Ontology Term', 'Gene Ratio', + 'BG Ratio', 'p-value', 'Adj. p-value', 'Genes'] + + if result.empty: + return create_empty_df_with_cols(cols) + + # Prettify display of genes + result['Genes'] = result['Genes'].str.split('/').str.join('\n') + + result['ID'] = get_to_po_link(result, 'ID') + + result = result.sort_values('Adj. p-value') + + display_cols_in_sci_notation( + result, [col for col in cols if 'p-value' in col]) + + return result[cols].dropna() + + +def convert_to_df_po(result): + cols = ['ID', 'Plant Ontology Term', 'Gene Ratio', + 'BG Ratio', 'p-value', 'Adj. p-value', 'Genes'] + + if result.empty: + return create_empty_df_with_cols(cols) + + # Prettify display of genes + result['Genes'] = result['Genes'].str.split('/').str.join('\n') + + result['ID'] = get_to_po_link(result, 'ID') + + result = result.sort_values('Adj. p-value') + + display_cols_in_sci_notation( + result, [col for col in cols if 'p-value' in col]) + + return result[cols].dropna() + + +def convert_to_df_ora(result, network): + cols = ['ID', 'KEGG Pathway', 'Gene Ratio', + 'BG Ratio', 'p-value', 'Adj. p-value', 'Genes'] + + if result.empty: + return create_empty_df_with_cols(cols) + + result['KEGG Pathway'] = result['KEGG Pathway'].apply( + remove_rap_db_info_in_pathway_name) + + # Construct link before appending the MSU accession + result['ID'] = get_kegg_link(result, 'ID', 'Genes') + + # Prettify display of genes and convert to MSU accessions + result['Genes'] = result['Genes'].str.split( + '/').str.join('\n') + result['Genes'] = result.apply( + lambda x: convert_transcript_to_msu_id(x['Genes'], network), axis=1) + + result = result.sort_values('Adj. p-value') + + display_cols_in_sci_notation( + result, [col for col in cols if 'p-value' in col]) + + return result[cols].dropna() + + +def convert_to_df_pe(result, module_idx, network, algo, parameters): + cols = ['ID', 'KEGG Pathway', 'ORA p-value', 'Perturbation p-value', 'Combined p-value', + 'Adj. ORA p-value', 'Adj. Perturbation p-value', + 'Adj. Combined p-value', 'Genes'] + + if result.empty: + return create_empty_df_with_cols(cols) + + result = result.loc[result['Adj. Combined p-value'] < const.P_VALUE_CUTOFF] + + # IMPORTANT: Do not change ordering of instructions + + # Prettify display of ID + result['ID'] = result['ID'].str[len('path:'):] + + result['KEGG Pathway'] = result.apply( + lambda x: get_kegg_pathway_name(x['ID'], network), axis=1) + result['KEGG Pathway'] = result['KEGG Pathway'].apply( + remove_rap_db_info_in_pathway_name) + + result['Genes'] = result.apply(lambda x: get_genes_in_module_and_pathway( + x['ID'], module_idx, network, algo, parameters), axis=1) + + # Construct link before appending the MSU accession + result['ID'] = get_kegg_link(result, 'ID', 'Genes') + + result['Genes'] = result.apply( + lambda x: convert_transcript_to_msu_id(x['Genes'], network), axis=1) + + result = result.sort_values('Adj. Combined p-value') + + display_cols_in_sci_notation( + result, [col for col in cols if 'p-value' in col]) + + return result[cols].dropna() + + +def convert_to_df_spia(result, network): + cols = ['ID', 'KEGG Pathway', 'ORA p-value', 'Total Acc. Perturbation', 'Perturbation p-value', 'Combined p-value', + 'Adj. Combined p-value', 'Pathway Status', 'Genes'] + + if result.empty: + return create_empty_df_with_cols(cols) + + result = result.loc[result['Adj. Combined p-value'] < const.P_VALUE_CUTOFF] + + # Prettify display of ID + result['ID'] = 'dosa' + result['ID'] + result['Total Acc. Perturbation'] = result['tA'] + + # Prettify display of genes and convert to MSU accessions + result['Genes'] = result['View on KEGG'].apply( + get_genes_from_kegg_link) + + # Construct link before appending the MSU accession + result['ID'] = get_kegg_link(result, 'ID', 'Genes') + + result['Genes'] = result.apply( + lambda x: convert_transcript_to_msu_id(x['Genes'], network), axis=1) + + result = result.sort_values('Adj. Combined p-value') + + display_cols_in_sci_notation( + result, [col for col in cols if 'p-value' in col]) + + return result[cols].dropna() + + +def convert_to_df(active_tab, module_idx, network, algo, parameters): + """ + Returns the results of ontology and pathway enrichment analysis as a data frame + + Parameters: + - active_tab: ID of the tab corresponding to the selected enrichment analysis + - module_idx: Index of the selected module + - network: Coexpression network + - algo: Module detection algorithm + - parameters: Parameter at which module detection algorithm is run + + Returns: + - Data frame containing the results of ontology and pathway enrichment analysis + - True if the data frame is empty; False, otherwise + """ + dir = enrichment_tabs[get_tab_index(active_tab)].path + enrichment_type = dir.split('/')[-1] + + file = f'{const.ENRICHMENT_ANALYSIS}/{network}/output/{algo}/{parameters}/{dir}/results/{enrichment_type}-df-{module_idx}.tsv' + + columns = {'go': ['ID', 'Gene Ontology Term', 'Gene Ratio', + 'BG Ratio', 'p-value', 'Adj. p-value', 'q-value', 'Genes', 'Count'], + 'to': ['ID', 'Trait Ontology Term', 'Gene Ratio', + 'BG Ratio', 'p-value', 'Adj. p-value', 'q-value', 'Genes', 'Count'], + 'po': ['ID', 'Plant Ontology Term', 'Gene Ratio', + 'BG Ratio', 'p-value', 'Adj. p-value', 'q-value', 'Genes', 'Count'], + 'ora': ['ID', 'KEGG Pathway', 'Gene Ratio', + 'BG Ratio', 'p-value', 'Adj. p-value', 'q-value', 'Genes', 'Count'], + 'pe': ['ID', 'totalAcc', 'totalPert', 'totalAccNorm', 'totalPertNorm', + 'Perturbation p-value', 'pAcc', 'ORA p-value', 'Combined p-value', + 'Adj. Perturbation p-value', 'Adj. Accumulation p-value', + 'Adj. ORA p-value', 'Adj. Combined p-value'], + 'spia': ['KEGG Pathway', 'ID', 'pSize', 'NDE', 'ORA p-value', 'tA', + 'Perturbation p-value', 'Combined p-value', 'Adj. Combined p-value', + 'Adj. Combined p-value (Bonferroni)', 'Pathway Status', 'View on KEGG']} + + try: + result = pd.read_csv(file, delimiter='\t', + names=columns[enrichment_type], skiprows=1) + + # SPIA is a special case + if enrichment_type.lower() == 'spia': + # Add dtype argument to preserve leading 0 in KEGG pathway ID + result = pd.read_csv(file, delimiter='\t', + names=columns[enrichment_type], skiprows=1, dtype={'ID': object}) + + empty = result.empty + except: + result = pd.DataFrame() + empty = True + + # Return results data frame and whether it is empty + if enrichment_type == 'go': + return convert_to_df_go(result), empty + + elif enrichment_type == 'to': + return convert_to_df_to(result), empty + + elif enrichment_type == 'po': + return convert_to_df_po(result), empty + + elif enrichment_type == 'ora': + return convert_to_df_ora(result, network), empty + + elif enrichment_type == 'pe': + return convert_to_df_pe(result, module_idx, network, algo, parameters), empty + + elif enrichment_type == 'spia': + return convert_to_df_spia(result, network), empty + + +def convert_module_to_edge_list(module, network_file, output_dir, filename): + module = set(module) + selected_nodes = set() + with open(network_file) as network, open(f'{output_dir}/{filename}', 'w') as output: + for edge in network: + edge = edge.rstrip() + nodes = edge.split('\t') + + if nodes[0] in module and nodes[1] in module: + selected_nodes.add(nodes[0]) + selected_nodes.add(nodes[1]) + output.write(f'{nodes[0]}\t{nodes[1]}\n') + + assert len(selected_nodes - module) == 0 + + +def convert_modules_to_edgelist(network_file, module_file, module_index, output_dir): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + with open(module_file) as modules: + for idx, module in enumerate(modules): + if idx == module_index - 1: + module = module.rstrip() + module = module.split('\t') + filename = f'module-{idx + 1}.tsv' + convert_module_to_edge_list( + module, network_file, output_dir, filename) + + break + + +def load_module_graph(implicated_gene_ids, module, network, algo, parameters, layout): + """ + Displays the subgraph induced by the module + + Parameters: + - implicated_gene_ids: Accessions of the genes implicated by GWAS + - module: Gene module + - network: Coexpression network + - algo: Module detection algorithm + - parameters: Parameter at which module detection algorithm is run + - layout: Layout of the graph display + + Returns: + - Elements (nodes and edges) of the graph + - Dictionary storing the layout of the graph + - Dictionary storing the visibility, width, and height of the graph + """ + try: + # Ignore the word "Module" at the start + module_idx = int(module.split(' ')[1]) + OUTPUT_DIR = f'{const.TEMP}/{network}/{algo}/modules/{parameters}' + coexpress_nw = f'{OUTPUT_DIR}/module-{module_idx}.tsv' + + if not path_exists(coexpress_nw): + NETWORK_FILE = f'{const.NETWORKS}/{network}.txt' + MODULE_FILE = f'{const.NETWORK_MODULES}/{network}/MSU/{algo}/{parameters}/{algo}-module-list.tsv' + + convert_modules_to_edgelist( + NETWORK_FILE, MODULE_FILE, module_idx, OUTPUT_DIR) + + G = nx.read_edgelist(coexpress_nw, data=(('coexpress', float))) + + # Highlight the GWAS-implicated genes + elements = nx.cytoscape_data(G)['elements'] + for node in elements['nodes']: + if node['data']['id'] in implicated_gene_ids: + node['classes'] = 'shaded' + + return elements, {'name': layout}, {'visibility': 'visible', 'width': '100%', 'height': '100vh'} + + # Triggered when there are no enriched modules + except: + return {}, {'name': layout}, {'display': 'none', 'width': '100%', 'height': '100vh'} + +# ==================================== +# Functions for displaying statistics +# ==================================== + + +def count_modules(network, algo, parameters): + with open(f'{const.NETWORK_MODULES}/{network}/MSU/{algo}/{parameters}/{algo}-module-list.tsv') as f: + return len(f.readlines()) + + +Noun = namedtuple('Noun', ['singular', 'plural']) + + +def get_noun_for_active_tab(active_tab): + tab_idx = get_tab_index(active_tab) + if 0 <= tab_idx and tab_idx <= 2: + return Noun('ontology term', 'ontology terms') + else: + return Noun('pathway', 'pathways') + + +def count_genes_in_module(implicated_genes, module_idx, network, algo, parameters): + with open(f'{const.NETWORK_MODULES}/{network}/MSU/{algo}/{parameters}/{algo}-module-list.tsv') as modules: + for idx, module in enumerate(modules): + if idx == module_idx - 1: + module_genes = module.strip().split('\t') + return len(module_genes), len(set.intersection(set(module_genes), set(implicated_genes))) diff --git a/callbacks/constants.py b/callbacks/constants.py index ba7fd54d..1b559238 100644 --- a/callbacks/constants.py +++ b/callbacks/constants.py @@ -1,64 +1,64 @@ -class Constants(object): - LIFT_OVER = 'lift-over' - COEXPRESSION = 'co-expression' - TFBS = 'tf-enrichment' - IGV = 'browse-loci' - TEXT_MINING = 'text-mining' - - DATA = 'static' - APP_DATA = f'{DATA}/app_data' - RAW_DATA = f'{DATA}/raw_data' - - ANNOTATIONS = f'{APP_DATA}/annotations' - ALIGNMENTS = f'{APP_DATA}/alignments' - OGI_MAPPING = f'{APP_DATA}/ogi_mapping' - GENE_DESCRIPTIONS = f'{APP_DATA}/gene_descriptions' - GENE_ID_MAPPING = f'{APP_DATA}/gene_id_mapping' - TEXT_MINING = f'{APP_DATA}/text_mining' - QTARO = f'{APP_DATA}/qtaro' - - GENOMES_NIPPONBARE = f'{APP_DATA}/genomes/Nipponbare' - ANNOTATIONS_NB = f'{ANNOTATIONS}/Nb' - OPEN_CHROMATIN = f'{APP_DATA}/open_chromatin' - OPEN_CHROMATIN_PANICLE = f'{OPEN_CHROMATIN}/panicle' - QTARO_DICTIONARY = f'{QTARO}/qtaro.pickle' - - NETWORKS = f'{APP_DATA}/networks' - NETWORK_MODULES = f'{APP_DATA}/network_modules' - - TEMP = f'{DATA}/temp' - IMPLICATED_GENES = f'{TEMP}/implicated_genes' - TEMP_IGV = 'igv' - TEMP_COEXPRESSION = 'co_expression' - TEMP_TEXT_MINING = 'text_mining' - - TEMP_TFBS = 'tf_enrichment' - TFBS_BEDS = f'{APP_DATA}/tf_enrichment' - PROMOTER_BED = 'query_promoter_intervals' - GENOME_WIDE_BED = 'query_genomic_intervals' - TFBS_ANNOTATION = f'{TFBS_BEDS}/annotation' - - DATA_PREPARATION_SCRIPTS = 'prepare_data/workflow/scripts' - ENRICHMENT_ANALYSIS_SCRIPTS = f'{DATA_PREPARATION_SCRIPTS}/enrichment_analysis' - - ENRICHMENT_ANALYSIS = f'{APP_DATA}/enrichment_analysis' - ENRICHMENT_ANALYSIS_MAPPING = 'mapping' - ENRICHMENT_ANALYSIS_MODULES = 'modules' - - KEGG_DOSA_GENESET = f'{ENRICHMENT_ANALYSIS_MAPPING}/kegg-dosa-geneset.pickle' - KEGG_DOSA_PATHWAY_NAMES = f'{ENRICHMENT_ANALYSIS_MAPPING}/kegg-dosa-pathway-names.tsv' - - TEXT_MINING_ANNOTATED_ABSTRACTS = f'{TEXT_MINING}/annotated_abstracts.tsv' - TEXT_MINING_PUBMED = f'{TEXT_MINING}/pubmed_per_gene' - - P_VALUE_CUTOFF = 0.05 - - # ========= - # Database - # ========= - - FILE_STATUS_DB = f'{TEMP}/file_status.db' - FILE_STATUS_TABLE = 'file_status' - - def __init__(self): - pass +class Constants(object): + LIFT_OVER = 'lift-over' + COEXPRESSION = 'co-expression' + TFBS = 'tf-enrichment' + IGV = 'browse-loci' + TEXT_MINING = 'text-mining' + + DATA = 'static' + APP_DATA = f'{DATA}/app_data' + RAW_DATA = f'{DATA}/raw_data' + + ANNOTATIONS = f'{APP_DATA}/annotations' + ALIGNMENTS = f'{APP_DATA}/alignments' + OGI_MAPPING = f'{APP_DATA}/ogi_mapping' + GENE_DESCRIPTIONS = f'{APP_DATA}/gene_descriptions' + GENE_ID_MAPPING = f'{APP_DATA}/gene_id_mapping' + TEXT_MINING = f'{APP_DATA}/text_mining' + QTARO = f'{APP_DATA}/qtaro' + + GENOMES_NIPPONBARE = f'{APP_DATA}/genomes/Nipponbare' + ANNOTATIONS_NB = f'{ANNOTATIONS}/Nb' + OPEN_CHROMATIN = f'{APP_DATA}/open_chromatin' + OPEN_CHROMATIN_PANICLE = f'{OPEN_CHROMATIN}/panicle' + QTARO_DICTIONARY = f'{QTARO}/qtaro.pickle' + + NETWORKS = f'{APP_DATA}/networks' + NETWORK_MODULES = f'{APP_DATA}/network_modules' + + TEMP = f'{DATA}/temp' + IMPLICATED_GENES = f'{TEMP}/implicated_genes' + TEMP_IGV = 'igv' + TEMP_COEXPRESSION = 'co_expression' + TEMP_TEXT_MINING = 'text_mining' + + TEMP_TFBS = 'tf_enrichment' + TFBS_BEDS = f'{APP_DATA}/tf_enrichment' + PROMOTER_BED = 'query_promoter_intervals' + GENOME_WIDE_BED = 'query_genomic_intervals' + TFBS_ANNOTATION = f'{TFBS_BEDS}/annotation' + + DATA_PREPARATION_SCRIPTS = 'prepare_data/workflow/scripts' + ENRICHMENT_ANALYSIS_SCRIPTS = f'{DATA_PREPARATION_SCRIPTS}/enrichment_analysis' + + ENRICHMENT_ANALYSIS = f'{APP_DATA}/enrichment_analysis' + ENRICHMENT_ANALYSIS_MAPPING = 'mapping' + ENRICHMENT_ANALYSIS_MODULES = 'modules' + + KEGG_DOSA_GENESET = f'{ENRICHMENT_ANALYSIS_MAPPING}/kegg-dosa-geneset.pickle' + KEGG_DOSA_PATHWAY_NAMES = f'{ENRICHMENT_ANALYSIS_MAPPING}/kegg-dosa-pathway-names.tsv' + + TEXT_MINING_ANNOTATED_ABSTRACTS = f'{TEXT_MINING}/annotated_abstracts.tsv' + TEXT_MINING_PUBMED = f'{TEXT_MINING}/pubmed_per_gene' + + P_VALUE_CUTOFF = 0.05 + + # ========= + # Database + # ========= + + FILE_STATUS_DB = f'{TEMP}/file_status.db' + FILE_STATUS_TABLE = 'file_status' + + def __init__(self): + pass diff --git a/callbacks/file_util.py b/callbacks/file_util.py index 03834e14..00f76c0a 100644 --- a/callbacks/file_util.py +++ b/callbacks/file_util.py @@ -1,119 +1,119 @@ -import regex as re -import os -from .constants import Constants - -import time -import sqlite3 - -const = Constants() - - -def path_exists(path): - """ - Checks if given path exists - - Parameters: - - path: Path to be checked if it exists - - Returns: - - True if the path exists; False, otherwise - """ - return os.path.exists(path) - - -def make_dir(directory): - """ - Creates given directory if it does not yet exist - - Parameters: - - directory: Directory to be created - """ - if not path_exists(directory): - os.makedirs(directory) - - -def convert_text_to_path(text): - """ - Converts given text into a well-formed path - - Parameters: - - text: Text to be converted into a well-formed path - - Returns: - - Well-formed path - """ - return text.strip().replace( - ":", "_").replace(";", "__").replace("-", "_").replace('.', '_').replace(' ', '') - - -def get_path_to_temp(genomic_interval, analysis_type, *args): - """ - Forms the path to temporary (file-cached) results of given post-GWAS analysis - This function returns only the path name. It does not create the actual file or directory - - Parameters: - - genomic_interval: Genomic interval entered by the user - - analysis_type: Post-GWAS analysis - - args: Subfolder names appended to the path - - Returns: - - Path to temporary (file-cached) results of post-GWAS analysis - """ - genomic_interval_foldername = shorten_name(convert_text_to_path( - genomic_interval)) - - analysis_type = convert_text_to_path(analysis_type) - - temp_dir = f'{const.TEMP}/{genomic_interval_foldername}/{analysis_type}' - for folder in args: - temp_dir += f'/{convert_text_to_path(folder)}' - - temp_dir = re.sub(r'/+', '/', temp_dir) - - return temp_dir - - -def get_path_to_text_mining_temp(analysis_type, *args): - analysis_type = convert_text_to_path(analysis_type) - - temp_dir = f'{const.TEMP}/{analysis_type}' - for folder in args: - temp_dir += f'/{convert_text_to_path(folder)}' - - temp_dir = re.sub(r'/+', '/', temp_dir) - - return temp_dir - -def shorten_name(name): - try: - connection = sqlite3.connect(const.FILE_STATUS_DB) - cursor = connection.cursor() - - query = f'INSERT OR IGNORE INTO {const.FILE_STATUS_TABLE}(name) VALUES("{name}")' - - cursor.execute(query) - connection.commit() - - cursor.close() - connection.close() - except sqlite3.Error as error: - pass - - try: - connection = sqlite3.connect(const.FILE_STATUS_DB) - cursor = connection.cursor() - - query = f'SELECT rowid FROM {const.FILE_STATUS_TABLE} WHERE name = "{name}"' - cursor.execute(query) - row_id = cursor.fetchall()[0][0] - - cursor.close() - connection.close() - except sqlite3.Error as error: - pass - - return row_id - - -def append_timestamp_to_filename(filename): - return f'{filename}.{time.time_ns() // 1000}' +import regex as re +import os +from .constants import Constants + +import time +import sqlite3 + +const = Constants() + + +def path_exists(path): + """ + Checks if given path exists + + Parameters: + - path: Path to be checked if it exists + + Returns: + - True if the path exists; False, otherwise + """ + return os.path.exists(path) + + +def make_dir(directory): + """ + Creates given directory if it does not yet exist + + Parameters: + - directory: Directory to be created + """ + if not path_exists(directory): + os.makedirs(directory) + + +def convert_text_to_path(text): + """ + Converts given text into a well-formed path + + Parameters: + - text: Text to be converted into a well-formed path + + Returns: + - Well-formed path + """ + return text.strip().replace( + ":", "_").replace(";", "__").replace("-", "_").replace('.', '_').replace(' ', '') + + +def get_path_to_temp(genomic_interval, analysis_type, *args): + """ + Forms the path to temporary (file-cached) results of given post-GWAS analysis + This function returns only the path name. It does not create the actual file or directory + + Parameters: + - genomic_interval: Genomic interval entered by the user + - analysis_type: Post-GWAS analysis + - args: Subfolder names appended to the path + + Returns: + - Path to temporary (file-cached) results of post-GWAS analysis + """ + genomic_interval_foldername = shorten_name(convert_text_to_path( + genomic_interval)) + + analysis_type = convert_text_to_path(analysis_type) + + temp_dir = f'{const.TEMP}/{genomic_interval_foldername}/{analysis_type}' + for folder in args: + temp_dir += f'/{convert_text_to_path(folder)}' + + temp_dir = re.sub(r'/+', '/', temp_dir) + + return temp_dir + + +def get_path_to_text_mining_temp(analysis_type, *args): + analysis_type = convert_text_to_path(analysis_type) + + temp_dir = f'{const.TEMP}/{analysis_type}' + for folder in args: + temp_dir += f'/{convert_text_to_path(folder)}' + + temp_dir = re.sub(r'/+', '/', temp_dir) + + return temp_dir + +def shorten_name(name): + try: + connection = sqlite3.connect(const.FILE_STATUS_DB) + cursor = connection.cursor() + + query = f'INSERT OR IGNORE INTO {const.FILE_STATUS_TABLE}(name) VALUES("{name}")' + + cursor.execute(query) + connection.commit() + + cursor.close() + connection.close() + except sqlite3.Error as error: + pass + + try: + connection = sqlite3.connect(const.FILE_STATUS_DB) + cursor = connection.cursor() + + query = f'SELECT rowid FROM {const.FILE_STATUS_TABLE} WHERE name = "{name}"' + cursor.execute(query) + row_id = cursor.fetchall()[0][0] + + cursor.close() + connection.close() + except sqlite3.Error as error: + pass + + return row_id + + +def append_timestamp_to_filename(filename): + return f'{filename}.{time.time_ns() // 1000}' diff --git a/callbacks/general_util.py b/callbacks/general_util.py index c54b2fc0..c404741d 100644 --- a/callbacks/general_util.py +++ b/callbacks/general_util.py @@ -1,56 +1,56 @@ -import pandas as pd - -NULL_PLACEHOLDER = '–' - - -def display_in_sci_notation(number): - """ - Returns given number in scientific notation n * 10^m, where n is rounded to 6 decimal places - - Parameters: - - number: Number whose equivalent in scientific notation is to be returned - - Returns: - - Number in scientific notation - """ - return '{:.6e}'.format(number) - - -def display_in_fixed_dec_places(number): - return '{:.6f}'.format(float(number)) - - -def display_cols_in_sci_notation(result, numeric_columns): - for column in numeric_columns: - result[column] = result[column].apply(display_in_sci_notation) - - -def display_cols_in_fixed_dec_places(result, numeric_columns): - for column in numeric_columns: - result[column] = result[column].apply(display_in_fixed_dec_places) - - -def create_empty_df_with_cols(cols): - cols_dict = {} - for col in cols: - cols_dict[col] = [NULL_PLACEHOLDER] - - return pd.DataFrame(cols_dict) - - -def get_tab_index(tab_id): - return int(tab_id.split('-')[1]) - - -def get_num_unique_entries(table, column): - if table[column].iloc[0] == NULL_PLACEHOLDER: - return 0 - - return table[column].nunique() - - -def get_num_entries(table, column): - if table[column].iloc[0] == NULL_PLACEHOLDER: - return 0 - - return table[column].count() +import pandas as pd + +NULL_PLACEHOLDER = '–' + + +def display_in_sci_notation(number): + """ + Returns given number in scientific notation n * 10^m, where n is rounded to 6 decimal places + + Parameters: + - number: Number whose equivalent in scientific notation is to be returned + + Returns: + - Number in scientific notation + """ + return '{:.6e}'.format(number) + + +def display_in_fixed_dec_places(number): + return '{:.6f}'.format(float(number)) + + +def display_cols_in_sci_notation(result, numeric_columns): + for column in numeric_columns: + result[column] = result[column].apply(display_in_sci_notation) + + +def display_cols_in_fixed_dec_places(result, numeric_columns): + for column in numeric_columns: + result[column] = result[column].apply(display_in_fixed_dec_places) + + +def create_empty_df_with_cols(cols): + cols_dict = {} + for col in cols: + cols_dict[col] = [NULL_PLACEHOLDER] + + return pd.DataFrame(cols_dict) + + +def get_tab_index(tab_id): + return int(tab_id.split('-')[1]) + + +def get_num_unique_entries(table, column): + if table[column].iloc[0] == NULL_PLACEHOLDER: + return 0 + + return table[column].nunique() + + +def get_num_entries(table, column): + if table[column].iloc[0] == NULL_PLACEHOLDER: + return 0 + + return table[column].count() diff --git a/callbacks/homepage/callbacks.py b/callbacks/homepage/callbacks.py index 8ac2e9da..40d8d079 100644 --- a/callbacks/homepage/callbacks.py +++ b/callbacks/homepage/callbacks.py @@ -1,184 +1,184 @@ - -from dash import Input, Output, State, html, ctx, ALL -from dash.exceptions import PreventUpdate -from .util import * -from ..lift_over import util as lift_over_util -from ..browse_loci import util as browse_loci_util -from ..constants import Constants - -from ..style_util import * - -const = Constants() - - -def init_callback(app): - - @app.callback( - Output({'type': 'analysis-nav', 'label': ALL}, 'className'), - Output({'type': 'analysis-layout', 'label': ALL}, 'hidden'), - State({'type': 'analysis-nav', 'label': ALL}, 'className'), - State({'type': 'analysis-nav', 'label': ALL}, 'id'), - State({'type': 'analysis-layout', 'label': ALL}, 'id'), - Input('current-analysis-page-nav', 'data'), - Input('homepage-submit', 'n_clicks'), - State({'type': 'analysis-layout', 'label': ALL}, 'hidden'), - ) - def display_specific_analysis_page(nav_className, analysis_nav_id, analysis_layout_id, current_page, *_): - if current_page: - update_nav_class_name = [] - update_layout_hidden = [] - - for i in range(len(analysis_nav_id)): - if analysis_nav_id[i]['label'] == current_page: - nav_classes = add_class_name('active', nav_className[i]) - else: - nav_classes = remove_class_name('active', nav_className[i]) - - update_nav_class_name.append(nav_classes) - - for i in range(len(analysis_layout_id)): - if analysis_layout_id[i]['label'] == current_page: - hide_layout = False - else: - hide_layout = True - - update_layout_hidden.append(hide_layout) - - return update_nav_class_name, update_layout_hidden - - raise PreventUpdate - - @app.callback( - Output('session-container', 'children'), - Output('input-error', 'children'), - Output('input-error', 'style'), - Output('homepage-is-submitted', 'data'), - Output('homepage-genomic-intervals-submitted-input', 'data'), - - State('homepage-genomic-intervals', 'value'), - - Input('homepage-submit', 'n_clicks'), - Input('homepage-genomic-intervals', 'n_submit'), - State('session-container', 'children'), - - Input('homepage-reset', 'n_clicks'), - Input('homepage-clear-cache', 'n_clicks'), - - prevent_initial_call=True - ) - def parse_input(nb_intervals_str, n_clicks, n_submit, dccStore_children, *_): - if 'homepage-clear-cache' == ctx.triggered_id: - clear_cache_folder() - - if 'homepage-reset' == ctx.triggered_id: - # clear data for items in dcc.Store found in session-container - dccStore_children = get_cleared_dccStore_data_excluding_some_data(dccStore_children) - - return dccStore_children, None, {'display': 'none'}, False, '' - - if n_submit >= 1 or ('homepage-submit' == ctx.triggered_id and n_clicks >= 1): - if nb_intervals_str: - intervals = lift_over_util.get_genomic_intervals_from_input( - nb_intervals_str) - - if lift_over_util.is_error(intervals): - return dccStore_children, [f'Error encountered while parsing genomic interval {intervals[1]}', html.Br(), lift_over_util.get_error_message(intervals[0])], \ - {'display': 'block'}, False, nb_intervals_str - else: - # clear data for items in dcc.Store found in session-container - dccStore_children = get_cleared_dccStore_data_excluding_some_data( - dccStore_children, 'homepage-genomic-intervals-saved-input') - - browse_loci_util.write_igv_tracks_to_file(nb_intervals_str) - - return dccStore_children, None, {'display': 'none'}, True, nb_intervals_str - else: - return dccStore_children, [f'Error: Input for genomic interval should not be empty.'], \ - {'display': 'block'}, False, nb_intervals_str - - raise PreventUpdate - - @app.callback( - Output('lift-over-nb-table', 'data'), - Output('lift-over-nb-entire-table', 'data'), - Input('homepage-genomic-intervals-submitted-input', 'data'), - State('homepage-is-submitted', 'data') - ) - def get_nipponbare_gene_ids(nb_intervals_str, homepage_is_submitted): - if homepage_is_submitted: - if nb_intervals_str: - nb_intervals = lift_over_util.get_genomic_intervals_from_input( - nb_intervals_str) - - if not lift_over_util.is_error(nb_intervals): - genes_from_Nb = lift_over_util.get_genes_in_Nb( - nb_intervals) - - return genes_from_Nb[1], genes_from_Nb[0].to_dict('records') - - raise PreventUpdate - - @app.callback( - Output('homepage-genomic-intervals-saved-input', - 'data', allow_duplicate=True), - Input({'type': 'example-genomic-interval', - 'description': ALL}, 'n_clicks'), - prevent_initial_call=True - ) - def set_input_fields_with_preset_input(example_genomic_interval_n_clicks): - if ctx.triggered_id and not all(val == 0 for val in example_genomic_interval_n_clicks): - return get_example_genomic_interval(ctx.triggered_id['description']) - - raise PreventUpdate - - - @app.callback( - Output('homepage-genomic-intervals-saved-input', - 'data', allow_duplicate=True), - Input('homepage-genomic-intervals', 'value'), - prevent_initial_call=True - ) - def set_input_fields(genomic_intervals): - return genomic_intervals - - - @app.callback( - Output('homepage-results-container', 'style'), - Input('homepage-is-submitted', 'data'), - Input('homepage-submit', 'n_clicks'), - ) - def display_homepage_output(homepage_is_submitted, *_): - if homepage_is_submitted: - return {'display': 'block'} - - else: - return {'display': 'none'} - - @app.callback( - Output('current-analysis-page-nav', 'data'), - Input({'type': 'analysis-nav', 'label': ALL}, 'n_clicks') - ) - def set_input_homepage_session_state(analysis_nav_items_n_clicks): - if ctx.triggered_id: - if not all(val == 0 for val in analysis_nav_items_n_clicks): - analysis_page_id = ctx.triggered_id.label - return analysis_page_id - - raise PreventUpdate - - @app.callback( - Output('homepage-genomic-intervals', 'value'), - Input('homepage-genomic-intervals-saved-input', 'data'), - ) - def get_input_homepage_session_state(genomic_intervals): - return genomic_intervals - - @app.callback( - Output('genomic-interval-modal', 'is_open'), - Input('genomic-interval-tooltip', 'n_clicks') - ) - def open_modals(tooltip_n_clicks): - if tooltip_n_clicks > 0: - return True - - raise PreventUpdate + +from dash import Input, Output, State, html, ctx, ALL +from dash.exceptions import PreventUpdate +from .util import * +from ..lift_over import util as lift_over_util +from ..browse_loci import util as browse_loci_util +from ..constants import Constants + +from ..style_util import * + +const = Constants() + + +def init_callback(app): + + @app.callback( + Output({'type': 'analysis-nav', 'label': ALL}, 'className'), + Output({'type': 'analysis-layout', 'label': ALL}, 'hidden'), + State({'type': 'analysis-nav', 'label': ALL}, 'className'), + State({'type': 'analysis-nav', 'label': ALL}, 'id'), + State({'type': 'analysis-layout', 'label': ALL}, 'id'), + Input('current-analysis-page-nav', 'data'), + Input('homepage-submit', 'n_clicks'), + State({'type': 'analysis-layout', 'label': ALL}, 'hidden'), + ) + def display_specific_analysis_page(nav_className, analysis_nav_id, analysis_layout_id, current_page, *_): + if current_page: + update_nav_class_name = [] + update_layout_hidden = [] + + for i in range(len(analysis_nav_id)): + if analysis_nav_id[i]['label'] == current_page: + nav_classes = add_class_name('active', nav_className[i]) + else: + nav_classes = remove_class_name('active', nav_className[i]) + + update_nav_class_name.append(nav_classes) + + for i in range(len(analysis_layout_id)): + if analysis_layout_id[i]['label'] == current_page: + hide_layout = False + else: + hide_layout = True + + update_layout_hidden.append(hide_layout) + + return update_nav_class_name, update_layout_hidden + + raise PreventUpdate + + @app.callback( + Output('session-container', 'children'), + Output('input-error', 'children'), + Output('input-error', 'style'), + Output('homepage-is-submitted', 'data'), + Output('homepage-genomic-intervals-submitted-input', 'data'), + + State('homepage-genomic-intervals', 'value'), + + Input('homepage-submit', 'n_clicks'), + Input('homepage-genomic-intervals', 'n_submit'), + State('session-container', 'children'), + + Input('homepage-reset', 'n_clicks'), + Input('homepage-clear-cache', 'n_clicks'), + + prevent_initial_call=True + ) + def parse_input(nb_intervals_str, n_clicks, n_submit, dccStore_children, *_): + if 'homepage-clear-cache' == ctx.triggered_id: + clear_cache_folder() + + if 'homepage-reset' == ctx.triggered_id: + # clear data for items in dcc.Store found in session-container + dccStore_children = get_cleared_dccStore_data_excluding_some_data(dccStore_children) + + return dccStore_children, None, {'display': 'none'}, False, '' + + if n_submit >= 1 or ('homepage-submit' == ctx.triggered_id and n_clicks >= 1): + if nb_intervals_str: + intervals = lift_over_util.get_genomic_intervals_from_input( + nb_intervals_str) + + if lift_over_util.is_error(intervals): + return dccStore_children, [f'Error encountered while parsing genomic interval {intervals[1]}', html.Br(), lift_over_util.get_error_message(intervals[0])], \ + {'display': 'block'}, False, nb_intervals_str + else: + # clear data for items in dcc.Store found in session-container + dccStore_children = get_cleared_dccStore_data_excluding_some_data( + dccStore_children, 'homepage-genomic-intervals-saved-input') + + browse_loci_util.write_igv_tracks_to_file(nb_intervals_str) + + return dccStore_children, None, {'display': 'none'}, True, nb_intervals_str + else: + return dccStore_children, [f'Error: Input for genomic interval should not be empty.'], \ + {'display': 'block'}, False, nb_intervals_str + + raise PreventUpdate + + @app.callback( + Output('lift-over-nb-table', 'data'), + Output('lift-over-nb-entire-table', 'data'), + Input('homepage-genomic-intervals-submitted-input', 'data'), + State('homepage-is-submitted', 'data') + ) + def get_nipponbare_gene_ids(nb_intervals_str, homepage_is_submitted): + if homepage_is_submitted: + if nb_intervals_str: + nb_intervals = lift_over_util.get_genomic_intervals_from_input( + nb_intervals_str) + + if not lift_over_util.is_error(nb_intervals): + genes_from_Nb = lift_over_util.get_genes_in_Nb( + nb_intervals) + + return genes_from_Nb[1], genes_from_Nb[0].to_dict('records') + + raise PreventUpdate + + @app.callback( + Output('homepage-genomic-intervals-saved-input', + 'data', allow_duplicate=True), + Input({'type': 'example-genomic-interval', + 'description': ALL}, 'n_clicks'), + prevent_initial_call=True + ) + def set_input_fields_with_preset_input(example_genomic_interval_n_clicks): + if ctx.triggered_id and not all(val == 0 for val in example_genomic_interval_n_clicks): + return get_example_genomic_interval(ctx.triggered_id['description']) + + raise PreventUpdate + + + @app.callback( + Output('homepage-genomic-intervals-saved-input', + 'data', allow_duplicate=True), + Input('homepage-genomic-intervals', 'value'), + prevent_initial_call=True + ) + def set_input_fields(genomic_intervals): + return genomic_intervals + + + @app.callback( + Output('homepage-results-container', 'style'), + Input('homepage-is-submitted', 'data'), + Input('homepage-submit', 'n_clicks'), + ) + def display_homepage_output(homepage_is_submitted, *_): + if homepage_is_submitted: + return {'display': 'block'} + + else: + return {'display': 'none'} + + @app.callback( + Output('current-analysis-page-nav', 'data'), + Input({'type': 'analysis-nav', 'label': ALL}, 'n_clicks') + ) + def set_input_homepage_session_state(analysis_nav_items_n_clicks): + if ctx.triggered_id: + if not all(val == 0 for val in analysis_nav_items_n_clicks): + analysis_page_id = ctx.triggered_id.label + return analysis_page_id + + raise PreventUpdate + + @app.callback( + Output('homepage-genomic-intervals', 'value'), + Input('homepage-genomic-intervals-saved-input', 'data'), + ) + def get_input_homepage_session_state(genomic_intervals): + return genomic_intervals + + @app.callback( + Output('genomic-interval-modal', 'is_open'), + Input('genomic-interval-tooltip', 'n_clicks') + ) + def open_modals(tooltip_n_clicks): + if tooltip_n_clicks > 0: + return True + + raise PreventUpdate diff --git a/callbacks/homepage/util.py b/callbacks/homepage/util.py index eceac5b0..6fa3818e 100644 --- a/callbacks/homepage/util.py +++ b/callbacks/homepage/util.py @@ -1,77 +1,86 @@ -import os -import shutil -from ..style_util import * -from ..constants import Constants -from ..file_util import * - -import sqlite3 - -const = Constants() - -example_genomic_intervals = { - 'pre-harvest': 'Chr01:1523625-1770814;Chr04:4662701-4670717', - 'anaerobic-germination': 'Chr07:6000000-6900000'} - - -def clear_cache_folder(): - if os.path.exists(const.TEMP): - shutil.rmtree(const.TEMP, ignore_errors=True) - - # Drop the table - try: - connection = sqlite3.connect(const.FILE_STATUS_DB) - cursor = connection.cursor() - - query = f'DROP TABLE {const.FILE_STATUS_TABLE}' - - cursor.execute(query) - connection.commit() - - cursor.close() - connection.close() - except: - pass - - # Recreate the database - make_dir(const.TEMP) - - try: - connection = sqlite3.connect(const.FILE_STATUS_DB) - cursor = connection.cursor() - - query = f'CREATE TABLE IF NOT EXISTS {const.FILE_STATUS_TABLE} (name TEXT, UNIQUE(name));' - - cursor.execute(query) - connection.commit() - - cursor.close() - connection.close() - except sqlite3.Error as error: - pass - - -def get_cleared_dccStore_data_excluding_some_data(dccStore_children, *arg): - for i in range(len(dccStore_children)): - dccStore_ID = dccStore_children[i]['props']['id'] - - if not dccStore_ID in arg: - dccStore_children[i]['props']['data'] = '' - - return dccStore_children - - -def get_example_genomic_interval(description): - return example_genomic_intervals[description] - - -def set_active_class(display_map, active_class): - class_names = [] - for page, layout_link in display_map.items(): - if page == active_class: - class_name = add_class_name('active', layout_link.link_class) - else: - class_name = remove_class_name('active', layout_link.link_class) - - class_names.append(class_name) - - return tuple(class_names) +import os +import shutil +from ..style_util import * +from ..constants import Constants +from ..file_util import * + +import sqlite3 + +const = Constants() + +example_genomic_intervals = { + 'pre-harvest': 'Chr01:1523625-1770814;Chr04:4662701-4670717', + 'anaerobic-germination': 'Chr07:6000000-6900000'} + + +def clear_cache_folder(): + if os.path.exists(const.TEMP): + shutil.rmtree(const.TEMP, ignore_errors=True) + + # Drop the table + try: + connection = sqlite3.connect(const.FILE_STATUS_DB) + cursor = connection.cursor() + + query = f'DROP TABLE {const.FILE_STATUS_TABLE}' + + cursor.execute(query) + connection.commit() + + cursor.close() + connection.close() + except: + pass + + # Recreate the database + make_dir(const.TEMP) + + try: + connection = sqlite3.connect(const.FILE_STATUS_DB) + cursor = connection.cursor() + + query = f'CREATE TABLE IF NOT EXISTS {const.FILE_STATUS_TABLE} (name TEXT, UNIQUE(name));' + + cursor.execute(query) + connection.commit() + + cursor.close() + connection.close() + except sqlite3.Error as error: + pass + + +def get_cleared_dccStore_data_excluding_some_data(dccStore_children, *args): + for i in range(len(dccStore_children)): + dccStore_ID = dccStore_children[i]['props']['id'] + + if args: + flag = False + for arg in args: + if arg in dccStore_ID: + flag = True + + if not flag: + dccStore_children[i]['props']['data'] = '' + + else: + dccStore_children[i]['props']['data'] = '' + + return dccStore_children + + +def get_example_genomic_interval(description): + return example_genomic_intervals[description] + + +def set_active_class(display_map, active_class): + class_names = [] + for page, layout_link in display_map.items(): + if page == active_class: + class_name = add_class_name('active', layout_link.link_class) + else: + class_name = remove_class_name('active', layout_link.link_class) + + class_names.append(class_name) + + return tuple(class_names) diff --git a/callbacks/lift_over/callbacks.py b/callbacks/lift_over/callbacks.py index dad5627a..354ad730 100644 --- a/callbacks/lift_over/callbacks.py +++ b/callbacks/lift_over/callbacks.py @@ -1,425 +1,425 @@ -from dash import Input, Output, State, dcc, html -from dash.exceptions import PreventUpdate - -from .util import * -from ..constants import Constants -from ..general_util import * -const = Constants() - - -def init_callback(app): - @app.callback( - Output('lift-over-genomic-intervals-input', 'children'), - State('homepage-genomic-intervals-submitted-input', 'data'), - Input('homepage-is-submitted', 'data'), - Input('lift-over-submit', 'n_clicks') - ) - def display_input(nb_intervals_str, homepage_is_submitted, *_): - if homepage_is_submitted: - if nb_intervals_str and not is_error( - get_genomic_intervals_from_input(nb_intervals_str)): - return [html.B('Your Input Intervals: '), html.Span(nb_intervals_str)] - else: - return None - - raise PreventUpdate - - @app.callback( - Output('lift-over-is-submitted', 'data', allow_duplicate=True), - Output('lift-over-other-refs-submitted-input', - 'data', allow_duplicate=True), - Output('lift-over-active-tab', 'data', allow_duplicate=True), - Output('lift-over-active-filter', 'data', allow_duplicate=True), - Input('lift-over-submit', 'n_clicks'), - State('homepage-is-submitted', 'data'), - State('lift-over-other-refs', 'value'), - prevent_initial_call=True - ) - def submit_lift_over_input(lift_over_submit_n_clicks, homepage_is_submitted, other_refs): - if homepage_is_submitted and lift_over_submit_n_clicks >= 1: - other_refs = sanitize_other_refs(other_refs) - - return True, other_refs, None, None - - raise PreventUpdate - - @app.callback( - Output('lift-over-results-container', 'style'), - Input('lift-over-is-submitted', 'data'), - ) - def display_lift_over_output(lift_over_is_submitted): - if lift_over_is_submitted: - return {'display': 'block'} - - else: - return {'display': 'none'} - - @app.callback( - Output('lift-over-results-intro', 'children'), - Output('lift-over-results-tabs', 'children'), - - Output('lift-over-overlap-table-filter', 'options'), - Output('lift-over-overlap-table-filter', 'value'), - - State('homepage-genomic-intervals-submitted-input', 'data'), - Input('lift-over-other-refs-submitted-input', 'data'), - - State('homepage-is-submitted', 'data'), - - State('lift-over-active-filter', 'data'), - State('lift-over-is-submitted', 'data') - ) - def display_gene_tabs(nb_intervals_str, other_refs, homepage_is_submitted, active_filter, lift_over_is_submitted): - if homepage_is_submitted and lift_over_is_submitted: - if nb_intervals_str and not is_error(get_genomic_intervals_from_input(nb_intervals_str)): - tabs = get_tabs() - - other_refs = sanitize_other_refs(other_refs) - - if other_refs: - tabs = tabs + other_refs - - tabs_children = [dcc.Tab(label=tab, value=tab) if idx < len(get_tabs()) - else dcc.Tab(label=f'Unique to {tab}', value=tab) - for idx, tab in enumerate(tabs)] - - if not active_filter: - active_filter = tabs[len(get_tabs()) - 1:] - - gene_list_msg = [html.Span( - 'The tabs below show the implicated genes in '), html.B('Nipponbare')] - - if other_refs: - other_refs_str = other_refs[0] - if len(other_refs) == 2: - other_refs_str += f' and {other_refs[1]}' - elif len(other_refs) > 2: - for idx, other_ref in enumerate(other_refs[1:]): - if idx != len(other_refs) - 2: - other_refs_str += f', ' - else: - other_refs_str += f', and ' - - other_refs_str += f'{other_ref} ({other_ref_genomes[other_ref]})' - - gene_list_msg += [html.Span(' and in orthologous regions of '), - html.B(other_refs_str)] - - gene_list_msg += [html.Span(':')] - - return gene_list_msg, tabs_children, tabs[len(get_tabs()) - 1:], active_filter - else: - return None, None, [], None - - raise PreventUpdate - - @app.callback( - Output('lift-over-results-tabs', 'active_tab'), - State('homepage-is-submitted', 'data'), - State('lift-over-active-tab', 'data'), - State('lift-over-is-submitted', 'data'), - Input('lift-over-other-refs-submitted-input', 'data') - ) - def display_active_tab(homepage_is_submitted, saved_active_tab, lift_over_is_submitted, *_): - if homepage_is_submitted and lift_over_is_submitted: - if not saved_active_tab: - return 'tab-0' - - return saved_active_tab - - raise PreventUpdate - - @app.callback( - Output('lift-over-other-refs-saved-input', - 'data', allow_duplicate=True), - Input('lift-over-other-refs', 'value'), - State('homepage-is-submitted', 'data'), - prevent_initial_call=True - ) - def set_input_lift_over_session_state(other_refs, homepage_is_submitted): - if homepage_is_submitted: - return other_refs - - raise PreventUpdate - - @app.callback( - Output('lift-over-active-tab', 'data', allow_duplicate=True), - Output('lift-over-active-filter', 'data', allow_duplicate=True), - - Input('lift-over-results-tabs', 'active_tab'), - Input('lift-over-overlap-table-filter', 'value'), - - State('homepage-is-submitted', 'data'), - State('lift-over-is-submitted', 'data'), - prevent_initial_call=True, - ) - def get_submitted_lift_over_session_state(active_tab, filter_rice_variants, homepage_is_submitted, lift_over_is_submitted): - if homepage_is_submitted and lift_over_is_submitted: - return active_tab, filter_rice_variants - - raise PreventUpdate - - @app.callback( - Output('lift-over-other-refs', 'value'), - State('lift-over-other-refs', 'multi'), - State('homepage-is-submitted', 'data'), - State('lift-over-other-refs-saved-input', 'data'), - Input('homepage-genomic-intervals-submitted-input', 'data'), - Input('lift-over-submit', 'n_clicks') - ) - def get_input_lift_over_session_state(is_multi_other_refs, homepage_is_submitted, other_refs, *_): - if homepage_is_submitted: - if not is_multi_other_refs and other_refs: - other_refs = other_refs[0] - - return other_refs - - raise PreventUpdate - - @app.callback( - Output('lift-over-results-gene-intro', 'children'), - Output('lift-over-overlap-table-filter', 'style'), - - Input('lift-over-results-tabs', 'active_tab'), - State('lift-over-results-tabs', 'children'), - State('homepage-is-submitted', 'data'), - State('lift-over-is-submitted', 'data') - ) - def display_gene_intro(active_tab, children, homepage_is_submitted, lift_over_is_submitted): - if homepage_is_submitted and lift_over_is_submitted: - if active_tab == get_tab_id('All Genes'): - return 'The table below lists all the implicated genes.', {'display': 'none'} - - elif active_tab == get_tab_id('Common Genes'): - return 'The table below lists the implicated genes that are common to:', {'display': 'block'} - - elif active_tab == get_tab_id('Nipponbare'): - return 'The table below lists the genes overlapping the site in the Nipponbare reference.', {'display': 'none'} - - else: - tab_number = get_tab_index(active_tab) - other_ref = children[tab_number]['props']['value'] - - return f'The table below lists the genes from homologous regions in {other_ref} that are not in Nipponbare.', {'display': 'none'} - - raise PreventUpdate - - @app.callback( - Output('lift-over-results-statistics', 'children'), - Output('lift-over-results-tabs', 'className'), - - Input('homepage-genomic-intervals-submitted-input', 'data'), - Input('lift-over-other-refs-submitted-input', 'data'), - - State('homepage-is-submitted', 'data'), - State('lift-over-is-submitted', 'data') - ) - def display_gene_statistics(nb_intervals_str, other_refs, homepage_is_submitted, lift_over_is_submitted): - if homepage_is_submitted and lift_over_is_submitted: - nb_intervals = get_genomic_intervals_from_input( - nb_intervals_str) - - genes_from_Nb_raw = get_genes_in_Nb(nb_intervals)[0] - - num_unique_genes = get_num_unique_entries( - genes_from_Nb_raw, 'OGI') - if num_unique_genes == 1: - gene_statistics_nb = f'{num_unique_genes} gene was found in Nipponbare' - else: - gene_statistics_nb = f'{num_unique_genes} genes were found in Nipponbare' - - for idx, other_ref in enumerate(other_refs): - common_genes_raw = get_common_genes([other_ref], nb_intervals) - num_unique_genes = get_num_unique_entries( - common_genes_raw, 'OGI') - if idx == len(other_refs) - 1: - if num_unique_genes == 1: - gene_statistics_nb += f', and {num_unique_genes} gene in {other_ref}' - else: - gene_statistics_nb += f', and {num_unique_genes} genes in {other_ref}' - else: - if num_unique_genes == 1: - gene_statistics_nb += f', {num_unique_genes} gene in {other_ref}' - else: - gene_statistics_nb += f', {num_unique_genes} genes in {other_ref}' - - gene_statistics_nb += '. ' - gene_statistics_items = [html.Li(gene_statistics_nb)] - - if other_refs: - other_refs.append('Nipponbare') - genes_common = get_common_genes(other_refs, nb_intervals) - num_unique_genes = get_num_unique_entries(genes_common, 'OGI') - - if num_unique_genes == 1: - gene_statistics_common = f'Among these, {num_unique_genes} gene is common to all cultivars.' - else: - gene_statistics_common = f'Among these, {num_unique_genes} genes are common to all cultivars.' - - gene_statistics_items.append( - html.Li(gene_statistics_common)) - - gene_statistics_other_ref = f'' - other_refs.pop() # Remove added Nipponbare - for idx, other_ref in enumerate(other_refs): - genes_from_other_ref_raw = get_unique_genes_in_other_ref( - other_ref, nb_intervals) - - if len(other_refs) > 1 and idx == len(other_refs) - 1: - gene_statistics_other_ref += f', and ' - elif idx != 0: - gene_statistics_other_ref += f', ' - - num_unique_genes = get_num_unique_entries( - genes_from_other_ref_raw, 'OGI') - - if num_unique_genes == 1: - gene_statistics_other_ref += f'{num_unique_genes} gene is unique to {other_ref}' - else: - gene_statistics_other_ref += f'{num_unique_genes} genes are unique to {other_ref}' - - gene_statistics_other_ref += '.' - gene_statistics_items.append( - html.Li(gene_statistics_other_ref)) - - # Setting the class name of lift-over-results-tabs to None is for removing the top margin during loading - return gene_statistics_items, None - - raise PreventUpdate - - @app.callback( - Output('lift-over-results-table', 'columns'), - Output('lift-over-results-table', 'data'), - - Input('homepage-genomic-intervals-submitted-input', 'data'), - Input('lift-over-results-tabs', 'active_tab'), - Input('lift-over-overlap-table-filter', 'value'), - Input('lift-over-other-refs-submitted-input', 'data'), - - State('lift-over-results-tabs', 'children'), - State('homepage-is-submitted', 'data'), - State('lift-over-is-submitted', 'data') - ) - def display_gene_tables(nb_intervals_str, active_tab, filter_rice_variants, other_refs, children, homepage_is_submitted, lift_over_is_submitted): - if homepage_is_submitted and lift_over_is_submitted: - nb_intervals = get_genomic_intervals_from_input( - nb_intervals_str) - - if active_tab == get_tab_id('All Genes'): - all_genes_raw = get_all_genes(other_refs, nb_intervals) - - mask = (all_genes_raw['OGI'] != NULL_PLACEHOLDER) - all_genes_raw.loc[mask, 'OGI'] = get_rgi_orthogroup_link( - all_genes_raw, 'OGI') - if 'Nipponbare' in all_genes_raw.columns: - mask = (all_genes_raw['Nipponbare'] != NULL_PLACEHOLDER) - all_genes_raw.loc[mask, 'Nipponbare'] = get_rgi_genecard_link( - all_genes_raw, 'Nipponbare') - - for cultivar in other_ref_genomes: - if cultivar in all_genes_raw.columns: - mask = (all_genes_raw[cultivar] != NULL_PLACEHOLDER) - all_genes_raw.loc[mask, cultivar] = get_rgi_genecard_link( - all_genes_raw, cultivar) - - all_genes = all_genes_raw.to_dict('records') - - columns = [{'id': x, 'name': x, 'presentation': 'markdown'} - for x in all_genes_raw.columns] - - return columns, all_genes - - elif active_tab == get_tab_id('Common Genes'): - common_genes_raw = get_common_genes( - filter_rice_variants, nb_intervals) - - # Mask will be triggered if no cultivar is selected - mask = (common_genes_raw['OGI'] != NULL_PLACEHOLDER) - common_genes_raw.loc[mask, 'OGI'] = get_rgi_orthogroup_link( - common_genes_raw, 'OGI') - - if 'Nipponbare' in common_genes_raw.columns: - mask = (common_genes_raw['Nipponbare'] != NULL_PLACEHOLDER) - common_genes_raw.loc[mask, 'Nipponbare'] = get_rgi_genecard_link( - common_genes_raw, 'Nipponbare') - - for cultivar in other_ref_genomes: - if cultivar in common_genes_raw.columns: - mask = (common_genes_raw[cultivar] != NULL_PLACEHOLDER) - common_genes_raw.loc[mask, cultivar] = get_rgi_genecard_link( - common_genes_raw, cultivar) - - common_genes = common_genes_raw.to_dict('records') - - columns = [{'id': x, 'name': x, 'presentation': 'markdown'} - for x in common_genes_raw.columns] - - return columns, common_genes - - elif active_tab == get_tab_id('Nipponbare'): - genes_from_Nb_raw = get_genes_in_Nb( - nb_intervals)[0].drop( - ['Chromosome', 'Start', 'End', 'Strand'], axis=1) - - mask = (genes_from_Nb_raw['OGI'] != NULL_PLACEHOLDER) - genes_from_Nb_raw.loc[mask, 'OGI'] = get_rgi_orthogroup_link( - genes_from_Nb_raw, 'OGI') - - mask = (genes_from_Nb_raw['Name'] != NULL_PLACEHOLDER) - genes_from_Nb_raw.loc[mask, 'Name'] = get_rgi_genecard_link( - genes_from_Nb_raw, 'Name') - - genes_from_Nb = genes_from_Nb_raw.to_dict('records') - - columns = [{'id': x, 'name': x, 'presentation': 'markdown'} - for x in genes_from_Nb_raw.columns] - - return columns, genes_from_Nb - - else: - tab_number = get_tab_index(active_tab) - other_ref = children[tab_number]['props']['value'] - - genes_from_other_ref_raw = get_unique_genes_in_other_ref( - other_ref, nb_intervals) - - mask = (genes_from_other_ref_raw['OGI'] != NULL_PLACEHOLDER) - genes_from_other_ref_raw.loc[mask, 'OGI'] = get_rgi_orthogroup_link( - genes_from_other_ref_raw, 'OGI') - - mask = (genes_from_other_ref_raw['Name'] != NULL_PLACEHOLDER) - genes_from_other_ref_raw.loc[mask, 'Name'] = get_rgi_genecard_link( - genes_from_other_ref_raw, 'Name') - - genes_from_other_ref = genes_from_other_ref_raw.to_dict( - 'records') - - columns = [{'id': x, 'name': x, 'presentation': 'markdown'} - for x in genes_from_other_ref_raw.columns] - - return columns, genes_from_other_ref - - raise PreventUpdate - - @app.callback( - Output('lift-over-results-table', 'filter_query'), - - Input('lift-over-reset-table', 'n_clicks'), - Input('lift-over-results-tabs', 'active_tab'), - Input('lift-over-overlap-table-filter', 'value') - ) - def reset_table_filters(*_): - return '' - - @app.callback( - Output('lift-over-download-df-to-csv', 'data'), - Input('lift-over-export-table', 'n_clicks'), - State('lift-over-results-table', 'data'), - State('homepage-genomic-intervals-submitted-input', 'data') - ) - def download_lift_over_table_to_csv(download_n_clicks, lift_over_df, genomic_intervals): - if download_n_clicks >= 1: - df = pd.DataFrame(lift_over_df) - return dcc.send_data_frame(df.to_csv, f'[{genomic_intervals}] Gene List and Lift-Over.csv', index=False) - - raise PreventUpdate +from dash import Input, Output, State, dcc, html +from dash.exceptions import PreventUpdate + +from .util import * +from ..constants import Constants +from ..general_util import * +const = Constants() + + +def init_callback(app): + @app.callback( + Output('lift-over-genomic-intervals-input', 'children'), + State('homepage-genomic-intervals-submitted-input', 'data'), + Input('homepage-is-submitted', 'data'), + Input('lift-over-submit', 'n_clicks') + ) + def display_input(nb_intervals_str, homepage_is_submitted, *_): + if homepage_is_submitted: + if nb_intervals_str and not is_error( + get_genomic_intervals_from_input(nb_intervals_str)): + return [html.B('Your Input Intervals: '), html.Span(nb_intervals_str)] + else: + return None + + raise PreventUpdate + + @app.callback( + Output('lift-over-is-submitted', 'data', allow_duplicate=True), + Output('lift-over-other-refs-submitted-input', + 'data', allow_duplicate=True), + Output('lift-over-active-tab', 'data', allow_duplicate=True), + Output('lift-over-active-filter', 'data', allow_duplicate=True), + Input('lift-over-submit', 'n_clicks'), + State('homepage-is-submitted', 'data'), + State('lift-over-other-refs', 'value'), + prevent_initial_call=True + ) + def submit_lift_over_input(lift_over_submit_n_clicks, homepage_is_submitted, other_refs): + if homepage_is_submitted and lift_over_submit_n_clicks >= 1: + other_refs = sanitize_other_refs(other_refs) + + return True, other_refs, None, None + + raise PreventUpdate + + @app.callback( + Output('lift-over-results-container', 'style'), + Input('lift-over-is-submitted', 'data'), + ) + def display_lift_over_output(lift_over_is_submitted): + if lift_over_is_submitted: + return {'display': 'block'} + + else: + return {'display': 'none'} + + @app.callback( + Output('lift-over-results-intro', 'children'), + Output('lift-over-results-tabs', 'children'), + + Output('lift-over-overlap-table-filter', 'options'), + Output('lift-over-overlap-table-filter', 'value'), + + State('homepage-genomic-intervals-submitted-input', 'data'), + Input('lift-over-other-refs-submitted-input', 'data'), + + State('homepage-is-submitted', 'data'), + + State('lift-over-active-filter', 'data'), + State('lift-over-is-submitted', 'data') + ) + def display_gene_tabs(nb_intervals_str, other_refs, homepage_is_submitted, active_filter, lift_over_is_submitted): + if homepage_is_submitted and lift_over_is_submitted: + if nb_intervals_str and not is_error(get_genomic_intervals_from_input(nb_intervals_str)): + tabs = get_tabs() + + other_refs = sanitize_other_refs(other_refs) + + if other_refs: + tabs = tabs + other_refs + + tabs_children = [dcc.Tab(label=tab, value=tab) if idx < len(get_tabs()) + else dcc.Tab(label=f'Unique to {tab}', value=tab) + for idx, tab in enumerate(tabs)] + + if not active_filter: + active_filter = tabs[len(get_tabs()) - 1:] + + gene_list_msg = [html.Span( + 'The tabs below show the implicated genes in '), html.B('Nipponbare')] + + if other_refs: + other_refs_str = other_refs[0] + if len(other_refs) == 2: + other_refs_str += f' and {other_refs[1]}' + elif len(other_refs) > 2: + for idx, other_ref in enumerate(other_refs[1:]): + if idx != len(other_refs) - 2: + other_refs_str += f', ' + else: + other_refs_str += f', and ' + + other_refs_str += f'{other_ref} ({other_ref_genomes[other_ref]})' + + gene_list_msg += [html.Span(' and in orthologous regions of '), + html.B(other_refs_str)] + + gene_list_msg += [html.Span(':')] + + return gene_list_msg, tabs_children, tabs[len(get_tabs()) - 1:], active_filter + else: + return None, None, [], None + + raise PreventUpdate + + @app.callback( + Output('lift-over-results-tabs', 'active_tab'), + State('homepage-is-submitted', 'data'), + State('lift-over-active-tab', 'data'), + State('lift-over-is-submitted', 'data'), + Input('lift-over-other-refs-submitted-input', 'data') + ) + def display_active_tab(homepage_is_submitted, saved_active_tab, lift_over_is_submitted, *_): + if homepage_is_submitted and lift_over_is_submitted: + if not saved_active_tab: + return 'tab-0' + + return saved_active_tab + + raise PreventUpdate + + @app.callback( + Output('lift-over-other-refs-saved-input', + 'data', allow_duplicate=True), + Input('lift-over-other-refs', 'value'), + State('homepage-is-submitted', 'data'), + prevent_initial_call=True + ) + def set_input_lift_over_session_state(other_refs, homepage_is_submitted): + if homepage_is_submitted: + return other_refs + + raise PreventUpdate + + @app.callback( + Output('lift-over-active-tab', 'data', allow_duplicate=True), + Output('lift-over-active-filter', 'data', allow_duplicate=True), + + Input('lift-over-results-tabs', 'active_tab'), + Input('lift-over-overlap-table-filter', 'value'), + + State('homepage-is-submitted', 'data'), + State('lift-over-is-submitted', 'data'), + prevent_initial_call=True, + ) + def get_submitted_lift_over_session_state(active_tab, filter_rice_variants, homepage_is_submitted, lift_over_is_submitted): + if homepage_is_submitted and lift_over_is_submitted: + return active_tab, filter_rice_variants + + raise PreventUpdate + + @app.callback( + Output('lift-over-other-refs', 'value'), + State('lift-over-other-refs', 'multi'), + State('homepage-is-submitted', 'data'), + State('lift-over-other-refs-saved-input', 'data'), + Input('homepage-genomic-intervals-submitted-input', 'data'), + Input('lift-over-submit', 'n_clicks') + ) + def get_input_lift_over_session_state(is_multi_other_refs, homepage_is_submitted, other_refs, *_): + if homepage_is_submitted: + if not is_multi_other_refs and other_refs: + other_refs = other_refs[0] + + return other_refs + + raise PreventUpdate + + @app.callback( + Output('lift-over-results-gene-intro', 'children'), + Output('lift-over-overlap-table-filter', 'style'), + + Input('lift-over-results-tabs', 'active_tab'), + State('lift-over-results-tabs', 'children'), + State('homepage-is-submitted', 'data'), + State('lift-over-is-submitted', 'data') + ) + def display_gene_intro(active_tab, children, homepage_is_submitted, lift_over_is_submitted): + if homepage_is_submitted and lift_over_is_submitted: + if active_tab == get_tab_id('All Genes'): + return 'The table below lists all the implicated genes.', {'display': 'none'} + + elif active_tab == get_tab_id('Common Genes'): + return 'The table below lists the implicated genes that are common to:', {'display': 'block'} + + elif active_tab == get_tab_id('Nipponbare'): + return 'The table below lists the genes overlapping the site in the Nipponbare reference.', {'display': 'none'} + + else: + tab_number = get_tab_index(active_tab) + other_ref = children[tab_number]['props']['value'] + + return f'The table below lists the genes from homologous regions in {other_ref} that are not in Nipponbare.', {'display': 'none'} + + raise PreventUpdate + + @app.callback( + Output('lift-over-results-statistics', 'children'), + Output('lift-over-results-tabs', 'className'), + + Input('homepage-genomic-intervals-submitted-input', 'data'), + Input('lift-over-other-refs-submitted-input', 'data'), + + State('homepage-is-submitted', 'data'), + State('lift-over-is-submitted', 'data') + ) + def display_gene_statistics(nb_intervals_str, other_refs, homepage_is_submitted, lift_over_is_submitted): + if homepage_is_submitted and lift_over_is_submitted: + nb_intervals = get_genomic_intervals_from_input( + nb_intervals_str) + + genes_from_Nb_raw = get_genes_in_Nb(nb_intervals)[0] + + num_unique_genes = get_num_unique_entries( + genes_from_Nb_raw, 'OGI') + if num_unique_genes == 1: + gene_statistics_nb = f'{num_unique_genes} gene was found in Nipponbare' + else: + gene_statistics_nb = f'{num_unique_genes} genes were found in Nipponbare' + + for idx, other_ref in enumerate(other_refs): + common_genes_raw = get_common_genes([other_ref], nb_intervals) + num_unique_genes = get_num_unique_entries( + common_genes_raw, 'OGI') + if idx == len(other_refs) - 1: + if num_unique_genes == 1: + gene_statistics_nb += f', and {num_unique_genes} gene in {other_ref}' + else: + gene_statistics_nb += f', and {num_unique_genes} genes in {other_ref}' + else: + if num_unique_genes == 1: + gene_statistics_nb += f', {num_unique_genes} gene in {other_ref}' + else: + gene_statistics_nb += f', {num_unique_genes} genes in {other_ref}' + + gene_statistics_nb += '. ' + gene_statistics_items = [html.Li(gene_statistics_nb)] + + if other_refs: + other_refs.append('Nipponbare') + genes_common = get_common_genes(other_refs, nb_intervals) + num_unique_genes = get_num_unique_entries(genes_common, 'OGI') + + if num_unique_genes == 1: + gene_statistics_common = f'Among these, {num_unique_genes} gene is common to all cultivars.' + else: + gene_statistics_common = f'Among these, {num_unique_genes} genes are common to all cultivars.' + + gene_statistics_items.append( + html.Li(gene_statistics_common)) + + gene_statistics_other_ref = f'' + other_refs.pop() # Remove added Nipponbare + for idx, other_ref in enumerate(other_refs): + genes_from_other_ref_raw = get_unique_genes_in_other_ref( + other_ref, nb_intervals) + + if len(other_refs) > 1 and idx == len(other_refs) - 1: + gene_statistics_other_ref += f', and ' + elif idx != 0: + gene_statistics_other_ref += f', ' + + num_unique_genes = get_num_unique_entries( + genes_from_other_ref_raw, 'OGI') + + if num_unique_genes == 1: + gene_statistics_other_ref += f'{num_unique_genes} gene is unique to {other_ref}' + else: + gene_statistics_other_ref += f'{num_unique_genes} genes are unique to {other_ref}' + + gene_statistics_other_ref += '.' + gene_statistics_items.append( + html.Li(gene_statistics_other_ref)) + + # Setting the class name of lift-over-results-tabs to None is for removing the top margin during loading + return gene_statistics_items, None + + raise PreventUpdate + + @app.callback( + Output('lift-over-results-table', 'columns'), + Output('lift-over-results-table', 'data'), + + Input('homepage-genomic-intervals-submitted-input', 'data'), + Input('lift-over-results-tabs', 'active_tab'), + Input('lift-over-overlap-table-filter', 'value'), + Input('lift-over-other-refs-submitted-input', 'data'), + + State('lift-over-results-tabs', 'children'), + State('homepage-is-submitted', 'data'), + State('lift-over-is-submitted', 'data') + ) + def display_gene_tables(nb_intervals_str, active_tab, filter_rice_variants, other_refs, children, homepage_is_submitted, lift_over_is_submitted): + if homepage_is_submitted and lift_over_is_submitted: + nb_intervals = get_genomic_intervals_from_input( + nb_intervals_str) + + if active_tab == get_tab_id('All Genes'): + all_genes_raw = get_all_genes(other_refs, nb_intervals) + + mask = (all_genes_raw['OGI'] != NULL_PLACEHOLDER) + all_genes_raw.loc[mask, 'OGI'] = get_rgi_orthogroup_link( + all_genes_raw, 'OGI') + if 'Nipponbare' in all_genes_raw.columns: + mask = (all_genes_raw['Nipponbare'] != NULL_PLACEHOLDER) + all_genes_raw.loc[mask, 'Nipponbare'] = get_rgi_genecard_link( + all_genes_raw, 'Nipponbare') + + for cultivar in other_ref_genomes: + if cultivar in all_genes_raw.columns: + mask = (all_genes_raw[cultivar] != NULL_PLACEHOLDER) + all_genes_raw.loc[mask, cultivar] = get_rgi_genecard_link( + all_genes_raw, cultivar) + + all_genes = all_genes_raw.to_dict('records') + + columns = [{'id': x, 'name': x, 'presentation': 'markdown'} + for x in all_genes_raw.columns] + + return columns, all_genes + + elif active_tab == get_tab_id('Common Genes'): + common_genes_raw = get_common_genes( + filter_rice_variants, nb_intervals) + + # Mask will be triggered if no cultivar is selected + mask = (common_genes_raw['OGI'] != NULL_PLACEHOLDER) + common_genes_raw.loc[mask, 'OGI'] = get_rgi_orthogroup_link( + common_genes_raw, 'OGI') + + if 'Nipponbare' in common_genes_raw.columns: + mask = (common_genes_raw['Nipponbare'] != NULL_PLACEHOLDER) + common_genes_raw.loc[mask, 'Nipponbare'] = get_rgi_genecard_link( + common_genes_raw, 'Nipponbare') + + for cultivar in other_ref_genomes: + if cultivar in common_genes_raw.columns: + mask = (common_genes_raw[cultivar] != NULL_PLACEHOLDER) + common_genes_raw.loc[mask, cultivar] = get_rgi_genecard_link( + common_genes_raw, cultivar) + + common_genes = common_genes_raw.to_dict('records') + + columns = [{'id': x, 'name': x, 'presentation': 'markdown'} + for x in common_genes_raw.columns] + + return columns, common_genes + + elif active_tab == get_tab_id('Nipponbare'): + genes_from_Nb_raw = get_genes_in_Nb( + nb_intervals)[0].drop( + ['Chromosome', 'Start', 'End', 'Strand'], axis=1) + + mask = (genes_from_Nb_raw['OGI'] != NULL_PLACEHOLDER) + genes_from_Nb_raw.loc[mask, 'OGI'] = get_rgi_orthogroup_link( + genes_from_Nb_raw, 'OGI') + + mask = (genes_from_Nb_raw['Name'] != NULL_PLACEHOLDER) + genes_from_Nb_raw.loc[mask, 'Name'] = get_rgi_genecard_link( + genes_from_Nb_raw, 'Name') + + genes_from_Nb = genes_from_Nb_raw.to_dict('records') + + columns = [{'id': x, 'name': x, 'presentation': 'markdown'} + for x in genes_from_Nb_raw.columns] + + return columns, genes_from_Nb + + else: + tab_number = get_tab_index(active_tab) + other_ref = children[tab_number]['props']['value'] + + genes_from_other_ref_raw = get_unique_genes_in_other_ref( + other_ref, nb_intervals) + + mask = (genes_from_other_ref_raw['OGI'] != NULL_PLACEHOLDER) + genes_from_other_ref_raw.loc[mask, 'OGI'] = get_rgi_orthogroup_link( + genes_from_other_ref_raw, 'OGI') + + mask = (genes_from_other_ref_raw['Name'] != NULL_PLACEHOLDER) + genes_from_other_ref_raw.loc[mask, 'Name'] = get_rgi_genecard_link( + genes_from_other_ref_raw, 'Name') + + genes_from_other_ref = genes_from_other_ref_raw.to_dict( + 'records') + + columns = [{'id': x, 'name': x, 'presentation': 'markdown'} + for x in genes_from_other_ref_raw.columns] + + return columns, genes_from_other_ref + + raise PreventUpdate + + @app.callback( + Output('lift-over-results-table', 'filter_query'), + + Input('lift-over-reset-table', 'n_clicks'), + Input('lift-over-results-tabs', 'active_tab'), + Input('lift-over-overlap-table-filter', 'value') + ) + def reset_table_filters(*_): + return '' + + @app.callback( + Output('lift-over-download-df-to-csv', 'data'), + Input('lift-over-export-table', 'n_clicks'), + State('lift-over-results-table', 'data'), + State('homepage-genomic-intervals-submitted-input', 'data') + ) + def download_lift_over_table_to_csv(download_n_clicks, lift_over_df, genomic_intervals): + if download_n_clicks >= 1: + df = pd.DataFrame(lift_over_df) + return dcc.send_data_frame(df.to_csv, f'[{genomic_intervals}] Gene List and Lift-Over.csv', index=False) + + raise PreventUpdate diff --git a/callbacks/lift_over/util.py b/callbacks/lift_over/util.py index b2e4252b..7d6cc724 100644 --- a/callbacks/lift_over/util.py +++ b/callbacks/lift_over/util.py @@ -1,692 +1,692 @@ -import pickle -from collections import defaultdict, namedtuple - -import gffutils -import pandas as pd - -from ..constants import Constants -from ..general_util import * -from ..links_util import * - - -const = Constants() -Genomic_interval = namedtuple('Genomic_interval', ['chrom', 'start', 'stop']) - -# Error codes and messages triggered by a malformed genomic interval entered by the user -Error_message = namedtuple('Error_message', ['code', 'message']) -errors = { - 'NO_CHROM_INTERVAL_SEP': Error_message(1, 'A genomic interval should be entered as chrom:start-end. Use a semicolon (;) to separate multiple intervals'), - 'NO_START_STOP_SEP': Error_message(2, 'Specify a valid start and end for the genomic interval'), - 'START_STOP_NOT_INT': Error_message(3, 'The start and end of a genomic interval should be integers'), - 'START_GREATER_THAN_STOP': Error_message(4, 'The start of a genomic interval should not be past the end') -} - -other_ref_genomes = {'N22': 'aus Nagina-22', - 'MH63': 'indica Minghui-63', - 'Azu': 'japonica Azucena', - 'ARC': 'basmati ARC', - 'IR64': 'indica IR64', - 'CMeo': 'japonica CHAO MEO'} - -NB_COLUMNS = ['Name', 'Description', 'UniProtKB/Swiss-Prot', - 'OGI', 'Chromosome', 'Start', 'End', 'Strand', 'QTL Analyses', 'PubMed Article IDs'] -OTHER_REF_COLUMNS = ['OGI', 'Name', 'Chromosome', 'Start', 'End', 'Strand'] -FRONT_FACING_COLUMNS = ['Name', 'Description', 'UniProtKB/Swiss-Prot', 'OGI'] -NO_REFS_COLUMNS = ['OGI'] - - -def construct_options_other_ref_genomes(): - return [ - {'value': symbol, 'label': f'{symbol} ({name})'} for symbol, name in other_ref_genomes.items()] - - -def create_empty_df_nb(): - """ - Returns an empty data frame if there are no results - - Returns: - - Empty data frame - """ - return create_empty_df_with_cols(NB_COLUMNS) - - -def create_empty_no_refs_df(): - return create_empty_df_with_cols(NO_REFS_COLUMNS) - - -def create_empty_df_other_refs(): - return create_empty_df_with_cols(OTHER_REF_COLUMNS) - - -def create_empty_front_facing_df(): - return create_empty_df_with_cols(FRONT_FACING_COLUMNS) - -# ===================================================== -# Utility functions for parsing input genomic interval -# ===================================================== - - -def is_error(genomic_interval): - """ - Returns True if given genomic interval is malformed; False, otherwise - - This function assumes that genomic_interval is the return value of to_genomic_interval() - - Parameters: - - genomic_interval: If its first element is an integer (i.e., the error code), - then the given genomic interval is malformed - - Returns: - - True if given genomic interval is malformed; False, otherwise - """ - return isinstance(genomic_interval[0], int) - - -def get_error_message(error_code): - """ - Returns the message associated with the error code if the user inputs a malformed genomic interval - - Parameters: - - error_code: Error code triggered by the malformed genomic interval - - Returns: - - Message associated with the given error code - """ - for _, code_message in errors.items(): - if code_message.code == error_code: - return code_message.message - - -def is_one_digit_chromosome(chromosome): - """ - Checks if given chromosome only has a single digit (e.g., Chr1, Chr2) - - Parameters: - - chromosome: Chromosome to be checked - - Returns: - - True if given chromosome only has a single digit; False, otherwise - """ - # Examples: Chr1, Chr2 - return len(chromosome) == len('Chr') + 1 - - -def pad_one_digit_chromosome(chromosome): - """ - Prepends a 0 to the chromosome number if it only has a single digit - For example, if the input is 'Chr1', it returns 'Chr01' - - This function assumes that the given chromosome only has a single digit - - Parameters: - - chromosome: Chromosome to be padded - - Returns: - - Chromosome with a leading 0 prepended - """ - return chromosome[:-1] + '0' + chromosome[-1] - - -def to_genomic_interval(genomic_interval_str): - """ - Converts a genomic interval extracted from the user input into a Genomic_interval tuple - If the genomic interval is malformed, it returns the error code, alongside the genomic interval - - Parameters: - - genomic_interval_str: Genomic interval extracted from the user input - - Returns: - - If the genomic interval is valid: Genomic_interval tuple - - Otherwise: Tuple containing the triggered error code and the genomic interval - """ - try: - chrom, interval = genomic_interval_str.split(":") - if is_one_digit_chromosome(chrom): - chrom = pad_one_digit_chromosome(chrom) - - except ValueError: - return errors['NO_CHROM_INTERVAL_SEP'].code, genomic_interval_str - - try: - start, stop = interval.split("-") - except ValueError: - return errors['NO_START_STOP_SEP'].code, genomic_interval_str - - try: - start = int(start) - stop = int(stop) - except ValueError: - return errors['START_STOP_NOT_INT'].code, genomic_interval_str - - if start > stop: - return errors['START_GREATER_THAN_STOP'].code, genomic_interval_str - - return Genomic_interval(chrom, start, stop) - - -def sanitize_nb_intervals_str(nb_intervals_str): - """ - Sanitizes the genomic intervals entered by the user by removing spaces and removing trailing semicolons - - Parameters: - - nb_intervals_str: Genomic intervals entered by the user - - Returns: - - Sanitized genomic interval - """ - nb_intervals_str = nb_intervals_str.replace(' ', '') - nb_intervals_str = nb_intervals_str.rstrip(';') - - return nb_intervals_str - - -def get_genomic_intervals_from_input(nb_intervals_str): - """ - Extracts the Genomic_interval tuples from the genomic intervals entered by the user - - Parameters: - - nb_intervals_str: Genomic intervals entered by the user - - Returns: - - List of Genomic_interval tuples - """ - nb_intervals_str = sanitize_nb_intervals_str(nb_intervals_str) - nb_intervals = [] - - nb_intervals_split = nb_intervals_str.split(";") - - for interval_str in nb_intervals_split: - interval = to_genomic_interval(interval_str) - - # Trap if at least one of the genomic intervals is malformed - if is_error(interval): - return interval - else: - nb_intervals.append(interval) - - return nb_intervals - -# ============================================================================ -# Utility functions for displaying lift-over results and sanitizng accessions -# ============================================================================ - - -def get_tabs(): - """ - Returns the tabs to be displayed in the liftover results - The tabs do not include those that are specific to a reference - - Returns: - - Tabs to be displayed in the liftover results (except those specific to a reference) - """ - return ['All Genes', 'Common Genes', 'Nipponbare'] - - -def get_tab_id(tab): - """ - Returns the index of given tab with respect to the tabs to be displayed in the liftover results - - Parameters: - - tab: Tab whose idnex is to be returned - - Returns: - - Index of given tab with respect to the tabs to be displayed in the liftover results - """ - return f'tab-{get_tabs().index(tab)}' - - -def sanitize_other_refs(other_refs): - """ - Returns the references (other than Nipponbare) selected by the user - - The need for this function is motivated by the fact that, when the user only chooses one reference, - the data type of this chosen value is string (not list) - - Parameters: - - other_refs: References (other than Nipponbare) selected by the user - - Returns: - - List of references (other than Nipponbare) selected by the user - """ - if other_refs: - if isinstance(other_refs, str): - return [other_refs] - else: - return other_refs - - return [] - - -def sanitize_gene_id(gene_id): - """ - Removes "gene:" prefix in given accession - - Parameters: - - gene_id: Accession - - Returns: - - Accession without the "gene:" prefix - """ - if gene_id[:len('gene:')] == 'gene:': - return gene_id[len('gene:'):] - - return gene_id - - -# =============================================== -# Utility functions for OGI-to-reference mapping -# =============================================== - - -def get_ogi_list(accession_ids, ogi_mapping): - """ - Returns the list of equivalent OGIs of given accessions - - Parameters: - - accession_ids: Accessions - - ogi_mapping: OGI-to-accession mapping dictionary - - Returns: - - list of equivalent OGIs of given accessions - """ - ogi_list = [ogi_mapping[accession_id] for accession_id in accession_ids] - - return ogi_list - - -def get_ogi_nb(nb_intervals): - """ - Maps Nipponbare accessions (obtained from a list of Genomic_interval tuples) to their respective OGIs - - Parameters: - - nb_intervals: List of Genomic_interval tuples - - Returns: - - Set containing all unique OGIs after performing OGI-to-Nipponbare mapping - - OGI-to-Nipponbare mapping dictionary - """ - - # All unique OGIs - final_ogi_set = set() - - # OGI-to-NB mapping dictionary (one OGI can map to multiple NB accessions) - final_ogi_dict = defaultdict(set) - - for nb_interval in nb_intervals: - # Load and search GFF_DB of Nipponbare - db = gffutils.FeatureDB( - f'{const.ANNOTATIONS}/Nb/IRGSPMSU.gff.db', keep_order=True) - genes_in_interval = list(db.region(region=(nb_interval.chrom, nb_interval.start, nb_interval.stop), - completely_within=False, featuretype='gene')) - - # Map Nipponbare accessions to OGIs - ogi_mapping_path = f'{const.OGI_MAPPING}/Nb_to_ogi.pickle' - with open(ogi_mapping_path, 'rb') as f: - ogi_mapping = pickle.load(f) - for gene in genes_in_interval: - gene_id = sanitize_gene_id(gene.id) - ogi = ogi_mapping[gene_id] - - final_ogi_set.add(ogi) - final_ogi_dict[ogi].add(gene_id) - - return final_ogi_set, final_ogi_dict - - -def get_ogi_other_ref(ref, nb_intervals): - """ - Maps reference-specific accessions (obtained from a list of Genomic_interval tuples) to their respective OGIs - "Reference" refers to a reference other than Nipponbare - Nipponbare reference is handled by get_ogi_nb() - - Parameters: - - ref: Reference - - nb_intervals: List of Genomic_interval tuples - - Returns: - - Set containing all unique OGIs after performing OGI-to-reference mapping - - OGI-to-reference mapping dictionary - """ - - # All unique OGIs - final_ogi_set = set() - - # OGI-to-NB mapping dictionary (one OGI can map to multiple NB accessions) - final_ogi_dict = defaultdict(set) - - # Get intervals from other refs that align to (parts) of the input loci - db_align = gffutils.FeatureDB( - f'{const.ALIGNMENTS}/{"Nb_"+str(ref)}/{"Nb_"+str(ref)}.gff.db') - - # Get corresponding intervals on ref - db_annotation = gffutils.FeatureDB( - f"{const.ANNOTATIONS}/{ref}/{ref}.gff.db".format(ref)) - - for nb_interval in nb_intervals: - gff_intersections = list(db_align.region(region=(nb_interval.chrom, nb_interval.start, nb_interval.stop), - completely_within=False)) - for intersection in gff_intersections: - ref_interval = to_genomic_interval( - intersection.attributes['Name'][0]) - genes_in_interval = list(db_annotation.region(region=(ref_interval.chrom, ref_interval.start, ref_interval.stop), - completely_within=False, featuretype='gene')) - - # Map reference-specific accessions to OGIs - ogi_mapping_path = f'{const.OGI_MAPPING}/{ref}_to_ogi.pickle' - with open(ogi_mapping_path, 'rb') as f: - ogi_mapping = pickle.load(f) - for gene in genes_in_interval: - gene_id = sanitize_gene_id(gene.id) - ogi = ogi_mapping[gene_id] - - final_ogi_set.add(ogi) - final_ogi_dict[ogi].add(gene_id) - - return final_ogi_set, final_ogi_dict - -# ================================================== -# Utility function related to QTARO and Text Mining -# ================================================== - - -def get_qtaro_entry(mapping, gene): - try: - qtaro_str = '
    ' - pub_idx = 1 - for character_major in mapping[gene]: - qtaro_str += '
  • ' + character_major + '
      ' - for character_minor in mapping[gene][character_major]: - pubs = [] - for pub in mapping[gene][character_major][character_minor]: - pubs.append( - '
    • ' + get_doi_link_single_str(pub, pub_idx) + '
    • ') - pub_idx += 1 - - qtaro_str += '
    • ' + character_minor + \ - '
        ' + ''.join(pubs) + '
    • ' - qtaro_str += '

  • ' - - # Remove the line break after the last character major - return qtaro_str[:-len("
    ")] + '
' - except KeyError: - return NULL_PLACEHOLDER - - -def get_qtaro_entries(mapping, genes): - return [get_qtaro_entry(mapping, gene) for gene in genes] - - -def get_pubmed_entry(gene): - try: - with open(f'{const.TEXT_MINING_PUBMED}/{gene}.pickle', 'rb') as f: - mapping = pickle.load(f) - - pubmed_ids = [get_pubmed_link_single_str(pubmed_id[0]) for pubmed_id in sorted( - mapping.items(), key=lambda x: x[1], reverse=True)] - except FileNotFoundError: - return NULL_PLACEHOLDER - - pubmed_str = '' - for idx, pubmed in enumerate(pubmed_ids): - if idx % 2 == 0: - pubmed_str += f'{pubmed}   ' - else: - pubmed_str += f'{pubmed}\n' - - if pubmed_str[-1] == '\n': # Ends in a newline - return pubmed_str[:-len('\n')] - - return pubmed_str[:-len('   ')] - - -# ======================== -# Functions for lift-over -# ======================== - - -def get_genes_in_Nb(nb_intervals): - """ - Returns a data frame containing the genes in Nipponbare - - Parameters: - - nb_intervals: List of Genomic_interval tuples - - Returns: - - Data frame containing the genes in Nipponbare - """ - dfs = [] - - for nb_interval in nb_intervals: - # Load and search GFF_DB of Nipponbare - db = gffutils.FeatureDB( - f'{const.ANNOTATIONS}/Nb/IRGSPMSU.gff.db', keep_order=True) - genes_in_interval = list(db.region(region=(nb_interval.chrom, nb_interval.start, nb_interval.stop), - completely_within=False, featuretype='gene')) - - # Map accessions to their respective OGIs - ogi_mapping_path = f'{const.OGI_MAPPING}/Nb_to_ogi.pickle' - ogi_list = [] - with open(ogi_mapping_path, 'rb') as f: - ogi_mapping = pickle.load(f) - ogi_list = get_ogi_list([sanitize_gene_id(gene.id) - for gene in genes_in_interval], ogi_mapping) - - # Get QTARO annotations - with open(const.QTARO_DICTIONARY, 'rb') as f: - qtaro_dict = pickle.load(f) - qtaro_list = get_qtaro_entries( - qtaro_dict, [gene.id for gene in genes_in_interval]) - - pubmed_ids = [get_pubmed_entry(gene.id) for gene in genes_in_interval] - - # Construct the data frame - df = pd.DataFrame({ - 'OGI': ogi_list, - 'Name': [gene.id for gene in genes_in_interval], - 'Chromosome': [gene.chrom for gene in genes_in_interval], - 'Start': [gene.start for gene in genes_in_interval], - 'End': [gene.end for gene in genes_in_interval], - 'Strand': [gene.strand for gene in genes_in_interval], - 'QTL Analyses': qtaro_list, - 'PubMed Article IDs': pubmed_ids - }) - - dfs.append(df) - - try: - table_gene_ids = pd.concat(dfs, ignore_index=True) - # Read in dataframe containing gene descriptions - gene_description_df = pd.read_csv( - f'{const.GENE_DESCRIPTIONS}/Nb/Nb_gene_descriptions.csv') - # Right merge because some genes do not have descriptions or UniProtKB/Swiss-Prot IDs - table = pd.merge(gene_description_df, table_gene_ids, - left_on='Gene_ID', right_on='Name', how='right') - - # Reorder columns - table = table[NB_COLUMNS] - - table['UniProtKB/Swiss-Prot'] = get_uniprot_link( - table, 'UniProtKB/Swiss-Prot') - - table = table.fillna(NULL_PLACEHOLDER) - - if table.shape[0] == 0: - return create_empty_df_nb(), table['Name'].values.tolist() - - return table, table['Name'].values.tolist() - - except ValueError: # No results to concatenate - return create_empty_df_nb(), table['Name'].values.tolist() - - -def get_genes_in_other_ref(ref, nb_intervals): - """ - Returns a data frame containing the genes in references other than Nipponbare - Nipponbare is handled by get_genes_in_Nb() - - Parameters: - - ref: Reference - - nb_intervals: List of Genomic_interval tuples - - Returns: - - Data frame containing the genes in references other than Nipponbare - """ - - # Get intervals from other refs that align to (parts) of the input loci - db_align = gffutils.FeatureDB( - f'{const.ALIGNMENTS}/{"Nb_"+str(ref)}/{"Nb_"+str(ref)}.gff.db') - - # Get corresponding intervals on ref - db_annotation = gffutils.FeatureDB( - f"{const.ANNOTATIONS}/{ref}/{ref}.gff.db") - - dfs = [] - - for nb_interval in nb_intervals: - gff_intersections = list(db_align.region(region=(nb_interval.chrom, nb_interval.start, nb_interval.stop), - completely_within=False)) - for intersection in gff_intersections: - ref_interval = to_genomic_interval( - intersection.attributes['Name'][0]) - genes_in_interval = list(db_annotation.region(region=(ref_interval.chrom, ref_interval.start, ref_interval.stop), - completely_within=False, featuretype='gene')) - - # Map accessions to their respective OGIs - ogi_mapping_path = f'{const.OGI_MAPPING}/{ref}_to_ogi.pickle' - ogi_list = [] - with open(ogi_mapping_path, 'rb') as f: - ogi_mapping = pickle.load(f) - ogi_list = get_ogi_list([sanitize_gene_id(gene.id) - for gene in genes_in_interval], ogi_mapping) - - # Construct the data frame - df = pd.DataFrame({ - 'OGI': ogi_list, - 'Name': [sanitize_gene_id(gene.id) for gene in genes_in_interval], - 'Chromosome': [gene.chrom for gene in genes_in_interval], - 'Start': [gene.start for gene in genes_in_interval], - 'End': [gene.end for gene in genes_in_interval], - 'Strand': [gene.strand for gene in genes_in_interval] - }) - dfs.append(df) - - try: - table = pd.concat(dfs, ignore_index=True) - if table.shape[0] == 0: - return create_empty_df_other_refs() - - return table - - except ValueError: # No results to concatenate - return create_empty_df_other_refs() - - -def get_common_genes(refs, nb_intervals): - """ - Returns a data frame containing the genes common to the given references - - Parameters: - - ref: References - - nb_intervals: List of Genomic_interval tuples - - Returns: - - Data frame containing the genes common to the given references - """ - # No cultivars selected - if not refs: - return create_empty_no_refs_df() - - common_genes = None - for ref in refs: - if ref != 'Nipponbare': - genes_in_ref = get_genes_in_other_ref(ref, nb_intervals) - else: - genes_in_ref = get_genes_in_Nb(nb_intervals)[0] - - genes_in_ref = genes_in_ref[['OGI', 'Name']] - - try: - common_genes = pd.merge( - common_genes, genes_in_ref, on='OGI') - # First instance of merging (that is, common_genes is still None) - except TypeError: - common_genes = genes_in_ref - - common_genes = common_genes.rename( - columns={'Name_x': 'Nipponbare', 'Name_y': ref, 'Name': ref}) - - common_genes = common_genes.rename( - columns={'Name': 'Nipponbare'}).dropna().drop_duplicates() - - return common_genes - - -def get_all_genes(refs, nb_intervals): - """ - Returns a data frame containing all the genes (i.e., the set-theoretic union of all the genes) - in Nipponbare, as well as orthologous genes in the given references - - Parameters: - - ref: References (other than Nipponbare) - - nb_intervals: List of Genomic_interval tuples - - Returns: - - Data frame containing all the genes - """ - genes_in_nb = get_genes_in_Nb(nb_intervals)[0] - genes_in_nb = genes_in_nb[['OGI', 'Name']] - - common_genes = genes_in_nb - for ref in refs: - if ref != 'Nipponbare': - genes_in_other_ref = get_genes_in_other_ref(ref, nb_intervals) - genes_in_other_ref = genes_in_other_ref[['OGI', 'Name']] - common_genes = pd.merge( - common_genes, genes_in_other_ref, on='OGI', how='outer') - - common_genes = common_genes.rename( - columns={'Name_x': 'Nipponbare', 'Name_y': ref, 'Name': ref}) - - common_genes = common_genes.rename( - columns={'Name': 'Nipponbare'}).fillna(NULL_PLACEHOLDER).drop_duplicates() - - return common_genes - - -def get_unique_genes_in_other_ref(ref, nb_intervals): - """ - Returns a data frame containing the genes in a reference that are not present in Nipponbare - - Parameters: - - ref: References - - nb_intervals: List of Genomic_interval tuples - - Returns: - - Data frame containing the genes in a reference that are not present in Nipponbare - """ - genes_in_nb = get_genes_in_Nb(nb_intervals)[0] - genes_in_other_ref = get_genes_in_other_ref(ref, nb_intervals) - - genes_in_nb = genes_in_nb[['OGI']] - - # Get set difference - unique_genes = pd.concat([genes_in_other_ref, genes_in_nb, genes_in_nb]).drop_duplicates( - subset=['OGI'], keep=False) - - gene_description_df = pd.read_csv( - f'{const.GENE_DESCRIPTIONS}/{ref}/{ref}_gene_descriptions.csv') - # Right merge because some genes do not have descriptions or UniProtKB/Swiss-Prot IDs - unique_genes = pd.merge(gene_description_df, unique_genes, - left_on='Gene_ID', right_on='Name', how='right') - - unique_genes = unique_genes[FRONT_FACING_COLUMNS] - - unique_genes['UniProtKB/Swiss-Prot'] = get_uniprot_link( - unique_genes, 'UniProtKB/Swiss-Prot') - - unique_genes = unique_genes.fillna(NULL_PLACEHOLDER) - - if unique_genes.shape[0] == 0: - return create_empty_front_facing_df() - - return unique_genes +import pickle +from collections import defaultdict, namedtuple + +import gffutils +import pandas as pd + +from ..constants import Constants +from ..general_util import * +from ..links_util import * + + +const = Constants() +Genomic_interval = namedtuple('Genomic_interval', ['chrom', 'start', 'stop']) + +# Error codes and messages triggered by a malformed genomic interval entered by the user +Error_message = namedtuple('Error_message', ['code', 'message']) +errors = { + 'NO_CHROM_INTERVAL_SEP': Error_message(1, 'A genomic interval should be entered as chrom:start-end. Use a semicolon (;) to separate multiple intervals'), + 'NO_START_STOP_SEP': Error_message(2, 'Specify a valid start and end for the genomic interval'), + 'START_STOP_NOT_INT': Error_message(3, 'The start and end of a genomic interval should be integers'), + 'START_GREATER_THAN_STOP': Error_message(4, 'The start of a genomic interval should not be past the end') +} + +other_ref_genomes = {'N22': 'aus Nagina-22', + 'MH63': 'indica Minghui-63', + 'Azu': 'japonica Azucena', + 'ARC': 'basmati ARC', + 'IR64': 'indica IR64', + 'CMeo': 'japonica CHAO MEO'} + +NB_COLUMNS = ['Name', 'Description', 'UniProtKB/Swiss-Prot', + 'OGI', 'Chromosome', 'Start', 'End', 'Strand', 'QTL Analyses', 'PubMed Article IDs'] +OTHER_REF_COLUMNS = ['OGI', 'Name', 'Chromosome', 'Start', 'End', 'Strand'] +FRONT_FACING_COLUMNS = ['Name', 'Description', 'UniProtKB/Swiss-Prot', 'OGI'] +NO_REFS_COLUMNS = ['OGI'] + + +def construct_options_other_ref_genomes(): + return [ + {'value': symbol, 'label': f'{symbol} ({name})'} for symbol, name in other_ref_genomes.items()] + + +def create_empty_df_nb(): + """ + Returns an empty data frame if there are no results + + Returns: + - Empty data frame + """ + return create_empty_df_with_cols(NB_COLUMNS) + + +def create_empty_no_refs_df(): + return create_empty_df_with_cols(NO_REFS_COLUMNS) + + +def create_empty_df_other_refs(): + return create_empty_df_with_cols(OTHER_REF_COLUMNS) + + +def create_empty_front_facing_df(): + return create_empty_df_with_cols(FRONT_FACING_COLUMNS) + +# ===================================================== +# Utility functions for parsing input genomic interval +# ===================================================== + + +def is_error(genomic_interval): + """ + Returns True if given genomic interval is malformed; False, otherwise + + This function assumes that genomic_interval is the return value of to_genomic_interval() + + Parameters: + - genomic_interval: If its first element is an integer (i.e., the error code), + then the given genomic interval is malformed + + Returns: + - True if given genomic interval is malformed; False, otherwise + """ + return isinstance(genomic_interval[0], int) + + +def get_error_message(error_code): + """ + Returns the message associated with the error code if the user inputs a malformed genomic interval + + Parameters: + - error_code: Error code triggered by the malformed genomic interval + + Returns: + - Message associated with the given error code + """ + for _, code_message in errors.items(): + if code_message.code == error_code: + return code_message.message + + +def is_one_digit_chromosome(chromosome): + """ + Checks if given chromosome only has a single digit (e.g., Chr1, Chr2) + + Parameters: + - chromosome: Chromosome to be checked + + Returns: + - True if given chromosome only has a single digit; False, otherwise + """ + # Examples: Chr1, Chr2 + return len(chromosome) == len('Chr') + 1 + + +def pad_one_digit_chromosome(chromosome): + """ + Prepends a 0 to the chromosome number if it only has a single digit + For example, if the input is 'Chr1', it returns 'Chr01' + + This function assumes that the given chromosome only has a single digit + + Parameters: + - chromosome: Chromosome to be padded + + Returns: + - Chromosome with a leading 0 prepended + """ + return chromosome[:-1] + '0' + chromosome[-1] + + +def to_genomic_interval(genomic_interval_str): + """ + Converts a genomic interval extracted from the user input into a Genomic_interval tuple + If the genomic interval is malformed, it returns the error code, alongside the genomic interval + + Parameters: + - genomic_interval_str: Genomic interval extracted from the user input + + Returns: + - If the genomic interval is valid: Genomic_interval tuple + - Otherwise: Tuple containing the triggered error code and the genomic interval + """ + try: + chrom, interval = genomic_interval_str.split(":") + if is_one_digit_chromosome(chrom): + chrom = pad_one_digit_chromosome(chrom) + + except ValueError: + return errors['NO_CHROM_INTERVAL_SEP'].code, genomic_interval_str + + try: + start, stop = interval.split("-") + except ValueError: + return errors['NO_START_STOP_SEP'].code, genomic_interval_str + + try: + start = int(start) + stop = int(stop) + except ValueError: + return errors['START_STOP_NOT_INT'].code, genomic_interval_str + + if start > stop: + return errors['START_GREATER_THAN_STOP'].code, genomic_interval_str + + return Genomic_interval(chrom, start, stop) + + +def sanitize_nb_intervals_str(nb_intervals_str): + """ + Sanitizes the genomic intervals entered by the user by removing spaces and removing trailing semicolons + + Parameters: + - nb_intervals_str: Genomic intervals entered by the user + + Returns: + - Sanitized genomic interval + """ + nb_intervals_str = nb_intervals_str.replace(' ', '') + nb_intervals_str = nb_intervals_str.rstrip(';') + + return nb_intervals_str + + +def get_genomic_intervals_from_input(nb_intervals_str): + """ + Extracts the Genomic_interval tuples from the genomic intervals entered by the user + + Parameters: + - nb_intervals_str: Genomic intervals entered by the user + + Returns: + - List of Genomic_interval tuples + """ + nb_intervals_str = sanitize_nb_intervals_str(nb_intervals_str) + nb_intervals = [] + + nb_intervals_split = nb_intervals_str.split(";") + + for interval_str in nb_intervals_split: + interval = to_genomic_interval(interval_str) + + # Trap if at least one of the genomic intervals is malformed + if is_error(interval): + return interval + else: + nb_intervals.append(interval) + + return nb_intervals + +# ============================================================================ +# Utility functions for displaying lift-over results and sanitizng accessions +# ============================================================================ + + +def get_tabs(): + """ + Returns the tabs to be displayed in the liftover results + The tabs do not include those that are specific to a reference + + Returns: + - Tabs to be displayed in the liftover results (except those specific to a reference) + """ + return ['All Genes', 'Common Genes', 'Nipponbare'] + + +def get_tab_id(tab): + """ + Returns the index of given tab with respect to the tabs to be displayed in the liftover results + + Parameters: + - tab: Tab whose idnex is to be returned + + Returns: + - Index of given tab with respect to the tabs to be displayed in the liftover results + """ + return f'tab-{get_tabs().index(tab)}' + + +def sanitize_other_refs(other_refs): + """ + Returns the references (other than Nipponbare) selected by the user + + The need for this function is motivated by the fact that, when the user only chooses one reference, + the data type of this chosen value is string (not list) + + Parameters: + - other_refs: References (other than Nipponbare) selected by the user + + Returns: + - List of references (other than Nipponbare) selected by the user + """ + if other_refs: + if isinstance(other_refs, str): + return [other_refs] + else: + return other_refs + + return [] + + +def sanitize_gene_id(gene_id): + """ + Removes "gene:" prefix in given accession + + Parameters: + - gene_id: Accession + + Returns: + - Accession without the "gene:" prefix + """ + if gene_id[:len('gene:')] == 'gene:': + return gene_id[len('gene:'):] + + return gene_id + + +# =============================================== +# Utility functions for OGI-to-reference mapping +# =============================================== + + +def get_ogi_list(accession_ids, ogi_mapping): + """ + Returns the list of equivalent OGIs of given accessions + + Parameters: + - accession_ids: Accessions + - ogi_mapping: OGI-to-accession mapping dictionary + + Returns: + - list of equivalent OGIs of given accessions + """ + ogi_list = [ogi_mapping[accession_id] for accession_id in accession_ids] + + return ogi_list + + +def get_ogi_nb(nb_intervals): + """ + Maps Nipponbare accessions (obtained from a list of Genomic_interval tuples) to their respective OGIs + + Parameters: + - nb_intervals: List of Genomic_interval tuples + + Returns: + - Set containing all unique OGIs after performing OGI-to-Nipponbare mapping + - OGI-to-Nipponbare mapping dictionary + """ + + # All unique OGIs + final_ogi_set = set() + + # OGI-to-NB mapping dictionary (one OGI can map to multiple NB accessions) + final_ogi_dict = defaultdict(set) + + for nb_interval in nb_intervals: + # Load and search GFF_DB of Nipponbare + db = gffutils.FeatureDB( + f'{const.ANNOTATIONS}/Nb/IRGSPMSU.gff.db', keep_order=True) + genes_in_interval = list(db.region(region=(nb_interval.chrom, nb_interval.start, nb_interval.stop), + completely_within=False, featuretype='gene')) + + # Map Nipponbare accessions to OGIs + ogi_mapping_path = f'{const.OGI_MAPPING}/Nb_to_ogi.pickle' + with open(ogi_mapping_path, 'rb') as f: + ogi_mapping = pickle.load(f) + for gene in genes_in_interval: + gene_id = sanitize_gene_id(gene.id) + ogi = ogi_mapping[gene_id] + + final_ogi_set.add(ogi) + final_ogi_dict[ogi].add(gene_id) + + return final_ogi_set, final_ogi_dict + + +def get_ogi_other_ref(ref, nb_intervals): + """ + Maps reference-specific accessions (obtained from a list of Genomic_interval tuples) to their respective OGIs + "Reference" refers to a reference other than Nipponbare + Nipponbare reference is handled by get_ogi_nb() + + Parameters: + - ref: Reference + - nb_intervals: List of Genomic_interval tuples + + Returns: + - Set containing all unique OGIs after performing OGI-to-reference mapping + - OGI-to-reference mapping dictionary + """ + + # All unique OGIs + final_ogi_set = set() + + # OGI-to-NB mapping dictionary (one OGI can map to multiple NB accessions) + final_ogi_dict = defaultdict(set) + + # Get intervals from other refs that align to (parts) of the input loci + db_align = gffutils.FeatureDB( + f'{const.ALIGNMENTS}/{"Nb_"+str(ref)}/{"Nb_"+str(ref)}.gff.db') + + # Get corresponding intervals on ref + db_annotation = gffutils.FeatureDB( + f"{const.ANNOTATIONS}/{ref}/{ref}.gff.db".format(ref)) + + for nb_interval in nb_intervals: + gff_intersections = list(db_align.region(region=(nb_interval.chrom, nb_interval.start, nb_interval.stop), + completely_within=False)) + for intersection in gff_intersections: + ref_interval = to_genomic_interval( + intersection.attributes['Name'][0]) + genes_in_interval = list(db_annotation.region(region=(ref_interval.chrom, ref_interval.start, ref_interval.stop), + completely_within=False, featuretype='gene')) + + # Map reference-specific accessions to OGIs + ogi_mapping_path = f'{const.OGI_MAPPING}/{ref}_to_ogi.pickle' + with open(ogi_mapping_path, 'rb') as f: + ogi_mapping = pickle.load(f) + for gene in genes_in_interval: + gene_id = sanitize_gene_id(gene.id) + ogi = ogi_mapping[gene_id] + + final_ogi_set.add(ogi) + final_ogi_dict[ogi].add(gene_id) + + return final_ogi_set, final_ogi_dict + +# ================================================== +# Utility function related to QTARO and Text Mining +# ================================================== + + +def get_qtaro_entry(mapping, gene): + try: + qtaro_str = '
    ' + pub_idx = 1 + for character_major in mapping[gene]: + qtaro_str += '
  • ' + character_major + '
      ' + for character_minor in mapping[gene][character_major]: + pubs = [] + for pub in mapping[gene][character_major][character_minor]: + pubs.append( + '
    • ' + get_doi_link_single_str(pub, pub_idx) + '
    • ') + pub_idx += 1 + + qtaro_str += '
    • ' + character_minor + \ + '
        ' + ''.join(pubs) + '
    • ' + qtaro_str += '

  • ' + + # Remove the line break after the last character major + return qtaro_str[:-len("
    ")] + '
' + except KeyError: + return NULL_PLACEHOLDER + + +def get_qtaro_entries(mapping, genes): + return [get_qtaro_entry(mapping, gene) for gene in genes] + + +def get_pubmed_entry(gene): + try: + with open(f'{const.TEXT_MINING_PUBMED}/{gene}.pickle', 'rb') as f: + mapping = pickle.load(f) + + pubmed_ids = [get_pubmed_link_single_str(pubmed_id[0]) for pubmed_id in sorted( + mapping.items(), key=lambda x: x[1], reverse=True)] + except FileNotFoundError: + return NULL_PLACEHOLDER + + pubmed_str = '' + for idx, pubmed in enumerate(pubmed_ids): + if idx % 2 == 0: + pubmed_str += f'{pubmed}   ' + else: + pubmed_str += f'{pubmed}\n' + + if pubmed_str[-1] == '\n': # Ends in a newline + return pubmed_str[:-len('\n')] + + return pubmed_str[:-len('   ')] + + +# ======================== +# Functions for lift-over +# ======================== + + +def get_genes_in_Nb(nb_intervals): + """ + Returns a data frame containing the genes in Nipponbare + + Parameters: + - nb_intervals: List of Genomic_interval tuples + + Returns: + - Data frame containing the genes in Nipponbare + """ + dfs = [] + + for nb_interval in nb_intervals: + # Load and search GFF_DB of Nipponbare + db = gffutils.FeatureDB( + f'{const.ANNOTATIONS}/Nb/IRGSPMSU.gff.db', keep_order=True) + genes_in_interval = list(db.region(region=(nb_interval.chrom, nb_interval.start, nb_interval.stop), + completely_within=False, featuretype='gene')) + + # Map accessions to their respective OGIs + ogi_mapping_path = f'{const.OGI_MAPPING}/Nb_to_ogi.pickle' + ogi_list = [] + with open(ogi_mapping_path, 'rb') as f: + ogi_mapping = pickle.load(f) + ogi_list = get_ogi_list([sanitize_gene_id(gene.id) + for gene in genes_in_interval], ogi_mapping) + + # Get QTARO annotations + with open(const.QTARO_DICTIONARY, 'rb') as f: + qtaro_dict = pickle.load(f) + qtaro_list = get_qtaro_entries( + qtaro_dict, [gene.id for gene in genes_in_interval]) + + pubmed_ids = [get_pubmed_entry(gene.id) for gene in genes_in_interval] + + # Construct the data frame + df = pd.DataFrame({ + 'OGI': ogi_list, + 'Name': [gene.id for gene in genes_in_interval], + 'Chromosome': [gene.chrom for gene in genes_in_interval], + 'Start': [gene.start for gene in genes_in_interval], + 'End': [gene.end for gene in genes_in_interval], + 'Strand': [gene.strand for gene in genes_in_interval], + 'QTL Analyses': qtaro_list, + 'PubMed Article IDs': pubmed_ids + }) + + dfs.append(df) + + try: + table_gene_ids = pd.concat(dfs, ignore_index=True) + # Read in dataframe containing gene descriptions + gene_description_df = pd.read_csv( + f'{const.GENE_DESCRIPTIONS}/Nb/Nb_gene_descriptions.csv') + # Right merge because some genes do not have descriptions or UniProtKB/Swiss-Prot IDs + table = pd.merge(gene_description_df, table_gene_ids, + left_on='Gene_ID', right_on='Name', how='right') + + # Reorder columns + table = table[NB_COLUMNS] + + table['UniProtKB/Swiss-Prot'] = get_uniprot_link( + table, 'UniProtKB/Swiss-Prot') + + table = table.fillna(NULL_PLACEHOLDER) + + if table.shape[0] == 0: + return create_empty_df_nb(), table['Name'].values.tolist() + + return table, table['Name'].values.tolist() + + except ValueError: # No results to concatenate + return create_empty_df_nb(), table['Name'].values.tolist() + + +def get_genes_in_other_ref(ref, nb_intervals): + """ + Returns a data frame containing the genes in references other than Nipponbare + Nipponbare is handled by get_genes_in_Nb() + + Parameters: + - ref: Reference + - nb_intervals: List of Genomic_interval tuples + + Returns: + - Data frame containing the genes in references other than Nipponbare + """ + + # Get intervals from other refs that align to (parts) of the input loci + db_align = gffutils.FeatureDB( + f'{const.ALIGNMENTS}/{"Nb_"+str(ref)}/{"Nb_"+str(ref)}.gff.db') + + # Get corresponding intervals on ref + db_annotation = gffutils.FeatureDB( + f"{const.ANNOTATIONS}/{ref}/{ref}.gff.db") + + dfs = [] + + for nb_interval in nb_intervals: + gff_intersections = list(db_align.region(region=(nb_interval.chrom, nb_interval.start, nb_interval.stop), + completely_within=False)) + for intersection in gff_intersections: + ref_interval = to_genomic_interval( + intersection.attributes['Name'][0]) + genes_in_interval = list(db_annotation.region(region=(ref_interval.chrom, ref_interval.start, ref_interval.stop), + completely_within=False, featuretype='gene')) + + # Map accessions to their respective OGIs + ogi_mapping_path = f'{const.OGI_MAPPING}/{ref}_to_ogi.pickle' + ogi_list = [] + with open(ogi_mapping_path, 'rb') as f: + ogi_mapping = pickle.load(f) + ogi_list = get_ogi_list([sanitize_gene_id(gene.id) + for gene in genes_in_interval], ogi_mapping) + + # Construct the data frame + df = pd.DataFrame({ + 'OGI': ogi_list, + 'Name': [sanitize_gene_id(gene.id) for gene in genes_in_interval], + 'Chromosome': [gene.chrom for gene in genes_in_interval], + 'Start': [gene.start for gene in genes_in_interval], + 'End': [gene.end for gene in genes_in_interval], + 'Strand': [gene.strand for gene in genes_in_interval] + }) + dfs.append(df) + + try: + table = pd.concat(dfs, ignore_index=True) + if table.shape[0] == 0: + return create_empty_df_other_refs() + + return table + + except ValueError: # No results to concatenate + return create_empty_df_other_refs() + + +def get_common_genes(refs, nb_intervals): + """ + Returns a data frame containing the genes common to the given references + + Parameters: + - ref: References + - nb_intervals: List of Genomic_interval tuples + + Returns: + - Data frame containing the genes common to the given references + """ + # No cultivars selected + if not refs: + return create_empty_no_refs_df() + + common_genes = None + for ref in refs: + if ref != 'Nipponbare': + genes_in_ref = get_genes_in_other_ref(ref, nb_intervals) + else: + genes_in_ref = get_genes_in_Nb(nb_intervals)[0] + + genes_in_ref = genes_in_ref[['OGI', 'Name']] + + try: + common_genes = pd.merge( + common_genes, genes_in_ref, on='OGI') + # First instance of merging (that is, common_genes is still None) + except TypeError: + common_genes = genes_in_ref + + common_genes = common_genes.rename( + columns={'Name_x': 'Nipponbare', 'Name_y': ref, 'Name': ref}) + + common_genes = common_genes.rename( + columns={'Name': 'Nipponbare'}).dropna().drop_duplicates() + + return common_genes + + +def get_all_genes(refs, nb_intervals): + """ + Returns a data frame containing all the genes (i.e., the set-theoretic union of all the genes) + in Nipponbare, as well as orthologous genes in the given references + + Parameters: + - ref: References (other than Nipponbare) + - nb_intervals: List of Genomic_interval tuples + + Returns: + - Data frame containing all the genes + """ + genes_in_nb = get_genes_in_Nb(nb_intervals)[0] + genes_in_nb = genes_in_nb[['OGI', 'Name']] + + common_genes = genes_in_nb + for ref in refs: + if ref != 'Nipponbare': + genes_in_other_ref = get_genes_in_other_ref(ref, nb_intervals) + genes_in_other_ref = genes_in_other_ref[['OGI', 'Name']] + common_genes = pd.merge( + common_genes, genes_in_other_ref, on='OGI', how='outer') + + common_genes = common_genes.rename( + columns={'Name_x': 'Nipponbare', 'Name_y': ref, 'Name': ref}) + + common_genes = common_genes.rename( + columns={'Name': 'Nipponbare'}).fillna(NULL_PLACEHOLDER).drop_duplicates() + + return common_genes + + +def get_unique_genes_in_other_ref(ref, nb_intervals): + """ + Returns a data frame containing the genes in a reference that are not present in Nipponbare + + Parameters: + - ref: References + - nb_intervals: List of Genomic_interval tuples + + Returns: + - Data frame containing the genes in a reference that are not present in Nipponbare + """ + genes_in_nb = get_genes_in_Nb(nb_intervals)[0] + genes_in_other_ref = get_genes_in_other_ref(ref, nb_intervals) + + genes_in_nb = genes_in_nb[['OGI']] + + # Get set difference + unique_genes = pd.concat([genes_in_other_ref, genes_in_nb, genes_in_nb]).drop_duplicates( + subset=['OGI'], keep=False) + + gene_description_df = pd.read_csv( + f'{const.GENE_DESCRIPTIONS}/{ref}/{ref}_gene_descriptions.csv') + # Right merge because some genes do not have descriptions or UniProtKB/Swiss-Prot IDs + unique_genes = pd.merge(gene_description_df, unique_genes, + left_on='Gene_ID', right_on='Name', how='right') + + unique_genes = unique_genes[FRONT_FACING_COLUMNS] + + unique_genes['UniProtKB/Swiss-Prot'] = get_uniprot_link( + unique_genes, 'UniProtKB/Swiss-Prot') + + unique_genes = unique_genes.fillna(NULL_PLACEHOLDER) + + if unique_genes.shape[0] == 0: + return create_empty_front_facing_df() + + return unique_genes diff --git a/callbacks/links_util.py b/callbacks/links_util.py index 8f4e19fc..41a4e8c8 100644 --- a/callbacks/links_util.py +++ b/callbacks/links_util.py @@ -1,58 +1,58 @@ -A_HREF = '' -LINK_ICON = '  ' - - -def get_genes_from_kegg_link(link): - idx = link.find('?') - query = link[idx:].split('+') - - return '\n'.join(query[1:]) - - -def get_kegg_link(result, id_col, genes_col): - return A_HREF + 'http://www.genome.jp/kegg-bin/show_pathway?' + \ - result[id_col] + '+' + result[genes_col].str.split('\n').str.join('+') + \ - CLOSE_A_HREF + result[id_col] + LINK_ICON - - -def get_go_link(result, id_col): - return A_HREF + 'https://amigo.geneontology.org/amigo/term/' + \ - result[id_col] + \ - CLOSE_A_HREF + result[id_col] + LINK_ICON - - -def get_to_po_link(result, id_col): - return A_HREF + 'https://www.ebi.ac.uk/ols4/ontologies/to/classes/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252F' + \ - result[id_col].str.replace(':', '_') + \ - CLOSE_A_HREF + result[id_col] + LINK_ICON - - -def get_uniprot_link(result, id_col): - return A_HREF + 'https://www.uniprot.org/uniprotkb/' + \ - result[id_col] + '/entry' + CLOSE_A_HREF + \ - result[id_col] + LINK_ICON - - -def get_pubmed_link(result, id_col): - return A_HREF + 'https://pubmed.ncbi.nlm.nih.gov/' + \ - result[id_col] + '/entry' + CLOSE_A_HREF + \ - result[id_col] + LINK_ICON - - -def get_doi_link_single_str(doi, pub_idx): - return A_HREF + 'https://doi.org/' + doi + CLOSE_A_HREF + 'Publication ' + str(pub_idx) + LINK_ICON - - -def get_pubmed_link_single_str(pubmed): - return A_HREF + 'https://pubmed.ncbi.nlm.nih.gov/' + \ - pubmed + '/entry' + CLOSE_A_HREF + \ - pubmed + LINK_ICON - - -def get_rgi_genecard_link(result, id_col): - return A_HREF + 'https://riceome.hzau.edu.cn/genecard/' + result[id_col] + CLOSE_A_HREF + result[id_col] + LINK_ICON - - -def get_rgi_orthogroup_link(result, id_col): - return A_HREF + 'https://riceome.hzau.edu.cn/orthogroup/' + result[id_col] + CLOSE_A_HREF + result[id_col] + LINK_ICON +A_HREF = '' +LINK_ICON = '  ' + + +def get_genes_from_kegg_link(link): + idx = link.find('?') + query = link[idx:].split('+') + + return '\n'.join(query[1:]) + + +def get_kegg_link(result, id_col, genes_col): + return A_HREF + 'http://www.genome.jp/kegg-bin/show_pathway?' + \ + result[id_col] + '+' + result[genes_col].str.split('\n').str.join('+') + \ + CLOSE_A_HREF + result[id_col] + LINK_ICON + + +def get_go_link(result, id_col): + return A_HREF + 'https://amigo.geneontology.org/amigo/term/' + \ + result[id_col] + \ + CLOSE_A_HREF + result[id_col] + LINK_ICON + + +def get_to_po_link(result, id_col): + return A_HREF + 'https://www.ebi.ac.uk/ols4/ontologies/to/classes/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252F' + \ + result[id_col].str.replace(':', '_') + \ + CLOSE_A_HREF + result[id_col] + LINK_ICON + + +def get_uniprot_link(result, id_col): + return A_HREF + 'https://www.uniprot.org/uniprotkb/' + \ + result[id_col] + '/entry' + CLOSE_A_HREF + \ + result[id_col] + LINK_ICON + + +def get_pubmed_link(result, id_col): + return A_HREF + 'https://pubmed.ncbi.nlm.nih.gov/' + \ + result[id_col] + '/entry' + CLOSE_A_HREF + \ + result[id_col] + LINK_ICON + + +def get_doi_link_single_str(doi, pub_idx): + return A_HREF + 'https://doi.org/' + doi + CLOSE_A_HREF + 'Publication ' + str(pub_idx) + LINK_ICON + + +def get_pubmed_link_single_str(pubmed): + return A_HREF + 'https://pubmed.ncbi.nlm.nih.gov/' + \ + pubmed + '/entry' + CLOSE_A_HREF + \ + pubmed + LINK_ICON + + +def get_rgi_genecard_link(result, id_col): + return A_HREF + 'https://riceome.hzau.edu.cn/genecard/' + result[id_col] + CLOSE_A_HREF + result[id_col] + LINK_ICON + + +def get_rgi_orthogroup_link(result, id_col): + return A_HREF + 'https://riceome.hzau.edu.cn/orthogroup/' + result[id_col] + CLOSE_A_HREF + result[id_col] + LINK_ICON diff --git a/callbacks/style_util.py b/callbacks/style_util.py index cc9a5957..43974b14 100644 --- a/callbacks/style_util.py +++ b/callbacks/style_util.py @@ -1,11 +1,11 @@ -def add_class_name(class_name, current_class_name): - current_classes = current_class_name.split(' ') - current_classes.append(class_name) - - return ' '.join(current_classes) - - -def remove_class_name(class_name, current_class_name): - current_classes = current_class_name.split(' ') - - return ' '.join([current_class for current_class in current_classes if current_class != class_name]) +def add_class_name(class_name, current_class_name): + current_classes = current_class_name.split(' ') + current_classes.append(class_name) + + return ' '.join(current_classes) + + +def remove_class_name(class_name, current_class_name): + current_classes = current_class_name.split(' ') + + return ' '.join([current_class for current_class in current_classes if current_class != class_name]) diff --git a/callbacks/text_mining/callbacks.py b/callbacks/text_mining/callbacks.py index 8943bc86..bb522c65 100644 --- a/callbacks/text_mining/callbacks.py +++ b/callbacks/text_mining/callbacks.py @@ -1,135 +1,151 @@ -from dash import Input, Output, State, ctx, ALL, html -from dash.exceptions import PreventUpdate -from collections import namedtuple - -from .util import * -from ..lift_over import util as lift_over_util - -def init_callback(app): - - # to display user input interval in the top nav - @app.callback( - Output('text-mining-genomic-intervals-input', 'children'), - State('homepage-genomic-intervals-submitted-input', 'data'), - Input('homepage-is-submitted', 'data'), - Input('text-mining-submit', 'n_clicks') - ) - def display_input(nb_intervals_str, homepage_is_submitted, *_): - if homepage_is_submitted: - if nb_intervals_str and not lift_over_util.is_error(lift_over_util.get_genomic_intervals_from_input(nb_intervals_str)): - return [html.B('Your Input Intervals: '), html.Span(nb_intervals_str)] - else: - return None - - raise PreventUpdate - - @app.callback( - Output('text-mining-query-saved-input', 'data', allow_duplicate=True), - Input({'type': 'example-text-mining', - 'description': ALL}, 'n_clicks'), - prevent_initial_call=True - ) - def set_input_fields_with_preset_input(example_text_mining_n_clicks): - if ctx.triggered_id and not all(val == 0 for val in example_text_mining_n_clicks): - return ctx.triggered_id['description'] - - raise PreventUpdate - - - @app.callback( - Output('text-mining-query-saved-input', 'data', allow_duplicate=True), - Input('text-mining-query', 'value'), - prevent_initial_call=True - ) - def set_input_fields(query_string): - return query_string - - - @app.callback( - Output('text-mining-query', 'value'), - Input('text-mining-query-saved-input', 'data'), - ) - def get_input_homepage_session_state(query): - return query - - @app.callback( - Output('text-mining-input-error', 'style'), - Output('text-mining-input-error', 'children'), - - Output('text-mining-is-submitted', 'data', allow_duplicate=True), - Output('text-mining-query-submitted-input', - 'data', allow_duplicate=True), - Input('text-mining-submit', 'n_clicks'), - Input('text-mining-query', 'n_submit'), - State('homepage-is-submitted', 'data'), - State('text-mining-query', 'value'), - prevent_initial_call=True - ) - def submit_text_mining_input(text_mining_submitted_n_clicks, text_mining_query_n_submit, homepage_is_submitted, text_mining_query): - if homepage_is_submitted and (text_mining_submitted_n_clicks >= 1 or text_mining_query_n_submit >= 1): - is_there_error, message = is_error(text_mining_query) - - if not is_there_error: - return {'display': 'none'}, message, True, text_mining_query - else: - return {'display': 'block'}, message, False, None - - raise PreventUpdate - - - @app.callback( - Output('text-mining-results-container', 'style'), - Input('text-mining-is-submitted', 'data') - ) - def display_coexpression_output(text_mining_is_submitted): - if text_mining_is_submitted: - return {'display': 'block'} - - else: - return {'display': 'none'} - - - @app.callback( - Output('text-mining-result-table', 'data'), - Output('text-mining-result-table', 'columns'), - Output('text-mining-results-stats', 'children'), - - Input('text-mining-is-submitted', 'data'), - State('homepage-is-submitted', 'data'), - State('text-mining-query-submitted-input', 'data') - ) - def display_text_mining_results(text_mining_is_submitted, homepage_submitted, text_mining_query_submitted_input): - if homepage_submitted and text_mining_is_submitted: - query_string = text_mining_query_submitted_input - - is_there_error, _ = is_error(query_string) - if not is_there_error: - text_mining_results_df = text_mining_query_search(query_string) - - columns = [{'id': x, 'name': x, 'presentation': 'markdown'} - for x in text_mining_results_df.columns] - - num_entries = get_num_entries(text_mining_results_df, "PMID") - num_unique_entries = get_num_unique_entries( - text_mining_results_df, "PMID") - - if num_entries == 1: - stats = f'Found {num_entries} match ' - else: - stats = f'Found {num_entries} matches ' - - if num_unique_entries == 1: - stats += f'across {num_unique_entries} publication' - else: - stats += f'across {num_unique_entries} publications' - - return text_mining_results_df.to_dict('records'), columns, stats - - raise PreventUpdate - - @app.callback( - Output('text-mining-result-table', 'filter_query'), - Input('text-mining-reset-table', 'n_clicks') - ) - def reset_table_filters(*_): - return '' +from dash import Input, Output, State, ctx, ALL, html, no_update +from dash.exceptions import PreventUpdate +from collections import namedtuple + +from .util import * +from ..lift_over import util as lift_over_util + +def init_callback(app): + + # to display user input interval in the top nav + @app.callback( + Output('text-mining-genomic-intervals-input', 'children'), + State('homepage-genomic-intervals-submitted-input', 'data'), + Input('homepage-is-submitted', 'data'), + Input('text-mining-submit', 'n_clicks') + ) + def display_input(nb_intervals_str, homepage_is_submitted, *_): + if homepage_is_submitted: + if nb_intervals_str and not lift_over_util.is_error(lift_over_util.get_genomic_intervals_from_input(nb_intervals_str)): + return [html.B('Your Input Intervals: '), html.Span(nb_intervals_str)] + else: + return None + + raise PreventUpdate + + @app.callback( + Output('text-mining-query-saved-input', 'data', allow_duplicate=True), + Input({'type': 'example-text-mining', + 'description': ALL}, 'n_clicks'), + prevent_initial_call=True + ) + def set_input_fields_with_preset_input(example_text_mining_n_clicks): + if ctx.triggered_id and not all(val == 0 for val in example_text_mining_n_clicks): + return ctx.triggered_id['description'] + + raise PreventUpdate + + + @app.callback( + Output('text-mining-query-saved-input', 'data', allow_duplicate=True), + Input('text-mining-query', 'value'), + prevent_initial_call=True + ) + def set_input_fields(query_string): + return query_string + + + @app.callback( + Output('text-mining-query', 'value'), + Input('text-mining-query-saved-input', 'data'), + ) + def get_input_homepage_session_state(query): + return query + + @app.callback( + Output('text-mining-input-error', 'style'), + Output('text-mining-input-error', 'children'), + + Output('text-mining-is-submitted', 'data', allow_duplicate=True), + Output('text-mining-query-submitted-input', + 'data', allow_duplicate=True), + Input('text-mining-submit', 'n_clicks'), + Input('text-mining-query', 'n_submit'), + State('homepage-is-submitted', 'data'), + State('text-mining-query', 'value'), + prevent_initial_call=True + ) + def submit_text_mining_input(text_mining_submitted_n_clicks, text_mining_query_n_submit, homepage_is_submitted, text_mining_query): + if homepage_is_submitted and (text_mining_submitted_n_clicks >= 1 or text_mining_query_n_submit >= 1): + is_there_error, message = is_error(text_mining_query) + + if not is_there_error: + return {'display': 'none'}, message, True, text_mining_query + else: + return {'display': 'block'}, message, no_update, no_update + + raise PreventUpdate + + + @app.callback( + Output('text-mining-results-container', 'style'), + Input('text-mining-is-submitted', 'data') + ) + def display_coexpression_output(text_mining_is_submitted): + if text_mining_is_submitted: + return {'display': 'block'} + + else: + return {'display': 'none'} + + @app.callback( + Output('text-mining-submit', 'disabled'), + Input('text-mining-submit', 'n_clicks'), + Input('text-mining-result-table', 'data'), + ) + def trigger(n_clicks, data): + context = ctx.triggered_id + + if context == 'text-mining-submit': + if n_clicks > 0 : + return True + else: + return False + else: + return False + + + @app.callback( + Output('text-mining-result-table', 'data'), + Output('text-mining-result-table', 'columns'), + Output('text-mining-results-stats', 'children'), + + State('text-mining-is-submitted', 'data'), + State('homepage-is-submitted', 'data'), + Input('text-mining-query-submitted-input', 'data') + ) + def display_text_mining_results(text_mining_is_submitted, homepage_submitted, text_mining_query_submitted_input): + if homepage_submitted and text_mining_is_submitted: + query_string = text_mining_query_submitted_input + + is_there_error, _ = is_error(query_string) + if not is_there_error: + text_mining_results_df = text_mining_query_search(query_string) + + columns = [{'id': x, 'name': x, 'presentation': 'markdown'} + for x in text_mining_results_df.columns] + + num_entries = get_num_entries(text_mining_results_df, "PMID") + num_unique_entries = get_num_unique_entries( + text_mining_results_df, "PMID") + + if num_entries == 1: + stats = f'Found {num_entries} match ' + else: + stats = f'Found {num_entries} matches ' + + if num_unique_entries == 1: + stats += f'across {num_unique_entries} publication' + else: + stats += f'across {num_unique_entries} publications' + + return text_mining_results_df.to_dict('records'), columns, stats + + raise PreventUpdate + + @app.callback( + Output('text-mining-result-table', 'filter_query'), + Input('text-mining-reset-table', 'n_clicks') + ) + def reset_table_filters(*_): + return '' diff --git a/callbacks/text_mining/util.py b/callbacks/text_mining/util.py index cbfdb487..2bf3f6fc 100644 --- a/callbacks/text_mining/util.py +++ b/callbacks/text_mining/util.py @@ -1,94 +1,94 @@ -import pandas as pd -from ..constants import Constants -from ..general_util import * -from ..links_util import * -import regex as re -import ftfy -from ..file_util import * - -const = Constants() -COLNAMES = ['Gene', 'PMID', 'Title', 'Sentence', 'Score'] - - -def sanitize_text(text): - # Sanitization of HTML tags should come first - text = re.sub(r'<\s+', '<', text) - text = re.sub(r'\s+>', '>', text) - text = re.sub(r'\s+', '>', text) - - text = re.sub(r'\s+', '>', text) + text = re.sub(r'\s+', '>', text) + + text = re.sub(r'\s+= 1: - submitted_input = Tfbs_input( - tfbs_set, tfbs_prediction_technique, tfbs_fdr)._asdict() - - return True, submitted_input - - raise PreventUpdate - - @app.callback( - Output('tfbs-results-container', 'style'), - Input('tfbs-is-submitted', 'data'), - ) - def display_tfbs_output(tfbs_is_submitted): - if tfbs_is_submitted: - return {'display': 'block'} - - else: - return {'display': 'none'} - - @app.callback( - Output('tf-enrichment-result-table', 'data'), - Output('tf-enrichment-result-table', 'columns'), - - Input('tfbs-is-submitted', 'data'), - State('lift-over-nb-entire-table', 'data'), - State('tfbs-addl-genes', 'value'), - - State('homepage-genomic-intervals-submitted-input', 'data'), - - State('homepage-is-submitted', 'data'), - State('tfbs-submitted-input', 'data') - ) - def display_enrichment_results(tfbs_is_submitted, lift_over_nb_entire_table, submitted_addl_genes, - nb_interval_str, homepage_submitted, tfbs_submitted_input): - if homepage_submitted and tfbs_is_submitted: - tfbs_set = tfbs_submitted_input['tfbs_set'] - tfbs_prediction_technique = tfbs_submitted_input['tfbs_prediction_technique'] - tfbs_fdr = tfbs_submitted_input['tfbs_fdr'] - - if submitted_addl_genes: - submitted_addl_genes = submitted_addl_genes.strip() - else: - submitted_addl_genes = '' - - list_addl_genes = list( - filter(None, [gene.strip() for gene in submitted_addl_genes.split(';')])) - - combined_genes = lift_over_nb_entire_table + \ - get_annotations_addl_gene(list_addl_genes) - - enrichment_results_df = perform_enrichment_all_tf(combined_genes, submitted_addl_genes, - tfbs_set, tfbs_prediction_technique, float(tfbs_fdr), nb_interval_str) - - columns = [{'id': x, 'name': x, 'presentation': 'markdown'} - for x in enrichment_results_df.columns] - - return enrichment_results_df.to_dict('records'), columns - - raise PreventUpdate - - @app.callback( - Output('tfbs-saved-input', 'data', allow_duplicate=True), - Input('tfbs-set', 'value'), - Input('tfbs-prediction-technique', 'value'), - Input('tfbs-fdr', 'value'), - State('homepage-is-submitted', 'data'), - prevent_initial_call=True - ) - def set_input_tfbs_session_state(tfbs_set, tfbs_prediction_technique, tfbs_fdr, homepage_is_submitted): - if homepage_is_submitted: - tfbs_saved_input = Tfbs_input( - tfbs_set, tfbs_prediction_technique, tfbs_fdr)._asdict() - - return tfbs_saved_input - - raise PreventUpdate - - @app.callback( - Output('tfbs-set', 'value'), - Output('tfbs-prediction-technique', 'value'), - State('homepage-is-submitted', 'data'), - State('tfbs-saved-input', 'data'), - Input('homepage-genomic-intervals-submitted-input', 'data') - ) - def get_input_tfbs_session_state(homepage_is_submitted, tfbs_saved_input, *_): - if homepage_is_submitted: - if not tfbs_saved_input: - return 'promoters', 'FunTFBS' - - return tfbs_saved_input['tfbs-set'], tfbs_saved_input['tfbs-prediction-technique'] - - raise PreventUpdate - - @app.callback( - Output('tf-enrichment-result-table', 'filter_query'), - Input('tfbs-reset-table', 'n_clicks') - ) - def reset_table_filters(*_): - return '' - - @app.callback( - Output('tfbs-download-df-to-csv', 'data'), - Input('tfbs-export-table', 'n_clicks'), - State('tf-enrichment-result-table', 'data'), - State('homepage-genomic-intervals-submitted-input', 'data') - ) - def download_tfbs_table_to_csv(download_n_clicks, tfbs_df, genomic_intervals): - if download_n_clicks >= 1: - df = pd.DataFrame(tfbs_df) - return dcc.send_data_frame(df.to_csv, f'[{genomic_intervals}] Regulatory Feature Enrichment.csv', index=False) - - raise PreventUpdate +from dash import Input, Output, State, html, dcc +from dash.exceptions import PreventUpdate +from collections import namedtuple + +from .util import * +from ..lift_over import util as lift_over_util + +Tfbs_input = namedtuple( + 'Tfbs_input', ['tfbs_set', 'tfbs_prediction_technique', 'tfbs_fdr']) + + +def init_callback(app): + @app.callback( + Output('tf-enrichment-genomic-intervals-input', 'children'), + State('homepage-genomic-intervals-submitted-input', 'data'), + Input('homepage-is-submitted', 'data'), + Input('tfbs-submit', 'n_clicks') + ) + def display_input(nb_intervals_str, homepage_is_submitted, *_): + if homepage_is_submitted: + if nb_intervals_str and not lift_over_util.is_error(lift_over_util.get_genomic_intervals_from_input(nb_intervals_str)): + return [html.B('Your Input Intervals: '), html.Span(nb_intervals_str)] + else: + return None + + raise PreventUpdate + + @app.callback( + Output('tfbs-is-submitted', 'data', allow_duplicate=True), + Output('tfbs-submitted-input', 'data', allow_duplicate=True), + + Input('tfbs-submit', 'n_clicks'), + State('homepage-is-submitted', 'data'), + + State('tfbs-addl-genes', 'value'), + State('tfbs-set', 'value'), + State('tfbs-prediction-technique', 'value'), + State('tfbs-fdr', 'value'), + prevent_initial_call=True + ) + def submit_tfbs_input(tfbs_submitted_n_clicks, homepage_is_submitted, addl_genes, tfbs_set, tfbs_prediction_technique, tfbs_fdr): + if homepage_is_submitted and tfbs_submitted_n_clicks >= 1: + submitted_input = Tfbs_input( + tfbs_set, tfbs_prediction_technique, tfbs_fdr)._asdict() + + return True, submitted_input + + raise PreventUpdate + + @app.callback( + Output('tfbs-results-container', 'style'), + Input('tfbs-is-submitted', 'data'), + ) + def display_tfbs_output(tfbs_is_submitted): + if tfbs_is_submitted: + return {'display': 'block'} + + else: + return {'display': 'none'} + + @app.callback( + Output('tf-enrichment-result-table', 'data'), + Output('tf-enrichment-result-table', 'columns'), + + Input('tfbs-is-submitted', 'data'), + State('lift-over-nb-entire-table', 'data'), + State('tfbs-addl-genes', 'value'), + + State('homepage-genomic-intervals-submitted-input', 'data'), + + State('homepage-is-submitted', 'data'), + State('tfbs-submitted-input', 'data') + ) + def display_enrichment_results(tfbs_is_submitted, lift_over_nb_entire_table, submitted_addl_genes, + nb_interval_str, homepage_submitted, tfbs_submitted_input): + if homepage_submitted and tfbs_is_submitted: + tfbs_set = tfbs_submitted_input['tfbs_set'] + tfbs_prediction_technique = tfbs_submitted_input['tfbs_prediction_technique'] + tfbs_fdr = tfbs_submitted_input['tfbs_fdr'] + + if submitted_addl_genes: + submitted_addl_genes = submitted_addl_genes.strip() + else: + submitted_addl_genes = '' + + list_addl_genes = list( + filter(None, [gene.strip() for gene in submitted_addl_genes.split(';')])) + + combined_genes = lift_over_nb_entire_table + \ + get_annotations_addl_gene(list_addl_genes) + + enrichment_results_df = perform_enrichment_all_tf(combined_genes, submitted_addl_genes, + tfbs_set, tfbs_prediction_technique, float(tfbs_fdr), nb_interval_str) + + columns = [{'id': x, 'name': x, 'presentation': 'markdown'} + for x in enrichment_results_df.columns] + + return enrichment_results_df.to_dict('records'), columns + + raise PreventUpdate + + @app.callback( + Output('tfbs-saved-input', 'data', allow_duplicate=True), + Input('tfbs-set', 'value'), + Input('tfbs-prediction-technique', 'value'), + Input('tfbs-fdr', 'value'), + State('homepage-is-submitted', 'data'), + prevent_initial_call=True + ) + def set_input_tfbs_session_state(tfbs_set, tfbs_prediction_technique, tfbs_fdr, homepage_is_submitted): + if homepage_is_submitted: + tfbs_saved_input = Tfbs_input( + tfbs_set, tfbs_prediction_technique, tfbs_fdr)._asdict() + + return tfbs_saved_input + + raise PreventUpdate + + @app.callback( + Output('tfbs-set', 'value'), + Output('tfbs-prediction-technique', 'value'), + State('homepage-is-submitted', 'data'), + State('tfbs-saved-input', 'data'), + Input('homepage-genomic-intervals-submitted-input', 'data') + ) + def get_input_tfbs_session_state(homepage_is_submitted, tfbs_saved_input, *_): + if homepage_is_submitted: + if not tfbs_saved_input: + return 'promoters', 'FunTFBS' + + return tfbs_saved_input['tfbs-set'], tfbs_saved_input['tfbs-prediction-technique'] + + raise PreventUpdate + + @app.callback( + Output('tf-enrichment-result-table', 'filter_query'), + Input('tfbs-reset-table', 'n_clicks') + ) + def reset_table_filters(*_): + return '' + + @app.callback( + Output('tfbs-download-df-to-csv', 'data'), + Input('tfbs-export-table', 'n_clicks'), + State('tf-enrichment-result-table', 'data'), + State('homepage-genomic-intervals-submitted-input', 'data') + ) + def download_tfbs_table_to_csv(download_n_clicks, tfbs_df, genomic_intervals): + if download_n_clicks >= 1: + df = pd.DataFrame(tfbs_df) + return dcc.send_data_frame(df.to_csv, f'[{genomic_intervals}] Regulatory Feature Enrichment.csv', index=False) + + raise PreventUpdate diff --git a/callbacks/tf_enrich/util.py b/callbacks/tf_enrich/util.py index c000d871..294d6628 100644 --- a/callbacks/tf_enrich/util.py +++ b/callbacks/tf_enrich/util.py @@ -1,180 +1,180 @@ -import pandas as pd -import os -import subprocess -import statsmodels.stats.multitest as sm -import pickle -from ..file_util import * -from ..constants import Constants -from ..general_util import * - -import gffutils - -const = Constants() - -COLUMNS = ['Transcription Factor', 'Family', - 'p-value', 'Adj. p-value', 'Significant?'] - - -def create_empty_df(): - return create_empty_df_with_cols(['Transcription Factor', 'p-value', 'adj. p-value']) - - -def get_annotations_addl_gene(addl_genes): - db = gffutils.FeatureDB( - f'{const.ANNOTATIONS}/Nb/IRGSPMSU.gff.db', keep_order=True) - - return [{'ogi': None, - 'name': addl_gene, - 'Chromosome': db[addl_gene].chrom, - 'Start': db[addl_gene].start, - 'End': db[addl_gene].end, - 'Strand': db[addl_gene].strand} for addl_gene in addl_genes] - -# gene_table is a list of dictionaries, each dictionary of this kind: {'ogi': 'OGI:01005230', 'name': 'LOC_Os01g03710', 'chrom': 'Chr01', 'start': 1534135, 'end': 1539627, 'strand': '+'} - - -def write_query_promoter_intervals_to_file(gene_table, nb_interval_str, addl_genes, upstream_win_len=500, downstream_win_len=100): - make_dir(get_path_to_temp(nb_interval_str, const.TEMP_TFBS)) - filepath = get_path_to_temp( - nb_interval_str, const.TEMP_TFBS, addl_genes, const.PROMOTER_BED) - with open(filepath, "w") as f: - for gene in gene_table: - if gene['Strand'] == '+': - promoter_start = gene['Start'] - upstream_win_len - assert promoter_start >= 0 - promoter_end = gene['Start'] + downstream_win_len - 1 - f.write("{}\t{}\t{}\n".format( - gene['Chromosome'], promoter_start, promoter_end)) - elif gene['Strand'] == '-': - promoter_start = gene['End'] + upstream_win_len - promoter_end = gene['End'] + 1 - downstream_win_len - assert promoter_end >= 0 - f.write("{}\t{}\t{}\n".format( - gene['Chromosome'], promoter_end, promoter_start)) - return filepath - - -def write_query_genome_intervals_to_file(nb_interval_str, addl_genes): - make_dir(get_path_to_temp(nb_interval_str, const.TEMP_TFBS, addl_genes)) - filepath = get_path_to_temp( - nb_interval_str, const.TEMP_TFBS, const.GENOME_WIDE_BED) - with open(filepath, "w") as f: - for interval in nb_interval_str.split(";"): - chrom, range = interval.split(":") - beg, end = range.split("-") - f.write("{}\t{}\t{}\n".format(chrom, beg, end)) - return filepath - - -def perform_enrichment_all_tf(lift_over_nb_entire_table, addl_genes, tfbs_set, tfbs_prediction_technique, tfbs_fdr, nb_interval_str): - out_dir = get_path_to_temp( - nb_interval_str, const.TEMP_TFBS, addl_genes, tfbs_set, tfbs_prediction_technique) - # if previously computed - if path_exists(f'{out_dir}/BH_corrected_fdr_{tfbs_fdr}.csv'): - results_df = pd.read_csv( - f'{out_dir}/BH_corrected_fdr_{tfbs_fdr}.csv', dtype=object) - - results_df['Family'] = results_df['Transcription Factor'].apply( - get_family) - - results_df = results_df[COLUMNS] - - return results_df - - # single-TF p-values already computed, but not BH_corrected, possibly FDR value changed - elif path_exists(f'{out_dir}/results_before_multiple_corrections.csv'): - results_before_multiple_corrections = pd.read_csv( - f'{out_dir}/results_before_multiple_corrections.csv') - results_df = multiple_testing_correction(results_before_multiple_corrections, - float(tfbs_fdr)) - results_df.to_csv( - f'{out_dir}/BH_corrected_fdr_{tfbs_fdr}.csv', index=False) - - results_df['Family'] = results_df['Transcription Factor'].apply( - get_family) - - results_df = results_df[COLUMNS] - - return results_df - - make_dir(out_dir) - - # construct query BED file - out_dir_tf_enrich = get_path_to_temp( - nb_interval_str, const.TEMP_TFBS, addl_genes) - if tfbs_set == 'promoters': - query_bed = write_query_promoter_intervals_to_file( - lift_over_nb_entire_table, nb_interval_str, addl_genes) - sizes = f'{const.TFBS_BEDS}/sizes/{tfbs_set}' - elif tfbs_set == 'genome': - query_bed = write_query_genome_intervals_to_file( - nb_interval_str, addl_genes) - sizes = f'{const.TFBS_BEDS}/sizes/{tfbs_set}' - - TF_list = [] - # keep together using a dict? but BH correction needs a separate list of p_values - pvalue_list = [] - - # perform annotation overlap statistical significance tests - for tf in os.listdir(os.path.join(const.TFBS_BEDS, tfbs_set, tfbs_prediction_technique, "intervals")): - # print("computing overlaps for: {}".format(tf)) - ref_bed = f'{const.TFBS_BEDS}/{tfbs_set}/{tfbs_prediction_technique}/intervals/{tf}' - out_dir_tf = f'{out_dir}/{tf}' - make_dir(out_dir_tf) - - p_value = perform_enrichment_specific_tf( - ref_bed, query_bed, sizes, out_dir_tf) - - TF_list.append(tf) - pvalue_list.append(p_value) - - results_no_adj_df = pd.DataFrame(list((zip(TF_list, pvalue_list))), columns=[ - "Transcription Factor", "p-value"]) - results_no_adj_df.to_csv( - f'{out_dir}/results_before_multiple_corrections.csv', index=False) - - results_df = multiple_testing_correction(results_no_adj_df, tfbs_fdr) - display_cols_in_sci_notation(results_df, ['p-value', 'Adj. p-value']) - - results_df.to_csv( - f'{out_dir}/BH_corrected_fdr_{tfbs_fdr}.csv', index=False) - - results_df['Family'] = results_df['Transcription Factor'].apply( - get_family) - - results_df = results_df[COLUMNS] - - return results_df - - -def perform_enrichment_specific_tf(ref_bed, query_bed, sizes, out_dir): - summary_file = f'{out_dir}/summary.txt' - - if not path_exists(summary_file): - subprocess.run(["mcdp2", "single", ref_bed, query_bed, sizes, "-o", out_dir], - shell=False, capture_output=True, text=True) # TODO exception handling - - with open(f'{out_dir}/summary.txt') as f: - content = f.readlines() - p_value = float(content[3].rstrip().split(":")[1]) - return p_value - - -def multiple_testing_correction(single_tf_results, fdr): - pvalues = single_tf_results['p-value'].tolist() - sig, adj_pvalue, _, _ = sm.multipletests( - pvalues, alpha=fdr, method='fdr_bh', is_sorted=False, returnsorted=False) - sig = sig.tolist() - sig = list(map(str, sig)) - adj_pvalue = adj_pvalue.tolist() - single_tf_results['Adj. p-value'] = adj_pvalue - single_tf_results['Significant?'] = sig - single_tf_results.sort_values(by=['p-value'], inplace=True) - return single_tf_results - - -def get_family(transcription_factor): - with open(f'{const.TFBS_ANNOTATION}/family_mapping.pickle', 'rb') as f: - mapping = pickle.load(f) - - return ', '.join(mapping[transcription_factor]) +import pandas as pd +import os +import subprocess +import statsmodels.stats.multitest as sm +import pickle +from ..file_util import * +from ..constants import Constants +from ..general_util import * + +import gffutils + +const = Constants() + +COLUMNS = ['Transcription Factor', 'Family', + 'p-value', 'Adj. p-value', 'Significant?'] + + +def create_empty_df(): + return create_empty_df_with_cols(['Transcription Factor', 'p-value', 'adj. p-value']) + + +def get_annotations_addl_gene(addl_genes): + db = gffutils.FeatureDB( + f'{const.ANNOTATIONS}/Nb/IRGSPMSU.gff.db', keep_order=True) + + return [{'ogi': None, + 'name': addl_gene, + 'Chromosome': db[addl_gene].chrom, + 'Start': db[addl_gene].start, + 'End': db[addl_gene].end, + 'Strand': db[addl_gene].strand} for addl_gene in addl_genes] + +# gene_table is a list of dictionaries, each dictionary of this kind: {'ogi': 'OGI:01005230', 'name': 'LOC_Os01g03710', 'chrom': 'Chr01', 'start': 1534135, 'end': 1539627, 'strand': '+'} + + +def write_query_promoter_intervals_to_file(gene_table, nb_interval_str, addl_genes, upstream_win_len=500, downstream_win_len=100): + make_dir(get_path_to_temp(nb_interval_str, const.TEMP_TFBS)) + filepath = get_path_to_temp( + nb_interval_str, const.TEMP_TFBS, addl_genes, const.PROMOTER_BED) + with open(filepath, "w") as f: + for gene in gene_table: + if gene['Strand'] == '+': + promoter_start = gene['Start'] - upstream_win_len + assert promoter_start >= 0 + promoter_end = gene['Start'] + downstream_win_len - 1 + f.write("{}\t{}\t{}\n".format( + gene['Chromosome'], promoter_start, promoter_end)) + elif gene['Strand'] == '-': + promoter_start = gene['End'] + upstream_win_len + promoter_end = gene['End'] + 1 - downstream_win_len + assert promoter_end >= 0 + f.write("{}\t{}\t{}\n".format( + gene['Chromosome'], promoter_end, promoter_start)) + return filepath + + +def write_query_genome_intervals_to_file(nb_interval_str, addl_genes): + make_dir(get_path_to_temp(nb_interval_str, const.TEMP_TFBS, addl_genes)) + filepath = get_path_to_temp( + nb_interval_str, const.TEMP_TFBS, const.GENOME_WIDE_BED) + with open(filepath, "w") as f: + for interval in nb_interval_str.split(";"): + chrom, range = interval.split(":") + beg, end = range.split("-") + f.write("{}\t{}\t{}\n".format(chrom, beg, end)) + return filepath + + +def perform_enrichment_all_tf(lift_over_nb_entire_table, addl_genes, tfbs_set, tfbs_prediction_technique, tfbs_fdr, nb_interval_str): + out_dir = get_path_to_temp( + nb_interval_str, const.TEMP_TFBS, addl_genes, tfbs_set, tfbs_prediction_technique) + # if previously computed + if path_exists(f'{out_dir}/BH_corrected_fdr_{tfbs_fdr}.csv'): + results_df = pd.read_csv( + f'{out_dir}/BH_corrected_fdr_{tfbs_fdr}.csv', dtype=object) + + results_df['Family'] = results_df['Transcription Factor'].apply( + get_family) + + results_df = results_df[COLUMNS] + + return results_df + + # single-TF p-values already computed, but not BH_corrected, possibly FDR value changed + elif path_exists(f'{out_dir}/results_before_multiple_corrections.csv'): + results_before_multiple_corrections = pd.read_csv( + f'{out_dir}/results_before_multiple_corrections.csv') + results_df = multiple_testing_correction(results_before_multiple_corrections, + float(tfbs_fdr)) + results_df.to_csv( + f'{out_dir}/BH_corrected_fdr_{tfbs_fdr}.csv', index=False) + + results_df['Family'] = results_df['Transcription Factor'].apply( + get_family) + + results_df = results_df[COLUMNS] + + return results_df + + make_dir(out_dir) + + # construct query BED file + out_dir_tf_enrich = get_path_to_temp( + nb_interval_str, const.TEMP_TFBS, addl_genes) + if tfbs_set == 'promoters': + query_bed = write_query_promoter_intervals_to_file( + lift_over_nb_entire_table, nb_interval_str, addl_genes) + sizes = f'{const.TFBS_BEDS}/sizes/{tfbs_set}' + elif tfbs_set == 'genome': + query_bed = write_query_genome_intervals_to_file( + nb_interval_str, addl_genes) + sizes = f'{const.TFBS_BEDS}/sizes/{tfbs_set}' + + TF_list = [] + # keep together using a dict? but BH correction needs a separate list of p_values + pvalue_list = [] + + # perform annotation overlap statistical significance tests + for tf in os.listdir(os.path.join(const.TFBS_BEDS, tfbs_set, tfbs_prediction_technique, "intervals")): + # print("computing overlaps for: {}".format(tf)) + ref_bed = f'{const.TFBS_BEDS}/{tfbs_set}/{tfbs_prediction_technique}/intervals/{tf}' + out_dir_tf = f'{out_dir}/{tf}' + make_dir(out_dir_tf) + + p_value = perform_enrichment_specific_tf( + ref_bed, query_bed, sizes, out_dir_tf) + + TF_list.append(tf) + pvalue_list.append(p_value) + + results_no_adj_df = pd.DataFrame(list((zip(TF_list, pvalue_list))), columns=[ + "Transcription Factor", "p-value"]) + results_no_adj_df.to_csv( + f'{out_dir}/results_before_multiple_corrections.csv', index=False) + + results_df = multiple_testing_correction(results_no_adj_df, tfbs_fdr) + display_cols_in_sci_notation(results_df, ['p-value', 'Adj. p-value']) + + results_df.to_csv( + f'{out_dir}/BH_corrected_fdr_{tfbs_fdr}.csv', index=False) + + results_df['Family'] = results_df['Transcription Factor'].apply( + get_family) + + results_df = results_df[COLUMNS] + + return results_df + + +def perform_enrichment_specific_tf(ref_bed, query_bed, sizes, out_dir): + summary_file = f'{out_dir}/summary.txt' + + if not path_exists(summary_file): + subprocess.run(["mcdp2", "single", ref_bed, query_bed, sizes, "-o", out_dir], + shell=False, capture_output=True, text=True) # TODO exception handling + + with open(f'{out_dir}/summary.txt') as f: + content = f.readlines() + p_value = float(content[3].rstrip().split(":")[1]) + return p_value + + +def multiple_testing_correction(single_tf_results, fdr): + pvalues = single_tf_results['p-value'].tolist() + sig, adj_pvalue, _, _ = sm.multipletests( + pvalues, alpha=fdr, method='fdr_bh', is_sorted=False, returnsorted=False) + sig = sig.tolist() + sig = list(map(str, sig)) + adj_pvalue = adj_pvalue.tolist() + single_tf_results['Adj. p-value'] = adj_pvalue + single_tf_results['Significant?'] = sig + single_tf_results.sort_values(by=['p-value'], inplace=True) + return single_tf_results + + +def get_family(transcription_factor): + with open(f'{const.TFBS_ANNOTATION}/family_mapping.pickle', 'rb') as f: + mapping = pickle.load(f) + + return ', '.join(mapping[transcription_factor]) diff --git a/dependencies/install-libraries-workflow.r b/dependencies/install-libraries-workflow.r index 318cf77b..c52df4ed 100644 --- a/dependencies/install-libraries-workflow.r +++ b/dependencies/install-libraries-workflow.r @@ -1,11 +1,11 @@ -install.packages("optparse", repos = "http://cran.us.r-project.org") -install.packages("ggplot2", repos = "http://cran.us.r-project.org") - -install.packages("BiocManager", repos = "http://cran.us.r-project.org") -library(BiocManager) -BiocManager::install("clusterProfiler", version = "3.17", ask = FALSE) -BiocManager::install("GO.db", version = "3.17", ask = FALSE) - -BiocManager::install("graphite", version = "3.17", ask = FALSE) -BiocManager::install("ROntoTools", version = "3.17", ask = FALSE) -BiocManager::install("SPIA", version = "3.17", ask = FALSE) +install.packages("optparse", repos = "http://cran.us.r-project.org") +install.packages("ggplot2", repos = "http://cran.us.r-project.org") + +install.packages("BiocManager", repos = "http://cran.us.r-project.org") +library(BiocManager) +BiocManager::install("clusterProfiler", version = "3.17", ask = FALSE) +BiocManager::install("GO.db", version = "3.17", ask = FALSE) + +BiocManager::install("graphite", version = "3.17", ask = FALSE) +BiocManager::install("ROntoTools", version = "3.17", ask = FALSE) +BiocManager::install("SPIA", version = "3.17", ask = FALSE) diff --git a/dependencies/requirements-app.txt b/dependencies/requirements-app.txt index 3db5ef2d..0b934d4b 100644 --- a/dependencies/requirements-app.txt +++ b/dependencies/requirements-app.txt @@ -1,12 +1,12 @@ -dash==2.9.1 -dash_bootstrap_components==1.4.1 -dash_cytoscape==0.3.0 -gffutils==0.11.1 -networkx>=2.8.7 -pandas==1.5.1 -dash_bio==1.0.2 -pybind11==2.10.4 -statsmodels==0.14.0 -regex==2023.8.8 -ftfy==6.1.1 -scipy==1.11.2 +dash==2.9.1 +dash_bootstrap_components==1.4.1 +dash_cytoscape==0.3.0 +gffutils==0.11.1 +networkx>=2.8.7 +pandas==1.5.1 +dash_bio==1.0.2 +pybind11==2.10.4 +statsmodels==0.14.0 +regex==2023.8.8 +ftfy==6.1.1 +scipy==1.11.2 diff --git a/dependencies/requirements-workflow.txt b/dependencies/requirements-workflow.txt index 56b41d9a..ae81be63 100644 --- a/dependencies/requirements-workflow.txt +++ b/dependencies/requirements-workflow.txt @@ -1,14 +1,14 @@ -dash==2.9.1 -dash_bootstrap_components==1.4.1 -dash_cytoscape==0.3.0 -gffutils==0.11.1 -networkx>=2.8.7 -pandas==1.5.1 -dash_bio==1.0.2 -pybind11==2.10.4 -statsmodels==0.14.0 -regex==2023.8.8 -ftfy==6.1.1 -scipy==1.11.2 -cdlib[all]==0.3.0 -nltk==3.8.1 +dash==2.9.1 +dash_bootstrap_components==1.4.1 +dash_cytoscape==0.3.0 +gffutils==0.11.1 +networkx>=2.8.7 +pandas==1.5.1 +dash_bio==1.0.2 +pybind11==2.10.4 +statsmodels==0.14.0 +regex==2023.8.8 +ftfy==6.1.1 +scipy==1.11.2 +cdlib[all]==0.3.0 +nltk==3.8.1 diff --git a/pages/analysis/browse_loci.py b/pages/analysis/browse_loci.py index 64d3c4b4..1e96dbcf 100644 --- a/pages/analysis/browse_loci.py +++ b/pages/analysis/browse_loci.py @@ -1,61 +1,61 @@ -from dash import dcc, html -import dash_bootstrap_components as dbc -from callbacks.constants import Constants -const = Constants() - -layout = html.Div( - id={ - 'type': 'analysis-layout', - 'label': const.IGV - }, - hidden=True, - children=[ - html.Div([ - html.P('WRITE ME') - ], className='analysis-intro p-3'), - - html.Br(), - - html.Div([ - html.I(className='bi bi-chevron-bar-right me-2 non-clickable'), - html.Span(id='igv-genomic-intervals-input'), - ], className='analysis-intro p-3'), - - html.Br(), - - html.Div([ - dbc.Label('Select an interval: ', - className='mb-2'), - - dcc.Dropdown( - id='igv-genomic-intervals', - ), - - html.Br(), - - dbc.Button('Submit', - id='igv-submit', - n_clicks=0, - className='page-button'), - ], className='analysis-intro p-3'), - - html.Br(), - - html.Div( - id='igv-results-container', - style={'display': 'none'}, - children=[ - html.Hr(className='mt-3 mb-4'), - dbc.Label(id='igv-track-intro'), - - dcc.Loading(dbc.Checklist(id='igv-track-filter', - inline=True, - className='ms-3')), - - html.Br(), - - dcc.Loading(id='igv-display') - ] - ) - ], className='mt-2' -) +from dash import dcc, html +import dash_bootstrap_components as dbc +from callbacks.constants import Constants +const = Constants() + +layout = html.Div( + id={ + 'type': 'analysis-layout', + 'label': const.IGV + }, + hidden=True, + children=[ + html.Div([ + html.P('WRITE ME') + ], className='analysis-intro p-3'), + + html.Br(), + + html.Div([ + html.I(className='bi bi-chevron-bar-right me-2 non-clickable'), + html.Span(id='igv-genomic-intervals-input'), + ], className='analysis-intro p-3'), + + html.Br(), + + html.Div([ + dbc.Label('Select an interval: ', + className='mb-2'), + + dcc.Dropdown( + id='igv-genomic-intervals', + ), + + html.Br(), + + dbc.Button('Submit', + id='igv-submit', + n_clicks=0, + className='page-button'), + ], className='analysis-intro p-3'), + + html.Br(), + + html.Div( + id='igv-results-container', + style={'display': 'none'}, + children=[ + html.Hr(className='mt-3 mb-4'), + dbc.Label(id='igv-track-intro'), + + dcc.Loading(dbc.Checklist(id='igv-track-filter', + inline=True, + className='ms-3')), + + html.Br(), + + dcc.Loading(id='igv-display') + ] + ) + ], className='mt-2' +) diff --git a/pages/analysis/co_expr.py b/pages/analysis/co_expr.py index 1dcdc12c..93bfffb6 100644 --- a/pages/analysis/co_expr.py +++ b/pages/analysis/co_expr.py @@ -1,401 +1,401 @@ -import dash_bootstrap_components as dbc -import dash_cytoscape as cyto -from dash import dash_table, dcc, html -from callbacks.constants import Constants -from callbacks.coexpression.util import * - - -const = Constants() - -coach = html.Li( - [html.B('COACH'), - html.Span( - ' Detects highly connected gene subnetworks (referred to as "cores") and expands them by including closely associated genes', - className='algo-desc'), - html.Div([ - html.Span( - 'Wu, M., Li, X., Kwoh, C. K., & Ng, S. K. (2009). A core-attachment based method to detect protein complexes in PPI networks. '), - html.I('BMC Bioinformatics, 10'), - html.Span('(169). '), - html.A('https://doi.org/10.1186/1471-2105-10-169', - href='https://doi.org/10.1186/1471-2105-10-169', - target='_blank')], - className='reference' - )] -) - -demon = html.Li( - [html.B('DEMON'), - html.Span( - ' Adopts a bottom-up approach where genes "vote" to determine the subnetwork to which connected genes belong', - className='algo-desc'), - html.Div([ - html.Span( - 'Coscia, M., Rossetti, G., Giannotti, F., & Pedreschi, D. (2012). DEMON: A local-first discovery method for overlapping communities. In '), - html.I('KDD\'12: Proceedings of the 18th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining '), - html.Span('(pp. 615–623). Association for Computing Machinery. '), - html.A('https://doi.org/10.1145/2339530.2339630', - href='https://doi.org/10.1145/2339530.2339630', - target='_blank')], - className='reference' - )] -) - -clusterone = html.Li( - [html.B('ClusterONE'), - html.Span( - ' Forms cohesive gene subnetworks from an initial set of seed genes. ', - className='algo-desc'), - html.Div([ - html.Span( - 'Nepusz, T., Yu, H., & Paccanaro, A. (2012). Detecting overlapping protein complexes in protein-protein interaction networks. '), - html.I('Nature Methods, 9, '), - html.Span('471–472. '), - html.A('https://doi.org/10.1038/nmeth.1938', - href='https://doi.org/10.1038/nmeth.1938', - target='_blank')], - className='reference' - )], -) - -fox = html.Li( - [html.B('FOX'), - html.Span( - ' Determines the membership of a gene to a subnetwork by counting the number of triangles formed by the gene with other genes in the subnetwork', - className='algo-desc'), - html.Div([ - html.Span( - 'Lyu, T., Bing, L., Zhang, Z., & Zhang, Y. (2020). FOX: Fast overlapping community detection algorithm in big weighted networks. '), - html.I('ACM Transactions on Social Computing, 3'), - html.Span('(3), 1–23. '), - html.A('https://doi.org/10.1145/3404970', - href='https://doi.org/10.1145/3404970', - target='_blank')], - className='reference' - )], -) - -module_detection_algo_modal = dbc.Modal([ - dbc.ModalHeader( - dbc.ModalTitle('Module Detection Algorithms') - ), - dbc.ModalBody([ - html.P( - 'Since genes can possibly be involved in multiple biological functions or processes, the algorithms supported by RicePilaf allow for overlapping modules (that is, a given gene may belong to multiple modules):'), - html.Ul([ - clusterone, html.Br(), coach, html.Br(), demon, html.Br(), fox - ]) - ])], - id='coexpression-clustering-algo-modal', - is_open=False, - size='xl' -) - - -# ============ -# Main Layout -# ============ - -layout = html.Div( - id={ - 'type': 'analysis-layout', - 'label': const.COEXPRESSION - }, - hidden=True, - - children=[ - - html.Div([ - html.P('In this page, you can search for modules (a.k.a. communities, clusters) in rice co-expression networks, which are significantly enriched in the genes implicated by your GWAS. ' - 'Likely functions of the modules are inferred by enrichment analysis against several ontologies and pathway databases.') - ], className='analysis-intro p-3'), - - html.Br(), - - html.Div([ - html.I(className='bi bi-chevron-bar-right me-2 non-clickable'), - html.Span(id='coexpression-genomic-intervals-input'), - ], className='analysis-intro p-3'), - - html.Br(), - - html.Div([ - dbc.Label( - 'Include additional genes from the pan-genome lift-over or the text mining results'), - html.Br(), - dbc.Label( - 'Enter their MSU accession IDs, separated by a semicolon (e.g., LOC_Os01g03680;LOC_Os01g03690;LOC_Os01g04110)', - className='small text-muted'), - - dbc.Textarea(id='coexpression-addl-genes'), - - html.Br(), - - dbc.Label(['Select the co-expression network', - html.I( - className='bi bi-info-circle', id='coexpression-network-tooltip')]), - - html.Br(), - - dbc.RadioItems( - id='coexpression-network', - options=COEXPRESSION_NETWORKS_VALUE_LABEL, - value='OS-CX', - inline=True, - className='ms-3 mt-1' - ), - - html.Br(), - - dbc.Label(['Select a module detection algorithm ', - html.I( - className='bi bi-info-circle', - id='coexpression-clustering-algo-tooltip', - n_clicks=0 - )]), - - module_detection_algo_modal, - - html.Br(), - - dbc.RadioItems( - id='coexpression-clustering-algo', - options=MODULE_DETECTION_ALGOS_VALUE_LABEL, - value='clusterone', - inline=True, - className='ms-3 mt-1' - ), - - html.Br(), - - dbc.Label(['Select the ', - html.Span('parameter for running the algorithm', - id='coexpression-parameter-name'), - html.I( - className='bi bi-info-circle', id='coexpression-parameter-tooltip')], - className='mb-4'), - - # Should also be changed if parameter space is changed - html.Div([dcc.Slider(id='coexpression-parameter-slider', step=None, - marks={0: '1 (Loose Modules)', 30: '2', 60: '3', - 90: '4 (Dense Modules)'}, - value=30)], - id='coexpression-parameter-slider-container'), - - html.Br(), - - dbc.Button('Run Analysis', - id='coexpression-submit', - className='page-button', - n_clicks=0), - ], className='analysis-intro p-3'), - - - html.Br(), - - html.Div( - id='coexpression-results-container', - style={'display': 'none'}, - children=[ - dcc.Loading( - id='coexpression-loading', - children=[ - html.Hr(className='mt-3 mb-3'), - - html.Br(), - - html.Div( - id='coexpression-input', - className='analysis-intro p-3' - ), - - html.Br(), - - html.Div([ - html.Div([ - html.Span(id='coexpression-module-stats') - ], className='stats'), - - html.Div( - id='coexpression-results-module-tabs-container', - children=[ - dbc.Label('Select an enriched module'), - dcc.Dropdown(id='coexpression-modules'), - ], - className='pt-3 pb-2' - ) - ], className='analysis-intro p-3') - ] - ), - - html.Div( - id='coexpression-graph-container', - children=[ - html.Div( - id='coexpression-table-container', - children=[ - html.Br(), - - html.Div([ - html.Span(id='coexpression-graph-stats') - ], className='mb-3 stats'), - - html.P( - 'Click on the tabs below to see the ontology terms and pathways in which the module is enriched.', - className='mb-4'), - - dbc.Tabs( - id='coexpression-modules-pathway', - active_tab='tab-0', - children=[ - dcc.Tab(label='Gene Ontology', - value='Gene Ontology'), - dcc.Tab(label='Trait Ontology', - value='Trait Ontology'), - dcc.Tab(label='Plant Ontology', - value='Plant Ontology'), - dcc.Tab(label='Pathways (Over-Representation)', - value='Pathways (Over-Representation)'), - dcc.Tab(label='Pathway-Express', - value='Pathway-Express'), - dcc.Tab(label='SPIA', - value='SPIA') - ] - ), - - html.Br(), - - dcc.Loading([ - html.P( - html.Div([ - html.Div([ - html.Span( - id='coexpression-table-stats') - ], className='mb-3 stats'), - dbc.Button([html.I( - className='bi bi-download me-2'), - 'Export to CSV'], - id='coexpression-export-table', - n_clicks=0, - color='light', size='sm', className='table-button'), - dcc.Download( - id='coexpression-download-df-to-csv'), - dbc.Button([html.I( - className='bi bi-arrow-clockwise me-2'), - 'Reset Table'], - id='coexpression-reset-table', - color='light', size='sm', className='ms-3 table-button') - ], style={'textAlign': 'right'}) - ), - - dash_table.DataTable( - id='coexpression-pathways', - style_cell={ - 'whiteSpace': 'pre-line' - }, - markdown_options={'html': True}, - sort_action='native', - filter_action='native', - filter_options={'case': 'insensitive', - 'placeholder_text': '🔎︎ Search Column'}, - page_action='native', - page_size=15, - cell_selectable=False - ) - ]), - - html.Br(), - html.Br(), - - html.Div([ - html.P( - 'The graph below shows the selected module. The connections indicate that the genes are co-expressed. The shaded nodes refer to the genes implicated by your GWAS/QTL, including those that you manually added.'), - - dbc.Label( - 'Select the graph display layout'), - - dbc.RadioItems( - id='coexpression-graph-layout', - options=[ - {'value': 'circle', 'label': 'Circle', - 'label_id': 'circle'}, - {'value': 'grid', 'label': 'Grid', - 'label_id': 'grid'} - ], - value='circle', - inline=True, - className='ms-3 mb-3', - ), - - html.P( - 'Click "Reset Graph Display" to position the graph at the center and restore default zoom settings.') - ], className='analysis-intro p-3'), - - html.Br(), - - html.P( - html.Div([ - dbc.Button([html.I( - className='bi bi-download me-2'), - 'Export Edge List'], - id='coexpression-export-graph', - color='light', size='sm', - n_clicks=0, - className='table-button'), - dcc.Download( - id='coexpression-download-graph-to-json'), - dbc.Button([html.I( - className='bi bi-arrow-clockwise me-2'), - 'Reset Graph Display'], - id='coexpression-reset-graph', - n_clicks=0, - color='light', size='sm', - className='ms-3 table-button') - ], style={'textAlign': 'right'}) - ), - ] - ), - - dcc.Loading([ - cyto.Cytoscape( - id='coexpression-module-graph', - className='mb-3', - layout={'name': 'circle'}, - style={'width': '100%', - 'height': '100vh'}, # Should be here (otherwise, initial loading does not consume entire width and height) - stylesheet=[ - { - 'selector': 'node', - 'style': { - 'content': 'data(id)', - 'height': '5px', - 'width': '5px', - 'font-size': '10px' - } - }, - { - 'selector': 'edge', - 'style': { - 'width': '1px', - } - }, - { - 'selector': '.shaded', - 'style': { - 'background-color': '#254b5d', - 'line-color': '#254b5d', - 'height': '20px', - 'width': '20px' - } - } - ] - ), - ]), - - html.Div(id='coexpression-extra-bottom-div', - className='mb-4') - ] - ) - ] - ) - ], className='mt-2') +import dash_bootstrap_components as dbc +import dash_cytoscape as cyto +from dash import dash_table, dcc, html +from callbacks.constants import Constants +from callbacks.coexpression.util import * + + +const = Constants() + +coach = html.Li( + [html.B('COACH'), + html.Span( + ' Detects highly connected gene subnetworks (referred to as "cores") and expands them by including closely associated genes', + className='algo-desc'), + html.Div([ + html.Span( + 'Wu, M., Li, X., Kwoh, C. K., & Ng, S. K. (2009). A core-attachment based method to detect protein complexes in PPI networks. '), + html.I('BMC Bioinformatics, 10'), + html.Span('(169). '), + html.A('https://doi.org/10.1186/1471-2105-10-169', + href='https://doi.org/10.1186/1471-2105-10-169', + target='_blank')], + className='reference' + )] +) + +demon = html.Li( + [html.B('DEMON'), + html.Span( + ' Adopts a bottom-up approach where genes "vote" to determine the subnetwork to which connected genes belong', + className='algo-desc'), + html.Div([ + html.Span( + 'Coscia, M., Rossetti, G., Giannotti, F., & Pedreschi, D. (2012). DEMON: A local-first discovery method for overlapping communities. In '), + html.I('KDD\'12: Proceedings of the 18th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining '), + html.Span('(pp. 615–623). Association for Computing Machinery. '), + html.A('https://doi.org/10.1145/2339530.2339630', + href='https://doi.org/10.1145/2339530.2339630', + target='_blank')], + className='reference' + )] +) + +clusterone = html.Li( + [html.B('ClusterONE'), + html.Span( + ' Forms cohesive gene subnetworks from an initial set of seed genes. ', + className='algo-desc'), + html.Div([ + html.Span( + 'Nepusz, T., Yu, H., & Paccanaro, A. (2012). Detecting overlapping protein complexes in protein-protein interaction networks. '), + html.I('Nature Methods, 9, '), + html.Span('471–472. '), + html.A('https://doi.org/10.1038/nmeth.1938', + href='https://doi.org/10.1038/nmeth.1938', + target='_blank')], + className='reference' + )], +) + +fox = html.Li( + [html.B('FOX'), + html.Span( + ' Determines the membership of a gene to a subnetwork by counting the number of triangles formed by the gene with other genes in the subnetwork', + className='algo-desc'), + html.Div([ + html.Span( + 'Lyu, T., Bing, L., Zhang, Z., & Zhang, Y. (2020). FOX: Fast overlapping community detection algorithm in big weighted networks. '), + html.I('ACM Transactions on Social Computing, 3'), + html.Span('(3), 1–23. '), + html.A('https://doi.org/10.1145/3404970', + href='https://doi.org/10.1145/3404970', + target='_blank')], + className='reference' + )], +) + +module_detection_algo_modal = dbc.Modal([ + dbc.ModalHeader( + dbc.ModalTitle('Module Detection Algorithms') + ), + dbc.ModalBody([ + html.P( + 'Since genes can possibly be involved in multiple biological functions or processes, the algorithms supported by RicePilaf allow for overlapping modules (that is, a given gene may belong to multiple modules):'), + html.Ul([ + clusterone, html.Br(), coach, html.Br(), demon, html.Br(), fox + ]) + ])], + id='coexpression-clustering-algo-modal', + is_open=False, + size='xl' +) + + +# ============ +# Main Layout +# ============ + +layout = html.Div( + id={ + 'type': 'analysis-layout', + 'label': const.COEXPRESSION + }, + hidden=True, + + children=[ + + html.Div([ + html.P('In this page, you can search for modules (a.k.a. communities, clusters) in rice co-expression networks, which are significantly enriched in the genes implicated by your GWAS. ' + 'Likely functions of the modules are inferred by enrichment analysis against several ontologies and pathway databases.') + ], className='analysis-intro p-3'), + + html.Br(), + + html.Div([ + html.I(className='bi bi-chevron-bar-right me-2 non-clickable'), + html.Span(id='coexpression-genomic-intervals-input'), + ], className='analysis-intro p-3'), + + html.Br(), + + html.Div([ + dbc.Label( + 'Include additional genes from the pan-genome lift-over or the text mining results'), + html.Br(), + dbc.Label( + 'Enter their MSU accession IDs, separated by a semicolon (e.g., LOC_Os01g03680;LOC_Os01g03690;LOC_Os01g04110)', + className='small text-muted'), + + dbc.Textarea(id='coexpression-addl-genes'), + + html.Br(), + + dbc.Label(['Select the co-expression network', + html.I( + className='bi bi-info-circle', id='coexpression-network-tooltip')]), + + html.Br(), + + dbc.RadioItems( + id='coexpression-network', + options=COEXPRESSION_NETWORKS_VALUE_LABEL, + value='OS-CX', + inline=True, + className='ms-3 mt-1' + ), + + html.Br(), + + dbc.Label(['Select a module detection algorithm ', + html.I( + className='bi bi-info-circle', + id='coexpression-clustering-algo-tooltip', + n_clicks=0 + )]), + + module_detection_algo_modal, + + html.Br(), + + dbc.RadioItems( + id='coexpression-clustering-algo', + options=MODULE_DETECTION_ALGOS_VALUE_LABEL, + value='clusterone', + inline=True, + className='ms-3 mt-1' + ), + + html.Br(), + + dbc.Label(['Select the ', + html.Span('parameter for running the algorithm', + id='coexpression-parameter-name'), + html.I( + className='bi bi-info-circle', id='coexpression-parameter-tooltip')], + className='mb-4'), + + # Should also be changed if parameter space is changed + html.Div([dcc.Slider(id='coexpression-parameter-slider', step=None, + marks={0: '1 (Loose Modules)', 30: '2', 60: '3', + 90: '4 (Dense Modules)'}, + value=30)], + id='coexpression-parameter-slider-container'), + + html.Br(), + + dbc.Button('Run Analysis', + id='coexpression-submit', + className='page-button', + n_clicks=0), + ], className='analysis-intro p-3'), + + + html.Br(), + + html.Div( + id='coexpression-results-container', + style={'display': 'none'}, + children=[ + dcc.Loading( + id='coexpression-loading', + children=[ + html.Hr(className='mt-3 mb-3'), + + html.Br(), + + html.Div( + id='coexpression-input', + className='analysis-intro p-3' + ), + + html.Br(), + + html.Div([ + html.Div([ + html.Span(id='coexpression-module-stats') + ], className='stats'), + + html.Div( + id='coexpression-results-module-tabs-container', + children=[ + dbc.Label('Select an enriched module'), + dcc.Dropdown(id='coexpression-modules'), + ], + className='pt-3 pb-2' + ) + ], className='analysis-intro p-3') + ] + ), + + html.Div( + id='coexpression-graph-container', + children=[ + html.Div( + id='coexpression-table-container', + children=[ + html.Br(), + + html.Div([ + html.Span(id='coexpression-graph-stats') + ], className='mb-3 stats'), + + html.P( + 'Click on the tabs below to see the ontology terms and pathways in which the module is enriched.', + className='mb-4'), + + dbc.Tabs( + id='coexpression-modules-pathway', + active_tab='tab-0', + children=[ + dcc.Tab(label='Gene Ontology', + value='Gene Ontology'), + dcc.Tab(label='Trait Ontology', + value='Trait Ontology'), + dcc.Tab(label='Plant Ontology', + value='Plant Ontology'), + dcc.Tab(label='Pathways (Over-Representation)', + value='Pathways (Over-Representation)'), + dcc.Tab(label='Pathway-Express', + value='Pathway-Express'), + dcc.Tab(label='SPIA', + value='SPIA') + ] + ), + + html.Br(), + + dcc.Loading([ + html.P( + html.Div([ + html.Div([ + html.Span( + id='coexpression-table-stats') + ], className='mb-3 stats'), + dbc.Button([html.I( + className='bi bi-download me-2'), + 'Export to CSV'], + id='coexpression-export-table', + n_clicks=0, + color='light', size='sm', className='table-button'), + dcc.Download( + id='coexpression-download-df-to-csv'), + dbc.Button([html.I( + className='bi bi-arrow-clockwise me-2'), + 'Reset Table'], + id='coexpression-reset-table', + color='light', size='sm', className='ms-3 table-button') + ], style={'textAlign': 'right'}) + ), + + dash_table.DataTable( + id='coexpression-pathways', + style_cell={ + 'whiteSpace': 'pre-line' + }, + markdown_options={'html': True}, + sort_action='native', + filter_action='native', + filter_options={'case': 'insensitive', + 'placeholder_text': '🔎︎ Search Column'}, + page_action='native', + page_size=15, + cell_selectable=False + ) + ]), + + html.Br(), + html.Br(), + + html.Div([ + html.P( + 'The graph below shows the selected module. The connections indicate that the genes are co-expressed. The shaded nodes refer to the genes implicated by your GWAS/QTL, including those that you manually added.'), + + dbc.Label( + 'Select the graph display layout'), + + dbc.RadioItems( + id='coexpression-graph-layout', + options=[ + {'value': 'circle', 'label': 'Circle', + 'label_id': 'circle'}, + {'value': 'grid', 'label': 'Grid', + 'label_id': 'grid'} + ], + value='circle', + inline=True, + className='ms-3 mb-3', + ), + + html.P( + 'Click "Reset Graph Display" to position the graph at the center and restore default zoom settings.') + ], className='analysis-intro p-3'), + + html.Br(), + + html.P( + html.Div([ + dbc.Button([html.I( + className='bi bi-download me-2'), + 'Export Edge List'], + id='coexpression-export-graph', + color='light', size='sm', + n_clicks=0, + className='table-button'), + dcc.Download( + id='coexpression-download-graph-to-json'), + dbc.Button([html.I( + className='bi bi-arrow-clockwise me-2'), + 'Reset Graph Display'], + id='coexpression-reset-graph', + n_clicks=0, + color='light', size='sm', + className='ms-3 table-button') + ], style={'textAlign': 'right'}) + ), + ] + ), + + dcc.Loading([ + cyto.Cytoscape( + id='coexpression-module-graph', + className='mb-3', + layout={'name': 'circle'}, + style={'width': '100%', + 'height': '100vh'}, # Should be here (otherwise, initial loading does not consume entire width and height) + stylesheet=[ + { + 'selector': 'node', + 'style': { + 'content': 'data(id)', + 'height': '5px', + 'width': '5px', + 'font-size': '10px' + } + }, + { + 'selector': 'edge', + 'style': { + 'width': '1px', + } + }, + { + 'selector': '.shaded', + 'style': { + 'background-color': '#254b5d', + 'line-color': '#254b5d', + 'height': '20px', + 'width': '20px' + } + } + ] + ), + ]), + + html.Div(id='coexpression-extra-bottom-div', + className='mb-4') + ] + ) + ] + ) + ], className='mt-2') diff --git a/pages/analysis/lift_over.py b/pages/analysis/lift_over.py index a381ffbf..29e58188 100644 --- a/pages/analysis/lift_over.py +++ b/pages/analysis/lift_over.py @@ -1,127 +1,127 @@ -import dash_bootstrap_components as dbc -from dash import dash_table, dcc, html -from callbacks.lift_over.util import * -from callbacks.constants import Constants -const = Constants() - - -layout = html.Div( - id={ - 'type': 'analysis-layout', - 'label': const.LIFT_OVER - }, - hidden=True, - children=[ - html.Div([ - html.P( - 'In this page, you can obtain the list of genes overlapping your input intervals. ' - 'Optionally, you can choose genomes to lift-over your Nipponbare coordinates to.'), - ], className='analysis-intro p-3'), - - html.Br(), - - html.Div([ - html.I( - className='bi bi-chevron-bar-right me-2 non-clickable' - ), - html.Span(id='lift-over-genomic-intervals-input'), - ], className='analysis-intro p-3'), - - html.Br(), - - html.Div([ - dbc.Label( - 'Select genome(s) for lift-over (ignore if lift-over is not needed)', - className='mb-2' - ), - - dcc.Dropdown( - construct_options_other_ref_genomes(), - id='lift-over-other-refs', - multi=True, - className='dash-bootstrap' - ), - - html.Br(), - - dbc.Button( - 'Show gene list', - id='lift-over-submit', - className='page-button', - n_clicks=0 - ), - - ], className='analysis-intro p-3'), - - html.Br(), - - html.Div( - id='lift-over-results-container', - style={'display': 'none'}, - children=[ - html.Hr(className='mt-3 mb-4'), - html.P(id='lift-over-results-intro'), - - dcc.Loading([ - html.Ul(id='lift-over-results-statistics'), - html.Br() - ]), - - dbc.Tabs(id='lift-over-results-tabs', - active_tab='tab-0', - className='mt-3'), - html.Br(), - - dbc.Label( - id='lift-over-results-gene-intro' - ), - - dbc.Checklist( - id='lift-over-overlap-table-filter', - inline=True, - className='ms-3' - ), - - html.Br(), - - dcc.Loading([ - html.P( - html.Div([ - dbc.Button([html.I( - className='bi bi-download me-2'), - 'Export to CSV'], - id='lift-over-export-table', - n_clicks=0, - color='light', size='sm', className='table-button' - ), - dcc.Download( - id='lift-over-download-df-to-csv' - ), - dbc.Button([html.I( - className='bi bi-arrow-clockwise me-2'), - 'Reset Table'], - id='lift-over-reset-table', - color='light', size='sm', className='ms-3 table-button' - ) - ], style={'textAlign': 'right'}) - ), - - dash_table.DataTable( - id='lift-over-results-table', - style_cell={ - 'whiteSpace': 'pre-line', - 'height': 'auto' - }, - markdown_options={'html': True}, - sort_action='native', - filter_action='native', - filter_options={'case': 'insensitive', - 'placeholder_text': '🔎︎ Search Column'}, - page_action='native', - page_size=15, - cell_selectable=False - ) - ]) - ] - ) - ], className='mt-2') +import dash_bootstrap_components as dbc +from dash import dash_table, dcc, html +from callbacks.lift_over.util import * +from callbacks.constants import Constants +const = Constants() + + +layout = html.Div( + id={ + 'type': 'analysis-layout', + 'label': const.LIFT_OVER + }, + hidden=True, + children=[ + html.Div([ + html.P( + 'In this page, you can obtain the list of genes overlapping your input intervals. ' + 'Optionally, you can choose genomes to lift-over your Nipponbare coordinates to.'), + ], className='analysis-intro p-3'), + + html.Br(), + + html.Div([ + html.I( + className='bi bi-chevron-bar-right me-2 non-clickable' + ), + html.Span(id='lift-over-genomic-intervals-input'), + ], className='analysis-intro p-3'), + + html.Br(), + + html.Div([ + dbc.Label( + 'Select genome(s) for lift-over (ignore if lift-over is not needed)', + className='mb-2' + ), + + dcc.Dropdown( + construct_options_other_ref_genomes(), + id='lift-over-other-refs', + multi=True, + className='dash-bootstrap' + ), + + html.Br(), + + dbc.Button( + 'Show gene list', + id='lift-over-submit', + className='page-button', + n_clicks=0 + ), + + ], className='analysis-intro p-3'), + + html.Br(), + + html.Div( + id='lift-over-results-container', + style={'display': 'none'}, + children=[ + html.Hr(className='mt-3 mb-4'), + html.P(id='lift-over-results-intro'), + + dcc.Loading([ + html.Ul(id='lift-over-results-statistics'), + html.Br() + ]), + + dbc.Tabs(id='lift-over-results-tabs', + active_tab='tab-0', + className='mt-3'), + html.Br(), + + dbc.Label( + id='lift-over-results-gene-intro' + ), + + dbc.Checklist( + id='lift-over-overlap-table-filter', + inline=True, + className='ms-3' + ), + + html.Br(), + + dcc.Loading([ + html.P( + html.Div([ + dbc.Button([html.I( + className='bi bi-download me-2'), + 'Export to CSV'], + id='lift-over-export-table', + n_clicks=0, + color='light', size='sm', className='table-button' + ), + dcc.Download( + id='lift-over-download-df-to-csv' + ), + dbc.Button([html.I( + className='bi bi-arrow-clockwise me-2'), + 'Reset Table'], + id='lift-over-reset-table', + color='light', size='sm', className='ms-3 table-button' + ) + ], style={'textAlign': 'right'}) + ), + + dash_table.DataTable( + id='lift-over-results-table', + style_cell={ + 'whiteSpace': 'pre-line', + 'height': 'auto' + }, + markdown_options={'html': True}, + sort_action='native', + filter_action='native', + filter_options={'case': 'insensitive', + 'placeholder_text': '🔎︎ Search Column'}, + page_action='native', + page_size=15, + cell_selectable=False + ) + ]) + ] + ) + ], className='mt-2') diff --git a/pages/analysis/text_mining.py b/pages/analysis/text_mining.py index 3bbd801e..16c71faa 100644 --- a/pages/analysis/text_mining.py +++ b/pages/analysis/text_mining.py @@ -1,115 +1,115 @@ -import dash_bootstrap_components as dbc -from dash import dash_table, dcc, html -from callbacks.constants import Constants -const = Constants() - -layout = html.Div( - id={ - 'type': 'analysis-layout', - 'label': const.TEXT_MINING - }, - hidden=True, - children=[ - - html.Div([ - html.P('In this page, you can retrieve gene names associated with traits, diseases, chemicals, etc. from a database constructed from text-mined PubMed abstracts. ' - # 'Conversely, you can retrieve literature that associates your gene of interest to some phenotype.' - ), - ], className='analysis-intro p-3'), - - html.Br(), - - html.Div([ - html.I(className='bi bi-chevron-bar-right me-2 non-clickable'), - html.Span(id='text-mining-genomic-intervals-input'), - ], className='analysis-intro p-3'), - - html.Br(), - - html.Div([ - dbc.Label('Enter your query trait/phenotype', className='mb-2'), - - dbc.Alert( - id='text-mining-input-error', - color='danger', - style={'display': 'none'} - ), - dbc.Input( - id='text-mining-query', - type='text', - value='', - debounce=True, - n_submit=0 - ), - - html.Div([html.Span('Examples:', className='pe-3'), - html.Span('pre-harvest sprouting', - id={'type': 'example-text-mining', - 'description': 'pre-harvest sprouting'}, - className='sample-genomic-interval', - n_clicks=0), - html.Span(',', className='sample-genomic-interval'), - html.Span('anaerobic germination', - id={'type': 'example-text-mining', - 'description': 'anaerobic germination'}, - className='sample-genomic-interval ms-3', - n_clicks=0)], - className='pt-3'), - html.Br(), - - dbc.Button('Search', - id='text-mining-submit', - className='page-button', - n_clicks=0), - ], className='analysis-intro p-3'), - - html.Br(), - - html.Div( - id='text-mining-results-container', - style={'display': 'none'}, - children=[ - html.Hr(className='mt-3 mb-4'), - dcc.Loading([ - html.Div([ - html.Span(id='text-mining-results-stats') - ], className='mb-3 stats'), - - html.P( - html.Div([ - dbc.Button([html.I( - className='bi bi-download me-2'), - 'Export to CSV'], - id='text-mining-export-table', - n_clicks=0, - color='light', size='sm', className='table-button'), - dcc.Download( - id='text-mining-download-df-to-csv'), - dbc.Button([html.I( - className='bi bi-arrow-clockwise me-2'), - 'Reset Table'], - id='text-mining-reset-table', - color='light', size='sm', className='ms-3 table-button') - ], style={'textAlign': 'right'}) - ), - - dash_table.DataTable( - id='text-mining-result-table', - style_data={ - 'whiteSpace': 'normal', - 'height': 'auto', - 'textAlign': 'left' - }, - markdown_options={'html': True}, - sort_action='native', - filter_action='native', - filter_options={'case': 'insensitive', - 'placeholder_text': '🔎︎ Search Column'}, - page_action='native', - page_size=10, - cell_selectable=False - ) - ]) - ], className='mt-2') - - ], className='mt-2') +import dash_bootstrap_components as dbc +from dash import dash_table, dcc, html +from callbacks.constants import Constants +const = Constants() + +layout = html.Div( + id={ + 'type': 'analysis-layout', + 'label': const.TEXT_MINING + }, + hidden=True, + children=[ + + html.Div([ + html.P('In this page, you can retrieve gene names associated with traits, diseases, chemicals, etc. from a database constructed from text-mined PubMed abstracts. ' + # 'Conversely, you can retrieve literature that associates your gene of interest to some phenotype.' + ), + ], className='analysis-intro p-3'), + + html.Br(), + + html.Div([ + html.I(className='bi bi-chevron-bar-right me-2 non-clickable'), + html.Span(id='text-mining-genomic-intervals-input'), + ], className='analysis-intro p-3'), + + html.Br(), + + html.Div([ + dbc.Label('Enter your query trait/phenotype', className='mb-2'), + + dbc.Alert( + id='text-mining-input-error', + color='danger', + style={'display': 'none'} + ), + dbc.Input( + id='text-mining-query', + type='text', + value='', + debounce=True, + n_submit=0 + ), + + html.Div([html.Span('Examples:', className='pe-3'), + html.Span('pre-harvest sprouting', + id={'type': 'example-text-mining', + 'description': 'pre-harvest sprouting'}, + className='sample-genomic-interval', + n_clicks=0), + html.Span(',', className='sample-genomic-interval'), + html.Span('anaerobic germination', + id={'type': 'example-text-mining', + 'description': 'anaerobic germination'}, + className='sample-genomic-interval ms-3', + n_clicks=0)], + className='pt-3'), + html.Br(), + + dbc.Button('Search', + id='text-mining-submit', + className='page-button', + n_clicks=0), + ], className='analysis-intro p-3'), + + html.Br(), + + html.Div( + id='text-mining-results-container', + style={'display': 'none'}, + children=[ + html.Hr(className='mt-3 mb-4'), + dcc.Loading(id='text-mining-loading', children=[ + html.Div([ + html.Span(id='text-mining-results-stats') + ], className='mb-3 stats'), + + html.P( + html.Div([ + dbc.Button([html.I( + className='bi bi-download me-2'), + 'Export to CSV'], + id='text-mining-export-table', + n_clicks=0, + color='light', size='sm', className='table-button'), + dcc.Download( + id='text-mining-download-df-to-csv'), + dbc.Button([html.I( + className='bi bi-arrow-clockwise me-2'), + 'Reset Table'], + id='text-mining-reset-table', + color='light', size='sm', className='ms-3 table-button') + ], style={'textAlign': 'right'}) + ), + + dash_table.DataTable( + id='text-mining-result-table', + style_data={ + 'whiteSpace': 'normal', + 'height': 'auto', + 'textAlign': 'left' + }, + markdown_options={'html': True}, + sort_action='native', + filter_action='native', + filter_options={'case': 'insensitive', + 'placeholder_text': '🔎︎ Search Column'}, + page_action='native', + page_size=10, + cell_selectable=False + ) + ]) + ], className='mt-2') + + ], className='mt-2') diff --git a/pages/analysis/tf_enrich.py b/pages/analysis/tf_enrich.py index f772c46f..f1e5ed6f 100644 --- a/pages/analysis/tf_enrich.py +++ b/pages/analysis/tf_enrich.py @@ -1,134 +1,134 @@ -import dash_bootstrap_components as dbc -from dash import dash_table, dcc, html -from callbacks.constants import Constants -const = Constants() - -layout = html.Div( - id={ - 'type': 'analysis-layout', - 'label': const.TFBS - }, - hidden=True, - children=[ - html.Div([ - html.P( - 'Perhaps your intervals contain variants that influence regulatory elements, for example by affecting binding affinity.'), - html.P( - 'In this page, you can search for transcription factors whose binding sites overlap significantly with your intervals.') - ], className='analysis-intro p-3'), - - html.Br(), - - html.Div([ - html.I(className='bi bi-chevron-bar-right me-2 non-clickable'), - html.Span(id='tf-enrichment-genomic-intervals-input'), - ], className='analysis-intro p-3'), - - html.Br(), - - html.Div([ - dbc.Label( - 'Include additional genes from the pan-genome lift-over or the text mining results'), - html.Br(), - dbc.Label( - 'Enter their MSU accession IDs, separated by a semicolon (e.g., LOC_Os01g03680;LOC_Os01g03690;LOC_Os01g04110)', - className='small text-muted'), - - dbc.Textarea(id='tfbs-addl-genes'), - - html.Br(), - - dbc.Label(['Choose TF binding site prediction technique', - html.I( - className='bi bi-info-circle', - id='tf-enrichment-technique-tooltip', - n_clicks=0 - )]), - dbc.RadioItems( - id='tfbs-prediction-technique', - options=[ - {'value': 'FunTFBS', 'label': 'FunTFBS', 'label_id': 'FunTFBS'}, - {'value': 'CE', 'label': 'motif conservation', - 'label_id': 'motif conservation'}, - {'value': 'motif', 'label': 'motif scan', - 'label_id': 'motif scan'} - ], - value='FunTFBS', - inline=True - ), - - html.Br(), - dbc.Label(['Consider TF binding sites in the following regions', - html.I( - className='bi bi-info-circle', - id='tf-enrichment-binding-site-tooltip', - n_clicks=0 - )]), - dbc.RadioItems( - id='tfbs-set', - options=[ - {'value': 'promoters', 'label': 'promoters', - 'label_id': 'promoters'}, - {'value': 'genome', 'label': 'genome', - 'label_id': 'genome'} - ], - value='promoters', - inline=True - ), - - html.Br(), - - dbc.Label("Input threshold for False-Discovery Rate:"), - dbc.Input(id='tfbs-fdr', type='number', - value=0.25, min=0, max=1, step=0.05), - html.Br(), - - dbc.Button('Run Analysis', - id='tfbs-submit', - n_clicks=0, - className='page-button'), - ], className='analysis-intro p-3'), - - html.Br(), - - html.Div( - id='tfbs-results-container', - style={'display': 'none'}, - children=[ - html.Hr(className='mt-3 mb-4'), - dcc.Loading([ - html.P( - html.Div([ - dbc.Button([html.I( - className='bi bi-download me-2'), - 'Export to CSV'], - id='tfbs-export-table', - n_clicks=0, - color='light', size='sm', className='table-button'), - dcc.Download(id='tfbs-download-df-to-csv'), - dbc.Button([html.I( - className='bi bi-arrow-clockwise me-2'), - 'Reset Table'], - id='tfbs-reset-table', - color='light', size='sm', className='ms-3 table-button') - ], style={'textAlign': 'right'}) - ), - - dash_table.DataTable( - id='tf-enrichment-result-table', - style_cell={ - 'whiteSpace': 'pre-line' - }, - markdown_options={'html': True}, - sort_action='native', - filter_action='native', - filter_options={'case': 'insensitive', - 'placeholder_text': '🔎︎ Search Column'}, - page_action='native', - page_size=15, - cell_selectable=False - ) - ]) - ]) - ], className='mt-2' -) +import dash_bootstrap_components as dbc +from dash import dash_table, dcc, html +from callbacks.constants import Constants +const = Constants() + +layout = html.Div( + id={ + 'type': 'analysis-layout', + 'label': const.TFBS + }, + hidden=True, + children=[ + html.Div([ + html.P( + 'Perhaps your intervals contain variants that influence regulatory elements, for example by affecting binding affinity.'), + html.P( + 'In this page, you can search for transcription factors whose binding sites overlap significantly with your intervals.') + ], className='analysis-intro p-3'), + + html.Br(), + + html.Div([ + html.I(className='bi bi-chevron-bar-right me-2 non-clickable'), + html.Span(id='tf-enrichment-genomic-intervals-input'), + ], className='analysis-intro p-3'), + + html.Br(), + + html.Div([ + dbc.Label( + 'Include additional genes from the pan-genome lift-over or the text mining results'), + html.Br(), + dbc.Label( + 'Enter their MSU accession IDs, separated by a semicolon (e.g., LOC_Os01g03680;LOC_Os01g03690;LOC_Os01g04110)', + className='small text-muted'), + + dbc.Textarea(id='tfbs-addl-genes'), + + html.Br(), + + dbc.Label(['Choose TF binding site prediction technique', + html.I( + className='bi bi-info-circle', + id='tf-enrichment-technique-tooltip', + n_clicks=0 + )]), + dbc.RadioItems( + id='tfbs-prediction-technique', + options=[ + {'value': 'FunTFBS', 'label': 'FunTFBS', 'label_id': 'FunTFBS'}, + {'value': 'CE', 'label': 'motif conservation', + 'label_id': 'motif conservation'}, + {'value': 'motif', 'label': 'motif scan', + 'label_id': 'motif scan'} + ], + value='FunTFBS', + inline=True + ), + + html.Br(), + dbc.Label(['Consider TF binding sites in the following regions', + html.I( + className='bi bi-info-circle', + id='tf-enrichment-binding-site-tooltip', + n_clicks=0 + )]), + dbc.RadioItems( + id='tfbs-set', + options=[ + {'value': 'promoters', 'label': 'promoters', + 'label_id': 'promoters'}, + {'value': 'genome', 'label': 'genome', + 'label_id': 'genome'} + ], + value='promoters', + inline=True + ), + + html.Br(), + + dbc.Label("Input threshold for False-Discovery Rate:"), + dbc.Input(id='tfbs-fdr', type='number', + value=0.25, min=0, max=1, step=0.05), + html.Br(), + + dbc.Button('Run Analysis', + id='tfbs-submit', + n_clicks=0, + className='page-button'), + ], className='analysis-intro p-3'), + + html.Br(), + + html.Div( + id='tfbs-results-container', + style={'display': 'none'}, + children=[ + html.Hr(className='mt-3 mb-4'), + dcc.Loading([ + html.P( + html.Div([ + dbc.Button([html.I( + className='bi bi-download me-2'), + 'Export to CSV'], + id='tfbs-export-table', + n_clicks=0, + color='light', size='sm', className='table-button'), + dcc.Download(id='tfbs-download-df-to-csv'), + dbc.Button([html.I( + className='bi bi-arrow-clockwise me-2'), + 'Reset Table'], + id='tfbs-reset-table', + color='light', size='sm', className='ms-3 table-button') + ], style={'textAlign': 'right'}) + ), + + dash_table.DataTable( + id='tf-enrichment-result-table', + style_cell={ + 'whiteSpace': 'pre-line' + }, + markdown_options={'html': True}, + sort_action='native', + filter_action='native', + filter_options={'case': 'insensitive', + 'placeholder_text': '🔎︎ Search Column'}, + page_action='native', + page_size=15, + cell_selectable=False + ) + ]) + ]) + ], className='mt-2' +) diff --git a/pages/analysis_layout.py b/pages/analysis_layout.py index 0d80cba5..f50b5204 100644 --- a/pages/analysis_layout.py +++ b/pages/analysis_layout.py @@ -1,31 +1,31 @@ -from dash import html - -import pages.analysis.lift_over as lift_over -import pages.analysis.co_expr as co_expr -import pages.analysis.tf_enrich as tf_enrich -import pages.analysis.browse_loci as browse_loci -import pages.analysis.text_mining as text_mining - -from collections import OrderedDict - -from callbacks.constants import Constants -const = Constants() - - -def get_analaysis_layout_dictionary(): - return OrderedDict({ - const.LIFT_OVER: 'Gene List and Lift-Over', - const.TEXT_MINING: 'Gene Retrieval by Text Mining', - const.COEXPRESSION: 'Co-Expression Network Analysis', - const.TFBS: 'Regulatory Feature Enrichment', - const.IGV: 'Browse Loci' - }) - - -layout = html.Div(children=[ - lift_over.layout, - text_mining.layout, - co_expr.layout, - tf_enrich.layout, - browse_loci.layout -]) +from dash import html + +import pages.analysis.lift_over as lift_over +import pages.analysis.co_expr as co_expr +import pages.analysis.tf_enrich as tf_enrich +import pages.analysis.browse_loci as browse_loci +import pages.analysis.text_mining as text_mining + +from collections import OrderedDict + +from callbacks.constants import Constants +const = Constants() + + +def get_analaysis_layout_dictionary(): + return OrderedDict({ + const.LIFT_OVER: 'Gene List and Lift-Over', + const.TEXT_MINING: 'Gene Retrieval by Text Mining', + const.COEXPRESSION: 'Co-Expression Network Analysis', + const.TFBS: 'Regulatory Feature Enrichment', + const.IGV: 'Browse Loci' + }) + + +layout = html.Div(children=[ + lift_over.layout, + text_mining.layout, + co_expr.layout, + tf_enrich.layout, + browse_loci.layout +]) diff --git a/pages/homepage.py b/pages/homepage.py index d70ef634..ad89026a 100644 --- a/pages/homepage.py +++ b/pages/homepage.py @@ -1,174 +1,174 @@ -import dash -import dash_bootstrap_components as dbc -from dash import html - -import pages.navigation.analysis_nav as analysis_nav -import pages.analysis_layout as analysis_layout - -dash.register_page(__name__, path='/', name='RicePilaf', location='app-topbar') - - -# ====== -# Modal -# ====== - -genomic_interval_modal = dbc.Modal([ - dbc.ModalHeader( - dbc.ModalTitle('Genomic Intervals from GWAS/QTL') - ), - dbc.ModalBody([ - html.Span('Enter genomic intervals like so: '), html.Span( - 'Chr01:100000-200000', className='text-muted'), - html.Br(), - html.Span( - 'Multiple intervals should be separated by a semicolon like so: '), - html.Span('Chr01:100000-200000;Chr02:300000-400000', - className='text-muted'), - html.Br(), - html.Span( - 'These intervals are obtained from LD-based clumping of significant GWAS SNPs or from QTL mapping studies.'), - html.Br(), - html.Br(), - - html.P( - 'We also provide some sample genomic intervals, taken from the following GWAS/QTL analyses:'), - html.Ul([ - html.Li([ - html.Div([ - html.Span( - 'Lee, J. S., Chebotarov, D., McNally, K. L., Pede, V., Setiyono, T. D., Raquid, R., Hyun, W. J., Leung, J. U., Kohli, A., & Mo, Y. (2021). Novel sources of pre-harvest sprouting resistance for Japanoica rice improvement. '), - html.I( - 'Plants, 10'), - html.Span( - '(8), 1709. '), - html.A('https://doi.org/10.3390/plants10081709', - href='https://doi.org/10.3390/plants10081709', - target='_blank')], - ) - ]) - ]), - html.Ul([ - html.Li([ - html.Div([ - html.Span( - 'Tnani, H., Chebotarov, D., Thapa, R., Ignacio, J. C. I., Israel, W. K., Quilloy, F. A., Dixit, S., & Septiningsih, E. M., & Kretzschmar, T. (2021). Enriched-GWAS and transcriptome analysis to refine and characterize a major QTL for anaerobic germination tolerance in rice. '), - html.I( - 'International Journal of Molecular Sciences, 22'), - html.Span( - '(9), 4445. '), - html.A('https://doi.org/10.3390/ijms22094445', - href='https://doi.org/10.3390/ijms22094445', - target='_blank')], - ) - ]) - ]) - ])], - id='genomic-interval-modal', - is_open=False, - size='xl' -) - -# ====== -# Input -# ====== - -submit_clear_buttons = dbc.Row([ - dbc.Col(dbc.Button('Proceed to Analyses Menu', - id='homepage-submit', - n_clicks=0, - className='home-button'), - xs=4, sm=4, md=2, lg=2, xl=2, xxl=2), - dbc.Col(dbc.Button('Reset All Analyses', - color='danger', - outline=True, - id='homepage-reset', - n_clicks=0, - className='home-button'), - xs=4, sm=4, md=2, lg=2, xl=2, xxl=2, - id='reset-analyses-container'), - dbc.Col(dbc.Button('Clear Cache', - id='homepage-clear-cache', - color='danger', - outline=True, - n_clicks=0, - className='home-button'), - xs=4, sm=4, md=2, lg=2, xl=2, xxl=2), -], className='pt-2') - -genome_ref_input = dbc.Col([ - html.Div([ - html.H5('Enter your GWAS/QTL intervals', - id='genomic-interval-hdr'), - html.I(className='bi bi-info-circle', - id='genomic-interval-tooltip', - n_clicks=0) - ], id='genomic-interval-container'), - - genomic_interval_modal, - - dbc.Alert( - id='input-error', - color='danger', - style={'display': 'none'} - ), - dbc.Input( - id='homepage-genomic-intervals', - type='text', - value='', - debounce=True, - n_submit=0 - ), - - html.Div([ - html.Span('Or select from these examples:', className='pe-3'), - html.Span('Pre-Harvest Sprouting (Lee et al., 2021)', - id={'type': 'example-genomic-interval', - 'description': 'pre-harvest'}, - className='sample-genomic-interval', - n_clicks=0), - html.Span(',', className='sample-genomic-interval'), - html.Span('Anaerobic Germination (Tnani et al., 2021)', - id={'type': 'example-genomic-interval', - 'description': 'anaerobic-germination'}, - className='sample-genomic-interval ms-3', - n_clicks=0)], - className='pt-3'), - html.Br(), - - submit_clear_buttons -]) - -# ============ -# Main Layout -# ============ - -layout = html.Div([ - dbc.Row( - genome_ref_input, - className='px-5 pt-4 pb-5', - id='genome-ref-input-container' - ), - - html.Br(), - - html.Div( - id='homepage-results-container', - style={'display': 'none'}, - children=[ - html.Div( - id='post-gwas-analysis-container', - children=[dbc.Row([ - dbc.Col( - [html.H5('Select an analysis', id='post-gwas-hdr'), - analysis_nav.navbar()], - xs=4, sm=4, md=2, lg=2, xl=2, xxl=2), - dbc.Col( - children=analysis_layout.layout, - xs=7, sm=7, md=9, lg=9, xl=9, xxl=9, - id='page', - ) - ], className='ps-5 py-2')] - ) - ] - ) -]) +import dash +import dash_bootstrap_components as dbc +from dash import html + +import pages.navigation.analysis_nav as analysis_nav +import pages.analysis_layout as analysis_layout + +dash.register_page(__name__, path='/', name='RicePilaf', location='app-topbar') + + +# ====== +# Modal +# ====== + +genomic_interval_modal = dbc.Modal([ + dbc.ModalHeader( + dbc.ModalTitle('Genomic Intervals from GWAS/QTL') + ), + dbc.ModalBody([ + html.Span('Enter genomic intervals like so: '), html.Span( + 'Chr01:100000-200000', className='text-muted'), + html.Br(), + html.Span( + 'Multiple intervals should be separated by a semicolon like so: '), + html.Span('Chr01:100000-200000;Chr02:300000-400000', + className='text-muted'), + html.Br(), + html.Span( + 'These intervals are obtained from LD-based clumping of significant GWAS SNPs or from QTL mapping studies.'), + html.Br(), + html.Br(), + + html.P( + 'We also provide some sample genomic intervals, taken from the following GWAS/QTL analyses:'), + html.Ul([ + html.Li([ + html.Div([ + html.Span( + 'Lee, J. S., Chebotarov, D., McNally, K. L., Pede, V., Setiyono, T. D., Raquid, R., Hyun, W. J., Leung, J. U., Kohli, A., & Mo, Y. (2021). Novel sources of pre-harvest sprouting resistance for Japanoica rice improvement. '), + html.I( + 'Plants, 10'), + html.Span( + '(8), 1709. '), + html.A('https://doi.org/10.3390/plants10081709', + href='https://doi.org/10.3390/plants10081709', + target='_blank')], + ) + ]) + ]), + html.Ul([ + html.Li([ + html.Div([ + html.Span( + 'Tnani, H., Chebotarov, D., Thapa, R., Ignacio, J. C. I., Israel, W. K., Quilloy, F. A., Dixit, S., & Septiningsih, E. M., & Kretzschmar, T. (2021). Enriched-GWAS and transcriptome analysis to refine and characterize a major QTL for anaerobic germination tolerance in rice. '), + html.I( + 'International Journal of Molecular Sciences, 22'), + html.Span( + '(9), 4445. '), + html.A('https://doi.org/10.3390/ijms22094445', + href='https://doi.org/10.3390/ijms22094445', + target='_blank')], + ) + ]) + ]) + ])], + id='genomic-interval-modal', + is_open=False, + size='xl' +) + +# ====== +# Input +# ====== + +submit_clear_buttons = dbc.Row([ + dbc.Col(dbc.Button('Proceed to Analyses Menu', + id='homepage-submit', + n_clicks=0, + className='home-button'), + xs=4, sm=4, md=2, lg=2, xl=2, xxl=2), + dbc.Col(dbc.Button('Reset All Analyses', + color='danger', + outline=True, + id='homepage-reset', + n_clicks=0, + className='home-button'), + xs=4, sm=4, md=2, lg=2, xl=2, xxl=2, + id='reset-analyses-container'), + dbc.Col(dbc.Button('Clear Cache', + id='homepage-clear-cache', + color='danger', + outline=True, + n_clicks=0, + className='home-button'), + xs=4, sm=4, md=2, lg=2, xl=2, xxl=2), +], className='pt-2') + +genome_ref_input = dbc.Col([ + html.Div([ + html.H5('Enter your GWAS/QTL intervals', + id='genomic-interval-hdr'), + html.I(className='bi bi-info-circle', + id='genomic-interval-tooltip', + n_clicks=0) + ], id='genomic-interval-container'), + + genomic_interval_modal, + + dbc.Alert( + id='input-error', + color='danger', + style={'display': 'none'} + ), + dbc.Input( + id='homepage-genomic-intervals', + type='text', + value='', + debounce=True, + n_submit=0 + ), + + html.Div([ + html.Span('Or select from these examples:', className='pe-3'), + html.Span('Pre-Harvest Sprouting (Lee et al., 2021)', + id={'type': 'example-genomic-interval', + 'description': 'pre-harvest'}, + className='sample-genomic-interval', + n_clicks=0), + html.Span(',', className='sample-genomic-interval'), + html.Span('Anaerobic Germination (Tnani et al., 2021)', + id={'type': 'example-genomic-interval', + 'description': 'anaerobic-germination'}, + className='sample-genomic-interval ms-3', + n_clicks=0)], + className='pt-3'), + html.Br(), + + submit_clear_buttons +]) + +# ============ +# Main Layout +# ============ + +layout = html.Div([ + dbc.Row( + genome_ref_input, + className='px-5 pt-4 pb-5', + id='genome-ref-input-container' + ), + + html.Br(), + + html.Div( + id='homepage-results-container', + style={'display': 'none'}, + children=[ + html.Div( + id='post-gwas-analysis-container', + children=[dbc.Row([ + dbc.Col( + [html.H5('Select an analysis', id='post-gwas-hdr'), + analysis_nav.navbar()], + xs=4, sm=4, md=2, lg=2, xl=2, xxl=2), + dbc.Col( + children=analysis_layout.layout, + xs=7, sm=7, md=9, lg=9, xl=9, xxl=9, + id='page', + ) + ], className='ps-5 py-2')] + ) + ] + ) +]) diff --git a/pages/navigation/analysis_nav.py b/pages/navigation/analysis_nav.py index d2f53b97..6bec589a 100644 --- a/pages/navigation/analysis_nav.py +++ b/pages/navigation/analysis_nav.py @@ -1,28 +1,28 @@ -import dash_bootstrap_components as dbc -import pages.analysis_layout as analysis_layout - - -def navbar(): - analysis_layout_dict = analysis_layout.get_analaysis_layout_dictionary() - - nav_list = [dbc.NavItem( - dbc.NavLink( - analysis_layout_dict[key], - className='ps-4', - id={ - 'type': 'analysis-nav', - 'label': key - }, - n_clicks=0 - ) - ) for key in analysis_layout_dict.keys()] - - return dbc.Nav( - [ - item for item in nav_list - ], - vertical=True, - pills=True, - className='bg-light', - id='homepage-dash-nav' - ) +import dash_bootstrap_components as dbc +import pages.analysis_layout as analysis_layout + + +def navbar(): + analysis_layout_dict = analysis_layout.get_analaysis_layout_dictionary() + + nav_list = [dbc.NavItem( + dbc.NavLink( + analysis_layout_dict[key], + className='ps-4', + id={ + 'type': 'analysis-nav', + 'label': key + }, + n_clicks=0 + ) + ) for key in analysis_layout_dict.keys()] + + return dbc.Nav( + [ + item for item in nav_list + ], + vertical=True, + pills=True, + className='bg-light', + id='homepage-dash-nav' + ) diff --git a/pages/navigation/main_nav.py b/pages/navigation/main_nav.py index 8c973b16..5ef5348b 100644 --- a/pages/navigation/main_nav.py +++ b/pages/navigation/main_nav.py @@ -1,50 +1,50 @@ -from dash import html -import dash_bootstrap_components as dbc - - -def navbar(): - return dbc.NavbarSimple( - children=[ - # dbc.NavItem(dbc.NavLink('Home', active='exact', - # href='/', className='top-navbar-item')), - dbc.NavItem(dbc.NavLink('User Guide', href='https://github.com/bioinfodlsu/rice-pilaf/wiki/2.-User-Guide', - target='_blank', className='top-navbar-item')), - ], - id='top-navbar', - brand=[dbc.Row( - [ - dbc.Col(html.Img(src='assets/logo.png', height='20px', - className='mx-auto'), className='d-flex align-items-center'), - dbc.Col(dbc.NavbarBrand('RicePilaf', className='ms-3')), - ], - align="center", - className="g-0", - )], - brand_href='/', - color='#4d987d', - dark=True, - fluid=True, - className='px-5', - ) - - """ - return dbc.NavbarSimple( - children=[ - dbc.NavItem( - dbc.NavLink([ - page["name"] - ], - href=page["path"], - active="exact", - className='top-navbar-item' - )) - for page in dash.page_registry.values() - if page["location"] == 'app-topbar' - ], - id='top-navbar', - brand=['RicePilaf'], - brand_href='/', - color='#4d987d', - dark=True - ) - """ +from dash import html +import dash_bootstrap_components as dbc + + +def navbar(): + return dbc.NavbarSimple( + children=[ + # dbc.NavItem(dbc.NavLink('Home', active='exact', + # href='/', className='top-navbar-item')), + dbc.NavItem(dbc.NavLink('User Guide', href='https://github.com/bioinfodlsu/rice-pilaf/wiki/2.-User-Guide', + target='_blank', className='top-navbar-item')), + ], + id='top-navbar', + brand=[dbc.Row( + [ + dbc.Col(html.Img(src='assets/logo.png', height='20px', + className='mx-auto'), className='d-flex align-items-center'), + dbc.Col(dbc.NavbarBrand('RicePilaf', className='ms-3')), + ], + align="center", + className="g-0", + )], + brand_href='/', + color='#4d987d', + dark=True, + fluid=True, + className='px-5', + ) + + """ + return dbc.NavbarSimple( + children=[ + dbc.NavItem( + dbc.NavLink([ + page["name"] + ], + href=page["path"], + active="exact", + className='top-navbar-item' + )) + for page in dash.page_registry.values() + if page["location"] == 'app-topbar' + ], + id='top-navbar', + brand=['RicePilaf'], + brand_href='/', + color='#4d987d', + dark=True + ) + """ diff --git a/prepare_data/workflow/Snakefile b/prepare_data/workflow/Snakefile index a80b00a8..2cbf7e61 100644 --- a/prepare_data/workflow/Snakefile +++ b/prepare_data/workflow/Snakefile @@ -1,20 +1,20 @@ -include: "rules/last_whole_genome_alignment.smk" -include: "rules/prepare_annotation.smk" -include: "rules/transcription_factor_binding_sites.smk" - -#rule download_ref_genomes: - #wget and then make symbolic links - - -# rule all_download_annotation: -# input: -# expand("{0}/annotations/{{other_ref}}/{{other_ref}}.gff.db".format(config["processed_data_dir"]),other_ref = config["other_refs"]) -# -# rule whole_genome_alignment: -# #input: "{0}/last_index/index.done".format(config["data_dir"]) -# input: -# expand("{0}/alignments/Nb_{{other_ref}}/Nb_{{other_ref}}.gff.db".format(config["processed_data_dir"]), other_ref = config["other_refs"]) - -rule tfbs: - input: - "{0}/promoter_seq.fasta".format(config["tfbs_dir"]) +include: "rules/last_whole_genome_alignment.smk" +include: "rules/prepare_annotation.smk" +include: "rules/transcription_factor_binding_sites.smk" + +#rule download_ref_genomes: + #wget and then make symbolic links + + +# rule all_download_annotation: +# input: +# expand("{0}/annotations/{{other_ref}}/{{other_ref}}.gff.db".format(config["processed_data_dir"]),other_ref = config["other_refs"]) +# +# rule whole_genome_alignment: +# #input: "{0}/last_index/index.done".format(config["data_dir"]) +# input: +# expand("{0}/alignments/Nb_{{other_ref}}/Nb_{{other_ref}}.gff.db".format(config["processed_data_dir"]), other_ref = config["other_refs"]) + +rule tfbs: + input: + "{0}/promoter_seq.fasta".format(config["tfbs_dir"]) diff --git a/prepare_data/workflow/configfile.yaml b/prepare_data/workflow/configfile.yaml index 7c0e9c08..40d68684 100644 --- a/prepare_data/workflow/configfile.yaml +++ b/prepare_data/workflow/configfile.yaml @@ -1,37 +1,37 @@ -#Input and output path -input_data_dir: "../../static" -processed_data_dir: "../../Azu_IR64_ARC_CHAO" -tfbs_dir: "../../static/tfbs" -#Data -#compute whole-genome alignment vs. Nb, for the following genomes -other_refs: - ["Azu","IR64","ARC","CHAO"] -threads: 1 - -#Data URLs -assembly_links: - Nb: "https://riceome.hzau.edu.cn/download/Npb.fasta" - ARC: "https://riceome.hzau.edu.cn/download/117425.fasta" - Azu: "https://riceome.hzau.edu.cn/download/Azucena.fasta" - CHAO: "https://riceome.hzau.edu.cn/download/132278.fasta" - IR64: "https://riceome.hzau.edu.cn/download/IR64.fasta" - LIU: "https://riceome.hzau.edu.cn/download/125827.fasta" - MH63: "https://riceome.hzau.edu.cn/download/MH63RS3.fasta" - N22: "https://riceome.hzau.edu.cn/download/117534.fasta" -annotation_links: - Nb: "https://riceome.hzau.edu.cn/download/IRGSPMSU.gff" - ARC: "https://riceome.hzau.edu.cn/download/117425.gff" - Azu: "https://riceome.hzau.edu.cn/download/Azucena.gff" - CHAO: "https://riceome.hzau.edu.cn/download/132278.gff" - IR64: "https://riceome.hzau.edu.cn/download/IR64.gff" - LIU: "https://riceome.hzau.edu.cn/download/125827.gff" - MH63: "https://riceome.hzau.edu.cn/download/MH63RS3.gff" - N22: "https://riceome.hzau.edu.cn/download/117534.gff" -tfbs_links: - tf_list: "http://planttfdb.gao-lab.org/download/TF_list/Osj_TF_list.txt.gz" - promoter_motif: "http://plantregmap.gao-lab.org/download_ftp.php?filepath=08-download/Oryza_sativa_Japonica_Group/binding/TFBS_from_motif_inProm_Osj.gff" - promoter_motif_CE: "http://plantregmap.gao-lab.org/download_ftp.php?filepath=08-download/Oryza_sativa_Japonica_Group/binding/TFBS_from_motif_CE_inProm_Osj.gff" - promoter_FunTBFS: "http://plantregmap.gao-lab.org/download_ftp.php?filepath=08-download/Oryza_sativa_Japonica_Group/binding/TFBS_from_FunTFBS_inProm_Osj.gff" - genome_motif: "http://plantregmap.gao-lab.org/download_ftp.php?filepath=08-download/Oryza_sativa_Japonica_Group/binding/TFBS_from_motif_genome-wide_Osj.gff" - genome_motif_CE: "http://plantregmap.gao-lab.org/download_ftp.php?filepath=08-download/Oryza_sativa_Japonica_Group/binding/TFBS_from_motif_CE_genome-wide_Osj.gff" - genome_FunTBFS: "http://plantregmap.gao-lab.org/download_ftp.php?filepath=08-download/Oryza_sativa_Japonica_Group/binding/TFBS_from_FunTFBS_genome-wide_Osj.gff" +#Input and output path +input_data_dir: "../../static" +processed_data_dir: "../../Azu_IR64_ARC_CHAO" +tfbs_dir: "../../static/tfbs" +#Data +#compute whole-genome alignment vs. Nb, for the following genomes +other_refs: + ["Azu","IR64","ARC","CHAO"] +threads: 1 + +#Data URLs +assembly_links: + Nb: "https://riceome.hzau.edu.cn/download/Npb.fasta" + ARC: "https://riceome.hzau.edu.cn/download/117425.fasta" + Azu: "https://riceome.hzau.edu.cn/download/Azucena.fasta" + CHAO: "https://riceome.hzau.edu.cn/download/132278.fasta" + IR64: "https://riceome.hzau.edu.cn/download/IR64.fasta" + LIU: "https://riceome.hzau.edu.cn/download/125827.fasta" + MH63: "https://riceome.hzau.edu.cn/download/MH63RS3.fasta" + N22: "https://riceome.hzau.edu.cn/download/117534.fasta" +annotation_links: + Nb: "https://riceome.hzau.edu.cn/download/IRGSPMSU.gff" + ARC: "https://riceome.hzau.edu.cn/download/117425.gff" + Azu: "https://riceome.hzau.edu.cn/download/Azucena.gff" + CHAO: "https://riceome.hzau.edu.cn/download/132278.gff" + IR64: "https://riceome.hzau.edu.cn/download/IR64.gff" + LIU: "https://riceome.hzau.edu.cn/download/125827.gff" + MH63: "https://riceome.hzau.edu.cn/download/MH63RS3.gff" + N22: "https://riceome.hzau.edu.cn/download/117534.gff" +tfbs_links: + tf_list: "http://planttfdb.gao-lab.org/download/TF_list/Osj_TF_list.txt.gz" + promoter_motif: "http://plantregmap.gao-lab.org/download_ftp.php?filepath=08-download/Oryza_sativa_Japonica_Group/binding/TFBS_from_motif_inProm_Osj.gff" + promoter_motif_CE: "http://plantregmap.gao-lab.org/download_ftp.php?filepath=08-download/Oryza_sativa_Japonica_Group/binding/TFBS_from_motif_CE_inProm_Osj.gff" + promoter_FunTBFS: "http://plantregmap.gao-lab.org/download_ftp.php?filepath=08-download/Oryza_sativa_Japonica_Group/binding/TFBS_from_FunTFBS_inProm_Osj.gff" + genome_motif: "http://plantregmap.gao-lab.org/download_ftp.php?filepath=08-download/Oryza_sativa_Japonica_Group/binding/TFBS_from_motif_genome-wide_Osj.gff" + genome_motif_CE: "http://plantregmap.gao-lab.org/download_ftp.php?filepath=08-download/Oryza_sativa_Japonica_Group/binding/TFBS_from_motif_CE_genome-wide_Osj.gff" + genome_FunTBFS: "http://plantregmap.gao-lab.org/download_ftp.php?filepath=08-download/Oryza_sativa_Japonica_Group/binding/TFBS_from_FunTFBS_genome-wide_Osj.gff" diff --git a/prepare_data/workflow/rules/last_whole_genome_alignment.smk b/prepare_data/workflow/rules/last_whole_genome_alignment.smk index 135a3682..a9a6b5b9 100644 --- a/prepare_data/workflow/rules/last_whole_genome_alignment.smk +++ b/prepare_data/workflow/rules/last_whole_genome_alignment.smk @@ -1,79 +1,79 @@ -rule last_db: #Rule for constructing LAST index - input: - reference = "{0}/genomes/Nipponbare/Npb.fasta.gz".format(config["input_data_dir"]) - output: - touch("{0}/last_index/index.done".format(config["processed_data_dir"])) - params: - index_basename = "{0}/last_index/Nipponbare_db".format(config["processed_data_dir"]) - conda: - "../env/lastal.yaml" - shell: - "lastdb -P4 -uNEAR {params.index_basename} {input.reference}" - -rule last_score_training: - input: - Nb_index_flag = "{0}/last_index/index.done".format(config["processed_data_dir"]), - query_genome= "{0}/genomes/{{other_ref}}/{{other_ref}}.fasta.gz".format(config["input_data_dir"]) - output: - train_out = "{0}/last_training/Nb_{{other_ref}}".format(config["processed_data_dir"]) - params: - index_basename="{0}/last_index/Nipponbare_db".format(config["processed_data_dir"]) - threads: config["threads"] - conda: - "../env/lastal.yaml" - shell: - """ - last-train -P {threads} --revsym -E0.05 -C2 --sample-number=500 {params.index_basename} {input.query_genome} \ - > {output.train_out} - """ - -rule last_align_one_to_many: - input: - Nb_index_flag = "{0}/last_index/index.done".format(config["processed_data_dir"]), - other_ref = "{0}/genomes/{{other_ref}}/{{other_ref}}.fasta.gz".format(config["input_data_dir"]), - score_training_out= "{0}/last_training/Nb_{{other_ref}}".format(config["processed_data_dir"]) - output: - one_to_many_alignment = "{0}/alignments/Nb_{{other_ref}}/Nb_{{other_ref}}_one_to_many.maf".format(config["processed_data_dir"]) - params: - index_basename="{0}/last_index/Nipponbare_db".format(config["processed_data_dir"]) - threads: config["threads"] - conda: - "../env/lastal.yaml" - shell: - """ - lastal -P {threads} -D1e8 -m20 -C2 --split-f=MAF+ -p {input.score_training_out} {params.index_basename} {input.other_ref} > {output.one_to_many_alignment} - """ - -rule last_align_one_to_one: - input: - one_to_many_alignment = "{0}/alignments/Nb_{{other_ref}}/Nb_{{other_ref}}_one_to_many.maf".format(config["processed_data_dir"]) - output: - one_to_one_alignment = "{0}/alignments/Nb_{{other_ref}}/Nb_{{other_ref}}_one_to_one.maf".format(config["processed_data_dir"]) - conda: - "../env/lastal.yaml" - shell: - """ - last-split -r -m1e-5 {input.one_to_many_alignment} > {output.one_to_one_alignment} - """ - -rule convert_gff: - input: - one_to_one_alignment="{0}/alignments/Nb_{{other_ref}}/Nb_{{other_ref}}_one_to_one.maf".format(config["processed_data_dir"]) - output: - gff = "{0}/alignments/Nb_{{other_ref}}/Nb_{{other_ref}}.gff".format(config["processed_data_dir"]) - conda: - "../env/lastal.yaml" - shell: - "maf-convert gff {input.one_to_one_alignment} > {output.gff}" - -rule build_gff_db: - input: - gff = "{0}/alignments/Nb_{{other_ref}}/Nb_{{other_ref}}.gff".format(config["processed_data_dir"]) - output: - gff_db = "{0}/alignments/Nb_{{other_ref}}/Nb_{{other_ref}}.gff.db".format(config["processed_data_dir"]) - conda: - "../env/gffutils.yaml" - shell: - "python scripts/gff_db.py {input.gff} {output.gff_db}" - - +rule last_db: #Rule for constructing LAST index + input: + reference = "{0}/genomes/Nipponbare/Npb.fasta.gz".format(config["input_data_dir"]) + output: + touch("{0}/last_index/index.done".format(config["processed_data_dir"])) + params: + index_basename = "{0}/last_index/Nipponbare_db".format(config["processed_data_dir"]) + conda: + "../env/lastal.yaml" + shell: + "lastdb -P4 -uNEAR {params.index_basename} {input.reference}" + +rule last_score_training: + input: + Nb_index_flag = "{0}/last_index/index.done".format(config["processed_data_dir"]), + query_genome= "{0}/genomes/{{other_ref}}/{{other_ref}}.fasta.gz".format(config["input_data_dir"]) + output: + train_out = "{0}/last_training/Nb_{{other_ref}}".format(config["processed_data_dir"]) + params: + index_basename="{0}/last_index/Nipponbare_db".format(config["processed_data_dir"]) + threads: config["threads"] + conda: + "../env/lastal.yaml" + shell: + """ + last-train -P {threads} --revsym -E0.05 -C2 --sample-number=500 {params.index_basename} {input.query_genome} \ + > {output.train_out} + """ + +rule last_align_one_to_many: + input: + Nb_index_flag = "{0}/last_index/index.done".format(config["processed_data_dir"]), + other_ref = "{0}/genomes/{{other_ref}}/{{other_ref}}.fasta.gz".format(config["input_data_dir"]), + score_training_out= "{0}/last_training/Nb_{{other_ref}}".format(config["processed_data_dir"]) + output: + one_to_many_alignment = "{0}/alignments/Nb_{{other_ref}}/Nb_{{other_ref}}_one_to_many.maf".format(config["processed_data_dir"]) + params: + index_basename="{0}/last_index/Nipponbare_db".format(config["processed_data_dir"]) + threads: config["threads"] + conda: + "../env/lastal.yaml" + shell: + """ + lastal -P {threads} -D1e8 -m20 -C2 --split-f=MAF+ -p {input.score_training_out} {params.index_basename} {input.other_ref} > {output.one_to_many_alignment} + """ + +rule last_align_one_to_one: + input: + one_to_many_alignment = "{0}/alignments/Nb_{{other_ref}}/Nb_{{other_ref}}_one_to_many.maf".format(config["processed_data_dir"]) + output: + one_to_one_alignment = "{0}/alignments/Nb_{{other_ref}}/Nb_{{other_ref}}_one_to_one.maf".format(config["processed_data_dir"]) + conda: + "../env/lastal.yaml" + shell: + """ + last-split -r -m1e-5 {input.one_to_many_alignment} > {output.one_to_one_alignment} + """ + +rule convert_gff: + input: + one_to_one_alignment="{0}/alignments/Nb_{{other_ref}}/Nb_{{other_ref}}_one_to_one.maf".format(config["processed_data_dir"]) + output: + gff = "{0}/alignments/Nb_{{other_ref}}/Nb_{{other_ref}}.gff".format(config["processed_data_dir"]) + conda: + "../env/lastal.yaml" + shell: + "maf-convert gff {input.one_to_one_alignment} > {output.gff}" + +rule build_gff_db: + input: + gff = "{0}/alignments/Nb_{{other_ref}}/Nb_{{other_ref}}.gff".format(config["processed_data_dir"]) + output: + gff_db = "{0}/alignments/Nb_{{other_ref}}/Nb_{{other_ref}}.gff.db".format(config["processed_data_dir"]) + conda: + "../env/gffutils.yaml" + shell: + "python scripts/gff_db.py {input.gff} {output.gff_db}" + + diff --git a/prepare_data/workflow/rules/prepare_annotation.smk b/prepare_data/workflow/rules/prepare_annotation.smk index d3f901a2..d5ef0338 100644 --- a/prepare_data/workflow/rules/prepare_annotation.smk +++ b/prepare_data/workflow/rules/prepare_annotation.smk @@ -1,20 +1,20 @@ -def get_annot_url(wildcards): - return config["annotation_links"][wildcards.other_ref] - -rule download_annotation: - output: - filename = "{0}/annotations/{{other_ref}}/{{other_ref}}.gff.gz".format(config["processed_data_dir"]) - params: - url = get_annot_url - shell: - "wget {params.url} -O {output.filename}" - -rule build_annot_gff_db: - input: - gff="{0}/annotations/{{other_ref}}/{{other_ref}}.gff.gz".format(config["processed_data_dir"]) - output: - gff_db = "{0}/annotations/{{other_ref}}/{{other_ref}}.gff.db".format(config["processed_data_dir"]) - conda: - "../env/gffutils.yaml" - shell: +def get_annot_url(wildcards): + return config["annotation_links"][wildcards.other_ref] + +rule download_annotation: + output: + filename = "{0}/annotations/{{other_ref}}/{{other_ref}}.gff.gz".format(config["processed_data_dir"]) + params: + url = get_annot_url + shell: + "wget {params.url} -O {output.filename}" + +rule build_annot_gff_db: + input: + gff="{0}/annotations/{{other_ref}}/{{other_ref}}.gff.gz".format(config["processed_data_dir"]) + output: + gff_db = "{0}/annotations/{{other_ref}}/{{other_ref}}.gff.db".format(config["processed_data_dir"]) + conda: + "../env/gffutils.yaml" + shell: "python scripts/gff_db.py {input.gff} {output.gff_db}" \ No newline at end of file diff --git a/prepare_data/workflow/rules/prepare_gene_descriptions.smk b/prepare_data/workflow/rules/prepare_gene_descriptions.smk index 9463f108..6bb149df 100644 --- a/prepare_data/workflow/rules/prepare_gene_descriptions.smk +++ b/prepare_data/workflow/rules/prepare_gene_descriptions.smk @@ -1,2 +1,2 @@ -python prepare_df_rgi_gene_description.py +python prepare_df_rgi_gene_description.py {genome}/{gene_description_file} {genome}/ {genome}_gene_descriptions.csv \ No newline at end of file diff --git a/prepare_data/workflow/rules/transcription_factor_binding_sites.smk b/prepare_data/workflow/rules/transcription_factor_binding_sites.smk index 08c0ce3e..96dd6abf 100644 --- a/prepare_data/workflow/rules/transcription_factor_binding_sites.smk +++ b/prepare_data/workflow/rules/transcription_factor_binding_sites.smk @@ -1,29 +1,29 @@ -rule get_promoter_sequences: - input: - genome = "{0}/genomes/Nipponbare/Npb.fasta".format(config["input_data_dir"]), - gff_db = "{0}/annotations/Nb/Nb.gff.db".format(config["input_data_dir"]) - params: - upstream_win_len = 10, - downstream_win_len = 0 - output: - out_fasta = "{0}/promoter_seq.fasta".format(config["tfbs_dir"]), - promoter_gene_map = "{0}/promoter_gene_map".format(config["tfbs_dir"]) - conda: - "../env/gff_biopython.yaml" - shell: - ''' - python scripts/get_promoter_sequences.py {input.genome} {input.gff_db} {params.upstream_win_len} \ - {params.downstream_win_len} {output.out_fasta} {output.promoter_gene_map} - ''' - -# rule fimo_search: -# input: -# promoter_seq = "{0}/promoter_seq.fasta".format(config["tfbs_dir"]) -# motifs = "{0}/PlantTFDB_TF_binding_motifs_from_experiments".format(config["tfbs_dir"]) -# output: -# fimo_out = "{0}/fimo_out".format(config["tfbs_dir"]) - -#rule download_tfbs_data: -# wget -O motif -# wget -O motif_CE -# wget -O FunTFBS +rule get_promoter_sequences: + input: + genome = "{0}/genomes/Nipponbare/Npb.fasta".format(config["input_data_dir"]), + gff_db = "{0}/annotations/Nb/Nb.gff.db".format(config["input_data_dir"]) + params: + upstream_win_len = 10, + downstream_win_len = 0 + output: + out_fasta = "{0}/promoter_seq.fasta".format(config["tfbs_dir"]), + promoter_gene_map = "{0}/promoter_gene_map".format(config["tfbs_dir"]) + conda: + "../env/gff_biopython.yaml" + shell: + ''' + python scripts/get_promoter_sequences.py {input.genome} {input.gff_db} {params.upstream_win_len} \ + {params.downstream_win_len} {output.out_fasta} {output.promoter_gene_map} + ''' + +# rule fimo_search: +# input: +# promoter_seq = "{0}/promoter_seq.fasta".format(config["tfbs_dir"]) +# motifs = "{0}/PlantTFDB_TF_binding_motifs_from_experiments".format(config["tfbs_dir"]) +# output: +# fimo_out = "{0}/fimo_out".format(config["tfbs_dir"]) + +#rule download_tfbs_data: +# wget -O motif +# wget -O motif_CE +# wget -O FunTFBS diff --git a/prepare_data/workflow/scripts/enrichment_analysis/ontology_enrichment/go-enrichment.r b/prepare_data/workflow/scripts/enrichment_analysis/ontology_enrichment/go-enrichment.r index 63d9d360..b83d883a 100644 --- a/prepare_data/workflow/scripts/enrichment_analysis/ontology_enrichment/go-enrichment.r +++ b/prepare_data/workflow/scripts/enrichment_analysis/ontology_enrichment/go-enrichment.r @@ -1,76 +1,76 @@ -# library(ggplot2) -library(optparse) -library(GO.db) -library(clusterProfiler) - -option_list <- list( - make_option(c("-g", "--modules"), - type = "character", default = NULL, - help = "text file containing the modules (gene IDs should be MSU accessions)" - ), - make_option(c("-i", "--module_index"), - type = "integer", default = NULL, - help = "index of the module of interest (first module is index 1)" - ), - make_option(c("-b", "--background_genes"), - type = "character", default = NULL, - help = "text file containing the background genes" - ), - make_option(c("-m", "--go_to_gene_mapping"), - type = "character", default = NULL, - help = "text file mapping the GO IDs to the genes" - ), - make_option(c("-o", "--output_dir"), - type = "character", default = NULL, - help = "output directory for the data frame and plot showing the enriched GO terms" - ) -) - -opt_parser <- OptionParser(option_list = option_list) -opt <- parse_args(opt_parser) - -go <- enricher( - gene = unlist(strsplit(readLines(opt$modules), "\t")[opt$module_index]), - universe = unlist(strsplit(readLines(opt$background_genes), "\t")), - TERM2GENE = read.table(opt$go_to_gene_mapping, sep = "\t", stringsAsFactors = FALSE), - TERM2NAME = data.frame("GOID" = names(Term(GOTERM)), "term" = Term(GOTERM)) -) - -print("Finished enrichment analysis") - -if (!dir.exists(opt$output_dir)) { - dir.create(opt$output_dir, recursive = TRUE) -} - -if (!dir.exists(paste0(opt$output_dir, "/results"))) { - dir.create(paste0(opt$output_dir, "/results"), recursive = TRUE) -} - -# if (!dir.exists(paste0(opt$output_dir, "/plots"))) { -# dir.create(paste0(opt$output_dir, "/plots"), recursive = TRUE) -# } - -go_df <- as.data.frame(go) -write.table(go_df, paste0(opt$output_dir, "/results/go-df-", opt$module_index, ".tsv"), - sep = "\t", row.names = FALSE, quote = FALSE -) - -if (nrow(go_df) > 0) { - # plot <- dotplot(go, - # showCategory = nrow(go_df), - # title = "Enriched GO Terms", - # font.size = 10 - # ) - - # ggsave(plot, - # filename = paste0(opt$output_dir, "/plots/go-dotplot-", opt$module_index, ".png"), - # height = max(c(22, nrow(go_df))), width = 22, units = "cm" - # ) - - print(paste0( - "Generated data frame and dot plot showing the enriched GO terms for module #", - opt$module_index - )) -} else { - print(paste0("No GO terms enriched for module #", opt$module_index)) -} +# library(ggplot2) +library(optparse) +library(GO.db) +library(clusterProfiler) + +option_list <- list( + make_option(c("-g", "--modules"), + type = "character", default = NULL, + help = "text file containing the modules (gene IDs should be MSU accessions)" + ), + make_option(c("-i", "--module_index"), + type = "integer", default = NULL, + help = "index of the module of interest (first module is index 1)" + ), + make_option(c("-b", "--background_genes"), + type = "character", default = NULL, + help = "text file containing the background genes" + ), + make_option(c("-m", "--go_to_gene_mapping"), + type = "character", default = NULL, + help = "text file mapping the GO IDs to the genes" + ), + make_option(c("-o", "--output_dir"), + type = "character", default = NULL, + help = "output directory for the data frame and plot showing the enriched GO terms" + ) +) + +opt_parser <- OptionParser(option_list = option_list) +opt <- parse_args(opt_parser) + +go <- enricher( + gene = unlist(strsplit(readLines(opt$modules), "\t")[opt$module_index]), + universe = unlist(strsplit(readLines(opt$background_genes), "\t")), + TERM2GENE = read.table(opt$go_to_gene_mapping, sep = "\t", stringsAsFactors = FALSE), + TERM2NAME = data.frame("GOID" = names(Term(GOTERM)), "term" = Term(GOTERM)) +) + +print("Finished enrichment analysis") + +if (!dir.exists(opt$output_dir)) { + dir.create(opt$output_dir, recursive = TRUE) +} + +if (!dir.exists(paste0(opt$output_dir, "/results"))) { + dir.create(paste0(opt$output_dir, "/results"), recursive = TRUE) +} + +# if (!dir.exists(paste0(opt$output_dir, "/plots"))) { +# dir.create(paste0(opt$output_dir, "/plots"), recursive = TRUE) +# } + +go_df <- as.data.frame(go) +write.table(go_df, paste0(opt$output_dir, "/results/go-df-", opt$module_index, ".tsv"), + sep = "\t", row.names = FALSE, quote = FALSE +) + +if (nrow(go_df) > 0) { + # plot <- dotplot(go, + # showCategory = nrow(go_df), + # title = "Enriched GO Terms", + # font.size = 10 + # ) + + # ggsave(plot, + # filename = paste0(opt$output_dir, "/plots/go-dotplot-", opt$module_index, ".png"), + # height = max(c(22, nrow(go_df))), width = 22, units = "cm" + # ) + + print(paste0( + "Generated data frame and dot plot showing the enriched GO terms for module #", + opt$module_index + )) +} else { + print(paste0("No GO terms enriched for module #", opt$module_index)) +} diff --git a/prepare_data/workflow/scripts/enrichment_analysis/ontology_enrichment/po-enrichment.r b/prepare_data/workflow/scripts/enrichment_analysis/ontology_enrichment/po-enrichment.r index 0d27b005..50beac0d 100644 --- a/prepare_data/workflow/scripts/enrichment_analysis/ontology_enrichment/po-enrichment.r +++ b/prepare_data/workflow/scripts/enrichment_analysis/ontology_enrichment/po-enrichment.r @@ -1,79 +1,79 @@ -# library(ggplot2) -library(optparse) -library(clusterProfiler) - -option_list <- list( - make_option(c("-g", "--modules"), - type = "character", default = NULL, - help = "text file containing the modules (gene IDs should be MSU accessions)" - ), - make_option(c("-i", "--module_index"), - type = "integer", default = NULL, - help = "index of the module of interest (first module is index 1)" - ), - make_option(c("-b", "--background_genes"), - type = "character", default = NULL, - help = "text file containing the background genes" - ), - make_option(c("-m", "--po_to_gene_mapping"), - type = "character", default = NULL, - help = "text file mapping the PO IDs to the genes" - ), - make_option(c("-t", "--po_to_name_mapping"), - type = "character", default = NULL, - help = "text file mapping the PO IDs to the PO names" - ), - make_option(c("-o", "--output_dir"), - type = "character", default = NULL, - help = "output directory for the data frame and plot showing the enriched PO terms" - ) -) - -opt_parser <- OptionParser(option_list = option_list) -opt <- parse_args(opt_parser) - -po <- enricher( - gene = unlist(strsplit(readLines(opt$modules), "\t")[opt$module_index]), - universe = unlist(strsplit(readLines(opt$background_genes), "\t")), - TERM2GENE = read.table(opt$po_to_gene_mapping, sep = "\t", stringsAsFactors = FALSE), - TERM2NAME = read.table(opt$po_to_name_mapping, sep = "\t", stringsAsFactors = FALSE) -) - -print("Finished enrichment analysis") - -if (!dir.exists(opt$output_dir)) { - dir.create(opt$output_dir, recursive = TRUE) -} - -if (!dir.exists(paste0(opt$output_dir, "/results"))) { - dir.create(paste0(opt$output_dir, "/results"), recursive = TRUE) -} - -# if (!dir.exists(paste0(opt$output_dir, "/plots"))) { -# dir.create(paste0(opt$output_dir, "/plots"), recursive = TRUE) -# } - -po_df <- as.data.frame(po) -write.table(po_df, paste0(opt$output_dir, "/results/po-df-", opt$module_index, ".tsv"), - sep = "\t", row.names = FALSE, quote = FALSE -) - -if (nrow(po_df) > 0) { - # plot <- dotplot(po, - # showCategory = nrow(po_df), - # title = "Enriched PO Terms", - # font.size = 10 - # ) - - # ggsave(plot, - # filename = paste0(opt$output_dir, "/plots/po-dotplot-", opt$module_index, ".png"), - # height = max(c(22, nrow(po_df))), width = 22, units = "cm" - # ) - - print(paste0( - "Generated data frame and dot plot showing the enriched PO terms for module #", - opt$module_index - )) -} else { - print(paste0("No PO terms enriched for module #", opt$module_index)) -} +# library(ggplot2) +library(optparse) +library(clusterProfiler) + +option_list <- list( + make_option(c("-g", "--modules"), + type = "character", default = NULL, + help = "text file containing the modules (gene IDs should be MSU accessions)" + ), + make_option(c("-i", "--module_index"), + type = "integer", default = NULL, + help = "index of the module of interest (first module is index 1)" + ), + make_option(c("-b", "--background_genes"), + type = "character", default = NULL, + help = "text file containing the background genes" + ), + make_option(c("-m", "--po_to_gene_mapping"), + type = "character", default = NULL, + help = "text file mapping the PO IDs to the genes" + ), + make_option(c("-t", "--po_to_name_mapping"), + type = "character", default = NULL, + help = "text file mapping the PO IDs to the PO names" + ), + make_option(c("-o", "--output_dir"), + type = "character", default = NULL, + help = "output directory for the data frame and plot showing the enriched PO terms" + ) +) + +opt_parser <- OptionParser(option_list = option_list) +opt <- parse_args(opt_parser) + +po <- enricher( + gene = unlist(strsplit(readLines(opt$modules), "\t")[opt$module_index]), + universe = unlist(strsplit(readLines(opt$background_genes), "\t")), + TERM2GENE = read.table(opt$po_to_gene_mapping, sep = "\t", stringsAsFactors = FALSE), + TERM2NAME = read.table(opt$po_to_name_mapping, sep = "\t", stringsAsFactors = FALSE) +) + +print("Finished enrichment analysis") + +if (!dir.exists(opt$output_dir)) { + dir.create(opt$output_dir, recursive = TRUE) +} + +if (!dir.exists(paste0(opt$output_dir, "/results"))) { + dir.create(paste0(opt$output_dir, "/results"), recursive = TRUE) +} + +# if (!dir.exists(paste0(opt$output_dir, "/plots"))) { +# dir.create(paste0(opt$output_dir, "/plots"), recursive = TRUE) +# } + +po_df <- as.data.frame(po) +write.table(po_df, paste0(opt$output_dir, "/results/po-df-", opt$module_index, ".tsv"), + sep = "\t", row.names = FALSE, quote = FALSE +) + +if (nrow(po_df) > 0) { + # plot <- dotplot(po, + # showCategory = nrow(po_df), + # title = "Enriched PO Terms", + # font.size = 10 + # ) + + # ggsave(plot, + # filename = paste0(opt$output_dir, "/plots/po-dotplot-", opt$module_index, ".png"), + # height = max(c(22, nrow(po_df))), width = 22, units = "cm" + # ) + + print(paste0( + "Generated data frame and dot plot showing the enriched PO terms for module #", + opt$module_index + )) +} else { + print(paste0("No PO terms enriched for module #", opt$module_index)) +} diff --git a/prepare_data/workflow/scripts/enrichment_analysis/ontology_enrichment/to-enrichment.r b/prepare_data/workflow/scripts/enrichment_analysis/ontology_enrichment/to-enrichment.r index 0207095c..5fc554de 100644 --- a/prepare_data/workflow/scripts/enrichment_analysis/ontology_enrichment/to-enrichment.r +++ b/prepare_data/workflow/scripts/enrichment_analysis/ontology_enrichment/to-enrichment.r @@ -1,79 +1,79 @@ -# library(ggplot2) -library(optparse) -library(clusterProfiler) - -option_list <- list( - make_option(c("-g", "--modules"), - type = "character", default = NULL, - help = "text file containing the modules (gene IDs should be MSU accessions)" - ), - make_option(c("-i", "--module_index"), - type = "integer", default = NULL, - help = "index of the module of interest (first module is index 1)" - ), - make_option(c("-b", "--background_genes"), - type = "character", default = NULL, - help = "text file containing the background genes" - ), - make_option(c("-m", "--to_to_gene_mapping"), - type = "character", default = NULL, - help = "text file mapping the TO IDs to the genes" - ), - make_option(c("-t", "--to_to_name_mapping"), - type = "character", default = NULL, - help = "text file mapping the TO IDs to the TO names" - ), - make_option(c("-o", "--output_dir"), - type = "character", default = NULL, - help = "output directory for the data frame and plot showing the enriched TO terms" - ) -) - -opt_parser <- OptionParser(option_list = option_list) -opt <- parse_args(opt_parser) - -to <- enricher( - gene = unlist(strsplit(readLines(opt$modules), "\t")[opt$module_index]), - universe = unlist(strsplit(readLines(opt$background_genes), "\t")), - TERM2GENE = read.table(opt$to_to_gene_mapping, sep = "\t", stringsAsFactors = FALSE), - TERM2NAME = read.table(opt$to_to_name_mapping, sep = "\t", stringsAsFactors = FALSE) -) - -print("Finished enrichment analysis") - -if (!dir.exists(opt$output_dir)) { - dir.create(opt$output_dir, recursive = TRUE) -} - -if (!dir.exists(paste0(opt$output_dir, "/results"))) { - dir.create(paste0(opt$output_dir, "/results"), recursive = TRUE) -} - -# if (!dir.exists(paste0(opt$output_dir, "/plots"))) { -# dir.create(paste0(opt$output_dir, "/plots"), recursive = TRUE) -# } - -to_df <- as.data.frame(to) -write.table(to_df, paste0(opt$output_dir, "/results/to-df-", opt$module_index, ".tsv"), - sep = "\t", row.names = FALSE, quote = FALSE -) - -if (nrow(to_df) > 0) { - # plot <- dotplot(to, - # showCategory = nrow(to_df), - # title = "Enriched TO Terms", - # font.size = 10 - # ) - - # ggsave(plot, - # filename = paste0(opt$output_dir, "/plots/to-dotplot-", opt$module_index, ".png"), - # height = max(c(22, nrow(to_df))), width = 22, units = "cm" - # ) - - print(paste0( - "Generated data frame and dot plot showing the enriched TO terms for module #", - opt$module_index - )) -} else { - print(paste0("No TO terms enriched for module #", opt$module_index)) -} +# library(ggplot2) +library(optparse) +library(clusterProfiler) + +option_list <- list( + make_option(c("-g", "--modules"), + type = "character", default = NULL, + help = "text file containing the modules (gene IDs should be MSU accessions)" + ), + make_option(c("-i", "--module_index"), + type = "integer", default = NULL, + help = "index of the module of interest (first module is index 1)" + ), + make_option(c("-b", "--background_genes"), + type = "character", default = NULL, + help = "text file containing the background genes" + ), + make_option(c("-m", "--to_to_gene_mapping"), + type = "character", default = NULL, + help = "text file mapping the TO IDs to the genes" + ), + make_option(c("-t", "--to_to_name_mapping"), + type = "character", default = NULL, + help = "text file mapping the TO IDs to the TO names" + ), + make_option(c("-o", "--output_dir"), + type = "character", default = NULL, + help = "output directory for the data frame and plot showing the enriched TO terms" + ) +) + +opt_parser <- OptionParser(option_list = option_list) +opt <- parse_args(opt_parser) + +to <- enricher( + gene = unlist(strsplit(readLines(opt$modules), "\t")[opt$module_index]), + universe = unlist(strsplit(readLines(opt$background_genes), "\t")), + TERM2GENE = read.table(opt$to_to_gene_mapping, sep = "\t", stringsAsFactors = FALSE), + TERM2NAME = read.table(opt$to_to_name_mapping, sep = "\t", stringsAsFactors = FALSE) +) + +print("Finished enrichment analysis") + +if (!dir.exists(opt$output_dir)) { + dir.create(opt$output_dir, recursive = TRUE) +} + +if (!dir.exists(paste0(opt$output_dir, "/results"))) { + dir.create(paste0(opt$output_dir, "/results"), recursive = TRUE) +} + +# if (!dir.exists(paste0(opt$output_dir, "/plots"))) { +# dir.create(paste0(opt$output_dir, "/plots"), recursive = TRUE) +# } + +to_df <- as.data.frame(to) +write.table(to_df, paste0(opt$output_dir, "/results/to-df-", opt$module_index, ".tsv"), + sep = "\t", row.names = FALSE, quote = FALSE +) + +if (nrow(to_df) > 0) { + # plot <- dotplot(to, + # showCategory = nrow(to_df), + # title = "Enriched TO Terms", + # font.size = 10 + # ) + + # ggsave(plot, + # filename = paste0(opt$output_dir, "/plots/to-dotplot-", opt$module_index, ".png"), + # height = max(c(22, nrow(to_df))), width = 22, units = "cm" + # ) + + print(paste0( + "Generated data frame and dot plot showing the enriched TO terms for module #", + opt$module_index + )) +} else { + print(paste0("No TO terms enriched for module #", opt$module_index)) +} diff --git a/prepare_data/workflow/scripts/enrichment_analysis/pathway_enrichment/ora-enrichment.r b/prepare_data/workflow/scripts/enrichment_analysis/pathway_enrichment/ora-enrichment.r index 1f830d90..74b1fed6 100644 --- a/prepare_data/workflow/scripts/enrichment_analysis/pathway_enrichment/ora-enrichment.r +++ b/prepare_data/workflow/scripts/enrichment_analysis/pathway_enrichment/ora-enrichment.r @@ -1,71 +1,71 @@ -# library(ggplot2) -library(optparse) -library(clusterProfiler) - -option_list <- list( - make_option(c("-g", "--modules"), - type = "character", default = NULL, - help = "text file containing the modules (gene IDs should be KEGG transcript IDs)" - ), - make_option(c("-i", "--module_index"), - type = "integer", default = NULL, - help = "index of the module of interest (first module is index 1)" - ), - make_option(c("-b", "--background_genes"), - type = "character", default = NULL, - help = "text file containing the background genes" - ), - make_option(c("-o", "--output_dir"), - type = "character", default = NULL, - help = "output directory for the data frame and plot showing the enriched pathways" - ) -) - -opt_parser <- OptionParser(option_list = option_list) -opt <- parse_args(opt_parser) - -kegg <- enrichKEGG( - gene = unlist(strsplit(readLines(opt$modules), "\t")[opt$module_index]), - universe = unlist(strsplit(readLines(opt$background_genes), "\t")), - organism = "dosa", - keyType = "kegg", -) - -print("Finished enrichment analysis") - -if (!dir.exists(opt$output_dir)) { - dir.create(opt$output_dir, recursive = TRUE) -} - -if (!dir.exists(paste0(opt$output_dir, "/results"))) { - dir.create(paste0(opt$output_dir, "/results"), recursive = TRUE) -} - -# if (!dir.exists(paste0(opt$output_dir, "/plots"))) { -# dir.create(paste0(opt$output_dir, "/plots"), recursive = TRUE) -# } - -kegg_df <- as.data.frame(kegg) -write.table(kegg_df, paste0(opt$output_dir, "/results/ora-df-", opt$module_index, ".tsv"), - sep = "\t", row.names = FALSE, quote = FALSE -) - -if (nrow(kegg_df) > 0) { - # plot <- dotplot(kegg, - # showCategory = nrow(kegg_df), - # title = "Enriched KEGG Pathways", - # font.size = 10 - # ) - - # ggsave(plot, - # filename = paste0(opt$output_dir, "/plots/ora-dotplot-", opt$module_index, ".png"), - # height = max(c(22, nrow(kegg_df))), width = 22, units = "cm" - # ) - - print(paste0( - "Generated data frame and dot plot showing the enriched KEGG pathways for module #", - opt$module_index - )) -} else { - print(paste0("No KEGG pathways enriched for module #", opt$module_index)) +# library(ggplot2) +library(optparse) +library(clusterProfiler) + +option_list <- list( + make_option(c("-g", "--modules"), + type = "character", default = NULL, + help = "text file containing the modules (gene IDs should be KEGG transcript IDs)" + ), + make_option(c("-i", "--module_index"), + type = "integer", default = NULL, + help = "index of the module of interest (first module is index 1)" + ), + make_option(c("-b", "--background_genes"), + type = "character", default = NULL, + help = "text file containing the background genes" + ), + make_option(c("-o", "--output_dir"), + type = "character", default = NULL, + help = "output directory for the data frame and plot showing the enriched pathways" + ) +) + +opt_parser <- OptionParser(option_list = option_list) +opt <- parse_args(opt_parser) + +kegg <- enrichKEGG( + gene = unlist(strsplit(readLines(opt$modules), "\t")[opt$module_index]), + universe = unlist(strsplit(readLines(opt$background_genes), "\t")), + organism = "dosa", + keyType = "kegg", +) + +print("Finished enrichment analysis") + +if (!dir.exists(opt$output_dir)) { + dir.create(opt$output_dir, recursive = TRUE) +} + +if (!dir.exists(paste0(opt$output_dir, "/results"))) { + dir.create(paste0(opt$output_dir, "/results"), recursive = TRUE) +} + +# if (!dir.exists(paste0(opt$output_dir, "/plots"))) { +# dir.create(paste0(opt$output_dir, "/plots"), recursive = TRUE) +# } + +kegg_df <- as.data.frame(kegg) +write.table(kegg_df, paste0(opt$output_dir, "/results/ora-df-", opt$module_index, ".tsv"), + sep = "\t", row.names = FALSE, quote = FALSE +) + +if (nrow(kegg_df) > 0) { + # plot <- dotplot(kegg, + # showCategory = nrow(kegg_df), + # title = "Enriched KEGG Pathways", + # font.size = 10 + # ) + + # ggsave(plot, + # filename = paste0(opt$output_dir, "/plots/ora-dotplot-", opt$module_index, ".png"), + # height = max(c(22, nrow(kegg_df))), width = 22, units = "cm" + # ) + + print(paste0( + "Generated data frame and dot plot showing the enriched KEGG pathways for module #", + opt$module_index + )) +} else { + print(paste0("No KEGG pathways enriched for module #", opt$module_index)) } \ No newline at end of file diff --git a/prepare_data/workflow/scripts/enrichment_analysis/pathway_enrichment/pe-enrichment.r b/prepare_data/workflow/scripts/enrichment_analysis/pathway_enrichment/pe-enrichment.r index 22275dec..66ee6217 100644 --- a/prepare_data/workflow/scripts/enrichment_analysis/pathway_enrichment/pe-enrichment.r +++ b/prepare_data/workflow/scripts/enrichment_analysis/pathway_enrichment/pe-enrichment.r @@ -1,62 +1,62 @@ -library(graphite) -library(optparse) -library(ROntoTools) - -option_list <- list( - make_option(c("-g", "--modules"), - type = "character", default = NULL, - help = "text file containing the modules (gene IDs should be KEGG transcript IDs)" - ), - make_option(c("-i", "--module_index"), - type = "integer", default = NULL, - help = "index of the module of interest (first module is index 1)" - ), - make_option(c("-b", "--background_genes"), - type = "character", default = NULL, - help = "text file containing the background genes" - ), - make_option(c("-o", "--output_dir"), - type = "character", default = NULL, - help = "output directory for the data frame and plot showing the enriched pathways" - ) -) - -opt_parser <- OptionParser(option_list = option_list) -opt <- parse_args(opt_parser) - -genes <- paste0("dosa:", unlist(strsplit(readLines(opt$modules), "\t")[opt$module_index])) -dummy_val <- 20 -dummy_fc <- replicate(length(genes), dummy_val) -input_data <- setNames(dummy_fc, genes) - -kpg <- setNodeWeights(setEdgeWeights(keggPathwayGraphs("dosa"))) - -pe_results <- pe(input_data, - graphs = kpg, - ref = paste0("dosa:", unlist(strsplit(readLines(opt$background_genes), "\t"))), - nboot = 2000, verbose = TRUE -) - -if (!dir.exists(opt$output_dir)) { - dir.create(opt$output_dir, recursive = TRUE) -} - -if (!dir.exists(paste0(opt$output_dir, "/results"))) { - dir.create(paste0(opt$output_dir, "/results"), recursive = TRUE) -} - -kegg_df <- summary(pe_results) -write.table(kegg_df, paste0(opt$output_dir, "/results/pe-df-", opt$module_index, ".tsv"), - sep = "\t", row.names = TRUE, quote = FALSE -) - -cat("\n") - -if (nrow(kegg_df) > 0) { - print(paste0( - "Generated data frame showing the enriched KEGG pathways for module #", - opt$module_index - )) -} else { - print(paste0("No KEGG pathways enriched for module #", opt$module_index)) +library(graphite) +library(optparse) +library(ROntoTools) + +option_list <- list( + make_option(c("-g", "--modules"), + type = "character", default = NULL, + help = "text file containing the modules (gene IDs should be KEGG transcript IDs)" + ), + make_option(c("-i", "--module_index"), + type = "integer", default = NULL, + help = "index of the module of interest (first module is index 1)" + ), + make_option(c("-b", "--background_genes"), + type = "character", default = NULL, + help = "text file containing the background genes" + ), + make_option(c("-o", "--output_dir"), + type = "character", default = NULL, + help = "output directory for the data frame and plot showing the enriched pathways" + ) +) + +opt_parser <- OptionParser(option_list = option_list) +opt <- parse_args(opt_parser) + +genes <- paste0("dosa:", unlist(strsplit(readLines(opt$modules), "\t")[opt$module_index])) +dummy_val <- 20 +dummy_fc <- replicate(length(genes), dummy_val) +input_data <- setNames(dummy_fc, genes) + +kpg <- setNodeWeights(setEdgeWeights(keggPathwayGraphs("dosa"))) + +pe_results <- pe(input_data, + graphs = kpg, + ref = paste0("dosa:", unlist(strsplit(readLines(opt$background_genes), "\t"))), + nboot = 2000, verbose = TRUE +) + +if (!dir.exists(opt$output_dir)) { + dir.create(opt$output_dir, recursive = TRUE) +} + +if (!dir.exists(paste0(opt$output_dir, "/results"))) { + dir.create(paste0(opt$output_dir, "/results"), recursive = TRUE) +} + +kegg_df <- summary(pe_results) +write.table(kegg_df, paste0(opt$output_dir, "/results/pe-df-", opt$module_index, ".tsv"), + sep = "\t", row.names = TRUE, quote = FALSE +) + +cat("\n") + +if (nrow(kegg_df) > 0) { + print(paste0( + "Generated data frame showing the enriched KEGG pathways for module #", + opt$module_index + )) +} else { + print(paste0("No KEGG pathways enriched for module #", opt$module_index)) } \ No newline at end of file diff --git a/prepare_data/workflow/scripts/enrichment_analysis/pathway_enrichment/spia-enrichment.r b/prepare_data/workflow/scripts/enrichment_analysis/pathway_enrichment/spia-enrichment.r index 5da79eff..6d35f229 100644 --- a/prepare_data/workflow/scripts/enrichment_analysis/pathway_enrichment/spia-enrichment.r +++ b/prepare_data/workflow/scripts/enrichment_analysis/pathway_enrichment/spia-enrichment.r @@ -1,100 +1,100 @@ -library(graphite) -library(optparse) -library(SPIA) - -option_list <- list( - make_option(c("-g", "--modules"), - type = "character", default = NULL, - help = "text file containing the modules (gene IDs should be KEGG transcript IDs)" - ), - make_option(c("-i", "--module_index"), - type = "integer", default = NULL, - help = "index of the module of interest (first module is index 1)" - ), - make_option(c("-b", "--background_genes"), - type = "character", default = NULL, - help = "text file containing the background genes" - ), - make_option(c("-p", "--pathways_dir"), - type = "character", default = NA, - help = "directory containing the XML files of the KEGG pathways" - ), - make_option(c("-s", "--spia_pathway_dir"), - type = "character", default = NULL, - help = "output directory for the SPIA RData file generated after processing the XML files of the KEGG pathways" - ), - make_option(c("-o", "--output_dir"), - type = "character", default = NULL, - help = "output directory for the data frame and plot showing the enriched pathways" - ) -) - -opt_parser <- OptionParser(option_list = option_list) -opt <- parse_args(opt_parser) - -if (!is.na(opt$pathways_dir)) { - if (!dir.exists(opt$spia_pathway_dir)) { - dir.create(opt$spia_pathway_dir, recursive = TRUE) - } - - makeSPIAdata( - kgml.path = opt$pathways_dir, - organism = "dosa", out.path = opt$spia_pathway_dir - ) -} - -genes <- unlist(strsplit(readLines(opt$modules), "\t")[opt$module_index]) -dummy_val <- 20 -dummy_fc <- replicate(length(genes), dummy_val) -input_data <- setNames(dummy_fc, genes) - -tryCatch({ - spia_results <- spia( - de = input_data, - all = unlist(strsplit(readLines(opt$background_genes), "\t")), - organism = "dosa", - data.dir = paste0(opt$spia_pathway_dir, "/") - ) - - if (!dir.exists(opt$output_dir)) { - dir.create(opt$output_dir, recursive = TRUE) - } - - if (!dir.exists(paste0(opt$output_dir, "/results"))) { - dir.create(paste0(opt$output_dir, "/results"), recursive = TRUE) - } - - kegg_df <- as.data.frame(spia_results) - write.table(kegg_df, - paste0(opt$output_dir, "/results/spia-df-", opt$module_index, ".tsv"), - sep = "\t", row.names = TRUE, quote = FALSE - ) - - cat("\n") - - if (nrow(kegg_df) > 0) { - print(paste0( - "Generated data frame showing the enriched KEGG pathways for module #", - opt$module_index - )) - } else { - print(paste0("No KEGG pathways enriched for module #", opt$module_index)) - } -}, error = function(err) { - kegg_df <- data.frame() - write.table(kegg_df, - paste0(opt$output_dir, "/results/spia-df-", opt$module_index, ".tsv"), - sep = "\t", row.names = TRUE, quote = FALSE - ) - - cat("\n") - - if (nrow(kegg_df) > 0) { - print(paste0( - "Generated data frame showing the enriched KEGG pathways for module #", - opt$module_index - )) - } else { - print(paste0("No KEGG pathways enriched for module #", opt$module_index)) - } +library(graphite) +library(optparse) +library(SPIA) + +option_list <- list( + make_option(c("-g", "--modules"), + type = "character", default = NULL, + help = "text file containing the modules (gene IDs should be KEGG transcript IDs)" + ), + make_option(c("-i", "--module_index"), + type = "integer", default = NULL, + help = "index of the module of interest (first module is index 1)" + ), + make_option(c("-b", "--background_genes"), + type = "character", default = NULL, + help = "text file containing the background genes" + ), + make_option(c("-p", "--pathways_dir"), + type = "character", default = NA, + help = "directory containing the XML files of the KEGG pathways" + ), + make_option(c("-s", "--spia_pathway_dir"), + type = "character", default = NULL, + help = "output directory for the SPIA RData file generated after processing the XML files of the KEGG pathways" + ), + make_option(c("-o", "--output_dir"), + type = "character", default = NULL, + help = "output directory for the data frame and plot showing the enriched pathways" + ) +) + +opt_parser <- OptionParser(option_list = option_list) +opt <- parse_args(opt_parser) + +if (!is.na(opt$pathways_dir)) { + if (!dir.exists(opt$spia_pathway_dir)) { + dir.create(opt$spia_pathway_dir, recursive = TRUE) + } + + makeSPIAdata( + kgml.path = opt$pathways_dir, + organism = "dosa", out.path = opt$spia_pathway_dir + ) +} + +genes <- unlist(strsplit(readLines(opt$modules), "\t")[opt$module_index]) +dummy_val <- 20 +dummy_fc <- replicate(length(genes), dummy_val) +input_data <- setNames(dummy_fc, genes) + +tryCatch({ + spia_results <- spia( + de = input_data, + all = unlist(strsplit(readLines(opt$background_genes), "\t")), + organism = "dosa", + data.dir = paste0(opt$spia_pathway_dir, "/") + ) + + if (!dir.exists(opt$output_dir)) { + dir.create(opt$output_dir, recursive = TRUE) + } + + if (!dir.exists(paste0(opt$output_dir, "/results"))) { + dir.create(paste0(opt$output_dir, "/results"), recursive = TRUE) + } + + kegg_df <- as.data.frame(spia_results) + write.table(kegg_df, + paste0(opt$output_dir, "/results/spia-df-", opt$module_index, ".tsv"), + sep = "\t", row.names = TRUE, quote = FALSE + ) + + cat("\n") + + if (nrow(kegg_df) > 0) { + print(paste0( + "Generated data frame showing the enriched KEGG pathways for module #", + opt$module_index + )) + } else { + print(paste0("No KEGG pathways enriched for module #", opt$module_index)) + } +}, error = function(err) { + kegg_df <- data.frame() + write.table(kegg_df, + paste0(opt$output_dir, "/results/spia-df-", opt$module_index, ".tsv"), + sep = "\t", row.names = TRUE, quote = FALSE + ) + + cat("\n") + + if (nrow(kegg_df) > 0) { + print(paste0( + "Generated data frame showing the enriched KEGG pathways for module #", + opt$module_index + )) + } else { + print(paste0("No KEGG pathways enriched for module #", opt$module_index)) + } }) \ No newline at end of file diff --git a/prepare_data/workflow/scripts/enrichment_analysis/util/aggregate-go-annotations.py b/prepare_data/workflow/scripts/enrichment_analysis/util/aggregate-go-annotations.py index d26b8216..ebe3f3e7 100644 --- a/prepare_data/workflow/scripts/enrichment_analysis/util/aggregate-go-annotations.py +++ b/prepare_data/workflow/scripts/enrichment_analysis/util/aggregate-go-annotations.py @@ -1,139 +1,139 @@ -import csv -import os -import pickle -import re -from collections import defaultdict - -import pandas as pd - - -def from_agrigo(agrigo_file): - annotations = [] - with open(agrigo_file) as agrigo: - csv_reader = csv.reader(agrigo, delimiter='\t') - for line in csv_reader: - id = line[-2] - go = line[-1] - - # Handle dirty data, like "LOC_ Os07g22494" - id = id.replace(" ", "") - annotations.append([go, id]) - - return pd.DataFrame(annotations) - - -def from_oryzabase(oryzabase_file): - annotations = [] - with open(oryzabase_file, encoding='utf-8') as oryzabase: - csv_reader = csv.reader(oryzabase, delimiter='\t') - next(csv_reader) - - for line in csv_reader: - id = line[13] - id_components = id.split('.') - id = id_components[0].strip() - - go_terms = line[-3] - go_components = go_terms.split(', ') - for go in go_components: - go = go.split(' - ') - go = go[0].strip() - - # Handle dirty data, like "LOC_ Os07g22494" - id = id.replace(" ", "") - - if id and go and re.compile('GO:\d+').match(go): - annotations.append([go, id]) - - return pd.DataFrame(annotations) - - -def construct_rap_db_mapping(rap_db_file): - rap_db_mapping = defaultdict(list) - - with open(rap_db_file) as rap_db: - csv_reader = csv.reader(rap_db, delimiter='\t') - next(csv_reader) - - for line in csv_reader: - go_ids = re.findall(r'GO:\d+', line[9]) - rap_db_mapping[line[0].rstrip().replace(" ", "")] = go_ids - - return rap_db_mapping - - -def convert_transcript_to_msu(msu_to_transcript_mapping_file): - with open(msu_to_transcript_mapping_file, 'rb') as mapping: - mapping_dict = pickle.load(mapping) - transcript_to_msu_mapping = defaultdict(set) - - for msu_id, transcript_ids in mapping_dict.items(): - for transcript_id in transcript_ids: - transcript_to_msu_mapping[transcript_id].add(msu_id) - - return transcript_to_msu_mapping - - -def from_rap_db(rap_db_file, all_genes_file, msu_to_transcript_file): - annotations = [] - rap_db_mapping = construct_rap_db_mapping(rap_db_file) - transcript_to_msu_mapping = convert_transcript_to_msu( - msu_to_transcript_file) - - with open(all_genes_file) as all_genes: - csv_reader = csv.reader(all_genes, delimiter='\t') - all_genes_list = [] - for line in csv_reader: - all_genes_list += line - - for gene in all_genes_list: - for go_id in rap_db_mapping[gene]: - for msu_id in transcript_to_msu_mapping[gene]: - msu_id = msu_id.replace(" ", "") - annotations.append([go_id, msu_id]) - - return pd.DataFrame(annotations) - - -def merge_annotations(*args): - merged_df = pd.concat(args) - merged_df.drop_duplicates(ignore_index=True, inplace=True) - - return merged_df - - -def save_to_csv(output_dir, merged_df): - if not os.path.exists(f'{output_dir}'): - os.makedirs(f'{output_dir}') - - merged_df.to_csv(f'{output_dir}/go-annotations.tsv', - sep='\t', index=False, header=False) - print(f'Generated {output_dir}/go-annotations.tsv') - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - - parser.add_argument( - 'agrigo_file', help='text file containing the GO annotations from agriGO v2.0') - parser.add_argument( - 'oryzabase_file', help='text file containing the GO annotations from Oryzabase') - parser.add_argument( - 'rap_db_file', help='text file containing the GO annotations from RAP-DB') - parser.add_argument( - 'all_genes_file', help='text file containing the RAP-DB accessions of all the genes of interest') - parser.add_argument( - 'msu_to_transcript_file', help='pickled dictionary mapping MSU accessions to KEGG transcript IDs') - parser.add_argument( - 'output_dir', help='output directory for the TSV file mapping MSU accessions to GO term IDs') - - args = parser.parse_args() - - agrigo_df = from_agrigo(args.agrigo_file) - oryzabase_df = from_oryzabase(args.oryzabase_file) - rap_db_df = from_rap_db( - args.rap_db_file, args.all_genes_file, args.msu_to_transcript_file) - - merged_df = merge_annotations(agrigo_df, oryzabase_df, rap_db_df) - save_to_csv(args.output_dir, merged_df) +import csv +import os +import pickle +import re +from collections import defaultdict + +import pandas as pd + + +def from_agrigo(agrigo_file): + annotations = [] + with open(agrigo_file) as agrigo: + csv_reader = csv.reader(agrigo, delimiter='\t') + for line in csv_reader: + id = line[-2] + go = line[-1] + + # Handle dirty data, like "LOC_ Os07g22494" + id = id.replace(" ", "") + annotations.append([go, id]) + + return pd.DataFrame(annotations) + + +def from_oryzabase(oryzabase_file): + annotations = [] + with open(oryzabase_file, encoding='utf-8') as oryzabase: + csv_reader = csv.reader(oryzabase, delimiter='\t') + next(csv_reader) + + for line in csv_reader: + id = line[13] + id_components = id.split('.') + id = id_components[0].strip() + + go_terms = line[-3] + go_components = go_terms.split(', ') + for go in go_components: + go = go.split(' - ') + go = go[0].strip() + + # Handle dirty data, like "LOC_ Os07g22494" + id = id.replace(" ", "") + + if id and go and re.compile('GO:\d+').match(go): + annotations.append([go, id]) + + return pd.DataFrame(annotations) + + +def construct_rap_db_mapping(rap_db_file): + rap_db_mapping = defaultdict(list) + + with open(rap_db_file) as rap_db: + csv_reader = csv.reader(rap_db, delimiter='\t') + next(csv_reader) + + for line in csv_reader: + go_ids = re.findall(r'GO:\d+', line[9]) + rap_db_mapping[line[0].rstrip().replace(" ", "")] = go_ids + + return rap_db_mapping + + +def convert_transcript_to_msu(msu_to_transcript_mapping_file): + with open(msu_to_transcript_mapping_file, 'rb') as mapping: + mapping_dict = pickle.load(mapping) + transcript_to_msu_mapping = defaultdict(set) + + for msu_id, transcript_ids in mapping_dict.items(): + for transcript_id in transcript_ids: + transcript_to_msu_mapping[transcript_id].add(msu_id) + + return transcript_to_msu_mapping + + +def from_rap_db(rap_db_file, all_genes_file, msu_to_transcript_file): + annotations = [] + rap_db_mapping = construct_rap_db_mapping(rap_db_file) + transcript_to_msu_mapping = convert_transcript_to_msu( + msu_to_transcript_file) + + with open(all_genes_file) as all_genes: + csv_reader = csv.reader(all_genes, delimiter='\t') + all_genes_list = [] + for line in csv_reader: + all_genes_list += line + + for gene in all_genes_list: + for go_id in rap_db_mapping[gene]: + for msu_id in transcript_to_msu_mapping[gene]: + msu_id = msu_id.replace(" ", "") + annotations.append([go_id, msu_id]) + + return pd.DataFrame(annotations) + + +def merge_annotations(*args): + merged_df = pd.concat(args) + merged_df.drop_duplicates(ignore_index=True, inplace=True) + + return merged_df + + +def save_to_csv(output_dir, merged_df): + if not os.path.exists(f'{output_dir}'): + os.makedirs(f'{output_dir}') + + merged_df.to_csv(f'{output_dir}/go-annotations.tsv', + sep='\t', index=False, header=False) + print(f'Generated {output_dir}/go-annotations.tsv') + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + + parser.add_argument( + 'agrigo_file', help='text file containing the GO annotations from agriGO v2.0') + parser.add_argument( + 'oryzabase_file', help='text file containing the GO annotations from Oryzabase') + parser.add_argument( + 'rap_db_file', help='text file containing the GO annotations from RAP-DB') + parser.add_argument( + 'all_genes_file', help='text file containing the RAP-DB accessions of all the genes of interest') + parser.add_argument( + 'msu_to_transcript_file', help='pickled dictionary mapping MSU accessions to KEGG transcript IDs') + parser.add_argument( + 'output_dir', help='output directory for the TSV file mapping MSU accessions to GO term IDs') + + args = parser.parse_args() + + agrigo_df = from_agrigo(args.agrigo_file) + oryzabase_df = from_oryzabase(args.oryzabase_file) + rap_db_df = from_rap_db( + args.rap_db_file, args.all_genes_file, args.msu_to_transcript_file) + + merged_df = merge_annotations(agrigo_df, oryzabase_df, rap_db_df) + save_to_csv(args.output_dir, merged_df) diff --git a/prepare_data/workflow/scripts/enrichment_analysis/util/aggregate-po-annotations.py b/prepare_data/workflow/scripts/enrichment_analysis/util/aggregate-po-annotations.py index 916dc9ed..a6679a57 100644 --- a/prepare_data/workflow/scripts/enrichment_analysis/util/aggregate-po-annotations.py +++ b/prepare_data/workflow/scripts/enrichment_analysis/util/aggregate-po-annotations.py @@ -1,91 +1,91 @@ -import csv -import os -import re - -import pandas as pd - - -def get_annotations(oryzabase_file): - annotations = [] - with open(oryzabase_file, encoding='utf-8') as oryzabase: - csv_reader = csv.reader(oryzabase, delimiter='\t') - next(csv_reader) - - for line in csv_reader: - id = line[13] - id_components = id.split('.') - id = id_components[0].strip() - - po_terms = line[-1] - po_components = po_terms.split(', ') - for po in po_components: - po = po.split(' - ') - po = po[0].strip() - - # Handle dirty data, like "LOC_ Os07g22494" - id = id.replace(" ", "") - - if id and po and re.compile('PO:\d+').match(po): - annotations.append([po, id]) - - return pd.DataFrame(annotations) - - -def map_to_id_to_names(oryzabase_file): - annotations = list() - with open(oryzabase_file, encoding='utf-8') as oryzabase: - csv_reader = csv.reader(oryzabase, delimiter='\t') - next(csv_reader) - - for line in csv_reader: - po_terms = line[-1] - po_components = po_terms.split(', ') - for po in po_components: - po = po.split(' - ') - - try: - po_id = po[0].strip() - # Remove translation - po_name = po[1] - po_name = po_name[:po_name.index('_')].strip() - - except IndexError: - continue - - if po_id and re.compile('PO:\d+').match(po_id) and po_name: - annotations.append([po_id, po_name]) - - return pd.DataFrame(annotations).drop_duplicates(ignore_index=True) - - -def save_annotations_to_csv(annotations_df, output_dir): - if not os.path.exists(f'{output_dir}'): - os.makedirs(f'{output_dir}') - - annotations_df.to_csv(f'{output_dir}/po-annotations.tsv', - sep='\t', index=False, header=False) - print(f'Generated {output_dir}/po-annotations.tsv') - - -def save_id_po_names_to_csv(id_to_names_df, output_dir): - id_to_names_df.to_csv(f'{output_dir}/po-id-to-name.tsv', - sep='\t', index=False, header=False) - print(f'Generated {output_dir}/po-id-to-name.tsv') - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - - parser.add_argument( - 'oryzabase_file', help='text file containing the PO annotations from Oryzabase') - parser.add_argument( - 'output_dir', help='output directory for the TSV file mapping MSU accessions to TO term IDs') - - args = parser.parse_args() - - annotations_df = get_annotations(args.oryzabase_file) - id_to_names_df = map_to_id_to_names(args.oryzabase_file) - - save_annotations_to_csv(annotations_df, args.output_dir) - save_id_po_names_to_csv(id_to_names_df, args.output_dir) +import csv +import os +import re + +import pandas as pd + + +def get_annotations(oryzabase_file): + annotations = [] + with open(oryzabase_file, encoding='utf-8') as oryzabase: + csv_reader = csv.reader(oryzabase, delimiter='\t') + next(csv_reader) + + for line in csv_reader: + id = line[13] + id_components = id.split('.') + id = id_components[0].strip() + + po_terms = line[-1] + po_components = po_terms.split(', ') + for po in po_components: + po = po.split(' - ') + po = po[0].strip() + + # Handle dirty data, like "LOC_ Os07g22494" + id = id.replace(" ", "") + + if id and po and re.compile('PO:\d+').match(po): + annotations.append([po, id]) + + return pd.DataFrame(annotations) + + +def map_to_id_to_names(oryzabase_file): + annotations = list() + with open(oryzabase_file, encoding='utf-8') as oryzabase: + csv_reader = csv.reader(oryzabase, delimiter='\t') + next(csv_reader) + + for line in csv_reader: + po_terms = line[-1] + po_components = po_terms.split(', ') + for po in po_components: + po = po.split(' - ') + + try: + po_id = po[0].strip() + # Remove translation + po_name = po[1] + po_name = po_name[:po_name.index('_')].strip() + + except IndexError: + continue + + if po_id and re.compile('PO:\d+').match(po_id) and po_name: + annotations.append([po_id, po_name]) + + return pd.DataFrame(annotations).drop_duplicates(ignore_index=True) + + +def save_annotations_to_csv(annotations_df, output_dir): + if not os.path.exists(f'{output_dir}'): + os.makedirs(f'{output_dir}') + + annotations_df.to_csv(f'{output_dir}/po-annotations.tsv', + sep='\t', index=False, header=False) + print(f'Generated {output_dir}/po-annotations.tsv') + + +def save_id_po_names_to_csv(id_to_names_df, output_dir): + id_to_names_df.to_csv(f'{output_dir}/po-id-to-name.tsv', + sep='\t', index=False, header=False) + print(f'Generated {output_dir}/po-id-to-name.tsv') + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + + parser.add_argument( + 'oryzabase_file', help='text file containing the PO annotations from Oryzabase') + parser.add_argument( + 'output_dir', help='output directory for the TSV file mapping MSU accessions to TO term IDs') + + args = parser.parse_args() + + annotations_df = get_annotations(args.oryzabase_file) + id_to_names_df = map_to_id_to_names(args.oryzabase_file) + + save_annotations_to_csv(annotations_df, args.output_dir) + save_id_po_names_to_csv(id_to_names_df, args.output_dir) diff --git a/prepare_data/workflow/scripts/enrichment_analysis/util/aggregate-to-annotations.py b/prepare_data/workflow/scripts/enrichment_analysis/util/aggregate-to-annotations.py index e20b7616..4183ef7e 100644 --- a/prepare_data/workflow/scripts/enrichment_analysis/util/aggregate-to-annotations.py +++ b/prepare_data/workflow/scripts/enrichment_analysis/util/aggregate-to-annotations.py @@ -1,88 +1,88 @@ -import csv -import os -import re - -import pandas as pd - - -def get_annotations(oryzabase_file): - annotations = [] - with open(oryzabase_file, encoding='utf-8') as oryzabase: - csv_reader = csv.reader(oryzabase, delimiter='\t') - next(csv_reader) - - for line in csv_reader: - id = line[13] - id_components = id.split('.') - id = id_components[0].strip() - - to_terms = line[-2] - to_components = to_terms.split(', ') - for to in to_components: - to = to.split(' - ') - to = to[0].strip() - - # Handle dirty data, like "LOC_ Os07g22494" - id = id.replace(" ", "") - - if id and to and re.compile('TO:\d+').match(to): - annotations.append([to, id]) - - return pd.DataFrame(annotations) - - -def map_to_id_to_names(oryzabase_file): - annotations = list() - with open(oryzabase_file, encoding='utf-8') as oryzabase: - csv_reader = csv.reader(oryzabase, delimiter='\t') - next(csv_reader) - - for line in csv_reader: - to_terms = line[-2] - to_components = to_terms.split(', ') - for to in to_components: - to = to.split(' - ') - - try: - to_id = to[0].strip() - to_name = to[1].strip() - except IndexError: - continue - - if to_id and re.compile('TO:\d+').match(to_id) and to_name: - annotations.append([to_id, to_name]) - - return pd.DataFrame(annotations).drop_duplicates(ignore_index=True) - - -def save_annotations_to_csv(annotations_df, output_dir): - if not os.path.exists(f'{output_dir}'): - os.makedirs(f'{output_dir}') - - annotations_df.to_csv(f'{output_dir}/to-annotations.tsv', - sep='\t', index=False, header=False) - print(f'Generated {output_dir}/to-annotations.tsv') - - -def save_id_to_names_to_csv(id_to_names_df, output_dir): - id_to_names_df.to_csv(f'{output_dir}/to-id-to-name.tsv', - sep='\t', index=False, header=False) - print(f'Generated {output_dir}/to-id-to-name.tsv') - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - - parser.add_argument( - 'oryzabase_file', help='text file containing the TO annotations from Oryzabase') - parser.add_argument( - 'output_dir', help='output directory for the TSV file mapping MSU accessions to TO term IDs') - - args = parser.parse_args() - - annotations_df = get_annotations(args.oryzabase_file) - id_to_names_df = map_to_id_to_names(args.oryzabase_file) - - save_annotations_to_csv(annotations_df, args.output_dir) - save_id_to_names_to_csv(id_to_names_df, args.output_dir) +import csv +import os +import re + +import pandas as pd + + +def get_annotations(oryzabase_file): + annotations = [] + with open(oryzabase_file, encoding='utf-8') as oryzabase: + csv_reader = csv.reader(oryzabase, delimiter='\t') + next(csv_reader) + + for line in csv_reader: + id = line[13] + id_components = id.split('.') + id = id_components[0].strip() + + to_terms = line[-2] + to_components = to_terms.split(', ') + for to in to_components: + to = to.split(' - ') + to = to[0].strip() + + # Handle dirty data, like "LOC_ Os07g22494" + id = id.replace(" ", "") + + if id and to and re.compile('TO:\d+').match(to): + annotations.append([to, id]) + + return pd.DataFrame(annotations) + + +def map_to_id_to_names(oryzabase_file): + annotations = list() + with open(oryzabase_file, encoding='utf-8') as oryzabase: + csv_reader = csv.reader(oryzabase, delimiter='\t') + next(csv_reader) + + for line in csv_reader: + to_terms = line[-2] + to_components = to_terms.split(', ') + for to in to_components: + to = to.split(' - ') + + try: + to_id = to[0].strip() + to_name = to[1].strip() + except IndexError: + continue + + if to_id and re.compile('TO:\d+').match(to_id) and to_name: + annotations.append([to_id, to_name]) + + return pd.DataFrame(annotations).drop_duplicates(ignore_index=True) + + +def save_annotations_to_csv(annotations_df, output_dir): + if not os.path.exists(f'{output_dir}'): + os.makedirs(f'{output_dir}') + + annotations_df.to_csv(f'{output_dir}/to-annotations.tsv', + sep='\t', index=False, header=False) + print(f'Generated {output_dir}/to-annotations.tsv') + + +def save_id_to_names_to_csv(id_to_names_df, output_dir): + id_to_names_df.to_csv(f'{output_dir}/to-id-to-name.tsv', + sep='\t', index=False, header=False) + print(f'Generated {output_dir}/to-id-to-name.tsv') + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + + parser.add_argument( + 'oryzabase_file', help='text file containing the TO annotations from Oryzabase') + parser.add_argument( + 'output_dir', help='output directory for the TSV file mapping MSU accessions to TO term IDs') + + args = parser.parse_args() + + annotations_df = get_annotations(args.oryzabase_file) + id_to_names_df = map_to_id_to_names(args.oryzabase_file) + + save_annotations_to_csv(annotations_df, args.output_dir) + save_id_to_names_to_csv(id_to_names_df, args.output_dir) diff --git a/prepare_data/workflow/scripts/enrichment_analysis/util/file-convert-msu.py b/prepare_data/workflow/scripts/enrichment_analysis/util/file-convert-msu.py index 4eaa9139..4589179f 100644 --- a/prepare_data/workflow/scripts/enrichment_analysis/util/file-convert-msu.py +++ b/prepare_data/workflow/scripts/enrichment_analysis/util/file-convert-msu.py @@ -1,53 +1,53 @@ -import csv -import os -import pickle - - -def convert_msu(msu_id_file, mapping_file, output_dir, target_id, skip_no_matches): - if not os.path.exists(f'{output_dir}/{target_id}'): - os.makedirs(f'{output_dir}/{target_id}') - - output_file_name = msu_id_file.split('/')[-1] - # Change file extension to tsv - output_file_name = output_file_name[:-len('.txt')] + '.tsv' - - with open(msu_id_file) as msu_file, open(mapping_file, 'rb') as mapping, open(f'{output_dir}/{target_id}/{output_file_name}', 'w') as output: - mapping_dict = pickle.load(mapping) - - csv_reader = csv.reader(msu_file, delimiter='\t') - for line in csv_reader: - output_set = set() - for msu_id in line: - if len(mapping_dict[msu_id]) != 0: - output_set = output_set.union(mapping_dict[msu_id]) - - output.write('\t'.join(list(output_set))) - - if skip_no_matches and len(output_set) > 0: - output.write('\n') - elif not skip_no_matches: - output.write('\n') - - print(f'Generated {output_dir}/{target_id}/{output_file_name}') - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - - parser.add_argument( - 'msu_id_file', help='text file containing the list of MSU accessions to be converted') - parser.add_argument( - 'mapping_file', help="pickled dictionary mapping the MSU accessions to the target IDs") - parser.add_argument( - 'output_dir', help='output directory for the file containing the equivalent IDs after conversion') - parser.add_argument( - 'target_id', help='either "entrez" or "transcript"') - parser.add_argument( - '--skip_no_matches', action='store_true', help = 'accessions that cannot be converted will be skipped' - ) - - args = parser.parse_args() - - convert_msu(args.msu_id_file, args.mapping_file, - args.output_dir, args.target_id, args.skip_no_matches) +import csv +import os +import pickle + + +def convert_msu(msu_id_file, mapping_file, output_dir, target_id, skip_no_matches): + if not os.path.exists(f'{output_dir}/{target_id}'): + os.makedirs(f'{output_dir}/{target_id}') + + output_file_name = msu_id_file.split('/')[-1] + # Change file extension to tsv + output_file_name = output_file_name[:-len('.txt')] + '.tsv' + + with open(msu_id_file) as msu_file, open(mapping_file, 'rb') as mapping, open(f'{output_dir}/{target_id}/{output_file_name}', 'w') as output: + mapping_dict = pickle.load(mapping) + + csv_reader = csv.reader(msu_file, delimiter='\t') + for line in csv_reader: + output_set = set() + for msu_id in line: + if len(mapping_dict[msu_id]) != 0: + output_set = output_set.union(mapping_dict[msu_id]) + + output.write('\t'.join(list(output_set))) + + if skip_no_matches and len(output_set) > 0: + output.write('\n') + elif not skip_no_matches: + output.write('\n') + + print(f'Generated {output_dir}/{target_id}/{output_file_name}') + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + + parser.add_argument( + 'msu_id_file', help='text file containing the list of MSU accessions to be converted') + parser.add_argument( + 'mapping_file', help="pickled dictionary mapping the MSU accessions to the target IDs") + parser.add_argument( + 'output_dir', help='output directory for the file containing the equivalent IDs after conversion') + parser.add_argument( + 'target_id', help='either "entrez" or "transcript"') + parser.add_argument( + '--skip_no_matches', action='store_true', help = 'accessions that cannot be converted will be skipped' + ) + + args = parser.parse_args() + + convert_msu(args.msu_id_file, args.mapping_file, + args.output_dir, args.target_id, args.skip_no_matches) diff --git a/prepare_data/workflow/scripts/enrichment_analysis/util/get-genes-in-pathway-dict.py b/prepare_data/workflow/scripts/enrichment_analysis/util/get-genes-in-pathway-dict.py index bf57dc8c..ccc2a572 100644 --- a/prepare_data/workflow/scripts/enrichment_analysis/util/get-genes-in-pathway-dict.py +++ b/prepare_data/workflow/scripts/enrichment_analysis/util/get-genes-in-pathway-dict.py @@ -1,46 +1,46 @@ -import csv -import os -import pickle - -from collections import defaultdict - - -def convert_geneset_to_dict(geneset_file): - pathway_dict = defaultdict(set) - - with open(geneset_file) as f: - csv_reader = csv.reader(f, delimiter='\t') - for line in csv_reader: - gene_components = line[0].split('.') - gene = f'{gene_components[1]}-{gene_components[2]}' - pathway = line[1][len('path:'):] - - pathway_dict[pathway].add(gene) - - return pathway_dict - - -def save_pathway_dict(pathway_dict, output_dir): - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - with open(f'{output_dir}/kegg-dosa-geneset.pickle', 'wb') as handle: - pickle.dump(pathway_dict, handle, - protocol=pickle.HIGHEST_PROTOCOL) - - print(f'Generated {output_dir}/kegg-dosa-geneset.pickle') - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - - parser.add_argument( - 'geneset_file', help='text file mapping the genes to their respective KEGG pathways') - parser.add_argument( - 'output_dir', help='output directory for the pickled dictionary mapping KEGG pathways to their genes') - - args = parser.parse_args() - - pathway_dict = convert_geneset_to_dict(args.geneset_file) - save_pathway_dict(pathway_dict, args.output_dir) +import csv +import os +import pickle + +from collections import defaultdict + + +def convert_geneset_to_dict(geneset_file): + pathway_dict = defaultdict(set) + + with open(geneset_file) as f: + csv_reader = csv.reader(f, delimiter='\t') + for line in csv_reader: + gene_components = line[0].split('.') + gene = f'{gene_components[1]}-{gene_components[2]}' + pathway = line[1][len('path:'):] + + pathway_dict[pathway].add(gene) + + return pathway_dict + + +def save_pathway_dict(pathway_dict, output_dir): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + with open(f'{output_dir}/kegg-dosa-geneset.pickle', 'wb') as handle: + pickle.dump(pathway_dict, handle, + protocol=pickle.HIGHEST_PROTOCOL) + + print(f'Generated {output_dir}/kegg-dosa-geneset.pickle') + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + + parser.add_argument( + 'geneset_file', help='text file mapping the genes to their respective KEGG pathways') + parser.add_argument( + 'output_dir', help='output directory for the pickled dictionary mapping KEGG pathways to their genes') + + args = parser.parse_args() + + pathway_dict = convert_geneset_to_dict(args.geneset_file) + save_pathway_dict(pathway_dict, args.output_dir) diff --git a/prepare_data/workflow/scripts/enrichment_analysis/util/get-genes-in-pathway.r b/prepare_data/workflow/scripts/enrichment_analysis/util/get-genes-in-pathway.r index 64baff90..2c08086f 100644 --- a/prepare_data/workflow/scripts/enrichment_analysis/util/get-genes-in-pathway.r +++ b/prepare_data/workflow/scripts/enrichment_analysis/util/get-genes-in-pathway.r @@ -1,25 +1,25 @@ -library(KEGGREST) -library(optparse) - -option_list <- list( - make_option(c("-o", "--output_dir"), - type = "character", default = NULL, - help = "output directory for the text file containing the genes in each KEGG pathway for the organism 'dosa'" - ) -) - -opt_parser <- OptionParser(option_list = option_list) -opt <- parse_args(opt_parser) - -genes <- keggLink("pathway", "dosa") -genes_df <- t(data.frame(as.list(genes))) - -if (!dir.exists(opt$output_dir)) { - dir.create(opt$output_dir, recursive = TRUE) -} - -write.table(genes_df, - paste0(opt$output_dir, "/kegg-dosa-geneset.tsv"), - sep = "\t", row.names = TRUE, col.names = FALSE, quote = FALSE) - +library(KEGGREST) +library(optparse) + +option_list <- list( + make_option(c("-o", "--output_dir"), + type = "character", default = NULL, + help = "output directory for the text file containing the genes in each KEGG pathway for the organism 'dosa'" + ) +) + +opt_parser <- OptionParser(option_list = option_list) +opt <- parse_args(opt_parser) + +genes <- keggLink("pathway", "dosa") +genes_df <- t(data.frame(as.list(genes))) + +if (!dir.exists(opt$output_dir)) { + dir.create(opt$output_dir, recursive = TRUE) +} + +write.table(genes_df, + paste0(opt$output_dir, "/kegg-dosa-geneset.tsv"), + sep = "\t", row.names = TRUE, col.names = FALSE, quote = FALSE) + print(paste0("Generated ", opt$output_dir, "/kegg-dosa-geneset.tsv")) \ No newline at end of file diff --git a/prepare_data/workflow/scripts/enrichment_analysis/util/msu-to-entrez-id.py b/prepare_data/workflow/scripts/enrichment_analysis/util/msu-to-entrez-id.py index f8b7e409..a4d8f5ae 100644 --- a/prepare_data/workflow/scripts/enrichment_analysis/util/msu-to-entrez-id.py +++ b/prepare_data/workflow/scripts/enrichment_analysis/util/msu-to-entrez-id.py @@ -1,45 +1,45 @@ -import csv -import os -import pickle -from collections import defaultdict - - -def load_msu_to_entrez(msu_to_entrez_dict, id_file): - with open(id_file) as f: - csv_reader = csv.reader(f, delimiter=',') - next(csv_reader) # Skip header - for line in csv_reader: - msu = line[-1] - entrez = line[1] - - if msu != '-': - msu_to_entrez_dict[msu].add(entrez) - - print("Finished mapping MSU accessions to Entrez IDs") - - -def save_msu_entrez_mapping(msu_to_entrez_dict, output_dir): - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - with open(f'{output_dir}/msu-to-entrez-id.pickle', 'wb') as handle: - pickle.dump(msu_to_entrez_dict, handle, - protocol=pickle.HIGHEST_PROTOCOL) - - print(f'Generated {output_dir}/msu-to-entrez-id.pickle') - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - - parser.add_argument( - 'msu_to_entrez_file', help='text file mapping MSU accessions to Entrez IDs') - parser.add_argument( - 'output_dir', help='output directory for the pickled dictionary mapping MSU accessions to their respective Entrez IDs') - - args = parser.parse_args() - - msu_to_entrez_dict = defaultdict(set) - load_msu_to_entrez(msu_to_entrez_dict, args.msu_to_entrez_file) - save_msu_entrez_mapping(msu_to_entrez_dict, args.output_dir) +import csv +import os +import pickle +from collections import defaultdict + + +def load_msu_to_entrez(msu_to_entrez_dict, id_file): + with open(id_file) as f: + csv_reader = csv.reader(f, delimiter=',') + next(csv_reader) # Skip header + for line in csv_reader: + msu = line[-1] + entrez = line[1] + + if msu != '-': + msu_to_entrez_dict[msu].add(entrez) + + print("Finished mapping MSU accessions to Entrez IDs") + + +def save_msu_entrez_mapping(msu_to_entrez_dict, output_dir): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + with open(f'{output_dir}/msu-to-entrez-id.pickle', 'wb') as handle: + pickle.dump(msu_to_entrez_dict, handle, + protocol=pickle.HIGHEST_PROTOCOL) + + print(f'Generated {output_dir}/msu-to-entrez-id.pickle') + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + + parser.add_argument( + 'msu_to_entrez_file', help='text file mapping MSU accessions to Entrez IDs') + parser.add_argument( + 'output_dir', help='output directory for the pickled dictionary mapping MSU accessions to their respective Entrez IDs') + + args = parser.parse_args() + + msu_to_entrez_dict = defaultdict(set) + load_msu_to_entrez(msu_to_entrez_dict, args.msu_to_entrez_file) + save_msu_entrez_mapping(msu_to_entrez_dict, args.output_dir) diff --git a/prepare_data/workflow/scripts/enrichment_analysis/util/msu-to-transcript-id.py b/prepare_data/workflow/scripts/enrichment_analysis/util/msu-to-transcript-id.py index 51e5ea08..6f5e8722 100644 --- a/prepare_data/workflow/scripts/enrichment_analysis/util/msu-to-transcript-id.py +++ b/prepare_data/workflow/scripts/enrichment_analysis/util/msu-to-transcript-id.py @@ -1,102 +1,102 @@ -import os -import pickle -from collections import defaultdict - - -def load_msu_to_transcript(msu_to_transcript_dict, id_file): - with open(id_file) as f: - for line in f: - line = line.rstrip() - - # MSU ID - if line[0] == 'L': - msu_id = line - else: - msu_to_transcript_dict[msu_id].add(line) - - -def load_msu_to_rap(msu_to_rap_dict, rap_to_msu_file): - with open(rap_to_msu_file) as rap_to_msu: - for line in rap_to_msu: - line = line.rstrip() - - rap, msu = line.split('\t') - msu = msu.split(',') - - for id in msu: - # Remove ".1" in "LOC_Os01g01019.1" - id_components = id.split('.') - id = id_components[0] - - if id != "None" and rap != "None": - msu_to_rap_dict[id].add(rap) - - -def load_rap_to_transcript(rap_to_transcript_dict, rap_to_transcipt_file): - with open(rap_to_transcipt_file) as rap_to_transcript: - for line in rap_to_transcript: - line = line.split('\t') - if line[0] != 'Transcript_ID': # Ignore header row - rap_to_transcript_dict[line[1]].add(line[0]) - - -def map_using_rb_dp(msu_to_transcript_dict, msu_to_rap_dict, rap_to_transcript_dict, gene): - transcript_ids = set() - - for rap_id in msu_to_rap_dict[gene]: - transcript_ids = transcript_ids.union(rap_to_transcript_dict[rap_id]) - - msu_to_transcript_dict[gene] = msu_to_transcript_dict[gene].union( - transcript_ids) - - -def map_no_transcript_id(msu_to_transcript_dict, msu_to_rap_dict, rap_to_transcript_dict, no_transcript_id_file): - with open(no_transcript_id_file) as no_transcript_id: - for gene in no_transcript_id: - gene = gene.rstrip() - map_using_rb_dp(msu_to_transcript_dict, - msu_to_rap_dict, rap_to_transcript_dict, gene) - - print("Finished mapping MSU IDs to KEGG transcript IDs") - - -def save_msu_transcript_mapping(msu_to_transcript_dict, output_dir): - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - with open(f'{output_dir}/msu-to-transcript-id.pickle', 'wb') as handle: - pickle.dump(msu_to_transcript_dict, handle, - protocol=pickle.HIGHEST_PROTOCOL) - - print(f'Generated {output_dir}/msu-to-transcript-id.pickle') - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - parser.add_argument( - 'riceidconverter_msu_to_transcript_id_file', help='text file containing MSU accessions and their respective KEGG transcript IDs as mapped via riceidcoverter') - parser.add_argument( - 'riceidconverter_no_transcript_id_file', help='text file containing the list of MSU accessions that cannot be mapped by riceidcoverter to KEGG transcript IDs') - parser.add_argument( - 'rap_to_msu_file', help='text file mapping RAP to MSU accessions') - parser.add_argument( - 'rap_to_transcript_file', help='text file mapping RAP accessions to KEGG transcript IDs') - parser.add_argument( - 'output_dir', help='output directory for the pickled dictionary mapping MSU accessions to their respective KEGG transcript IDs') - - args = parser.parse_args() - - msu_to_transcript_dict = defaultdict(set) - msu_to_rap_dict = defaultdict(set) - rap_to_transcript_dict = defaultdict(set) - - load_msu_to_transcript(msu_to_transcript_dict, - args.riceidconverter_msu_to_transcript_id_file) - load_msu_to_rap(msu_to_rap_dict, args.rap_to_msu_file) - load_rap_to_transcript( - rap_to_transcript_dict, args.rap_to_transcript_file) - - map_no_transcript_id(msu_to_transcript_dict, msu_to_rap_dict, rap_to_transcript_dict, - args.riceidconverter_no_transcript_id_file) - save_msu_transcript_mapping(msu_to_transcript_dict, args.output_dir) +import os +import pickle +from collections import defaultdict + + +def load_msu_to_transcript(msu_to_transcript_dict, id_file): + with open(id_file) as f: + for line in f: + line = line.rstrip() + + # MSU ID + if line[0] == 'L': + msu_id = line + else: + msu_to_transcript_dict[msu_id].add(line) + + +def load_msu_to_rap(msu_to_rap_dict, rap_to_msu_file): + with open(rap_to_msu_file) as rap_to_msu: + for line in rap_to_msu: + line = line.rstrip() + + rap, msu = line.split('\t') + msu = msu.split(',') + + for id in msu: + # Remove ".1" in "LOC_Os01g01019.1" + id_components = id.split('.') + id = id_components[0] + + if id != "None" and rap != "None": + msu_to_rap_dict[id].add(rap) + + +def load_rap_to_transcript(rap_to_transcript_dict, rap_to_transcipt_file): + with open(rap_to_transcipt_file) as rap_to_transcript: + for line in rap_to_transcript: + line = line.split('\t') + if line[0] != 'Transcript_ID': # Ignore header row + rap_to_transcript_dict[line[1]].add(line[0]) + + +def map_using_rb_dp(msu_to_transcript_dict, msu_to_rap_dict, rap_to_transcript_dict, gene): + transcript_ids = set() + + for rap_id in msu_to_rap_dict[gene]: + transcript_ids = transcript_ids.union(rap_to_transcript_dict[rap_id]) + + msu_to_transcript_dict[gene] = msu_to_transcript_dict[gene].union( + transcript_ids) + + +def map_no_transcript_id(msu_to_transcript_dict, msu_to_rap_dict, rap_to_transcript_dict, no_transcript_id_file): + with open(no_transcript_id_file) as no_transcript_id: + for gene in no_transcript_id: + gene = gene.rstrip() + map_using_rb_dp(msu_to_transcript_dict, + msu_to_rap_dict, rap_to_transcript_dict, gene) + + print("Finished mapping MSU IDs to KEGG transcript IDs") + + +def save_msu_transcript_mapping(msu_to_transcript_dict, output_dir): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + with open(f'{output_dir}/msu-to-transcript-id.pickle', 'wb') as handle: + pickle.dump(msu_to_transcript_dict, handle, + protocol=pickle.HIGHEST_PROTOCOL) + + print(f'Generated {output_dir}/msu-to-transcript-id.pickle') + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument( + 'riceidconverter_msu_to_transcript_id_file', help='text file containing MSU accessions and their respective KEGG transcript IDs as mapped via riceidcoverter') + parser.add_argument( + 'riceidconverter_no_transcript_id_file', help='text file containing the list of MSU accessions that cannot be mapped by riceidcoverter to KEGG transcript IDs') + parser.add_argument( + 'rap_to_msu_file', help='text file mapping RAP to MSU accessions') + parser.add_argument( + 'rap_to_transcript_file', help='text file mapping RAP accessions to KEGG transcript IDs') + parser.add_argument( + 'output_dir', help='output directory for the pickled dictionary mapping MSU accessions to their respective KEGG transcript IDs') + + args = parser.parse_args() + + msu_to_transcript_dict = defaultdict(set) + msu_to_rap_dict = defaultdict(set) + rap_to_transcript_dict = defaultdict(set) + + load_msu_to_transcript(msu_to_transcript_dict, + args.riceidconverter_msu_to_transcript_id_file) + load_msu_to_rap(msu_to_rap_dict, args.rap_to_msu_file) + load_rap_to_transcript( + rap_to_transcript_dict, args.rap_to_transcript_file) + + map_no_transcript_id(msu_to_transcript_dict, msu_to_rap_dict, rap_to_transcript_dict, + args.riceidconverter_no_transcript_id_file) + save_msu_transcript_mapping(msu_to_transcript_dict, args.output_dir) diff --git a/prepare_data/workflow/scripts/enrichment_analysis/util/ricegeneid-msu-to-transcript-id.r b/prepare_data/workflow/scripts/enrichment_analysis/util/ricegeneid-msu-to-transcript-id.r index c0a7e2d8..47ea90e0 100644 --- a/prepare_data/workflow/scripts/enrichment_analysis/util/ricegeneid-msu-to-transcript-id.r +++ b/prepare_data/workflow/scripts/enrichment_analysis/util/ricegeneid-msu-to-transcript-id.r @@ -1,50 +1,50 @@ -library(data.table) -library(riceidconverter) -library(optparse) - -option_list <- list( - make_option(c("-g", "--msu_genes"), - type = "character", default = NULL, - help = "text file containing the input genes (MSU ID)" - ), - make_option(c("-o", "--output_dir"), - type = "character", default = NULL, - help = "output directory for the text file containing the equivalent KEGG transcript IDs and the text file containing the genes without equivalent KEGG transcript IDs" - ) -) - -opt_parser <- OptionParser(option_list = option_list) -opt <- parse_args(opt_parser) - -all_genes <- read.table(opt$msu_genes) -all_genes <- all_genes[[1]] - -all_transcript <- c() -na_all_transcript <- c() - -for (gene in all_genes) { - transcript_id <- RiceIDConvert(gene, fromType = "MSU", toType = "TRANSCRIPTID") - transcript_id_list <- c() - transcript_id_list <- append(transcript_id_list, gene) - - for (id in transcript_id$TRANSCRIPTID) { - if (is.na(id)) { - print(gene) - na_all_transcript <- append(na_all_transcript, gene) - } else { - transcript_id_list <- append(transcript_id_list, id) - } - } - - all_transcript <- append(all_transcript, list(transcript_id_list)) -} - -if (!dir.exists(opt$output_dir)) { - dir.create(opt$output_dir, recursive = TRUE) -} - -lapply(na_all_transcript, write, paste0(opt$output_dir, "/all-na-transcript-id.txt"), append = TRUE, sep = "\n") -print(paste0("Generated ", opt$output_dir, "/all-na-transcript-id.txt")) - -lapply(all_transcript, write, paste0(opt$output_dir, "/all-transcript-id.txt"), append = TRUE, sep = "\n") -print(paste0("Generated ", opt$output_dir, "/all-transcript-id.txt")) +library(data.table) +library(riceidconverter) +library(optparse) + +option_list <- list( + make_option(c("-g", "--msu_genes"), + type = "character", default = NULL, + help = "text file containing the input genes (MSU ID)" + ), + make_option(c("-o", "--output_dir"), + type = "character", default = NULL, + help = "output directory for the text file containing the equivalent KEGG transcript IDs and the text file containing the genes without equivalent KEGG transcript IDs" + ) +) + +opt_parser <- OptionParser(option_list = option_list) +opt <- parse_args(opt_parser) + +all_genes <- read.table(opt$msu_genes) +all_genes <- all_genes[[1]] + +all_transcript <- c() +na_all_transcript <- c() + +for (gene in all_genes) { + transcript_id <- RiceIDConvert(gene, fromType = "MSU", toType = "TRANSCRIPTID") + transcript_id_list <- c() + transcript_id_list <- append(transcript_id_list, gene) + + for (id in transcript_id$TRANSCRIPTID) { + if (is.na(id)) { + print(gene) + na_all_transcript <- append(na_all_transcript, gene) + } else { + transcript_id_list <- append(transcript_id_list, id) + } + } + + all_transcript <- append(all_transcript, list(transcript_id_list)) +} + +if (!dir.exists(opt$output_dir)) { + dir.create(opt$output_dir, recursive = TRUE) +} + +lapply(na_all_transcript, write, paste0(opt$output_dir, "/all-na-transcript-id.txt"), append = TRUE, sep = "\n") +print(paste0("Generated ", opt$output_dir, "/all-na-transcript-id.txt")) + +lapply(all_transcript, write, paste0(opt$output_dir, "/all-transcript-id.txt"), append = TRUE, sep = "\n") +print(paste0("Generated ", opt$output_dir, "/all-transcript-id.txt")) diff --git a/prepare_data/workflow/scripts/enrichment_analysis/util/transcript-to-msu-id.py b/prepare_data/workflow/scripts/enrichment_analysis/util/transcript-to-msu-id.py index 5ece34b2..97543be9 100644 --- a/prepare_data/workflow/scripts/enrichment_analysis/util/transcript-to-msu-id.py +++ b/prepare_data/workflow/scripts/enrichment_analysis/util/transcript-to-msu-id.py @@ -1,41 +1,41 @@ -import pickle -import os -from collections import defaultdict - - -def convert_transcript_to_msu(msu_to_transcript_dict): - transcript_to_msu = defaultdict(set) - with open(msu_to_transcript_dict, 'rb') as f: - msu_to_transcript = pickle.load(f) - - for msu, transcript_ids in msu_to_transcript.items(): - for transcript in transcript_ids: - transcript_to_msu[transcript].add(msu) - - return transcript_to_msu - - -def save_transcript_msu_mapping(transcript_to_msu, output_dir): - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - with open(f'{output_dir}/transcript-to-msu-id.pickle', 'wb') as handle: - pickle.dump(transcript_to_msu, handle, - protocol=pickle.HIGHEST_PROTOCOL) - - print(f'Generated {output_dir}/transcript-to-msu-id.pickle') - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - parser.add_argument( - 'msu_to_transcript_dict', help='pickled dictionary mapping MSU accessions to their respective KEGG transcript IDs') - parser.add_argument( - 'output_dir', help='output directory for the pickled dictionary mapping KEGG transcript IDs to their respective MSU accessions') - - args = parser.parse_args() - - transcript_to_msu = convert_transcript_to_msu(args.msu_to_transcript_dict) - - save_transcript_msu_mapping(transcript_to_msu, args.output_dir) +import pickle +import os +from collections import defaultdict + + +def convert_transcript_to_msu(msu_to_transcript_dict): + transcript_to_msu = defaultdict(set) + with open(msu_to_transcript_dict, 'rb') as f: + msu_to_transcript = pickle.load(f) + + for msu, transcript_ids in msu_to_transcript.items(): + for transcript in transcript_ids: + transcript_to_msu[transcript].add(msu) + + return transcript_to_msu + + +def save_transcript_msu_mapping(transcript_to_msu, output_dir): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + with open(f'{output_dir}/transcript-to-msu-id.pickle', 'wb') as handle: + pickle.dump(transcript_to_msu, handle, + protocol=pickle.HIGHEST_PROTOCOL) + + print(f'Generated {output_dir}/transcript-to-msu-id.pickle') + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument( + 'msu_to_transcript_dict', help='pickled dictionary mapping MSU accessions to their respective KEGG transcript IDs') + parser.add_argument( + 'output_dir', help='output directory for the pickled dictionary mapping KEGG transcript IDs to their respective MSU accessions') + + args = parser.parse_args() + + transcript_to_msu = convert_transcript_to_msu(args.msu_to_transcript_dict) + + save_transcript_msu_mapping(transcript_to_msu, args.output_dir) diff --git a/prepare_data/workflow/scripts/gene_description/prepare_df_rgi_gene_description.py b/prepare_data/workflow/scripts/gene_description/prepare_df_rgi_gene_description.py index e0445aaa..2f3cebd5 100644 --- a/prepare_data/workflow/scripts/gene_description/prepare_df_rgi_gene_description.py +++ b/prepare_data/workflow/scripts/gene_description/prepare_df_rgi_gene_description.py @@ -1,28 +1,28 @@ -import pandas as pd - - -def main(annotation_file, out_csv): - - gene_description_df = pd.DataFrame( - columns=('Gene_ID', 'Description', 'UniProtKB/Swiss-Prot')) - - with open(annotation_file, 'r') as f: - for i, line in enumerate(f): - gene_ID, remaining = line.rstrip().split("\t") - description, UniProt_ID = remaining.split( - "[UniProtKB/Swiss-Prot:") - description = description.rstrip() - UniProt_ID = UniProt_ID.rstrip("]") - gene_description_df.loc[i] = [gene_ID, description, UniProt_ID] - - gene_description_df.to_csv(out_csv, index=False) - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - parser.add_argument('annotation_file', help='ann file from RGI') - parser.add_argument('out_csv', help='path to output csv file') - - args = parser.parse_args() - main(args.annotation_file, args.out_csv) +import pandas as pd + + +def main(annotation_file, out_csv): + + gene_description_df = pd.DataFrame( + columns=('Gene_ID', 'Description', 'UniProtKB/Swiss-Prot')) + + with open(annotation_file, 'r') as f: + for i, line in enumerate(f): + gene_ID, remaining = line.rstrip().split("\t") + description, UniProt_ID = remaining.split( + "[UniProtKB/Swiss-Prot:") + description = description.rstrip() + UniProt_ID = UniProt_ID.rstrip("]") + gene_description_df.loc[i] = [gene_ID, description, UniProt_ID] + + gene_description_df.to_csv(out_csv, index=False) + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('annotation_file', help='ann file from RGI') + parser.add_argument('out_csv', help='path to output csv file') + + args = parser.parse_args() + main(args.annotation_file, args.out_csv) diff --git a/prepare_data/workflow/scripts/get_promoter_sequences.py b/prepare_data/workflow/scripts/get_promoter_sequences.py index 6ae81506..b877d123 100644 --- a/prepare_data/workflow/scripts/get_promoter_sequences.py +++ b/prepare_data/workflow/scripts/get_promoter_sequences.py @@ -1,48 +1,48 @@ -import gffutils -from Bio import SeqIO -from Bio.SeqRecord import SeqRecord -from Bio.Seq import Seq - -def main(genome,gff_db,upstream_win_len,downstream_win_len,out_fasta,promoter_gene_map_f): - records = SeqIO.to_dict(SeqIO.parse(genome,"fasta")) - db = gffutils.FeatureDB(gff_db,keep_order=True) - genes = list(db.features_of_type('gene')) - promoters = [] - promoter_gene_map = open(args.promoter_gene_map,"a") - for gene in genes: - if gene.strand == '+': - promoter_seq_up = str(records[gene.seqid].seq[gene.start-1-upstream_win_len:gene.start-1]) - promoter_seq_dn = str(records[gene.seqid].seq[gene.start-1:gene.start-1+downstream_win_len]) - promoter_seq = promoter_seq_up+promoter_seq_dn - promoter_seq_id = gene.seqid+":"+str(gene.start-1-upstream_win_len)+"-"+str(gene.start-1+downstream_win_len) - promoter_seq_rec = SeqRecord(Seq(promoter_seq),id=promoter_seq_id,description='') - - elif gene.strand == "-": - promoter_seq_up = str(records[gene.seqid].seq[gene.end:gene.end+upstream_win_len]) - promoter_seq_dn = str(records[gene.seqid].seq[gene.end-1-downstream_win_len:gene.end]) - promoter_seq = promoter_seq_dn + promoter_seq_up - promoter_seq_id = gene.seqid+":"+str(gene.start-1-upstream_win_len)+"-"+str(gene.start-1+downstream_win_len) - promoter_seq_rec = SeqRecord(Seq(promoter_seq),id=promoter_seq_id,description='') - promoters.append(promoter_seq_rec) - - promoters.append(promoter_seq_rec) - promoter_gene_map.write(promoter_seq_id+"\t"+gene.attributes["ID"][0]+"\n") - - # promoter_seq = str(records[gene.seqid].seq[gene.start - upstream_win_len:gene.start - 1]) - - SeqIO.write(promoters,out_fasta,"fasta") - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - parser.add_argument('genome',help='genome sequence') - parser.add_argument('gff_db',help='path to gff db') - parser.add_argument('upstream_win_len',type=int,help='how many bps upstream of TSS do we search for motifs') - parser.add_argument('downstream_win_len', type=int,help='how many bps downstream of TSS do we search for motifs') - parser.add_argument('out_fasta',help='path to output fasta file') - parser.add_argument('promoter_gene_map',help='path to file containing promoter-to-gene map') - - args= parser.parse_args() - main(args.genome, args.gff_db,args.upstream_win_len, args.downstream_win_len, +import gffutils +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +from Bio.Seq import Seq + +def main(genome,gff_db,upstream_win_len,downstream_win_len,out_fasta,promoter_gene_map_f): + records = SeqIO.to_dict(SeqIO.parse(genome,"fasta")) + db = gffutils.FeatureDB(gff_db,keep_order=True) + genes = list(db.features_of_type('gene')) + promoters = [] + promoter_gene_map = open(args.promoter_gene_map,"a") + for gene in genes: + if gene.strand == '+': + promoter_seq_up = str(records[gene.seqid].seq[gene.start-1-upstream_win_len:gene.start-1]) + promoter_seq_dn = str(records[gene.seqid].seq[gene.start-1:gene.start-1+downstream_win_len]) + promoter_seq = promoter_seq_up+promoter_seq_dn + promoter_seq_id = gene.seqid+":"+str(gene.start-1-upstream_win_len)+"-"+str(gene.start-1+downstream_win_len) + promoter_seq_rec = SeqRecord(Seq(promoter_seq),id=promoter_seq_id,description='') + + elif gene.strand == "-": + promoter_seq_up = str(records[gene.seqid].seq[gene.end:gene.end+upstream_win_len]) + promoter_seq_dn = str(records[gene.seqid].seq[gene.end-1-downstream_win_len:gene.end]) + promoter_seq = promoter_seq_dn + promoter_seq_up + promoter_seq_id = gene.seqid+":"+str(gene.start-1-upstream_win_len)+"-"+str(gene.start-1+downstream_win_len) + promoter_seq_rec = SeqRecord(Seq(promoter_seq),id=promoter_seq_id,description='') + promoters.append(promoter_seq_rec) + + promoters.append(promoter_seq_rec) + promoter_gene_map.write(promoter_seq_id+"\t"+gene.attributes["ID"][0]+"\n") + + # promoter_seq = str(records[gene.seqid].seq[gene.start - upstream_win_len:gene.start - 1]) + + SeqIO.write(promoters,out_fasta,"fasta") + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('genome',help='genome sequence') + parser.add_argument('gff_db',help='path to gff db') + parser.add_argument('upstream_win_len',type=int,help='how many bps upstream of TSS do we search for motifs') + parser.add_argument('downstream_win_len', type=int,help='how many bps downstream of TSS do we search for motifs') + parser.add_argument('out_fasta',help='path to output fasta file') + parser.add_argument('promoter_gene_map',help='path to file containing promoter-to-gene map') + + args= parser.parse_args() + main(args.genome, args.gff_db,args.upstream_win_len, args.downstream_win_len, args.out_fasta, args.promoter_gene_map) \ No newline at end of file diff --git a/prepare_data/workflow/scripts/gff_db.py b/prepare_data/workflow/scripts/gff_db.py index cc09fda1..5cf51a5e 100644 --- a/prepare_data/workflow/scripts/gff_db.py +++ b/prepare_data/workflow/scripts/gff_db.py @@ -1,13 +1,13 @@ -import gffutils - -def main(input_gff,output_gff_db): - db = gffutils.create_db(input_gff, output_gff_db,merge_strategy="warning") - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - parser.add_argument('input_gff',help='input GFF file') - parser.add_argument('output_gff_db',help='output GFF index file') - - args= parser.parse_args() +import gffutils + +def main(input_gff,output_gff_db): + db = gffutils.create_db(input_gff, output_gff_db,merge_strategy="warning") + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('input_gff',help='input GFF file') + parser.add_argument('output_gff_db',help='output GFF index file') + + args= parser.parse_args() main(args.input_gff, args.output_gff_db) \ No newline at end of file diff --git a/prepare_data/workflow/scripts/module_detection/detect-modules-via-coach.py b/prepare_data/workflow/scripts/module_detection/detect-modules-via-coach.py index 5967a060..4b0ce983 100644 --- a/prepare_data/workflow/scripts/module_detection/detect-modules-via-coach.py +++ b/prepare_data/workflow/scripts/module_detection/detect-modules-via-coach.py @@ -1,43 +1,43 @@ -import os - -import networkx as nx -from cdlib import algorithms, readwrite - - -def detect_modules(edge_list_file, module_list_dir, density_threshold, affinity_threshold, closeness_threshold): - G = None - with open(edge_list_file, 'r') as f: - G = nx.read_edgelist(f) - G = nx.convert_node_labels_to_integers(G, ordering="sorted") - - coms = algorithms.coach( - G, density_threshold=density_threshold, affinity_threshold=affinity_threshold, closeness_threshold=closeness_threshold) - - if not os.path.exists(module_list_dir): - os.makedirs(module_list_dir) - - readwrite.write_community_csv( - coms, f'{module_list_dir}/coach-int-module-list-{int(affinity_threshold * 1000)}.csv', '\t') - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - parser.add_argument( - 'edge_list_file', help='text file corresponding to the edge list') - parser.add_argument( - 'module_list_dir', help='output directory for the module list') - parser.add_argument( - '-density_threshold', type=float, required=False, default=0.7, help='minimum core density (default = 0.7)' - ) - parser.add_argument( - '-affinity_threshold', type=float, required=False, default=0.225, help='maximum core affinity (default = 0.225)' - ) - parser.add_argument( - '-closeness_threshold', type=float, required=False, default=0.5, help='minimum neighbor closeness (default = 0.5)' - ) - - args = parser.parse_args() - - detect_modules(args.edge_list_file, args.module_list_dir, - args.density_threshold, args.affinity_threshold, args.closeness_threshold) +import os + +import networkx as nx +from cdlib import algorithms, readwrite + + +def detect_modules(edge_list_file, module_list_dir, density_threshold, affinity_threshold, closeness_threshold): + G = None + with open(edge_list_file, 'r') as f: + G = nx.read_edgelist(f) + G = nx.convert_node_labels_to_integers(G, ordering="sorted") + + coms = algorithms.coach( + G, density_threshold=density_threshold, affinity_threshold=affinity_threshold, closeness_threshold=closeness_threshold) + + if not os.path.exists(module_list_dir): + os.makedirs(module_list_dir) + + readwrite.write_community_csv( + coms, f'{module_list_dir}/coach-int-module-list-{int(affinity_threshold * 1000)}.csv', '\t') + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument( + 'edge_list_file', help='text file corresponding to the edge list') + parser.add_argument( + 'module_list_dir', help='output directory for the module list') + parser.add_argument( + '-density_threshold', type=float, required=False, default=0.7, help='minimum core density (default = 0.7)' + ) + parser.add_argument( + '-affinity_threshold', type=float, required=False, default=0.225, help='maximum core affinity (default = 0.225)' + ) + parser.add_argument( + '-closeness_threshold', type=float, required=False, default=0.5, help='minimum neighbor closeness (default = 0.5)' + ) + + args = parser.parse_args() + + detect_modules(args.edge_list_file, args.module_list_dir, + args.density_threshold, args.affinity_threshold, args.closeness_threshold) diff --git a/prepare_data/workflow/scripts/module_detection/detect-modules-via-demon.py b/prepare_data/workflow/scripts/module_detection/detect-modules-via-demon.py index b58e0e86..da897a9a 100644 --- a/prepare_data/workflow/scripts/module_detection/detect-modules-via-demon.py +++ b/prepare_data/workflow/scripts/module_detection/detect-modules-via-demon.py @@ -1,39 +1,39 @@ -import os - -import networkx as nx -from cdlib import algorithms, readwrite - - -def detect_modules(edge_list_file, module_list_dir, epsilon, min_com_size): - G = None - with open(edge_list_file, 'r') as f: - G = nx.read_edgelist(f) - G = nx.convert_node_labels_to_integers(G, ordering="sorted") - - coms = algorithms.demon(G, epsilon=epsilon, min_com_size=min_com_size) - - if not os.path.exists(module_list_dir): - os.makedirs(module_list_dir) - - readwrite.write_community_csv( - coms, f'{module_list_dir}/demon-int-module-list-{int(epsilon * 100)}.csv', '\t') - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - parser.add_argument( - 'edge_list_file', help='text file corresponding to the edge list') - parser.add_argument( - 'module_list_dir', help='output directory for the module list') - parser.add_argument( - '-epsilon', type=float, required=False, default=0.25, help='merging threshold (default = 0.25)' - ) - parser.add_argument( - '-min_com_size', type=int, required=False, default=3, help='minimum size of a module (default = 3)' - ) - - args = parser.parse_args() - - detect_modules(args.edge_list_file, - args.module_list_dir, args.epsilon, args.min_com_size) +import os + +import networkx as nx +from cdlib import algorithms, readwrite + + +def detect_modules(edge_list_file, module_list_dir, epsilon, min_com_size): + G = None + with open(edge_list_file, 'r') as f: + G = nx.read_edgelist(f) + G = nx.convert_node_labels_to_integers(G, ordering="sorted") + + coms = algorithms.demon(G, epsilon=epsilon, min_com_size=min_com_size) + + if not os.path.exists(module_list_dir): + os.makedirs(module_list_dir) + + readwrite.write_community_csv( + coms, f'{module_list_dir}/demon-int-module-list-{int(epsilon * 100)}.csv', '\t') + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument( + 'edge_list_file', help='text file corresponding to the edge list') + parser.add_argument( + 'module_list_dir', help='output directory for the module list') + parser.add_argument( + '-epsilon', type=float, required=False, default=0.25, help='merging threshold (default = 0.25)' + ) + parser.add_argument( + '-min_com_size', type=int, required=False, default=3, help='minimum size of a module (default = 3)' + ) + + args = parser.parse_args() + + detect_modules(args.edge_list_file, + args.module_list_dir, args.epsilon, args.min_com_size) diff --git a/prepare_data/workflow/scripts/module_util/generate-mapping-from-networkx-int-edge-graph.py b/prepare_data/workflow/scripts/module_util/generate-mapping-from-networkx-int-edge-graph.py index fe73fb5a..26637d89 100644 --- a/prepare_data/workflow/scripts/module_util/generate-mapping-from-networkx-int-edge-graph.py +++ b/prepare_data/workflow/scripts/module_util/generate-mapping-from-networkx-int-edge-graph.py @@ -1,50 +1,50 @@ - -import os -import pickle - -import networkx as nx - - -def generate_mapping(mapping, edge_list_file, int_edge_list_node_mapping_file): - G = None - with open(edge_list_file, 'r') as edge_list, open(int_edge_list_node_mapping_file, 'rb') as int_edge_list_node_map: - G = nx.read_edgelist(edge_list) - int_edge_list_node_mapping = pickle.load(int_edge_list_node_map) - - sorted_nodes = sorted(G.nodes()) - - for i, node in enumerate(sorted_nodes): - mapping[i] = int_edge_list_node_mapping[int(node)] - - print("Finished mapping integer node labels to their (original) string node labels") - - -def save_node_mapping(mapping, node_mapping_dir): - if not os.path.exists(node_mapping_dir): - os.makedirs(node_mapping_dir) - - with open(f'{node_mapping_dir}/networkx-node-mapping.pickle', 'wb') as handle: - pickle.dump(mapping, handle, - protocol=pickle.HIGHEST_PROTOCOL) - - print(f'Generated {node_mapping_dir}/networkx-node-mapping.pickle') - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - parser.add_argument( - 'edge_list_file', help='text file corresponding to the edge list with the node labels converted to integers') - parser.add_argument( - 'int_edge_list_node_mapping_file', help='pickled dictionary mapping the integer node labels to the (original) string labels' - ) - parser.add_argument( - 'output_dir', help='output directory for the pickled dictionary mapping the node labels in the networkx integer-indexed graph to their (original) string labels' - ) - - args = parser.parse_args() - - mapping = {} - generate_mapping(mapping, args.edge_list_file, - args.int_edge_list_node_mapping_file) - save_node_mapping(mapping, args.output_dir) + +import os +import pickle + +import networkx as nx + + +def generate_mapping(mapping, edge_list_file, int_edge_list_node_mapping_file): + G = None + with open(edge_list_file, 'r') as edge_list, open(int_edge_list_node_mapping_file, 'rb') as int_edge_list_node_map: + G = nx.read_edgelist(edge_list) + int_edge_list_node_mapping = pickle.load(int_edge_list_node_map) + + sorted_nodes = sorted(G.nodes()) + + for i, node in enumerate(sorted_nodes): + mapping[i] = int_edge_list_node_mapping[int(node)] + + print("Finished mapping integer node labels to their (original) string node labels") + + +def save_node_mapping(mapping, node_mapping_dir): + if not os.path.exists(node_mapping_dir): + os.makedirs(node_mapping_dir) + + with open(f'{node_mapping_dir}/networkx-node-mapping.pickle', 'wb') as handle: + pickle.dump(mapping, handle, + protocol=pickle.HIGHEST_PROTOCOL) + + print(f'Generated {node_mapping_dir}/networkx-node-mapping.pickle') + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument( + 'edge_list_file', help='text file corresponding to the edge list with the node labels converted to integers') + parser.add_argument( + 'int_edge_list_node_mapping_file', help='pickled dictionary mapping the integer node labels to the (original) string labels' + ) + parser.add_argument( + 'output_dir', help='output directory for the pickled dictionary mapping the node labels in the networkx integer-indexed graph to their (original) string labels' + ) + + args = parser.parse_args() + + mapping = {} + generate_mapping(mapping, args.edge_list_file, + args.int_edge_list_node_mapping_file) + save_node_mapping(mapping, args.output_dir) diff --git a/prepare_data/workflow/scripts/module_util/get-modules-from-clusterone-results.py b/prepare_data/workflow/scripts/module_util/get-modules-from-clusterone-results.py index 2bdbe4e4..b47cd20e 100644 --- a/prepare_data/workflow/scripts/module_util/get-modules-from-clusterone-results.py +++ b/prepare_data/workflow/scripts/module_util/get-modules-from-clusterone-results.py @@ -1,50 +1,50 @@ -import csv -import os -import sys -import codecs - -maxInt = sys.maxsize - -while True: - try: - csv.field_size_limit(maxInt) - break - except OverflowError: - maxInt = int(maxInt/10) - - -def get_modules(clusterone_results, output_dir): - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - with open(clusterone_results) as results, open(f'{output_dir}/clusterone-module-list.tsv', 'w') as output: - csv_reader = csv.reader(results, delimiter=',') - - try: - next(csv_reader) # Skip header - except Exception as e: - # Sometimes the generated CSV has null bytes - csv_reader = csv.reader(codecs.open( - clusterone_results, 'rU', 'utf-16')) - next(csv_reader) - - for line in csv_reader: - modules = line[-1] - modules = modules.replace(" ", "\t") - output.write(modules + "\n") - - print(f'Generated {output_dir}/clusterone-module-list.tsv') - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - parser.add_argument( - 'clusterone_results', help='CSV file corresponding to the results of running ClusterONE') - parser.add_argument( - 'output_dir', help='output directory for the text file containing only the modules found via ClusterONE' - ) - - args = parser.parse_args() - - get_modules(args.clusterone_results, args.output_dir) +import csv +import os +import sys +import codecs + +maxInt = sys.maxsize + +while True: + try: + csv.field_size_limit(maxInt) + break + except OverflowError: + maxInt = int(maxInt/10) + + +def get_modules(clusterone_results, output_dir): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + with open(clusterone_results) as results, open(f'{output_dir}/clusterone-module-list.tsv', 'w') as output: + csv_reader = csv.reader(results, delimiter=',') + + try: + next(csv_reader) # Skip header + except Exception as e: + # Sometimes the generated CSV has null bytes + csv_reader = csv.reader(codecs.open( + clusterone_results, 'rU', 'utf-16')) + next(csv_reader) + + for line in csv_reader: + modules = line[-1] + modules = modules.replace(" ", "\t") + output.write(modules + "\n") + + print(f'Generated {output_dir}/clusterone-module-list.tsv') + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument( + 'clusterone_results', help='CSV file corresponding to the results of running ClusterONE') + parser.add_argument( + 'output_dir', help='output directory for the text file containing only the modules found via ClusterONE' + ) + + args = parser.parse_args() + + get_modules(args.clusterone_results, args.output_dir) diff --git a/prepare_data/workflow/scripts/module_util/restore-node-labels-in-modules.py b/prepare_data/workflow/scripts/module_util/restore-node-labels-in-modules.py index fcb2ce27..12fbce10 100644 --- a/prepare_data/workflow/scripts/module_util/restore-node-labels-in-modules.py +++ b/prepare_data/workflow/scripts/module_util/restore-node-labels-in-modules.py @@ -1,41 +1,41 @@ -import os -import pickle - - -def restore_node_labels(module_list_file, mapping_file, module_list_dir, algo): - if not os.path.exists(module_list_dir): - os.makedirs(module_list_dir) - - with open(module_list_file, 'r') as modules, open(mapping_file, 'rb') as mapping, open(f'{module_list_dir}/{algo}-module-list.tsv', 'w') as output: - mapping_dict = pickle.load(mapping) - - for module in modules: - module = module.rstrip() - nodes = module.split('\t') - - mapped_nodes = [] - for node in nodes: - mapped_nodes.append(mapping_dict[int(node)]) - - output.write('\t'.join(mapped_nodes) + '\n') - - print(f'Generated {module_list_dir}/{algo}-module-list.tsv') - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - parser.add_argument( - 'module_list_file', help='text file corresponding to the module list where the node labels are integers') - parser.add_argument( - 'mapping_file', help='pickled integer-to-string node label mapping dictionary') - parser.add_argument( - 'module_list_dir', help='output directory for the module list where the nodes have been relabeled to their (original) string labels') - parser.add_argument( - 'algo', help='name of community detection algorithm' - ) - - args = parser.parse_args() - - restore_node_labels(args.module_list_file, - args.mapping_file, args.module_list_dir, args.algo) +import os +import pickle + + +def restore_node_labels(module_list_file, mapping_file, module_list_dir, algo): + if not os.path.exists(module_list_dir): + os.makedirs(module_list_dir) + + with open(module_list_file, 'r') as modules, open(mapping_file, 'rb') as mapping, open(f'{module_list_dir}/{algo}-module-list.tsv', 'w') as output: + mapping_dict = pickle.load(mapping) + + for module in modules: + module = module.rstrip() + nodes = module.split('\t') + + mapped_nodes = [] + for node in nodes: + mapped_nodes.append(mapping_dict[int(node)]) + + output.write('\t'.join(mapped_nodes) + '\n') + + print(f'Generated {module_list_dir}/{algo}-module-list.tsv') + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument( + 'module_list_file', help='text file corresponding to the module list where the node labels are integers') + parser.add_argument( + 'mapping_file', help='pickled integer-to-string node label mapping dictionary') + parser.add_argument( + 'module_list_dir', help='output directory for the module list where the nodes have been relabeled to their (original) string labels') + parser.add_argument( + 'algo', help='name of community detection algorithm' + ) + + args = parser.parse_args() + + restore_node_labels(args.module_list_file, + args.mapping_file, args.module_list_dir, args.algo) diff --git a/prepare_data/workflow/scripts/network_util/convert-to-int-edge-list.py b/prepare_data/workflow/scripts/network_util/convert-to-int-edge-list.py index 7ffb3f00..fcdc86e7 100644 --- a/prepare_data/workflow/scripts/network_util/convert-to-int-edge-list.py +++ b/prepare_data/workflow/scripts/network_util/convert-to-int-edge-list.py @@ -1,55 +1,55 @@ -import os -import pickle - - -def convert_to_int_edge_list(string_to_int_mapping, int_to_string_mapping, input_file, output_dir): - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - with open(input_file, 'r') as orig_graph, open(f'{output_dir}/int-edge-list.txt', 'w') as int_graph: - for line in orig_graph: - edges = line.split('\t') - - if edges[0] not in string_to_int_mapping: - string_to_int_mapping[edges[0]] = len(string_to_int_mapping) - int_to_string_mapping[string_to_int_mapping[edges[0]]] = edges[0] - - if edges[1] not in string_to_int_mapping: - string_to_int_mapping[edges[1]] = len(string_to_int_mapping) - int_to_string_mapping[string_to_int_mapping[edges[1]]] = edges[1] - - node1 = str(string_to_int_mapping[edges[0]]) - node2 = str(string_to_int_mapping[edges[1]]) - - int_graph.write(node1 + " " + node2 + "\n") - - print("Finished mapping string node labels to integer node labels") - - -def save_node_mapping(int_to_string_mapping, node_mapping_dir): - if not os.path.exists(node_mapping_dir): - os.makedirs(node_mapping_dir) - - with open(f'{node_mapping_dir}/int-edge-list-node-mapping.pickle', 'wb') as handle: - pickle.dump(int_to_string_mapping, handle, - protocol=pickle.HIGHEST_PROTOCOL) - - print(f'Generated {node_mapping_dir}/int-edge-list-node-mapping.pickle') - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - parser.add_argument( - 'input_edge_list_file', help='text file corresponding to the edge list where the node labels are strings') - parser.add_argument( - 'output_dir', help='output directory for the edge list with the node labels converted to integers and for the pickled integer-to-string node label mapping dictionary' - ) - - args = parser.parse_args() - - string_to_int_mapping = {} - int_to_string_mapping = {} - convert_to_int_edge_list( - string_to_int_mapping, int_to_string_mapping, args.input_edge_list_file, args.output_dir) - save_node_mapping(int_to_string_mapping, args.output_dir) +import os +import pickle + + +def convert_to_int_edge_list(string_to_int_mapping, int_to_string_mapping, input_file, output_dir): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + with open(input_file, 'r') as orig_graph, open(f'{output_dir}/int-edge-list.txt', 'w') as int_graph: + for line in orig_graph: + edges = line.split('\t') + + if edges[0] not in string_to_int_mapping: + string_to_int_mapping[edges[0]] = len(string_to_int_mapping) + int_to_string_mapping[string_to_int_mapping[edges[0]]] = edges[0] + + if edges[1] not in string_to_int_mapping: + string_to_int_mapping[edges[1]] = len(string_to_int_mapping) + int_to_string_mapping[string_to_int_mapping[edges[1]]] = edges[1] + + node1 = str(string_to_int_mapping[edges[0]]) + node2 = str(string_to_int_mapping[edges[1]]) + + int_graph.write(node1 + " " + node2 + "\n") + + print("Finished mapping string node labels to integer node labels") + + +def save_node_mapping(int_to_string_mapping, node_mapping_dir): + if not os.path.exists(node_mapping_dir): + os.makedirs(node_mapping_dir) + + with open(f'{node_mapping_dir}/int-edge-list-node-mapping.pickle', 'wb') as handle: + pickle.dump(int_to_string_mapping, handle, + protocol=pickle.HIGHEST_PROTOCOL) + + print(f'Generated {node_mapping_dir}/int-edge-list-node-mapping.pickle') + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument( + 'input_edge_list_file', help='text file corresponding to the edge list where the node labels are strings') + parser.add_argument( + 'output_dir', help='output directory for the edge list with the node labels converted to integers and for the pickled integer-to-string node label mapping dictionary' + ) + + args = parser.parse_args() + + string_to_int_mapping = {} + int_to_string_mapping = {} + convert_to_int_edge_list( + string_to_int_mapping, int_to_string_mapping, args.input_edge_list_file, args.output_dir) + save_node_mapping(int_to_string_mapping, args.output_dir) diff --git a/prepare_data/workflow/scripts/ogi_mapping/generate-ogi-dicts.py b/prepare_data/workflow/scripts/ogi_mapping/generate-ogi-dicts.py index 676a8ae0..87cc972c 100644 --- a/prepare_data/workflow/scripts/ogi_mapping/generate-ogi-dicts.py +++ b/prepare_data/workflow/scripts/ogi_mapping/generate-ogi-dicts.py @@ -1,110 +1,110 @@ -import csv -import os -import pickle - - -def get_rice_variants(path): - rice_variants = [] - - with open(f'{path}/core.ogi') as f: - csv_reader = csv.reader(f, delimiter='\t') - for row in csv_reader: - rice_variants = row - break - - for i in range(len(rice_variants)): - # Remove OS prefix - if rice_variants[i][:len('Os')] == 'Os': - rice_variants[i] = rice_variants[i][len('OS'):] - - # Nipponbare is abbreviated as 'Nb' in the app but 'Nip' in RGI - if rice_variants[i] == 'Nip': - rice_variants[i] = 'Nb' - - # Remove LOC - if rice_variants[i] == 'LOC': - rice_variants[i] = '' - - rice_variants = [ - rice_variant for rice_variant in rice_variants if rice_variant != ''] - - return rice_variants - - -def make_mapping_dicts(rice_variants): - mapping_dicts = [] - for rice_variant in rice_variants: - mapping_dicts.append({}) - - return mapping_dicts - - -def separate_paralogs(genes): - if ',' in genes: - paralogs = genes.split(',') - return paralogs - - return [genes] - - -def generate_dict(ogi_file, mapping_dicts): - with open(ogi_file) as f: - csv_reader = csv.reader(f, delimiter='\t') - - # Skip header row - next(csv_reader, None) - - for row in csv_reader: - NB_ACCESSION = 1 - mapping_dict_idx = 0 - for idx in range(NB_ACCESSION, len(row)): - - # Skip indices 2 and 3. They are also Nipponbare accession numbers. - # But the app uses the Nipponbare accession at index 1 - if not (2 <= idx and idx <= 3): - try: - gene_str = row[idx].strip() - - if gene_str != '.': - genes = separate_paralogs(row[idx].strip()) - for gene in genes: - if gene != '': - mapping_dicts[mapping_dict_idx][gene] = row[0].strip( - ) - - except IndexError: - break - - mapping_dict_idx += 1 - - -def pickle_mapping_dicts(path_mapping_dicts, mapping_dicts): - if not os.path.exists(path_mapping_dicts): - os.makedirs(path_mapping_dicts) - - for rice_variant, mapping_dict in zip(rice_variants, mapping_dicts): - pickle_path = f'{path_mapping_dicts}/{rice_variant}_to_ogi.pickle' - with open(pickle_path, 'wb') as f: - pickle.dump(mapping_dict, f, protocol=pickle.HIGHEST_PROTOCOL) - print( - f'Generated {path_mapping_dicts}/{rice_variant}_to_ogi.pickle') - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - parser.add_argument( - 'input_dir', help='directory containing the gene ID mapping from RGI') - parser.add_argument( - 'output_dir', help='output directory for the pickled accession-to-OGI mapping dictionaries') - - args = parser.parse_args() - - rice_variants = get_rice_variants(args.input_dir) - mapping_dicts = make_mapping_dicts(rice_variants) - - for file in os.listdir(args.input_dir): - generate_dict(f'{args.input_dir}/{file}', mapping_dicts) - print(f'Generated dictionary for {args.input_dir}/{file}') - - pickle_mapping_dicts(args.output_dir, mapping_dicts) +import csv +import os +import pickle + + +def get_rice_variants(path): + rice_variants = [] + + with open(f'{path}/core.ogi') as f: + csv_reader = csv.reader(f, delimiter='\t') + for row in csv_reader: + rice_variants = row + break + + for i in range(len(rice_variants)): + # Remove OS prefix + if rice_variants[i][:len('Os')] == 'Os': + rice_variants[i] = rice_variants[i][len('OS'):] + + # Nipponbare is abbreviated as 'Nb' in the app but 'Nip' in RGI + if rice_variants[i] == 'Nip': + rice_variants[i] = 'Nb' + + # Remove LOC + if rice_variants[i] == 'LOC': + rice_variants[i] = '' + + rice_variants = [ + rice_variant for rice_variant in rice_variants if rice_variant != ''] + + return rice_variants + + +def make_mapping_dicts(rice_variants): + mapping_dicts = [] + for rice_variant in rice_variants: + mapping_dicts.append({}) + + return mapping_dicts + + +def separate_paralogs(genes): + if ',' in genes: + paralogs = genes.split(',') + return paralogs + + return [genes] + + +def generate_dict(ogi_file, mapping_dicts): + with open(ogi_file) as f: + csv_reader = csv.reader(f, delimiter='\t') + + # Skip header row + next(csv_reader, None) + + for row in csv_reader: + NB_ACCESSION = 1 + mapping_dict_idx = 0 + for idx in range(NB_ACCESSION, len(row)): + + # Skip indices 2 and 3. They are also Nipponbare accession numbers. + # But the app uses the Nipponbare accession at index 1 + if not (2 <= idx and idx <= 3): + try: + gene_str = row[idx].strip() + + if gene_str != '.': + genes = separate_paralogs(row[idx].strip()) + for gene in genes: + if gene != '': + mapping_dicts[mapping_dict_idx][gene] = row[0].strip( + ) + + except IndexError: + break + + mapping_dict_idx += 1 + + +def pickle_mapping_dicts(path_mapping_dicts, mapping_dicts): + if not os.path.exists(path_mapping_dicts): + os.makedirs(path_mapping_dicts) + + for rice_variant, mapping_dict in zip(rice_variants, mapping_dicts): + pickle_path = f'{path_mapping_dicts}/{rice_variant}_to_ogi.pickle' + with open(pickle_path, 'wb') as f: + pickle.dump(mapping_dict, f, protocol=pickle.HIGHEST_PROTOCOL) + print( + f'Generated {path_mapping_dicts}/{rice_variant}_to_ogi.pickle') + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument( + 'input_dir', help='directory containing the gene ID mapping from RGI') + parser.add_argument( + 'output_dir', help='output directory for the pickled accession-to-OGI mapping dictionaries') + + args = parser.parse_args() + + rice_variants = get_rice_variants(args.input_dir) + mapping_dicts = make_mapping_dicts(rice_variants) + + for file in os.listdir(args.input_dir): + generate_dict(f'{args.input_dir}/{file}', mapping_dicts) + print(f'Generated dictionary for {args.input_dir}/{file}') + + pickle_mapping_dicts(args.output_dir, mapping_dicts) diff --git a/prepare_data/workflow/scripts/qtaro/prepare_qtaro.py b/prepare_data/workflow/scripts/qtaro/prepare_qtaro.py index 23636fd7..581e17c8 100644 --- a/prepare_data/workflow/scripts/qtaro/prepare_qtaro.py +++ b/prepare_data/workflow/scripts/qtaro/prepare_qtaro.py @@ -1,53 +1,53 @@ -import csv -from collections import defaultdict -import os -import pickle - - -def convert_default_to_vanilla_dict(d): - """ - Lifted from https://stackoverflow.com/questions/26496831/how-to-convert-defaultdict-of-defaultdicts-of-defaultdicts-to-dict-of-dicts-o - """ - if isinstance(d, defaultdict): - d = {k: convert_default_to_vanilla_dict(v) for k, v in d.items()} - return d - - -def prepare_qtaro_mapping(annotation_file): - mapping = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) - with open(annotation_file, encoding='utf8') as qtaro: - csv_reader = csv.reader(qtaro, delimiter=',') - for line in csv_reader: - gene = line[-1] - character_major = line[3] - character_minor = line[4] - pub = line[-2] - - mapping[gene][character_major][character_minor].add(pub) - - print("Generated dictionary from QTARO annotation file") - - return convert_default_to_vanilla_dict(mapping) - - -def export_mapping(mapping, output_dir): - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - with open(f'{output_dir}/qtaro.pickle', 'wb') as handle: - pickle.dump(mapping, handle, protocol=pickle.HIGHEST_PROTOCOL) - - print(f'Generated {output_dir}/qtaro.pickle') - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - parser.add_argument('annotation_file', help='annotation file from QTARO') - parser.add_argument( - 'output_dir', help='output directory for the dictionary resulting from preprocessing the QTARO annotation file') - - args = parser.parse_args() - - mapping = prepare_qtaro_mapping(args.annotation_file) - export_mapping(mapping, args.output_dir) +import csv +from collections import defaultdict +import os +import pickle + + +def convert_default_to_vanilla_dict(d): + """ + Lifted from https://stackoverflow.com/questions/26496831/how-to-convert-defaultdict-of-defaultdicts-of-defaultdicts-to-dict-of-dicts-o + """ + if isinstance(d, defaultdict): + d = {k: convert_default_to_vanilla_dict(v) for k, v in d.items()} + return d + + +def prepare_qtaro_mapping(annotation_file): + mapping = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) + with open(annotation_file, encoding='utf8') as qtaro: + csv_reader = csv.reader(qtaro, delimiter=',') + for line in csv_reader: + gene = line[-1] + character_major = line[3] + character_minor = line[4] + pub = line[-2] + + mapping[gene][character_major][character_minor].add(pub) + + print("Generated dictionary from QTARO annotation file") + + return convert_default_to_vanilla_dict(mapping) + + +def export_mapping(mapping, output_dir): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + with open(f'{output_dir}/qtaro.pickle', 'wb') as handle: + pickle.dump(mapping, handle, protocol=pickle.HIGHEST_PROTOCOL) + + print(f'Generated {output_dir}/qtaro.pickle') + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('annotation_file', help='annotation file from QTARO') + parser.add_argument( + 'output_dir', help='output directory for the dictionary resulting from preprocessing the QTARO annotation file') + + args = parser.parse_args() + + mapping = prepare_qtaro_mapping(args.annotation_file) + export_mapping(mapping, args.output_dir) diff --git a/prepare_data/workflow/scripts/text_mining/get_pubmed_per_gene.py b/prepare_data/workflow/scripts/text_mining/get_pubmed_per_gene.py index cc03dbc5..6d6ab790 100644 --- a/prepare_data/workflow/scripts/text_mining/get_pubmed_per_gene.py +++ b/prepare_data/workflow/scripts/text_mining/get_pubmed_per_gene.py @@ -1,494 +1,494 @@ -# =========== -# CORE LOGIC -# =========== - -# Go through each symbol associated with gene -# We use the word "symbol" to refer to the different accession IDs and gene symbols. - -# - The symbol should not be "sandwiched" between alphanumeric characters -# - This is to disambiguate PK1 from PK12 -# - This is also so that PK1 in (PK1) can still be matched - -# - The symbol should not be after sp. or spp. (or their variants w/o periods) -# - This is to disambiguate gene symbols from taxonomic nomenclature - -# - If the symbol has 2 letters only, make matching case-sensitive -# - This is to disambiguate go from GO -# - Otherwise, make matching case-insensitive - -# - If the symbol is an English word, make matching case-sensitive -# - This is to disambiguate coin from COIN (cold inducible zinc finger protein) -# - We are using the English word corpus from the NLTK - -# - We have to replace some symbols for better disambiguation (based on a manually compiled list) -# - For now, the only entry is tips. It yielded a lot of false matches with root tips etc. -# - So we replace tips with TIPS and TIPs - -# - We have to exclude some symbols under certain contexts (based on a manually compiled list) -# - For example, PS is a symbol for some pistilloid-stamen gene. -# However, PS is more commonly used in literature as an abbreviation for photosystem -# - Another case is LOG, which is a symbol for lonely guy gene, but can often appear in the context of log 2 -# - Our disambiguation strategy is as follows: -# - Let x be the symbol of interest. It should be excluded if it stands for or refers to y. -# - Let S be the set of symbols for that gene. Let S' = S \ {x} . -# - If a PubMed article has a match in S', then it is included -# - If a PubMed article matches x but the article also contains y, then it is excluded -# - If a PubMed article matches x and the article does not contain y, then it is included - -from nltk.corpus import words -import pandas as pd -import os -import regex as re -import csv -import pickle - -from collections import defaultdict - -import nltk -nltk.download('words') - -ENG_WORDS = set(words.words()) - -COLNAMES = ['Gene', 'PMID', 'Title', 'Sentence', 'Score'] - -SPECIES_LOOKBEHIND = '(?2 +# - Our disambiguation strategy is as follows: +# - Let x be the symbol of interest. It should be excluded if it stands for or refers to y. +# - Let S be the set of symbols for that gene. Let S' = S \ {x} . +# - If a PubMed article has a match in S', then it is included +# - If a PubMed article matches x but the article also contains y, then it is excluded +# - If a PubMed article matches x and the article does not contain y, then it is included + +from nltk.corpus import words +import pandas as pd +import os +import regex as re +import csv +import pickle + +from collections import defaultdict + +import nltk +nltk.download('words') + +ENG_WORDS = set(words.words()) + +COLNAMES = ['Gene', 'PMID', 'Title', 'Sentence', 'Score'] + +SPECIES_LOOKBEHIND = '(?