From 5176ef1be52127a73afbc4de2888e22818c38014 Mon Sep 17 00:00:00 2001 From: murphycj Date: Sun, 29 Sep 2024 19:47:38 -0400 Subject: [PATCH] update code for ensembl 111 --- .gitignore | 2 ++ README.md | 13 +++++++++++++ agfusion/database.py | 25 ++++++++++++++----------- agfusion/utils.py | 2 +- requirements.txt | 2 +- 5 files changed, 31 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index ab17765..8335dff 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,5 @@ dist/ *pyc build/ *.db +*.ipynb +*clans.tsv diff --git a/README.md b/README.md index 12a062d..f06e7a0 100644 --- a/README.md +++ b/README.md @@ -215,6 +215,19 @@ agfusion annotate \ ![alt tag](https://github.com/murphycj/AGFusion/blob/master/doc/ENSMUST00000064477-ENSMUST00000002487-rescale.png) ![alt tag](https://github.com/murphycj/AGFusion/blob/master/doc/ENSMUST00000122054-ENSMUST00000070330-rescale.png) +# Building your own database +AGFusion uses a pre-built SQLite database to annotation gene fusions; in addition to data from pyensembl. The SQLite databases are stored on AWS S3: . + +Follow the steps below if you want to build your own SQLite database: + +(1) Install [mysqlclient](https://github.com/PyMySQL/mysqlclient). + +(2) Download and unzip the PFAM reference file: [https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz](https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz) + +(3) Install your desired pyensembl reference genome. For example: `pyensembl install --release 111`. + +(4) Build the AGFusion database: `agfusion build -d . -s homo_sapiens -r 111 --pfam Pfam-A.clans.tsv` + # Troubleshooting **(1) Problem:** I get a warning message like the following: diff --git a/agfusion/database.py b/agfusion/database.py index ba2debf..d219910 100644 --- a/agfusion/database.py +++ b/agfusion/database.py @@ -1,5 +1,6 @@ """Classes for AGFusion database objects """ + import logging import sqlite3 import sys @@ -115,9 +116,14 @@ def __init__(self, db_dir, species, release, pfam, server): for line in open(pfam, "r"): line = line.rstrip().split("\t") + # old format + # pfam_id = line[0] + # pfam_name = line[1] + # pfam_desc = line[3] + pfam_id = line[0] - pfam_name = line[1] - pfam_desc = line[3] + pfam_name = line[3] + pfam_desc = line[4] self.pfam_mapping[pfam_id] = {"name": pfam_name, "desc": pfam_desc} @@ -257,15 +263,12 @@ def fetch_gene_names(self): sys.exit(1) mysql_command = ( - """SELECT gene.gene_id, xref.display_label FROM " \ - "gene, object_xref, xref,external_db WHERE " \ - "gene.gene_id = object_xref.ensembl_id AND " \ - "object_xref.ensembl_object_type = 'Gene' AND " \ - "object_xref.xref_id = xref.xref_id AND " \ - "xref.external_db_id = external_db.external_db_id AND " \ - "external_db.db_name = '""" - + gene_name_db - + """';""" + "SELECT gene.gene_id, xref.display_label FROM gene, object_xref, xref, external_db " + "WHERE gene.gene_id = object_xref.ensembl_id " + "AND object_xref.ensembl_object_type = 'Gene' " + "AND object_xref.xref_id = xref.xref_id " + "AND xref.external_db_id = external_db.external_db_id " + f"AND external_db.db_name = '{gene_name_db}';" ) self.logger.info("MySQL - %s", mysql_command) diff --git a/agfusion/utils.py b/agfusion/utils.py index 08a7796..5bbecac 100644 --- a/agfusion/utils.py +++ b/agfusion/utils.py @@ -13,7 +13,7 @@ # this is mostly contigent on the maximum ensembl release supported # by pyensembl -MAX_ENSEMBL_RELEASE = 95 +MAX_ENSEMBL_RELEASE = 111 GENOME_SHORTCUTS = { "GRCm38": ["mus_musculus", MAX_ENSEMBL_RELEASE], diff --git a/requirements.txt b/requirements.txt index 5589114..c994487 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,4 @@ matplotlib>=3.6.1 pandas>=1.5.1 biopython>=1.79 future>=0.16.0 -pyensembl>=1.1.0 +pyensembl>=2.3.13