From 5176ef1be52127a73afbc4de2888e22818c38014 Mon Sep 17 00:00:00 2001
From: murphycj <charliemurphyj@gmail.com>
Date: Sun, 29 Sep 2024 19:47:38 -0400
Subject: [PATCH] update code for ensembl 111

---
 .gitignore           |  2 ++
 README.md            | 13 +++++++++++++
 agfusion/database.py | 25 ++++++++++++++-----------
 agfusion/utils.py    |  2 +-
 requirements.txt     |  2 +-
 5 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/.gitignore b/.gitignore
index ab17765..8335dff 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,5 @@ dist/
 *pyc
 build/
 *.db
+*.ipynb
+*clans.tsv
diff --git a/README.md b/README.md
index 12a062d..f06e7a0 100644
--- a/README.md
+++ b/README.md
@@ -215,6 +215,19 @@ agfusion annotate \
 ![alt tag](https://github.com/murphycj/AGFusion/blob/master/doc/ENSMUST00000064477-ENSMUST00000002487-rescale.png)
 ![alt tag](https://github.com/murphycj/AGFusion/blob/master/doc/ENSMUST00000122054-ENSMUST00000070330-rescale.png)
 
+# Building your own database
+AGFusion uses a pre-built SQLite database to annotation gene fusions; in addition to data from pyensembl. The SQLite databases are stored on AWS S3: .
+
+Follow the steps below if you want to build your own SQLite database:
+
+(1) Install [mysqlclient](https://github.com/PyMySQL/mysqlclient).
+
+(2) Download and unzip the PFAM reference file: [https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz](https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz)
+
+(3) Install your desired pyensembl reference genome. For example: `pyensembl install --release 111`.
+
+(4) Build the AGFusion database: `agfusion build -d . -s homo_sapiens -r 111 --pfam Pfam-A.clans.tsv`
+
 # Troubleshooting
 
 **(1) Problem:** I get a warning message like the following:
diff --git a/agfusion/database.py b/agfusion/database.py
index ba2debf..d219910 100644
--- a/agfusion/database.py
+++ b/agfusion/database.py
@@ -1,5 +1,6 @@
 """Classes for AGFusion database objects
 """
+
 import logging
 import sqlite3
 import sys
@@ -115,9 +116,14 @@ def __init__(self, db_dir, species, release, pfam, server):
         for line in open(pfam, "r"):
             line = line.rstrip().split("\t")
 
+            # old format
+            # pfam_id = line[0]
+            # pfam_name = line[1]
+            # pfam_desc = line[3]
+
             pfam_id = line[0]
-            pfam_name = line[1]
-            pfam_desc = line[3]
+            pfam_name = line[3]
+            pfam_desc = line[4]
 
             self.pfam_mapping[pfam_id] = {"name": pfam_name, "desc": pfam_desc}
 
@@ -257,15 +263,12 @@ def fetch_gene_names(self):
             sys.exit(1)
 
         mysql_command = (
-            """SELECT gene.gene_id, xref.display_label FROM " \
-            "gene, object_xref, xref,external_db WHERE " \
-            "gene.gene_id = object_xref.ensembl_id AND " \
-            "object_xref.ensembl_object_type = 'Gene' AND " \
-            "object_xref.xref_id = xref.xref_id AND " \
-            "xref.external_db_id = external_db.external_db_id AND " \
-            "external_db.db_name = '"""
-            + gene_name_db
-            + """';"""
+            "SELECT gene.gene_id, xref.display_label FROM gene, object_xref, xref, external_db "
+            "WHERE gene.gene_id = object_xref.ensembl_id "
+            "AND object_xref.ensembl_object_type = 'Gene' "
+            "AND object_xref.xref_id = xref.xref_id "
+            "AND xref.external_db_id = external_db.external_db_id "
+            f"AND external_db.db_name = '{gene_name_db}';"
         )
 
         self.logger.info("MySQL - %s", mysql_command)
diff --git a/agfusion/utils.py b/agfusion/utils.py
index 08a7796..5bbecac 100644
--- a/agfusion/utils.py
+++ b/agfusion/utils.py
@@ -13,7 +13,7 @@
 # this is mostly contigent on the maximum ensembl release supported
 # by pyensembl
 
-MAX_ENSEMBL_RELEASE = 95
+MAX_ENSEMBL_RELEASE = 111
 
 GENOME_SHORTCUTS = {
     "GRCm38": ["mus_musculus", MAX_ENSEMBL_RELEASE],
diff --git a/requirements.txt b/requirements.txt
index 5589114..c994487 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,4 @@ matplotlib>=3.6.1
 pandas>=1.5.1
 biopython>=1.79
 future>=0.16.0
-pyensembl>=1.1.0
+pyensembl>=2.3.13