From f23a3f03e0bca0c548afd4d0ed1a60ca8228f59b Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Sat, 15 Oct 2022 15:23:54 +0200
Subject: [PATCH 01/21] fetchPfamMSA from Interpro basic

---
 prody/database/pfam.py | 96 +++++++++++++++++++++++-------------------
 1 file changed, 53 insertions(+), 43 deletions(-)

diff --git a/prody/database/pfam.py b/prody/database/pfam.py
index 2b8b886b9..53f6ca93d 100644
--- a/prody/database/pfam.py
+++ b/prody/database/pfam.py
@@ -29,8 +29,10 @@
 SELEX = 'selex'
 STOCKHOLM = 'stockholm'
 
-DOWNLOAD_FORMATS = set(['seed', 'full', 'ncbi', 'metagenomics',
-                        'rp15', 'rp35', 'rp55', 'rp75', 'uniprot'])
+DOWNLOAD_FORMATS = set(['seed', 'full', 'uniprot', 
+                        #'ncbi', 'metagenomics',
+                        #'rp15', 'rp35', 'rp55', 'rp75'
+                        ])
 FORMAT_OPTIONS = ({'format': set([FASTA, SELEX, STOCKHOLM]),
                   'order': set(['tree', 'alphabetical']),
                   'inserts': set(['lower', 'upper']),
@@ -40,6 +42,7 @@
 
 old_prefix = 'https://pfam.xfam.org/'
 prefix = 'https://pfam-legacy.xfam.org/'
+new_prefix = 'https://www.ebi.ac.uk/interpro/wwwapi//entry/pfam/'
 
 def searchPfam(query, **kwargs):
     """Returns Pfam search results in a dictionary.  Matching Pfam accession
@@ -357,50 +360,57 @@ def fetchPfamMSA(acc, alignment='full', compressed=False, **kwargs):
                          .format(repr(orig_acc)))
 
     if alignment not in DOWNLOAD_FORMATS:
-        raise ValueError('alignment must be one of full, seed, ncbi or'
-                         ' metagenomics')
-    if alignment == 'ncbi' or alignment == 'metagenomics' or alignment == 'uniprot':
-        url = (prefix + 'family/' + acc + '/alignment/' +
-               alignment + '/gzipped')
+        raise ValueError('alignment must be one of full, seed,'
+                         #' ncbi or'
+                         #' metagenomics'
+                         ' or uniprot')
+    # if alignment == 'ncbi' or alignment == 'metagenomics' or alignment == 'uniprot':
+    #     #url = (prefix + 'family/' + acc + '/alignment/' +
+    #     #       alignment + '/gzipped')
+    #     url = (new_prefix + acc + 
+    #            '/?annotation=alignment:' + alignment + '&download')
+    #     url_flag = True
+    #     extension = '.sth'
+    # else:
+    if not kwargs:
+        #url = (prefix + 'family/' + acc + '/alignment/' +
+        #       alignment + '/gzipped')
+        url = (new_prefix + acc + 
+                '/?annotation=alignment:' + alignment + '&download')
         url_flag = True
         extension = '.sth'
     else:
-        if not kwargs:
-            url = (prefix + 'family/' + acc + '/alignment/' +
-                   alignment + '/gzipped')
-            url_flag = True
-            extension = '.sth'
-        else:
-            align_format = kwargs.get('format', 'selex').lower()
-
-            if align_format not in FORMAT_OPTIONS['format']:
-                raise ValueError('alignment format must be of type selex'
-                                 ' stockholm or fasta. MSF not supported')
-
-            if align_format == SELEX:
-                align_format, extension = 'pfam', '.slx'
-            elif align_format == FASTA:
-                extension = '.fasta'
-            else:
-                extension = '.sth'
-
-            gaps = str(kwargs.get('gaps', 'dashes')).lower()
-            if gaps not in FORMAT_OPTIONS['gaps']:
-                raise ValueError('gaps must be of type mixed, dots, dashes, '
-                                 'or None')
-
-            inserts = kwargs.get('inserts', 'upper').lower()
-            if(inserts not in FORMAT_OPTIONS['inserts']):
-                raise ValueError('inserts must be of type lower or upper')
-
-            order = kwargs.get('order', 'tree').lower()
-            if order not in FORMAT_OPTIONS['order']:
-                raise ValueError('order must be of type tree or alphabetical')
-
-            url = (prefix + 'family/' + acc + '/alignment/'
-                   + alignment + '/format?format=' + align_format +
-                   '&alnType=' + alignment + '&order=' + order[0] +
-                   '&case=' + inserts[0] + '&gaps=' + gaps + '&download=1')
+        raise ValueError('kwargs are not supported for Interpro Pfam')
+    #     align_format = kwargs.get('format', 'selex').lower()
+
+    #     if align_format not in FORMAT_OPTIONS['format']:
+    #         raise ValueError('alignment format must be of type selex'
+    #                             ' stockholm or fasta. MSF not supported')
+
+    #     if align_format == SELEX:
+    #         align_format, extension = 'pfam', '.slx'
+    #     elif align_format == FASTA:
+    #         extension = '.fasta'
+    #     else:
+    #         extension = '.sth'
+
+    #     gaps = str(kwargs.get('gaps', 'dashes')).lower()
+    #     if gaps not in FORMAT_OPTIONS['gaps']:
+    #         raise ValueError('gaps must be of type mixed, dots, dashes, '
+    #                             'or None')
+
+    #     inserts = kwargs.get('inserts', 'upper').lower()
+    #     if(inserts not in FORMAT_OPTIONS['inserts']):
+    #         raise ValueError('inserts must be of type lower or upper')
+
+    #     order = kwargs.get('order', 'tree').lower()
+    #     if order not in FORMAT_OPTIONS['order']:
+    #         raise ValueError('order must be of type tree or alphabetical')
+
+    #     url = (prefix + 'family/' + acc + '/alignment/'
+    #             + alignment + '/format?format=' + align_format +
+    #             '&alnType=' + alignment + '&order=' + order[0] +
+    #             '&case=' + inserts[0] + '&gaps=' + gaps + '&download=1')
 
     LOGGER.timeit('_pfam')
     timeout = kwargs.get('timeout', 60)

From 4930f5c9c30aff069b01374b267f1ca6857b10fe Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Sat, 15 Oct 2022 15:24:22 +0200
Subject: [PATCH 02/21] adjust tests to work with it

---
 prody/tests/database/test_pfam.py | 34 ++++++++++++++++---------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/prody/tests/database/test_pfam.py b/prody/tests/database/test_pfam.py
index 0b83337fc..b4ccf6040 100644
--- a/prody/tests/database/test_pfam.py
+++ b/prody/tests/database/test_pfam.py
@@ -144,32 +144,34 @@ def testSeed(self):
         
     def testFormat(self):
         """Test the outcome of fetching the domain MSA for claudins
-        with keyword argument format set to fasta."""
+        with keyword argument format set to fasta. This should give a 
+        ValueError for now"""
 
-        b = fetchPfamMSA(self.query, format="fasta")
+        self.assertRaises(ValueError, fetchPfamMSA, self.query, format="fasta")
+        # b = fetchPfamMSA(self.query, format="fasta")
 
-        self.assertIsInstance(b, str,
-            'fetchPfamMSA failed to return a str instance')
+        # self.assertIsInstance(b, str,
+        #     'fetchPfamMSA failed to return a str instance')
         
-        self.assertEqual(b, 'PF00822_full.fasta')
+        # self.assertEqual(b, 'PF00822_full.fasta')
         
-        self.assertTrue(os.path.exists(b))
+        # self.assertTrue(os.path.exists(b))
 
 
-    def testFolder(self):
-        """Test the outcome of fetching the domain MSA for claudins
-        with keyword folder set to a folder that is made especially."""
+    # def testFolder(self):
+    #     """Test the outcome of fetching the domain MSA for claudins
+    #     with keyword folder set to a folder that is made especially."""
 
-        folder = "new_folder"
-        os.mkdir(folder)
-        b = fetchPfamMSA(self.query, folder=folder)
+    #     folder = "new_folder"
+    #     os.mkdir(folder)
+    #     b = fetchPfamMSA(self.query, folder=folder)
 
-        self.assertIsInstance(b, str,
-            'fetchPfamMSA failed to return a str instance')
+    #     self.assertIsInstance(b, str,
+    #         'fetchPfamMSA failed to return a str instance')
         
-        self.assertEqual(b, 'new_folder/PF00822_full.slx')
+    #     self.assertEqual(b, 'new_folder/PF00822_full.slx')
         
-        self.assertTrue(os.path.exists(b))
+    #     self.assertTrue(os.path.exists(b))
     
     @classmethod
     def tearDownClass(self):

From e444eb41d056d43fc3c5d44f72b3f8f59e1cfa3c Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Sat, 15 Oct 2022 16:48:05 +0200
Subject: [PATCH 03/21] clean up searchPfam test

---
 prody/tests/database/test_pfam.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/prody/tests/database/test_pfam.py b/prody/tests/database/test_pfam.py
index b4ccf6040..f4f8c3a25 100644
--- a/prody/tests/database/test_pfam.py
+++ b/prody/tests/database/test_pfam.py
@@ -17,7 +17,14 @@
 LOGGER.verbosity = 'none'
 
 class TestSearchPfam(unittest.TestCase):
-    def setUp(self):
+    
+    @classmethod
+    def setUpClass(self):
+        self.workdir = 'pfam_search_tests'
+        if not os.path.exists(self.workdir):
+            os.mkdir(self.workdir)
+        os.chdir(self.workdir)
+
         self.queries = ['P19491', 'GRIA2_RAT', '6qkc', '6qkcB', '6qkcI', 
                         'VQVLLTTIGAFAAFGLMTIAISTDYWLYTRGLTHSGLWRICCLEGLK'\
                             'RGVCVKINHFAEYLLRVVRASSIFPILSAILLLLGGVCVAASR'\
@@ -101,6 +108,11 @@ def testSeqSingle(self):
         self.assertEqual(sorted(list(a.keys())), 
                            ['PF00822'],
                            'searchPfam failed to return the right domain family IDs for TARP')
+        
+    @classmethod
+    def tearDownClass(self):
+        os.chdir('..')
+        shutil.rmtree(self.workdir)
 
 
 class TestFetchPfamMSA(unittest.TestCase):

From 01a8a838c87a392e8e81b72aa119c35e2c221612 Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Thu, 27 Oct 2022 18:31:08 +0200
Subject: [PATCH 04/21] fix pfam tests for py 3.5

---
 prody/tests/database/test_pfam.py | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/prody/tests/database/test_pfam.py b/prody/tests/database/test_pfam.py
index f4f8c3a25..14b186420 100644
--- a/prody/tests/database/test_pfam.py
+++ b/prody/tests/database/test_pfam.py
@@ -25,7 +25,7 @@ def setUpClass(self):
             os.mkdir(self.workdir)
         os.chdir(self.workdir)
 
-        self.queries = ['P19491', 'GRIA2_RAT', '6qkc', '6qkcB', '6qkcI', 
+        self.queries = ['P19491', 'GRIA2_RAT', '6qkcB', '6qkcI', 
                         'VQVLLTTIGAFAAFGLMTIAISTDYWLYTRGLTHSGLWRICCLEGLK'\
                             'RGVCVKINHFAEYLLRVVRASSIFPILSAILLLLGGVCVAASR'\
                             'VYKSKRNIILGAGILFVAAGLSNIIGVIVYISANAGKNHYSYG'\
@@ -57,24 +57,11 @@ def testUniprotIdMulti(self):
                            ['PF00060', 'PF01094', 'PF10613'],
                            'searchPfam failed to return the right domain family IDs')
         
-    def testPdbIdMulti(self):
-        """Test the outcome of a simple search scenario using a PDB ID
-        containing the same multi-domain protein from taking the first chain by default."""
-
-        a = searchPfam(self.queries[2])
-
-        self.assertIsInstance(a, dict,
-            'searchPfam failed to return a dict instance')
-        
-        self.assertEqual(sorted(list(a.keys())), 
-                           ['PF00060', 'PF01094', 'PF10613'],
-                           'searchPfam failed to return the right domain family IDs')
-        
     def testPdbIdChMulti(self):
         """Test the outcome of a simple search scenario using a PDB ID
         and chain ID for the same multi-domain protein from specifying chain B."""
 
-        a = searchPfam(self.queries[3])
+        a = searchPfam(self.queries[2])
 
         self.assertIsInstance(a, dict,
             'searchPfam failed to return a dict instance')
@@ -87,7 +74,7 @@ def testPdbIdChSingle(self):
         """Test the outcome of a simple search scenario using a PDB ID
         and chain ID to get the single domain protein TARP g8 from chain I."""
 
-        a = searchPfam(self.queries[4])
+        a = searchPfam(self.queries[3])
 
         self.assertIsInstance(a, dict,
             'searchPfam failed to return a dict instance')
@@ -100,7 +87,7 @@ def testSeqSingle(self):
         """Test the outcome of a simple search scenario using the sequence 
         of the single domain protein TARP g8 from 6qkc chain I."""
 
-        a = searchPfam(self.queries[5])
+        a = searchPfam(self.queries[4])
 
         self.assertIsInstance(a, dict,
             'searchPfam failed to return a dict instance')

From 56ef75fad8cb032691d364344646974117a542db Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Thu, 19 Jan 2023 18:37:07 +0100
Subject: [PATCH 05/21] add searchInterpro

---
 prody/database/__init__.py |  14 ++++-
 prody/database/interpro.py | 102 +++++++++++++++++++++++++++++++++++++
 2 files changed, 114 insertions(+), 2 deletions(-)
 create mode 100644 prody/database/interpro.py

diff --git a/prody/database/__init__.py b/prody/database/__init__.py
index fb523f789..87a736d32 100644
--- a/prody/database/__init__.py
+++ b/prody/database/__init__.py
@@ -10,8 +10,7 @@
   * :func:`.fetchPfamMSA` - download MSA files
   * :func:`.searchPfam` - search for domain families of a protein
 
-
-.. _Pfam: http://pfam.sanger.ac.uk/
+.. _Pfam: https://www.ebi.ac.uk/interpro/entry/pfam/
 
 UniProt
 ========
@@ -70,7 +69,14 @@
 
 .. _GOA: https://www.ebi.ac.uk/GOA/
 
+Interpro
+====
+
+The following functions can be used to search and retrieve Pfam_ data:
+
+  * :func:`.searchInterpro` - search for domain families of a protein
 
+.. _Pfam: https://www.ebi.ac.uk/interpro/
 """
 
 __all__ = []
@@ -98,3 +104,7 @@
 from . import quartataweb
 from .quartataweb import *
 __all__.extend(quartataweb.__all__)
+
+from . import interpro
+from .interpro import *
+__all__.extend(interpro.__all__)
diff --git a/prody/database/interpro.py b/prody/database/interpro.py
new file mode 100644
index 000000000..fe2d786af
--- /dev/null
+++ b/prody/database/interpro.py
@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+"""This module defines functions for interfacing Interpro database."""
+
+__author__ = 'James Krieger'
+
+import json
+from os.path import isfile
+from prody import LOGGER, PY3K
+
+__all__ = ['searchInterpro']
+
+prefix = 'https://www.ebi.ac.uk/interpro/wwwapi/entry/'
+
+def searchInterpro(query, **kwargs):
+    """Returns Interpro search results in a list of dictionaries.  
+    
+    Matching family accessions as keys will map to various properties,
+    including start and end residue positions.
+
+    :arg query: UniProt ID or PDB identifier with or without a
+        chain identifier, e.g. ``'1mkp'`` or ``'1mkpA'``.  
+        UniProt ID of the specified chain, or the first
+        protein chain will be used for searching the Pfam database
+    :type query: str
+
+    :arg timeout: timeout for blocking connection attempt in seconds, default
+        is 60
+    :type timeout: int
+    """
+    import requests
+    
+    LOGGER.timeit('_interpro')
+    timeout = int(kwargs.get('timeout', 60))
+
+    if len(query) == 4:
+        url = prefix + "all/structure/pdb/" + query
+        
+    elif len(query) == 5:
+        accession = None
+        
+        from prody import parsePDBHeader
+        try:
+            polymers = parsePDBHeader(query[:4], 'polymers')
+        except Exception as err:
+            raise ValueError('failed to parse header for {0} ({1})'
+                             .format(query[:4], str(err)))
+
+        chid = query[4:].upper()
+        
+        for poly in polymers:
+            if chid and poly.chid != chid:
+                continue
+            for dbref in poly.dbrefs:
+                if dbref.database != 'UniProt':
+                    continue
+                accession = dbref.accession
+                LOGGER.info('UniProt accession {0} for {1} chain '
+                            '{2} will be used.'
+                            .format(accession, query[:4], poly.chid))
+                break
+            if accession is not None:
+                break
+            
+        if accession is None:
+            raise ValueError('A UniProt accession for PDB {0} could not be '
+                             'parsed.'.format(repr(query)))
+        else:
+            url = prefix + "all/protein/uniprot/" + accession
+        
+    else:
+        url = prefix + "all/protein/uniprot/" + query
+
+    LOGGER.debug('Retrieving Interpro search results: ' + url)
+    result = None
+    sleep = 2
+    while LOGGER.timing('_interpro') < timeout:
+        try:
+            result = requests.get(url, verify=False).content
+        except Exception:
+            pass
+        else:
+            if result not in ['PEND','RUN']:
+                break
+        
+        sleep = 20 if int(sleep * 1.5) >= 20 else int(sleep * 1.5)
+        LOGGER.sleep(int(sleep), '. Trying to reconnect...')
+
+    if not result:
+        raise IOError('Interpro search timed out or failed to parse results, '
+                      ' check URL: ' + url)
+    else:
+        LOGGER.report('Interpro search completed in %.2fs.', '_interpro')
+
+    if PY3K:
+        result = result.decode()
+        
+    try:
+        result = json.loads(result)
+    except Exception as err:
+        raise ValueError('failed to parse results as json, check URL: ' + url)
+
+    return result["results"]

From 768505f90e62243cdd7b78c9e7aec10fb9eb8e67 Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Thu, 19 Jan 2023 19:04:42 +0100
Subject: [PATCH 06/21] working searchPfam

---
 prody/database/pfam.py | 230 ++++++++++-------------------------------
 1 file changed, 55 insertions(+), 175 deletions(-)

diff --git a/prody/database/pfam.py b/prody/database/pfam.py
index b0a6bfe2e..230eca0f0 100644
--- a/prody/database/pfam.py
+++ b/prody/database/pfam.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """This module defines functions for interfacing Pfam database."""
 
-__author__ = 'Anindita Dutta, Ahmet Bakan, Cihan Kaya'
+__author__ = 'Anindita Dutta, Ahmet Bakan, Cihan Kaya, James Krieger'
 
 import re
 from numbers import Integral
@@ -22,6 +22,7 @@
     import urllib
     import urllib2
 
+import json
 
 __all__ = ['searchPfam', 'fetchPfamMSA', 'parsePfamPDBs']
 
@@ -38,180 +39,71 @@
                   'inserts': set(['lower', 'upper']),
                   'gaps': set(['mixed', 'dots', 'dashes', 'none'])})
 
-MINSEQLEN = 16
-
 old_prefix = 'https://pfam.xfam.org/'
 prefix = 'https://pfam-legacy.xfam.org/'
-new_prefix = 'https://www.ebi.ac.uk/interpro/wwwapi//entry/pfam/'
+new_prefix = 'https://www.ebi.ac.uk/interpro/wwwapi/entry/'
 
 def searchPfam(query, **kwargs):
     """Returns Pfam search results in a dictionary.  Matching Pfam accession
     as keys will map to evalue, alignment start and end residue positions.
 
-    :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence
-        file. Sequence queries must not contain without gaps and must be at
-        least 16 characters long
+    :arg query: UniProt ID or PDB identifier with or without a
+        chain identifier, e.g. ``'1mkp'`` or ``'1mkpA'``.  
+        UniProt ID of the specified chain, or the first
+        protein chain will be used for searching the Pfam database
     :type query: str
 
     :arg timeout: timeout for blocking connection attempt in seconds, default
         is 60
     :type timeout: int
-
-    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
-    chain identifier.  UniProt ID of the specified chain, or the first
-    protein chain will be used for searching the Pfam database."""
+    """
 
     import requests
 
-    if isfile(query):
-        from prody.sequence import MSAFile
-        try:
-            seq = next(MSAFile(query))
-        except:
-            with openFile(query) as inp:
-                seq = ''.join(inp.read().split())
-        else:
-            seq = seq[0][1]
-        if not seq.isalpha():
-            raise ValueError('could not parse a sequence without gaps from ' +
-                             query)
-    else:
-        seq = ''.join(query.split())
+    seq = ''.join(query.split())
 
     import xml.etree.cElementTree as ET
     LOGGER.timeit('_pfam')
     timeout = int(kwargs.get('timeout', 60))
-    if len(seq) >= MINSEQLEN:
-        if not seq.isalpha():
-            raise ValueError(repr(seq) + ' is not a valid sequence')
-        fseq = '>Seq\n' + seq
-        parameters = { 'hmmdb' : 'pfam', 'seq': fseq }
-        enc_params = urllib.urlencode(parameters).encode('utf-8')
-        request = urllib2.Request('https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params)
-
-        results_url = urllib2.urlopen(request).geturl()
-
-        #res_params = { 'output' : 'xml' }
-        res_params = { 'format' : 'tsv' }
-        enc_res_params = urllib.urlencode(res_params)
-        #modified_res_url = results_url + '?' + enc_res_params
-        modified_res_url = results_url.replace('results','download') + '?' + enc_res_params
-
-        result_request = urllib2.Request(modified_res_url) 
-        # url = ( urllib2.urlopen(request).geturl() + '?output=xml') 
-        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'
-                     .format(seq[:MINSEQLEN]))
 
+    if len(seq) <= 5:
+        accession = None
+        from prody import parsePDBHeader
         try:
-            #xml = urllib2.urlopen(result_request).read()
-            tsv = urllib2.urlopen(result_request).read()
-            # openURL(url, timeout=timeout).read()
-        except:
-            raise ValueError('No matching Pfam domains were found.')
-        
-        # try:
-        #     root = ET.XML(xml)
-        # except Exception as err:
-        #     raise ValueError('failed to parse results XML, check URL: ' + modified_res_url)
-
-        matches = {}
-        #for child in root[0]:
-            #if child.tag == 'hits':
-                # accession = child.get('acc')
-                # pfam_id = accession.split('.')[0]
-                # matches[pfam_id]={}
-                # matches[pfam_id]['accession']=accession
-                # matches[pfam_id]['class']='Domain'
-                # matches[pfam_id]['id']=child.get('name')
-                # matches[pfam_id]['locations']={}
-                # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto')
-                # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom')
-                # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore')
-                # matches[pfam_id]['locations']['end']=child[0].get('alisqto')
-                # matches[pfam_id]['locations']['evalue']=child.get('evalue')
-                # matches[pfam_id]['locations']['evidence']='hmmer v3.0'
-                # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto')
-                # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom')
-                # matches[pfam_id]['locations']['significant']=child[0].get('significant')    
-                # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom')
-                # matches[pfam_id]['type']='Pfam-A'
-        # return matches
-
-        if PY3K:
-            tsv = tsv.decode()
-
-        lines = tsv.split('\n')
-        keys = lines[0].split('\t')
-        root = {}
-        for i, line in enumerate(lines[1:-1]):
-            root[i] = {}
-            for j, key in enumerate(keys):
-                root[i][key] = line.split('\t')[j]
-
-        for child in root.values():
-            accession = child['Family Accession']
-            pfam_id = accession.split('.')[0]
-            matches[pfam_id]={}
-            matches[pfam_id]['accession'] = accession
-            matches[pfam_id]['class'] = 'Domain'
-            matches[pfam_id]['id'] = child['Family id']
-            matches[pfam_id]['locations'] = {}
-            matches[pfam_id]['locations']['ali_end'] = child['Ali. End']
-            matches[pfam_id]['locations']['ali_start'] = child['Ali. Start']
-            matches[pfam_id]['locations']['bitscore'] = child['Bit Score']
-            matches[pfam_id]['locations']['end'] = child['Env. End']
-            matches[pfam_id]['locations']['cond_evalue'] = child['Cond. E-value']
-            matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value']
-            matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0'
-            matches[pfam_id]['locations']['hmm_end'] = child['Model End']
-            matches[pfam_id]['locations']['hmm_start'] = child['Model Start']
-            #matches[pfam_id]['locations']['significant'] = child['significant']   
-            matches[pfam_id]['locations']['start'] = child['Env. Start']
-            matches[pfam_id]['type'] = 'Pfam-A'
-        return matches
+            polymers = parsePDBHeader(seq[:4], 'polymers')
+        except Exception as err:
+            raise ValueError('failed to parse header for {0} ({1})'
+                                .format(seq[:4], str(err)))
+        else:
+            chid = seq[4:].upper()
 
-    else:
-        if len(seq) <= 5:
-            idcode = None
-            from prody import parsePDBHeader
-            try:
-                polymers = parsePDBHeader(seq[:4], 'polymers')
-            except Exception as err:
-                LOGGER.warn('failed to parse header for {0} ({1})'
-                            .format(seq[:4], str(err)))
-            else:
-                chid = seq[4:].upper()
- 
-            for poly in polymers:
-                if chid and poly.chid != chid:
+        for poly in polymers:
+            if chid and poly.chid != chid:
+                continue
+            for dbref in poly.dbrefs:
+                if dbref.database != 'UniProt':
                     continue
-                for dbref in poly.dbrefs:
-                    if dbref.database != 'UniProt':
-                        continue
-                    idcode = dbref.idcode
-                    accession = dbref.accession
-                    LOGGER.info('UniProt ID code {0} for {1} chain '
-                                '{2} will be used.'
-                                .format(idcode, seq[:4], poly.chid))
-                    break
-                if idcode is not None:
-                    break
-            if idcode is None:
-                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
-                            'parsed.'.format(repr(seq)))
-                url = prefix + 'protein/' + seq + '?output=xml'
-            else:
-                url = prefix + 'protein/' + idcode + '?output=xml'
-
+                accession = dbref.accession
+                LOGGER.info('UniProt accession {0} for {1} chain '
+                            '{2} will be used.'
+                            .format(accession, seq[:4], poly.chid))
+                break
+            if accession is not None:
+                break
+        if accession is None:
+            raise ValueError('A UniProt accession for PDB {0} could not be '
+                                'parsed.'.format(repr(seq)))
         else:
-            url = prefix + 'protein/' + seq + '?output=xml'
+            url = new_prefix + "all/protein/uniprot/" + accession
+
+    else:
+        url = new_prefix + "all/protein/uniprot/" + seq
 
     LOGGER.debug('Retrieving Pfam search results: ' + url)
     xml = None
     sleep = 2
     while LOGGER.timing('_pfam') < timeout:
         try:
-            # xml = openURL(url, timeout=timeout).read()
             xml = requests.get(url, verify=False).content
         except Exception:
             pass
@@ -264,44 +156,32 @@ def searchPfam(query, **kwargs):
                     raise ValueError('No valid UniProt accession or ID for: ' + seq)
 
     try:
-        root = ET.XML(xml)
+        root = json.loads(xml)
+        #return root
     except Exception as err:
         raise ValueError('failed to parse results XML, check URL: ' + url)
 
-    if len(seq) >= MINSEQLEN:
-        try:
-            xml_matches = root[0][0][0][0]
-        except IndexError:
-            raise ValueError('failed to parse results XML, check URL: ' + url)
-    else:
-        key = '{' + old_prefix + '}'
-        results = dictElement(root[0], key)
-        try:
-            xml_matches = results['matches']
-        except KeyError:
-            raise ValueError('failed to parse results XML, check URL: ' + url)
-
     matches = dict()
-    for child in xml_matches:
-
+    for entry in root["results"]:
         try:
-            accession = child.attrib['accession'][:7]
+            metadata = entry["metadata"]
+            accession = metadata["accession"]
         except KeyError:
-            raise ValueError('failed to parse results XML, check URL: ' + url)
+            raise ValueError('failed to parse accessions from results, check URL: ' + url)
 
-        if not re.search('^P(F|B)[0-9]{5}$', accession):
-            raise ValueError('{0} does not match pfam accession'
-                             ' format'.format(accession))
+        if not re.search('PF[0-9]{5}$', accession):
+            continue
 
-        match = matches.setdefault(accession, dict(child.items()))
-        locations = match.setdefault('locations', [])
-        for loc in child:
-            locations.append(dict(loc.items()))
+        match = matches.setdefault(accession, dict(metadata.items()))
+        
+        other_data = entry["proteins"]
+        locations = match.setdefault("locations", [])
+        for item1 in other_data:
+            for key, value in item1.items():
+                if key == "entry_protein_locations":
+                    locations.append(value)    
 
-    if len(seq) < MINSEQLEN:
-        query = 'Query ' + repr(query)
-    else:
-        query = 'Query sequence'
+    query = 'Query ' + repr(query)
 
     if matches:
         LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
@@ -375,7 +255,7 @@ def fetchPfamMSA(acc, alignment='full', compressed=False, **kwargs):
     if not kwargs:
         #url = (prefix + 'family/' + acc + '/alignment/' +
         #       alignment + '/gzipped')
-        url = (new_prefix + acc + 
+        url = (new_prefix + "/pfam/" + acc + 
                 '/?annotation=alignment:' + alignment + '&download')
         url_flag = True
         extension = '.sth'

From fcfa6075c8a698d1d376332530c634db7602b113 Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Thu, 19 Jan 2023 20:00:39 +0100
Subject: [PATCH 07/21] restrict searchPfam tests

---
 prody/tests/database/test_pfam.py | 36 +++----------------------------
 1 file changed, 3 insertions(+), 33 deletions(-)

diff --git a/prody/tests/database/test_pfam.py b/prody/tests/database/test_pfam.py
index 14b186420..f93446ccd 100644
--- a/prody/tests/database/test_pfam.py
+++ b/prody/tests/database/test_pfam.py
@@ -25,11 +25,7 @@ def setUpClass(self):
             os.mkdir(self.workdir)
         os.chdir(self.workdir)
 
-        self.queries = ['P19491', 'GRIA2_RAT', '6qkcB', '6qkcI', 
-                        'VQVLLTTIGAFAAFGLMTIAISTDYWLYTRGLTHSGLWRICCLEGLK'\
-                            'RGVCVKINHFAEYLLRVVRASSIFPILSAILLLLGGVCVAASR'\
-                            'VYKSKRNIILGAGILFVAAGLSNIIGVIVYISANAGKNHYSYG'\
-                            'WSFYFGGLSFILAEVIGVLAVNIYIERSR']
+        self.queries = ['P19491', '6qkcB', '6qkcI']
 
     def testUniprotAccMulti(self):
         """Test the outcome of a simple search scenario using a Uniprot Accession 
@@ -43,25 +39,12 @@ def testUniprotAccMulti(self):
         self.assertEqual(sorted(list(a.keys())), 
                            ['PF00060', 'PF01094', 'PF10613'],
                            'searchPfam failed to return the right domain family IDs')
-
-    def testUniprotIdMulti(self):
-        """Test the outcome of a simple search scenario using a Uniprot ID
-        for a multi-domain protein, AMPAR GluA2."""
-
-        a = searchPfam(self.queries[1])
-
-        self.assertIsInstance(a, dict,
-            'searchPfam failed to return a dict instance')
-        
-        self.assertEqual(sorted(list(a.keys())), 
-                           ['PF00060', 'PF01094', 'PF10613'],
-                           'searchPfam failed to return the right domain family IDs')
         
     def testPdbIdChMulti(self):
         """Test the outcome of a simple search scenario using a PDB ID
         and chain ID for the same multi-domain protein from specifying chain B."""
 
-        a = searchPfam(self.queries[2])
+        a = searchPfam(self.queries[1])
 
         self.assertIsInstance(a, dict,
             'searchPfam failed to return a dict instance')
@@ -74,20 +57,7 @@ def testPdbIdChSingle(self):
         """Test the outcome of a simple search scenario using a PDB ID
         and chain ID to get the single domain protein TARP g8 from chain I."""
 
-        a = searchPfam(self.queries[3])
-
-        self.assertIsInstance(a, dict,
-            'searchPfam failed to return a dict instance')
-        
-        self.assertEqual(sorted(list(a.keys())), 
-                           ['PF00822'],
-                           'searchPfam failed to return the right domain family IDs for TARP')
-        
-    def testSeqSingle(self):
-        """Test the outcome of a simple search scenario using the sequence 
-        of the single domain protein TARP g8 from 6qkc chain I."""
-
-        a = searchPfam(self.queries[4])
+        a = searchPfam(self.queries[2])
 
         self.assertIsInstance(a, dict,
             'searchPfam failed to return a dict instance')

From eb8fac53410f0b1ebc002f219cf5968ed474ef9b Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Thu, 19 Jan 2023 20:32:40 +0100
Subject: [PATCH 08/21] fix parsePfamPdbs test

---
 prody/tests/database/test_pfam.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prody/tests/database/test_pfam.py b/prody/tests/database/test_pfam.py
index f93446ccd..70cc9b0bd 100644
--- a/prody/tests/database/test_pfam.py
+++ b/prody/tests/database/test_pfam.py
@@ -152,7 +152,7 @@ class TestParsePfamPDBs(unittest.TestCase):
     
     @classmethod
     def setUpClass(self):
-        self.queries = ['PF20446', 'Q57ZF2_TRYB2', 'VAS1_BOVIN']
+        self.queries = ['PF20446', 'Q57ZF2', 'P40682']
 
         self.workdir = 'pfam_pdb_tests'
         if not os.path.exists(self.workdir):

From 39758fb13826d109ab8f6401e2f988ea87a206b7 Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Thu, 19 Jan 2023 20:33:07 +0100
Subject: [PATCH 09/21] simplify searchPfam output for parsePfamPdbs

---
 prody/database/pfam.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/prody/database/pfam.py b/prody/database/pfam.py
index 230eca0f0..e3cd83eaa 100644
--- a/prody/database/pfam.py
+++ b/prody/database/pfam.py
@@ -179,7 +179,13 @@ def searchPfam(query, **kwargs):
         for item1 in other_data:
             for key, value in item1.items():
                 if key == "entry_protein_locations":
-                    locations.append(value)    
+                    for item2 in value:
+                        new_dict = {}
+                        for item3 in value[0]["fragments"]:
+                            new_dict["start"] = item3["start"]
+                            new_dict["end"] = item3["end"]
+                            new_dict["score"] = item2["score"]
+                            locations.append(new_dict)    
 
     query = 'Query ' + repr(query)
 

From 73f5e2081f44fd41be6233404b1e3cf7ce5ded50 Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Sun, 5 Nov 2023 16:45:41 +0100
Subject: [PATCH 10/21] tidy up fetchPfamMSA

---
 prody/database/pfam.py | 100 +++++------------------------------------
 1 file changed, 10 insertions(+), 90 deletions(-)

diff --git a/prody/database/pfam.py b/prody/database/pfam.py
index e3cd83eaa..4009d3fa5 100644
--- a/prody/database/pfam.py
+++ b/prody/database/pfam.py
@@ -209,21 +209,6 @@ def fetchPfamMSA(acc, alignment='full', compressed=False, **kwargs):
 
     :arg compressed: gzip the downloaded MSA file, default is **False**
 
-    *Alignment Options*
-
-    :arg format: a Pfam supported MSA file format, one of ``'selex'``,
-        (default), ``'stockholm'`` or ``'fasta'``
-
-    :arg order: ordering of sequences, ``'tree'`` (default) or
-        ``'alphabetical'``
-
-    :arg inserts: letter case for inserts, ``'upper'`` (default) or ``'lower'``
-
-    :arg gaps: gap character, one of ``'dashes'`` (default), ``'dots'``,
-        ``'mixed'`` or **None** for unaligned
-
-    *Other Options*
-
     :arg timeout: timeout for blocking connection attempt in seconds, default
         is 60
 
@@ -233,76 +218,21 @@ def fetchPfamMSA(acc, alignment='full', compressed=False, **kwargs):
     
     import requests
 
-    # url = prefix + 'family/acc?id=' + acc
-    # handle = openURL(url, timeout=int(kwargs.get('timeout', 60)))
-    orig_acc = acc
-    # acc = handle.readline().strip()
-    # if PY3K:
-    #     acc = acc.decode()
-    url_flag = False
-
     if not re.search('(?<=PF)[0-9]{5}$', acc):
         raise ValueError('{0} is not a valid Pfam ID or Accession Code'
-                         .format(repr(orig_acc)))
+                         .format(repr(acc)))
 
     if alignment not in DOWNLOAD_FORMATS:
-        raise ValueError('alignment must be one of full, seed,'
-                         #' ncbi or'
-                         #' metagenomics'
-                         ' or uniprot')
-    # if alignment == 'ncbi' or alignment == 'metagenomics' or alignment == 'uniprot':
-    #     #url = (prefix + 'family/' + acc + '/alignment/' +
-    #     #       alignment + '/gzipped')
-    #     url = (new_prefix + acc + 
-    #            '/?annotation=alignment:' + alignment + '&download')
-    #     url_flag = True
-    #     extension = '.sth'
-    # else:
-    if not kwargs:
-        #url = (prefix + 'family/' + acc + '/alignment/' +
-        #       alignment + '/gzipped')
-        url = (new_prefix + "/pfam/" + acc + 
-                '/?annotation=alignment:' + alignment + '&download')
-        url_flag = True
-        extension = '.sth'
-    else:
-        raise ValueError('kwargs are not supported for Interpro Pfam')
-    #     align_format = kwargs.get('format', 'selex').lower()
-
-    #     if align_format not in FORMAT_OPTIONS['format']:
-    #         raise ValueError('alignment format must be of type selex'
-    #                             ' stockholm or fasta. MSF not supported')
-
-    #     if align_format == SELEX:
-    #         align_format, extension = 'pfam', '.slx'
-    #     elif align_format == FASTA:
-    #         extension = '.fasta'
-    #     else:
-    #         extension = '.sth'
-
-    #     gaps = str(kwargs.get('gaps', 'dashes')).lower()
-    #     if gaps not in FORMAT_OPTIONS['gaps']:
-    #         raise ValueError('gaps must be of type mixed, dots, dashes, '
-    #                             'or None')
-
-    #     inserts = kwargs.get('inserts', 'upper').lower()
-    #     if(inserts not in FORMAT_OPTIONS['inserts']):
-    #         raise ValueError('inserts must be of type lower or upper')
-
-    #     order = kwargs.get('order', 'tree').lower()
-    #     if order not in FORMAT_OPTIONS['order']:
-    #         raise ValueError('order must be of type tree or alphabetical')
-
-    #     url = (prefix + 'family/' + acc + '/alignment/'
-    #             + alignment + '/format?format=' + align_format +
-    #             '&alnType=' + alignment + '&order=' + order[0] +
-    #             '&case=' + inserts[0] + '&gaps=' + gaps + '&download=1')
+        raise ValueError('alignment must be one of full, seed, or uniprot')
+
+    url = (new_prefix + "/pfam/" + acc + 
+            '/?annotation=alignment:' + alignment + '&download')
+    extension = '.sth'
 
     LOGGER.timeit('_pfam')
     timeout = kwargs.get('timeout', 60)
     response = None
     sleep = 2
-    try_error = 3
     while LOGGER.timing('_pfam') < timeout:
         try:
             response = requests.get(url, verify=False).content
@@ -314,32 +244,22 @@ def fetchPfamMSA(acc, alignment='full', compressed=False, **kwargs):
         sleep = 20 if int(sleep * 1.5) >= 20 else int(sleep * 1.5)
         LOGGER.sleep(int(sleep), '. Trying to reconnect...')
 
-    # response = openURL(url, timeout=int(kwargs.get('timeout', 60)))
     outname = kwargs.get('outname', None)
     if not outname:
-        outname = orig_acc
+        outname = acc
     folder = str(kwargs.get('folder', '.'))
     filepath = join(makePath(folder), outname + '_' + alignment + extension)
     if compressed:
         filepath = filepath + '.gz'
-        if url_flag:
-            f_out = open(filepath, 'wb')
-        else:
-            f_out = openFile(filepath, 'wb')
-        # f_out.write(response.read())
+        f_out = open(filepath, 'wb')
         f_out.write(response)
         f_out.close()
     else:
-        if url_flag:
-            gunzip(response, filepath)
-        else:
-            with open(filepath, 'wb') as f_out:
-                # f_out.write(response.read())
-                f_out.write(response)
+        gunzip(response, filepath)
 
     filepath = relpath(filepath)
     LOGGER.info('Pfam MSA for {0} is written as {1}.'
-                .format(orig_acc, filepath))
+                .format(acc, filepath))
 
     return filepath
 

From ee6c61db6e480d908bb0c44dc62ac2fdb7ca4392 Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Sun, 5 Nov 2023 17:10:03 +0100
Subject: [PATCH 11/21] encode searchPfam xml for PY2

---
 prody/database/pfam.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/prody/database/pfam.py b/prody/database/pfam.py
index 4009d3fa5..33d881950 100644
--- a/prody/database/pfam.py
+++ b/prody/database/pfam.py
@@ -122,6 +122,8 @@ def searchPfam(query, **kwargs):
 
     if PY3K:
         xml = xml.decode()
+    else:
+        xml = xml.encode()
 
     if xml.find('There was a system error on your last request.') > 0:
         LOGGER.warn('No Pfam matches found for: ' + seq)
@@ -294,6 +296,10 @@ def parsePfamPDBs(query, data=[], **kwargs):
     if len(query) > 4 and query.startswith('PF'):
         pfam_acc = query
     else:
+        if not isinstance(start, Integral) and not isinstance(end, Integral):
+            raise ValueError('Please provide an integer for start or end '
+                             'when using a UniProt ID or PDB ID.')
+
         pfam_matches = searchPfam(query, **kwargs)
         keys = list(pfam_matches.keys())
 
@@ -311,17 +317,13 @@ def parsePfamPDBs(query, data=[], **kwargs):
                 start_diff = np.array(start_diff)
                 pfam_acc = keys[np.where(abs(start_diff) == min(abs(start_diff)))[0][0]]
 
-        elif isinstance(end, Integral):
+        if isinstance(end, Integral):
             end_diff = []
             for i, key in enumerate(pfam_matches):
                 end_diff.append(int(pfam_matches[key]['locations'][0]['end']) - end)
             end_diff = np.array(end_diff)
             pfam_acc = keys[np.where(abs(end_diff) == min(abs(end_diff)))[0][0]]
 
-        else:
-            raise ValueError('Please provide an integer for start or end '
-                             'when using a UniProt ID or PDB ID.')
-
     from ftplib import FTP
     from .uniprot import queryUniprot
 

From d1f4fbbd35f3bbcd204b1140e94f1a8ffc5ccbae Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Sun, 5 Nov 2023 17:30:10 +0100
Subject: [PATCH 12/21] fix parsePfamPDBs for upper case pdb id

---
 prody/database/pfam.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prody/database/pfam.py b/prody/database/pfam.py
index 33d881950..5afc0166c 100644
--- a/prody/database/pfam.py
+++ b/prody/database/pfam.py
@@ -393,7 +393,7 @@ def parsePfamPDBs(query, data=[], **kwargs):
                 pdbid = value['PDB']
             except:
                 continue
-            if pdbid != data_dict['PDB_ID']:
+            if pdbid.lower() != data_dict['PDB_ID'].lower():
                 continue
             pdbchains = value['chains']
 

From 38a0e5a79abbe3422dafaa5d6c4cdb28ead68ac9 Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Sun, 5 Nov 2023 17:34:46 +0100
Subject: [PATCH 13/21] add extra LBD domain to second place

---
 prody/tests/database/test_pfam.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/prody/tests/database/test_pfam.py b/prody/tests/database/test_pfam.py
index 70cc9b0bd..2bff14425 100644
--- a/prody/tests/database/test_pfam.py
+++ b/prody/tests/database/test_pfam.py
@@ -37,7 +37,7 @@ def testUniprotAccMulti(self):
             'searchPfam failed to return a dict instance')
         
         self.assertEqual(sorted(list(a.keys())), 
-                           ['PF00060', 'PF01094', 'PF10613'],
+                           ['PF00060', 'PF00497', 'PF01094', 'PF10613'],
                            'searchPfam failed to return the right domain family IDs')
         
     def testPdbIdChMulti(self):
@@ -49,9 +49,8 @@ def testPdbIdChMulti(self):
         self.assertIsInstance(a, dict,
             'searchPfam failed to return a dict instance')
         
-        self.assertEqual(sorted(list(a.keys())), 
-                           ['PF00060', 'PF01094', 'PF10613'],
-                           'searchPfam failed to return the right domain family IDs for AMPAR')
+        self.assertEqual(sorted(list(a.keys())), ['PF00060', 'PF00497', 'PF01094', 'PF10613'],
+                         'searchPfam failed to return the right domain family IDs for AMPAR')
         
     def testPdbIdChSingle(self):
         """Test the outcome of a simple search scenario using a PDB ID

From df0f3961b60a3d66290ca527d59a7fdeb20d8992 Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Sun, 5 Nov 2023 17:36:24 +0100
Subject: [PATCH 14/21] remove test format and add back test folder

---
 prody/tests/database/test_pfam.py | 35 ++++++++-----------------------
 1 file changed, 9 insertions(+), 26 deletions(-)

diff --git a/prody/tests/database/test_pfam.py b/prody/tests/database/test_pfam.py
index 2bff14425..ebfee6879 100644
--- a/prody/tests/database/test_pfam.py
+++ b/prody/tests/database/test_pfam.py
@@ -109,37 +109,20 @@ def testSeed(self):
         
         self.assertTrue(os.path.exists(b))
 
-        
-    def testFormat(self):
+    def testFolder(self):
         """Test the outcome of fetching the domain MSA for claudins
-        with keyword argument format set to fasta. This should give a 
-        ValueError for now"""
-
-        self.assertRaises(ValueError, fetchPfamMSA, self.query, format="fasta")
-        # b = fetchPfamMSA(self.query, format="fasta")
-
-        # self.assertIsInstance(b, str,
-        #     'fetchPfamMSA failed to return a str instance')
-        
-        # self.assertEqual(b, 'PF00822_full.fasta')
-        
-        # self.assertTrue(os.path.exists(b))
+        with keyword folder set to a folder that is made especially."""
 
+        folder = "new_folder"
+        os.mkdir(folder)
+        b = fetchPfamMSA(self.query, folder=folder)
 
-    # def testFolder(self):
-    #     """Test the outcome of fetching the domain MSA for claudins
-    #     with keyword folder set to a folder that is made especially."""
-
-    #     folder = "new_folder"
-    #     os.mkdir(folder)
-    #     b = fetchPfamMSA(self.query, folder=folder)
-
-    #     self.assertIsInstance(b, str,
-    #         'fetchPfamMSA failed to return a str instance')
+        self.assertIsInstance(b, str,
+            'fetchPfamMSA failed to return a str instance')
         
-    #     self.assertEqual(b, 'new_folder/PF00822_full.slx')
+        self.assertEqual(b, 'new_folder/PF00822_full.slx')
         
-    #     self.assertTrue(os.path.exists(b))
+        self.assertTrue(os.path.exists(b))
     
     @classmethod
     def tearDownClass(self):

From 73f01df137aeee63372e1856dcf1552a5993445b Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Sun, 5 Nov 2023 17:52:30 +0100
Subject: [PATCH 15/21] remove backup search of legacy

---
 prody/database/pfam.py | 39 +++++----------------------------------
 1 file changed, 5 insertions(+), 34 deletions(-)

diff --git a/prody/database/pfam.py b/prody/database/pfam.py
index 5afc0166c..ee8b0aa66 100644
--- a/prody/database/pfam.py
+++ b/prody/database/pfam.py
@@ -39,9 +39,7 @@
                   'inserts': set(['lower', 'upper']),
                   'gaps': set(['mixed', 'dots', 'dashes', 'none'])})
 
-old_prefix = 'https://pfam.xfam.org/'
-prefix = 'https://pfam-legacy.xfam.org/'
-new_prefix = 'https://www.ebi.ac.uk/interpro/wwwapi/entry/'
+prefix = 'https://www.ebi.ac.uk/interpro/wwwapi/entry/'
 
 def searchPfam(query, **kwargs):
     """Returns Pfam search results in a dictionary.  Matching Pfam accession
@@ -94,10 +92,10 @@ def searchPfam(query, **kwargs):
             raise ValueError('A UniProt accession for PDB {0} could not be '
                                 'parsed.'.format(repr(seq)))
         else:
-            url = new_prefix + "all/protein/uniprot/" + accession
+            url = prefix + "all/protein/uniprot/" + accession
 
     else:
-        url = new_prefix + "all/protein/uniprot/" + seq
+        url = prefix + "all/protein/uniprot/" + seq
 
     LOGGER.debug('Retrieving Pfam search results: ' + url)
     xml = None
@@ -129,37 +127,10 @@ def searchPfam(query, **kwargs):
         LOGGER.warn('No Pfam matches found for: ' + seq)
         return None
     elif xml.find('No valid UniProt accession or ID') > 0:
-        try:
-            url = prefix + 'protein/' + accession + '?output=xml'
-            LOGGER.debug('Retrieving Pfam search results: ' + url)
-            xml = openURL(url, timeout=timeout).read()
-        except:
-            raise ValueError('No valid UniProt accession or ID for: ' + seq)
-        
-        if xml.find('No valid UniProt accession or ID') > 0:
-            try:
-                ag = parsePDB(seq, subset='ca')
-                ag_seq = ag.getSequence()
-                return searchPfam(ag_seq)
-            except:
-                try:
-                    url = 'https://uniprot.org/uniprot/' + accession + '.xml'
-                    xml = openURL(url, timeout=timeout).read()
-                    if len(xml) > 0:
-                        root = ET.XML(xml)
-                        accession = root[0][0].text
-
-                        url = prefix + 'protein/' + accession + '?output=xml'
-                        LOGGER.debug('Retrieving Pfam search results: ' + url)
-                        xml = openURL(url, timeout=timeout).read()
-                    else:
-                        raise ValueError('No valid UniProt accession or ID for: ' + seq)
-                except:
-                    raise ValueError('No valid UniProt accession or ID for: ' + seq)
+        raise ValueError('No valid UniProt accession or ID for: ' + seq)
 
     try:
         root = json.loads(xml)
-        #return root
     except Exception as err:
         raise ValueError('failed to parse results XML, check URL: ' + url)
 
@@ -227,7 +198,7 @@ def fetchPfamMSA(acc, alignment='full', compressed=False, **kwargs):
     if alignment not in DOWNLOAD_FORMATS:
         raise ValueError('alignment must be one of full, seed, or uniprot')
 
-    url = (new_prefix + "/pfam/" + acc + 
+    url = (prefix + "/pfam/" + acc + 
             '/?annotation=alignment:' + alignment + '&download')
     extension = '.sth'
 

From 2f304c07fc23101fdda3fd271954fcd8639ca754 Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Sun, 5 Nov 2023 18:04:50 +0100
Subject: [PATCH 16/21] skip None in parsePfamPDBs

---
 prody/database/pfam.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/prody/database/pfam.py b/prody/database/pfam.py
index ee8b0aa66..b25491a6e 100644
--- a/prody/database/pfam.py
+++ b/prody/database/pfam.py
@@ -260,6 +260,8 @@ def parsePfamPDBs(query, data=[], **kwargs):
         The PFAM domain that ends closest to this will be selected. 
     :type end: int
     """
+
+    only_parse = kwargs.pop('only_parse', False)
     
     start = kwargs.pop('start', 1)
     end = kwargs.pop('end', None)
@@ -341,10 +343,16 @@ def parsePfamPDBs(query, data=[], **kwargs):
     else:
         results = ags
 
+    if only_parse:
+        return results
+
     LOGGER.progress('Extracting Pfam domains...', len(ags))
     comma_splitter = re.compile(r'\s*,\s*').split
     no_info = []
     for i, ag in enumerate(ags):
+        if ag is None:
+            continue
+
         LOGGER.update(i)
         data_dict = data_dicts[i]
         pfamRange = data_dict['UniprotResnumRange'].split('-')

From 46470eb3ddd7926853d4aa5725d8c9729c992c7a Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Sun, 5 Nov 2023 18:16:54 +0100
Subject: [PATCH 17/21] not return None in parsePfamPDBs

---
 prody/database/pfam.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prody/database/pfam.py b/prody/database/pfam.py
index b25491a6e..8c94632ca 100644
--- a/prody/database/pfam.py
+++ b/prody/database/pfam.py
@@ -344,7 +344,7 @@ def parsePfamPDBs(query, data=[], **kwargs):
         results = ags
 
     if only_parse:
-        return results
+        return [result for result in results if result is not None]
 
     LOGGER.progress('Extracting Pfam domains...', len(ags))
     comma_splitter = re.compile(r'\s*,\s*').split

From 42d90874ec87cd24fb319c63e8d076863169b7e0 Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Sun, 5 Nov 2023 18:28:36 +0100
Subject: [PATCH 18/21] not return None in parsePfamPDBs end

---
 prody/database/pfam.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prody/database/pfam.py b/prody/database/pfam.py
index 8c94632ca..f60654eac 100644
--- a/prody/database/pfam.py
+++ b/prody/database/pfam.py
@@ -432,5 +432,5 @@ def parsePfamPDBs(query, data=[], **kwargs):
     else:
         LOGGER.warn('data should be a list in order to get output')
     
-    return results
+    return [result for result in results if result is not None]
 

From 8f6eb9983adef6132628cbf72d8142209625b7a7 Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Sun, 5 Nov 2023 18:32:19 +0100
Subject: [PATCH 19/21] no check for num of parsed pdbs

---
 prody/tests/database/test_pfam.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/prody/tests/database/test_pfam.py b/prody/tests/database/test_pfam.py
index ebfee6879..7b3a1316a 100644
--- a/prody/tests/database/test_pfam.py
+++ b/prody/tests/database/test_pfam.py
@@ -226,9 +226,6 @@ def testMultiDomainStart2(self):
         self.assertIsInstance(b[0], Selection,
             'fetchPfamMSA failed to return a list of Selection instances')
         
-        self.assertEqual(len(b), 23,
-            'fetchPfamMSA failed to return a list of length 23')
-        
         self.assertEqual(b[0].getResnums()[0], 418,
             'fetchPfamMSA failed to return a first Selection with first resnum 418')  
         

From 12b311f372de81d91fdf674be1eb4c74e58211fb Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Sun, 5 Nov 2023 18:55:10 +0100
Subject: [PATCH 20/21] fix fetchPfamMSA format

---
 prody/tests/database/test_pfam.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prody/tests/database/test_pfam.py b/prody/tests/database/test_pfam.py
index 7b3a1316a..33b1fb4a3 100644
--- a/prody/tests/database/test_pfam.py
+++ b/prody/tests/database/test_pfam.py
@@ -120,7 +120,7 @@ def testFolder(self):
         self.assertIsInstance(b, str,
             'fetchPfamMSA failed to return a str instance')
         
-        self.assertEqual(b, 'new_folder/PF00822_full.slx')
+        self.assertEqual(b, 'new_folder/PF00822_full.sth')
         
         self.assertTrue(os.path.exists(b))
     

From 239ee041711bf2edbf0f3b243966f7e1451494d5 Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Sun, 5 Nov 2023 18:55:34 +0100
Subject: [PATCH 21/21] fix reporting for parsePfamPDBs

---
 prody/tests/database/test_pfam.py | 34 +++++++++++++++----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/prody/tests/database/test_pfam.py b/prody/tests/database/test_pfam.py
index 33b1fb4a3..adaef5276 100644
--- a/prody/tests/database/test_pfam.py
+++ b/prody/tests/database/test_pfam.py
@@ -149,13 +149,13 @@ def testPfamIdDefault(self):
         b = parsePfamPDBs(self.queries[0])
 
         self.assertIsInstance(b, list,
-            'fetchPfamMSA failed to return a list instance')
+            'parsePfamPDBs failed to return a list instance')
 
         self.assertIsInstance(b[0], Selection,
-            'fetchPfamMSA failed to return a list of Selection instances')
+            'parsePfamPDBs failed to return a list of Selection instances')
         
         self.assertEqual(len(b), 5,
-            'fetchPfamMSA failed to return a list of length 5')
+            'parsePfamPDBs failed to return a list of length 5')
 
 
     def testUniprotDefault(self):
@@ -166,13 +166,13 @@ def testUniprotDefault(self):
         b = parsePfamPDBs(self.queries[1])
 
         self.assertIsInstance(b, list,
-            'fetchPfamMSA failed to return a list instance')
+            'parsePfamPDBs failed to return a list instance')
 
         self.assertIsInstance(b[0], Selection,
-            'fetchPfamMSA failed to return a list of Selection instances')
+            'parsePfamPDBs failed to return a list of Selection instances')
         
         self.assertEqual(len(b), 5,
-            'fetchPfamMSA failed to return a list of length 5')
+            'parsePfamPDBs failed to return a list of length 5')
 
         
     def testMultiDomainDefault(self):
@@ -183,16 +183,16 @@ def testMultiDomainDefault(self):
         b = parsePfamPDBs(self.queries[2])
 
         self.assertIsInstance(b, list,
-            'fetchPfamMSA failed to return a list instance')
+            'parsePfamPDBs failed to return a list instance')
 
         self.assertIsInstance(b[0], Selection,
-            'fetchPfamMSA failed to return a list of Selection instances')
+            'parsePfamPDBs failed to return a list of Selection instances')
         
         self.assertEqual(len(b), 7,
-            'fetchPfamMSA failed to return a list of length 7')
+            'parsePfamPDBs failed to return a list of length 7')
         
         self.assertEqual(b[0].getResnums()[0], 262,
-            'fetchPfamMSA failed to return a first Selection with first resnum 262')        
+            'parsePfamPDBs failed to return a first Selection with first resnum 262')
 
     def testMultiDomainStart1(self):
         """Test the outcome of parsing PDBs using a V-type proton ATPase subunit S1, 
@@ -202,16 +202,16 @@ def testMultiDomainStart1(self):
         b = parsePfamPDBs(self.queries[2], start=1)
 
         self.assertIsInstance(b, list,
-            'fetchPfamMSA failed to return a list instance')
+            'parsePfamPDBs failed to return a list instance')
 
         self.assertIsInstance(b[0], Selection,
-            'fetchPfamMSA failed to return a list of Selection instances')
+            'parsePfamPDBs failed to return a list of Selection instances')
         
         self.assertEqual(len(b), 7,
-            'fetchPfamMSA failed to return a list of length 7')
+            'parsePfamPDBs failed to return a list of length 7')
         
         self.assertEqual(b[0].getResnums()[0], 262,
-            'fetchPfamMSA failed to return a first Selection with first resnum 262')  
+            'parsePfamPDBs failed to return a first Selection with first resnum 262')
         
     def testMultiDomainStart2(self):
         """Test the outcome of parsing PDBs using a V-type proton ATPase subunit S1, 
@@ -221,13 +221,13 @@ def testMultiDomainStart2(self):
         b = parsePfamPDBs(self.queries[2], start=418)
 
         self.assertIsInstance(b, list,
-            'fetchPfamMSA failed to return a list instance')
+            'parsePfamPDBs failed to return a list instance')
 
         self.assertIsInstance(b[0], Selection,
-            'fetchPfamMSA failed to return a list of Selection instances')
+            'parsePfamPDBs failed to return a list of Selection instances')
         
         self.assertEqual(b[0].getResnums()[0], 418,
-            'fetchPfamMSA failed to return a first Selection with first resnum 418')  
+            'parsePfamPDBs failed to return a first Selection with first resnum 418')
         
     @classmethod
     def tearDownClass(self):