Merge pull request #126 from pymzml/byteMzml

Byte mzml
pymzml · Apr 1, 2019 · 86cb9d6 · 86cb9d6
2 parents 0b0111c + 21b999a
commit 86cb9d6
Show file tree

Hide file tree

Showing 5 changed files with 114 additions and 70 deletions.
diff --git a/pymzml/file_classes/bytesMzml.py b/pymzml/file_classes/bytesMzml.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Interface for binary streams of uncompressed mzML.
+
+@author: Sylvain Le Bon
+"""
+from io import TextIOWrapper
+
+from .. import regex_patterns
+from .standardMzml import StandardMzml
+
+
+class BytesMzml(StandardMzml):
+    def __init__(self, binary, encoding, build_index_from_scratch=False):
+        """
+        Initalize Wrapper object for standard mzML files.
+
+        Arguments:
+            path (str)     : path to the file
+            encoding (str) : encoding of the file
+        """
+        self.binary       = binary
+        self.file_handler = self.get_file_handler(encoding)
+        self.offset_dict = dict()
+        self.spec_open = regex_patterns.SPECTRUM_OPEN_PATTERN
+        self.spec_close = regex_patterns.SPECTRUM_CLOSE_PATTERN
+        if build_index_from_scratch is True:
+            seeker = self.get_binary_file_handler()
+            self._build_index_from_scratch(seeker)
+            seeker.close()
+
+    def get_binary_file_handler(self):
+        self.binary.seek(0)
+        return self.binary
+
+    def get_file_handler(self, encoding):
+        return TextIOWrapper(self.binary, encoding=encoding)
diff --git a/pymzml/file_classes/standardMzml.py b/pymzml/file_classes/standardMzml.py
@@ -27,19 +27,25 @@ def __init__(self, path, encoding, build_index_from_scratch=False):
             encoding (str) : encoding of the file
         """
         self.path         = path
-        self.file_handler = codecs.open(
-            path,
-            mode     = 'r',
-            encoding = encoding
-        )
+        self.file_handler = self.get_file_handler(encoding)
         self.offset_dict = dict()
         self.spec_open = regex_patterns.SPECTRUM_OPEN_PATTERN
         self.spec_close = regex_patterns.SPECTRUM_CLOSE_PATTERN
         if build_index_from_scratch is True:
-            seeker = open(path, 'rb')
+            seeker = self.get_binary_file_handler()
             self._build_index_from_scratch(seeker)
             seeker.close()
 
+    def get_binary_file_handler(self):
+        return open(self.path, 'rb')
+
+    def get_file_handler(self, encoding):
+        return codecs.open(
+            self.path,
+            mode     = 'r',
+            encoding = encoding
+        )
+
     def __getitem__(self, identifier):
         """
         Access the item with id 'identifier'.
@@ -80,7 +86,7 @@ def __getitem__(self, identifier):
         elif identifier in self.offset_dict:
 
             start = self.offset_dict[identifier]
-            with open(self.path, 'rb') as seeker:
+            with self.get_binary_file_handler() as seeker:
                 seeker.seek(start[0])
                 start, end = self._read_to_spec_end(seeker)
             self.file_handler.seek(start, 0)
@@ -122,7 +128,7 @@ def _build_index(self, from_scratch=False):
             seeking to a particular offset for the file.
         """
         # Declare the pre-seeker
-        seeker = open(self.path, 'rb')
+        seeker = self.get_binary_file_handler()
         # Reading last 1024 bytes to find chromatogram Pos and SpectrumIndex Pos
         index_list_offset_pattern = re.compile(
             b'<indexListOffset>(?P<indexListOffset>[0-9]*)</indexListOffset>'
@@ -324,7 +330,7 @@ def _interpol_search(
 
         """
         # print('target ', target_index)
-        seeker          = open(self.path, 'rb')
+        seeker          = self.get_binary_file_handler()
         seeker.seek(0, 2)
         chunk_size      = chunk_size * 512
         lower_bound     = 0
@@ -424,7 +430,7 @@ def _interpol_search(
                 except:
                     key = sorted_keys[pos]
                     spec_start_offset = self.offset_dict[key][0]
-                seeker = open(self.path, 'rb')
+                seeker = self.get_binary_file_handler()
                 seeker.seek(spec_start_offset)
                 spectrum = self._search_linear(seeker, target_index)
                 seeker.close()
@@ -533,7 +539,7 @@ def _search_linear(self, seeker, index, chunk_size=8):
                     )
 
     def _search_string_identifier(self, search_string, chunk_size=8):
-        with open(self.path, 'rb') as seeker:
+        with self.get_binary_file_handler() as seeker:
             data = None
             total_chunk_size = chunk_size * 512
             spec_start = None

diff --git a/pymzml/file_interface.py b/pymzml/file_interface.py
@@ -5,9 +5,8 @@
 
 @author: Manuel Koesters
 """
-from pymzml.file_classes import indexedGzip
-from pymzml.file_classes import standardGzip
-from pymzml.file_classes import standardMzml
+from io import BytesIO
+from pymzml.file_classes import indexedGzip, standardGzip, standardMzml, bytesMzml
 from pymzml.utils import GSGR
 
 
@@ -32,7 +31,7 @@ def close(self):
         """Close the internal file handler."""
         self.file_handler.close()
 
-    def _open(self, path):
+    def _open(self, path_or_file):
         """
         Open a file like object resp. a wrapper for a file like object.
 
@@ -46,24 +45,18 @@ def _open(self, path):
             :py:class:`~pymzml.file_classes.standardMzml.StandardMzml`,
             based on the file ending of 'path'
         """
-        if path.endswith('.gz'):
-            if self._indexed_gzip(path):
-                file_handler = indexedGzip.IndexedGzip(
-                    path,
-                    self.encoding
-                )
+        if isinstance(path_or_file, BytesIO):
+            return bytesMzml.BytesMzml(path_or_file, self.encoding, self.build_index_from_scratch)
+        if path_or_file.endswith('.gz'):
+            if self._indexed_gzip(path_or_file):
+                return indexedGzip.IndexedGzip(path_or_file, self.encoding)
             else:
-                file_handler = standardGzip.StandardGzip(
-                    path,
-                    self.encoding
-                )
-        else:
-            file_handler = standardMzml.StandardMzml(
-                path,
+                return standardGzip.StandardGzip(path_or_file, self.encoding)
+        return standardMzml.StandardMzml(
+                path_or_file,
                 self.encoding,
                 self.build_index_from_scratch,
             )
-        return file_handler
 
     def _indexed_gzip(self, path):
         """

diff --git a/pymzml/run.py b/pymzml/run.py
@@ -38,6 +38,7 @@
 import os
 import xml.etree.ElementTree as ElementTree
 from collections import defaultdict as ddict
+from io import BytesIO
 
 from . import spec
 from . import obo
@@ -72,7 +73,7 @@ class Reader(object):
 
     def __init__(
         self,
-        path,
+        path_or_file,
         MS_precisions = None,
         obo_version   = None,
         build_index_from_scratch=False,
@@ -100,21 +101,13 @@ def __init__(
 
         # File info
         self.info                = ddict()
-        self.info['file_name']   = path
-        self.info['encoding']    = self._determine_file_encoding(path)
-        self.info['file_object'] = self._open_file(self.info['file_name'])
-        # if build_index_from_scratch is True:
-        #     print(isinstance(self.info['file_object'], StandardMzml))
-        #     print(type(self.info['file_object']))
-        #     if isinstance(self.info['file_object'], StandardMzml):
-        #         self.info['offset_dict'] = \
-        #             self.info['file_object']._build_index_from_scratch()
-        #     else:
-        #         raise Exception(
-        #             'Can only build index from scratch '
-        #             'for standard mzML files.'
-        #         )
-        # else:
+        if isinstance(path_or_file, str):
+            self.info['file_name']   = path_or_file
+            self.info['encoding']    = self._determine_file_encoding(path_or_file)
+        else:
+            self.info['encoding']    = self._guess_encoding(path_or_file)
+
+        self.info['file_object'] = self._open_file(path_or_file)
         self.info['offset_dict'] = self.info['file_object'].offset_dict
         self.info['obo_version'] = obo_version
 
@@ -192,7 +185,7 @@ def file_class(self):
         """Return file object in use."""
         return type(self.info['file_object'].file_handler)
 
-    def _open_file(self, path):
+    def _open_file(self, path_or_file):
         """
         Open the path using the FileInterface class as a wrapper.
 
@@ -204,11 +197,27 @@ def _open_file(self, path):
                 mzml files
         """
         return FileInterface(
-            path,
+            path_or_file,
             self.info['encoding'],
             build_index_from_scratch=self.build_index_from_scratch
         )
 
+    def _guess_encoding(self, mzml_file):
+        """
+        Determine the encoding used for the file.
+
+        Arguments:
+            mzml_file (IOBase): an mzml file
+
+        Returns:
+            mzml_encoding (str): encoding type of the file
+        """
+        match = regex_patterns.FILE_ENCODING_PATTERN.search(mzml_file.readline())
+        if match:
+            return bytes.decode(match.group('encoding'))
+        else:
+            return 'utf-8'
+
     def _determine_file_encoding(self, path):
         """
         Determine the encoding used for the file in path.
@@ -219,17 +228,15 @@ def _determine_file_encoding(self, path):
         Returns:
             mzml_encoding (str): encoding type of the file
         """
-        mzml_encoding = 'utf-8'
         if os.path.exists(path):
-            with open(path, 'rb') as sniffer:
-                header = sniffer.readline()
-                encoding_pattern = regex_patterns.FILE_ENCODING_PATTERN
-                match = encoding_pattern.search(header)
-                if match:
-                    mzml_encoding = bytes.decode(
-                        match.group('encoding')
-                    )
-        return mzml_encoding
+            print(path)
+            if path.endswith('.gz') or path.endswith('.igz'):
+                import gzip
+                _open = gzip.open
+            else:
+                _open = open
+            with _open(path, 'rb') as sniffer:
+                return self._guess_encoding(sniffer)
 
     def _init_obo_translator(self):
         """

diff --git a/tests/main_reader_test.py b/tests/main_reader_test.py
@@ -16,12 +16,12 @@ class runTest(unittest.TestCase):
     def setUp(self):
         """
         """
-        paths = test_file_paths.paths
+        self.paths = test_file_paths.paths
 
-        file_compressed_indexed     = paths[2]
-        file_compressed_unindexed   = paths[1]
-        file_uncompressed_indexed   = paths[0]
-        file_uncompressed_unindexed = paths[0]
+        file_compressed_indexed     = self.paths[2]
+        file_compressed_unindexed   = self.paths[1]
+        file_uncompressed_indexed   = self.paths[0]
+        file_uncompressed_unindexed = self.paths[0]
         self.reader_compressed_indexed     = run.Reader(file_compressed_indexed)
         self.reader_compressed_unindexed   = run.Reader(file_compressed_unindexed)
         self.reader_uncompressed_indexed   = run.Reader(file_uncompressed_indexed)
@@ -30,14 +30,14 @@ def setUp(self):
     def test_determine_file_encoding(self):
         """
         """
-        encoding = self.reader_compressed_indexed._determine_file_encoding(self.reader_compressed_indexed.info['encoding'])
-        self.assertEqual(encoding, 'utf-8')
-        encoding = self.reader_compressed_unindexed._determine_file_encoding(self.reader_compressed_unindexed.info['encoding'])
-        self.assertEqual(encoding, 'utf-8')
-        encoding = self.reader_uncompressed_indexed._determine_file_encoding(self.reader_uncompressed_indexed.info['encoding'])
-        self.assertEqual(encoding, 'utf-8')
-        encoding = self.reader_uncompressed_unindexed._determine_file_encoding(self.reader_uncompressed_unindexed.info['encoding'])
-        self.assertEqual(encoding, 'utf-8')
+        encoding = self.reader_compressed_indexed._determine_file_encoding(self.paths[2])
+        self.assertEqual(encoding, 'ISO-8859-1')
+        encoding = self.reader_compressed_unindexed._determine_file_encoding(self.paths[1])
+        self.assertEqual(encoding, 'ISO-8859-1')
+        encoding = self.reader_uncompressed_indexed._determine_file_encoding(self.paths[3])
+        self.assertEqual(encoding, 'ISO-8859-1')
+        encoding = self.reader_uncompressed_unindexed._determine_file_encoding(self.paths[0])
+        self.assertEqual(encoding, 'ISO-8859-1')
 
     def test_init_iter(self):
         """