From 94f838554f15e4946ac5c732b4a90889a911c999 Mon Sep 17 00:00:00 2001 From: Sylvain Le Bon Date: Sun, 10 Mar 2019 15:04:43 +0100 Subject: [PATCH 1/2] feature: handle in memory binary streams --- pymzml/file_classes/bytesMzml.py | 38 ++++++++++++++++++++ pymzml/file_classes/standardMzml.py | 28 +++++++++------ pymzml/file_interface.py | 29 ++++++---------- pymzml/run.py | 54 +++++++++++++++-------------- 4 files changed, 94 insertions(+), 55 deletions(-) create mode 100644 pymzml/file_classes/bytesMzml.py diff --git a/pymzml/file_classes/bytesMzml.py b/pymzml/file_classes/bytesMzml.py new file mode 100644 index 00000000..f8c0bdd6 --- /dev/null +++ b/pymzml/file_classes/bytesMzml.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Interface for binary streams of uncompressed mzML. + +@author: Sylvain Le Bon +""" +from io import TextIOWrapper + +from .. import regex_patterns +from .standardMzml import StandardMzml + + +class BytesMzml(StandardMzml): + def __init__(self, binary, encoding, build_index_from_scratch=False): + """ + Initalize Wrapper object for standard mzML files. + + Arguments: + path (str) : path to the file + encoding (str) : encoding of the file + """ + self.binary = binary + self.file_handler = self.get_file_handler(encoding) + self.offset_dict = dict() + self.spec_open = regex_patterns.SPECTRUM_OPEN_PATTERN + self.spec_close = regex_patterns.SPECTRUM_CLOSE_PATTERN + if build_index_from_scratch is True: + seeker = self.get_binary_file_handler() + self._build_index_from_scratch(seeker) + seeker.close() + + def get_binary_file_handler(self): + self.binary.seek(0) + return self.binary + + def get_file_handler(self, encoding): + return TextIOWrapper(self.binary, encoding=encoding) diff --git a/pymzml/file_classes/standardMzml.py b/pymzml/file_classes/standardMzml.py index 3e4c980d..48847e2b 100755 --- a/pymzml/file_classes/standardMzml.py +++ b/pymzml/file_classes/standardMzml.py @@ -27,19 +27,25 @@ def __init__(self, path, encoding, build_index_from_scratch=False): encoding (str) : encoding of the file """ self.path = path - self.file_handler = codecs.open( - path, - mode = 'r', - encoding = encoding - ) + self.file_handler = self.get_file_handler(encoding) self.offset_dict = dict() self.spec_open = regex_patterns.SPECTRUM_OPEN_PATTERN self.spec_close = regex_patterns.SPECTRUM_CLOSE_PATTERN if build_index_from_scratch is True: - seeker = open(path, 'rb') + seeker = self.get_binary_file_handler() self._build_index_from_scratch(seeker) seeker.close() + def get_binary_file_handler(self): + return open(self.path, 'rb') + + def get_file_handler(self, encoding): + return codecs.open( + self.path, + mode = 'r', + encoding = encoding + ) + def __getitem__(self, identifier): """ Access the item with id 'identifier'. @@ -80,7 +86,7 @@ def __getitem__(self, identifier): elif identifier in self.offset_dict: start = self.offset_dict[identifier] - with open(self.path, 'rb') as seeker: + with self.get_binary_file_handler() as seeker: seeker.seek(start[0]) start, end = self._read_to_spec_end(seeker) self.file_handler.seek(start, 0) @@ -122,7 +128,7 @@ def _build_index(self, from_scratch=False): seeking to a particular offset for the file. """ # Declare the pre-seeker - seeker = open(self.path, 'rb') + seeker = self.get_binary_file_handler() # Reading last 1024 bytes to find chromatogram Pos and SpectrumIndex Pos index_list_offset_pattern = re.compile( b'(?P[0-9]*)' @@ -324,7 +330,7 @@ def _interpol_search( """ # print('target ', target_index) - seeker = open(self.path, 'rb') + seeker = self.get_binary_file_handler() seeker.seek(0, 2) chunk_size = chunk_size * 512 lower_bound = 0 @@ -424,7 +430,7 @@ def _interpol_search( except: key = sorted_keys[pos] spec_start_offset = self.offset_dict[key][0] - seeker = open(self.path, 'rb') + seeker = self.get_binary_file_handler() seeker.seek(spec_start_offset) spectrum = self._search_linear(seeker, target_index) seeker.close() @@ -533,7 +539,7 @@ def _search_linear(self, seeker, index, chunk_size=8): ) def _search_string_identifier(self, search_string, chunk_size=8): - with open(self.path, 'rb') as seeker: + with self.get_binary_file_handler() as seeker: data = None total_chunk_size = chunk_size * 512 spec_start = None diff --git a/pymzml/file_interface.py b/pymzml/file_interface.py index 38278a22..0b039e71 100755 --- a/pymzml/file_interface.py +++ b/pymzml/file_interface.py @@ -5,9 +5,8 @@ @author: Manuel Koesters """ -from pymzml.file_classes import indexedGzip -from pymzml.file_classes import standardGzip -from pymzml.file_classes import standardMzml +from io import BytesIO +from pymzml.file_classes import indexedGzip, standardGzip, standardMzml, bytesMzml from pymzml.utils import GSGR @@ -32,7 +31,7 @@ def close(self): """Close the internal file handler.""" self.file_handler.close() - def _open(self, path): + def _open(self, path_or_file): """ Open a file like object resp. a wrapper for a file like object. @@ -46,24 +45,18 @@ def _open(self, path): :py:class:`~pymzml.file_classes.standardMzml.StandardMzml`, based on the file ending of 'path' """ - if path.endswith('.gz'): - if self._indexed_gzip(path): - file_handler = indexedGzip.IndexedGzip( - path, - self.encoding - ) + if isinstance(path_or_file, BytesIO): + return bytesMzml.BytesMzml(path_or_file, self.encoding, self.build_index_from_scratch) + if path_or_file.endswith('.gz'): + if self._indexed_gzip(path_or_file): + return indexedGzip.IndexedGzip(path_or_file, self.encoding) else: - file_handler = standardGzip.StandardGzip( - path, - self.encoding - ) - else: - file_handler = standardMzml.StandardMzml( - path, + return standardGzip.StandardGzip(path_or_file, self.encoding) + return standardMzml.StandardMzml( + path_or_file, self.encoding, self.build_index_from_scratch, ) - return file_handler def _indexed_gzip(self, path): """ diff --git a/pymzml/run.py b/pymzml/run.py index 2f68b8fa..82d57022 100755 --- a/pymzml/run.py +++ b/pymzml/run.py @@ -38,6 +38,7 @@ import os import xml.etree.ElementTree as ElementTree from collections import defaultdict as ddict +from io import BytesIO from . import spec from . import obo @@ -72,7 +73,7 @@ class Reader(object): def __init__( self, - path, + path_or_file, MS_precisions = None, obo_version = None, build_index_from_scratch=False, @@ -98,21 +99,13 @@ def __init__( # File info self.info = ddict() - self.info['file_name'] = path - self.info['encoding'] = self._determine_file_encoding(path) - self.info['file_object'] = self._open_file(self.info['file_name']) - # if build_index_from_scratch is True: - # print(isinstance(self.info['file_object'], StandardMzml)) - # print(type(self.info['file_object'])) - # if isinstance(self.info['file_object'], StandardMzml): - # self.info['offset_dict'] = \ - # self.info['file_object']._build_index_from_scratch() - # else: - # raise Exception( - # 'Can only build index from scratch ' - # 'for standard mzML files.' - # ) - # else: + if isinstance(path_or_file, str): + self.info['file_name'] = path_or_file + self.info['encoding'] = self._determine_file_encoding(path_or_file) + else: + self.info['encoding'] = self._guess_encoding(path_or_file) + + self.info['file_object'] = self._open_file(path_or_file) self.info['offset_dict'] = self.info['file_object'].offset_dict self.info['obo_version'] = obo_version @@ -190,7 +183,7 @@ def file_class(self): """Return file object in use.""" return type(self.info['file_object'].file_handler) - def _open_file(self, path): + def _open_file(self, path_or_file): """ Open the path using the FileInterface class as a wrapper. @@ -202,11 +195,27 @@ def _open_file(self, path): mzml files """ return FileInterface( - path, + path_or_file, self.info['encoding'], build_index_from_scratch=self.build_index_from_scratch ) + def _guess_encoding(self, mzml_file): + """ + Determine the encoding used for the file. + + Arguments: + mzml_file (IOBase): an mzml file + + Returns: + mzml_encoding (str): encoding type of the file + """ + match = regex_patterns.FILE_ENCODING_PATTERN.search(mzml_file.readline()) + if match: + return bytes.decode(match.group('encoding')) + else: + return 'utf-8' + def _determine_file_encoding(self, path): """ Determine the encoding used for the file in path. @@ -220,14 +229,7 @@ def _determine_file_encoding(self, path): mzml_encoding = 'utf-8' if os.path.exists(path): with open(path, 'rb') as sniffer: - header = sniffer.readline() - encoding_pattern = regex_patterns.FILE_ENCODING_PATTERN - match = encoding_pattern.search(header) - if match: - mzml_encoding = bytes.decode( - match.group('encoding') - ) - return mzml_encoding + return self._guess_encoding(sniffer) def _init_obo_translator(self): """ From 21b999abe43e1fadd97c4b3fd6c16987cb49bd53 Mon Sep 17 00:00:00 2001 From: MKoesters Date: Mon, 1 Apr 2019 13:53:15 +0200 Subject: [PATCH 2/2] fix test in byteMzml pull request --- pymzml/run.py | 9 +++++++-- tests/main_reader_test.py | 26 +++++++++++++------------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/pymzml/run.py b/pymzml/run.py index 82d57022..d67e20f3 100755 --- a/pymzml/run.py +++ b/pymzml/run.py @@ -226,9 +226,14 @@ def _determine_file_encoding(self, path): Returns: mzml_encoding (str): encoding type of the file """ - mzml_encoding = 'utf-8' if os.path.exists(path): - with open(path, 'rb') as sniffer: + print(path) + if path.endswith('.gz') or path.endswith('.igz'): + import gzip + _open = gzip.open + else: + _open = open + with _open(path, 'rb') as sniffer: return self._guess_encoding(sniffer) def _init_obo_translator(self): diff --git a/tests/main_reader_test.py b/tests/main_reader_test.py index f099b712..51c87827 100755 --- a/tests/main_reader_test.py +++ b/tests/main_reader_test.py @@ -16,12 +16,12 @@ class runTest(unittest.TestCase): def setUp(self): """ """ - paths = test_file_paths.paths + self.paths = test_file_paths.paths - file_compressed_indexed = paths[2] - file_compressed_unindexed = paths[1] - file_uncompressed_indexed = paths[0] - file_uncompressed_unindexed = paths[0] + file_compressed_indexed = self.paths[2] + file_compressed_unindexed = self.paths[1] + file_uncompressed_indexed = self.paths[0] + file_uncompressed_unindexed = self.paths[0] self.reader_compressed_indexed = run.Reader(file_compressed_indexed) self.reader_compressed_unindexed = run.Reader(file_compressed_unindexed) self.reader_uncompressed_indexed = run.Reader(file_uncompressed_indexed) @@ -30,14 +30,14 @@ def setUp(self): def test_determine_file_encoding(self): """ """ - encoding = self.reader_compressed_indexed._determine_file_encoding(self.reader_compressed_indexed.info['encoding']) - self.assertEqual(encoding, 'utf-8') - encoding = self.reader_compressed_unindexed._determine_file_encoding(self.reader_compressed_unindexed.info['encoding']) - self.assertEqual(encoding, 'utf-8') - encoding = self.reader_uncompressed_indexed._determine_file_encoding(self.reader_uncompressed_indexed.info['encoding']) - self.assertEqual(encoding, 'utf-8') - encoding = self.reader_uncompressed_unindexed._determine_file_encoding(self.reader_uncompressed_unindexed.info['encoding']) - self.assertEqual(encoding, 'utf-8') + encoding = self.reader_compressed_indexed._determine_file_encoding(self.paths[2]) + self.assertEqual(encoding, 'ISO-8859-1') + encoding = self.reader_compressed_unindexed._determine_file_encoding(self.paths[1]) + self.assertEqual(encoding, 'ISO-8859-1') + encoding = self.reader_uncompressed_indexed._determine_file_encoding(self.paths[3]) + self.assertEqual(encoding, 'ISO-8859-1') + encoding = self.reader_uncompressed_unindexed._determine_file_encoding(self.paths[0]) + self.assertEqual(encoding, 'ISO-8859-1') def test_init_iter(self): """