Skip to content

Commit

Permalink
Merge pull request #126 from pymzml/byteMzml
Browse files Browse the repository at this point in the history
Byte mzml
  • Loading branch information
MKoesters authored Apr 1, 2019
2 parents 0b0111c + 21b999a commit 86cb9d6
Show file tree
Hide file tree
Showing 5 changed files with 114 additions and 70 deletions.
38 changes: 38 additions & 0 deletions pymzml/file_classes/bytesMzml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Interface for binary streams of uncompressed mzML.
@author: Sylvain Le Bon
"""
from io import TextIOWrapper

from .. import regex_patterns
from .standardMzml import StandardMzml


class BytesMzml(StandardMzml):
def __init__(self, binary, encoding, build_index_from_scratch=False):
"""
Initalize Wrapper object for standard mzML files.
Arguments:
path (str) : path to the file
encoding (str) : encoding of the file
"""
self.binary = binary
self.file_handler = self.get_file_handler(encoding)
self.offset_dict = dict()
self.spec_open = regex_patterns.SPECTRUM_OPEN_PATTERN
self.spec_close = regex_patterns.SPECTRUM_CLOSE_PATTERN
if build_index_from_scratch is True:
seeker = self.get_binary_file_handler()
self._build_index_from_scratch(seeker)
seeker.close()

def get_binary_file_handler(self):
self.binary.seek(0)
return self.binary

def get_file_handler(self, encoding):
return TextIOWrapper(self.binary, encoding=encoding)
28 changes: 17 additions & 11 deletions pymzml/file_classes/standardMzml.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,25 @@ def __init__(self, path, encoding, build_index_from_scratch=False):
encoding (str) : encoding of the file
"""
self.path = path
self.file_handler = codecs.open(
path,
mode = 'r',
encoding = encoding
)
self.file_handler = self.get_file_handler(encoding)
self.offset_dict = dict()
self.spec_open = regex_patterns.SPECTRUM_OPEN_PATTERN
self.spec_close = regex_patterns.SPECTRUM_CLOSE_PATTERN
if build_index_from_scratch is True:
seeker = open(path, 'rb')
seeker = self.get_binary_file_handler()
self._build_index_from_scratch(seeker)
seeker.close()

def get_binary_file_handler(self):
return open(self.path, 'rb')

def get_file_handler(self, encoding):
return codecs.open(
self.path,
mode = 'r',
encoding = encoding
)

def __getitem__(self, identifier):
"""
Access the item with id 'identifier'.
Expand Down Expand Up @@ -80,7 +86,7 @@ def __getitem__(self, identifier):
elif identifier in self.offset_dict:

start = self.offset_dict[identifier]
with open(self.path, 'rb') as seeker:
with self.get_binary_file_handler() as seeker:
seeker.seek(start[0])
start, end = self._read_to_spec_end(seeker)
self.file_handler.seek(start, 0)
Expand Down Expand Up @@ -122,7 +128,7 @@ def _build_index(self, from_scratch=False):
seeking to a particular offset for the file.
"""
# Declare the pre-seeker
seeker = open(self.path, 'rb')
seeker = self.get_binary_file_handler()
# Reading last 1024 bytes to find chromatogram Pos and SpectrumIndex Pos
index_list_offset_pattern = re.compile(
b'<indexListOffset>(?P<indexListOffset>[0-9]*)</indexListOffset>'
Expand Down Expand Up @@ -324,7 +330,7 @@ def _interpol_search(
"""
# print('target ', target_index)
seeker = open(self.path, 'rb')
seeker = self.get_binary_file_handler()
seeker.seek(0, 2)
chunk_size = chunk_size * 512
lower_bound = 0
Expand Down Expand Up @@ -424,7 +430,7 @@ def _interpol_search(
except:
key = sorted_keys[pos]
spec_start_offset = self.offset_dict[key][0]
seeker = open(self.path, 'rb')
seeker = self.get_binary_file_handler()
seeker.seek(spec_start_offset)
spectrum = self._search_linear(seeker, target_index)
seeker.close()
Expand Down Expand Up @@ -533,7 +539,7 @@ def _search_linear(self, seeker, index, chunk_size=8):
)

def _search_string_identifier(self, search_string, chunk_size=8):
with open(self.path, 'rb') as seeker:
with self.get_binary_file_handler() as seeker:
data = None
total_chunk_size = chunk_size * 512
spec_start = None
Expand Down
29 changes: 11 additions & 18 deletions pymzml/file_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
@author: Manuel Koesters
"""
from pymzml.file_classes import indexedGzip
from pymzml.file_classes import standardGzip
from pymzml.file_classes import standardMzml
from io import BytesIO
from pymzml.file_classes import indexedGzip, standardGzip, standardMzml, bytesMzml
from pymzml.utils import GSGR


Expand All @@ -32,7 +31,7 @@ def close(self):
"""Close the internal file handler."""
self.file_handler.close()

def _open(self, path):
def _open(self, path_or_file):
"""
Open a file like object resp. a wrapper for a file like object.
Expand All @@ -46,24 +45,18 @@ def _open(self, path):
:py:class:`~pymzml.file_classes.standardMzml.StandardMzml`,
based on the file ending of 'path'
"""
if path.endswith('.gz'):
if self._indexed_gzip(path):
file_handler = indexedGzip.IndexedGzip(
path,
self.encoding
)
if isinstance(path_or_file, BytesIO):
return bytesMzml.BytesMzml(path_or_file, self.encoding, self.build_index_from_scratch)
if path_or_file.endswith('.gz'):
if self._indexed_gzip(path_or_file):
return indexedGzip.IndexedGzip(path_or_file, self.encoding)
else:
file_handler = standardGzip.StandardGzip(
path,
self.encoding
)
else:
file_handler = standardMzml.StandardMzml(
path,
return standardGzip.StandardGzip(path_or_file, self.encoding)
return standardMzml.StandardMzml(
path_or_file,
self.encoding,
self.build_index_from_scratch,
)
return file_handler

def _indexed_gzip(self, path):
"""
Expand Down
63 changes: 35 additions & 28 deletions pymzml/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import os
import xml.etree.ElementTree as ElementTree
from collections import defaultdict as ddict
from io import BytesIO

from . import spec
from . import obo
Expand Down Expand Up @@ -72,7 +73,7 @@ class Reader(object):

def __init__(
self,
path,
path_or_file,
MS_precisions = None,
obo_version = None,
build_index_from_scratch=False,
Expand Down Expand Up @@ -100,21 +101,13 @@ def __init__(

# File info
self.info = ddict()
self.info['file_name'] = path
self.info['encoding'] = self._determine_file_encoding(path)
self.info['file_object'] = self._open_file(self.info['file_name'])
# if build_index_from_scratch is True:
# print(isinstance(self.info['file_object'], StandardMzml))
# print(type(self.info['file_object']))
# if isinstance(self.info['file_object'], StandardMzml):
# self.info['offset_dict'] = \
# self.info['file_object']._build_index_from_scratch()
# else:
# raise Exception(
# 'Can only build index from scratch '
# 'for standard mzML files.'
# )
# else:
if isinstance(path_or_file, str):
self.info['file_name'] = path_or_file
self.info['encoding'] = self._determine_file_encoding(path_or_file)
else:
self.info['encoding'] = self._guess_encoding(path_or_file)

self.info['file_object'] = self._open_file(path_or_file)
self.info['offset_dict'] = self.info['file_object'].offset_dict
self.info['obo_version'] = obo_version

Expand Down Expand Up @@ -192,7 +185,7 @@ def file_class(self):
"""Return file object in use."""
return type(self.info['file_object'].file_handler)

def _open_file(self, path):
def _open_file(self, path_or_file):
"""
Open the path using the FileInterface class as a wrapper.
Expand All @@ -204,11 +197,27 @@ def _open_file(self, path):
mzml files
"""
return FileInterface(
path,
path_or_file,
self.info['encoding'],
build_index_from_scratch=self.build_index_from_scratch
)

def _guess_encoding(self, mzml_file):
"""
Determine the encoding used for the file.
Arguments:
mzml_file (IOBase): an mzml file
Returns:
mzml_encoding (str): encoding type of the file
"""
match = regex_patterns.FILE_ENCODING_PATTERN.search(mzml_file.readline())
if match:
return bytes.decode(match.group('encoding'))
else:
return 'utf-8'

def _determine_file_encoding(self, path):
"""
Determine the encoding used for the file in path.
Expand All @@ -219,17 +228,15 @@ def _determine_file_encoding(self, path):
Returns:
mzml_encoding (str): encoding type of the file
"""
mzml_encoding = 'utf-8'
if os.path.exists(path):
with open(path, 'rb') as sniffer:
header = sniffer.readline()
encoding_pattern = regex_patterns.FILE_ENCODING_PATTERN
match = encoding_pattern.search(header)
if match:
mzml_encoding = bytes.decode(
match.group('encoding')
)
return mzml_encoding
print(path)
if path.endswith('.gz') or path.endswith('.igz'):
import gzip
_open = gzip.open
else:
_open = open
with _open(path, 'rb') as sniffer:
return self._guess_encoding(sniffer)

def _init_obo_translator(self):
"""
Expand Down
26 changes: 13 additions & 13 deletions tests/main_reader_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ class runTest(unittest.TestCase):
def setUp(self):
"""
"""
paths = test_file_paths.paths
self.paths = test_file_paths.paths

file_compressed_indexed = paths[2]
file_compressed_unindexed = paths[1]
file_uncompressed_indexed = paths[0]
file_uncompressed_unindexed = paths[0]
file_compressed_indexed = self.paths[2]
file_compressed_unindexed = self.paths[1]
file_uncompressed_indexed = self.paths[0]
file_uncompressed_unindexed = self.paths[0]
self.reader_compressed_indexed = run.Reader(file_compressed_indexed)
self.reader_compressed_unindexed = run.Reader(file_compressed_unindexed)
self.reader_uncompressed_indexed = run.Reader(file_uncompressed_indexed)
Expand All @@ -30,14 +30,14 @@ def setUp(self):
def test_determine_file_encoding(self):
"""
"""
encoding = self.reader_compressed_indexed._determine_file_encoding(self.reader_compressed_indexed.info['encoding'])
self.assertEqual(encoding, 'utf-8')
encoding = self.reader_compressed_unindexed._determine_file_encoding(self.reader_compressed_unindexed.info['encoding'])
self.assertEqual(encoding, 'utf-8')
encoding = self.reader_uncompressed_indexed._determine_file_encoding(self.reader_uncompressed_indexed.info['encoding'])
self.assertEqual(encoding, 'utf-8')
encoding = self.reader_uncompressed_unindexed._determine_file_encoding(self.reader_uncompressed_unindexed.info['encoding'])
self.assertEqual(encoding, 'utf-8')
encoding = self.reader_compressed_indexed._determine_file_encoding(self.paths[2])
self.assertEqual(encoding, 'ISO-8859-1')
encoding = self.reader_compressed_unindexed._determine_file_encoding(self.paths[1])
self.assertEqual(encoding, 'ISO-8859-1')
encoding = self.reader_uncompressed_indexed._determine_file_encoding(self.paths[3])
self.assertEqual(encoding, 'ISO-8859-1')
encoding = self.reader_uncompressed_unindexed._determine_file_encoding(self.paths[0])
self.assertEqual(encoding, 'ISO-8859-1')

def test_init_iter(self):
"""
Expand Down

0 comments on commit 86cb9d6

Please sign in to comment.