From c34aa5c626eb919cc131ff43b9e01d453b5ac728 Mon Sep 17 00:00:00 2001 From: Matthew Chang Date: Mon, 1 Jul 2024 17:35:55 -0700 Subject: [PATCH 1/3] Updates readline logic for azure to match s3 Loosely copies the readline buffer management from s3 to azure, improving performance. --- smart_open/azure.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/smart_open/azure.py b/smart_open/azure.py index 5cac221b..1c991f05 100644 --- a/smart_open/azure.py +++ b/smart_open/azure.py @@ -325,24 +325,22 @@ def readline(self, limit=-1): """Read up to and including the next newline. Returns the bytes read.""" if limit != -1: raise NotImplementedError('limits other than -1 not implemented yet') - the_line = io.BytesIO() + + # + # A single line may span multiple buffers. + # + line = io.BytesIO() while not (self._position == self._size and len(self._current_part) == 0): - # - # In the worst case, we're reading the unread part of self._current_part - # twice here, once in the if condition and once when calling index. - # - # This is sub-optimal, but better than the alternative: wrapping - # .index in a try..except, because that is slower. - # - remaining_buffer = self._current_part.peek() - if self._line_terminator in remaining_buffer: - next_newline = remaining_buffer.index(self._line_terminator) - the_line.write(self._read_from_buffer(next_newline + 1)) + line_part = self._current_part.readline(self._line_terminator) + line.write(line_part) + self._position += len(line_part) + + if line_part.endswith(self._line_terminator): break else: - the_line.write(self._read_from_buffer()) self._fill_buffer() - return the_line.getvalue() + + return line.getvalue() # # Internal methods. From 35a8c5e6ba96fc5f7be93e46a85fcbaec0e6c032 Mon Sep 17 00:00:00 2001 From: Matthew Chang Date: Wed, 21 Aug 2024 15:12:32 -0700 Subject: [PATCH 2/3] Adds unittest for readlines with azure --- smart_open/tests/test_azure.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/smart_open/tests/test_azure.py b/smart_open/tests/test_azure.py index a82dbf17..a65a9186 100644 --- a/smart_open/tests/test_azure.py +++ b/smart_open/tests/test_azure.py @@ -373,6 +373,24 @@ def test_read(self): self.assertEqual(content[6:14], fin.read(8)) # ř is 2 bytes self.assertEqual(content[14:], fin.read()) # read the rest + def test_readline(self): + """Are Azure Blob Storage files read correctly with readlines?""" + content = u"hello wořld\nhow are you?".encode('utf8') + blob_name = "test_read_%s" % BLOB_NAME + put_to_container(blob_name, contents=content) + logger.debug('content: %r len: %r', content, len(content)) + + with smart_open.azure.Reader(CONTAINER_NAME, blob_name, CLIENT) as fin: + line = fin.readline() + self.assertEqual(line, 'hello wořld\n') + + fin.seek(0) + actual = list(fin) + self.assertEqual(fin.tell(), len(content)) + + expected = ['hello wořld\n', 'how are you?'] + self.assertEqual(expected, actual) + def test_read_max_concurrency(self): """Are Azure Blob Storage files read correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') From 93e3d8e2926eee81bf59382dd572c39006c2dc10 Mon Sep 17 00:00:00 2001 From: Matthew Chang Date: Fri, 23 Aug 2024 17:39:46 -0700 Subject: [PATCH 3/3] Revert "Adds unittest for readlines with azure" Did not need to add test, already exists further down in the file. This reverts commit 35a8c5e6ba96fc5f7be93e46a85fcbaec0e6c032. --- smart_open/tests/test_azure.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/smart_open/tests/test_azure.py b/smart_open/tests/test_azure.py index a65a9186..a82dbf17 100644 --- a/smart_open/tests/test_azure.py +++ b/smart_open/tests/test_azure.py @@ -373,24 +373,6 @@ def test_read(self): self.assertEqual(content[6:14], fin.read(8)) # ř is 2 bytes self.assertEqual(content[14:], fin.read()) # read the rest - def test_readline(self): - """Are Azure Blob Storage files read correctly with readlines?""" - content = u"hello wořld\nhow are you?".encode('utf8') - blob_name = "test_read_%s" % BLOB_NAME - put_to_container(blob_name, contents=content) - logger.debug('content: %r len: %r', content, len(content)) - - with smart_open.azure.Reader(CONTAINER_NAME, blob_name, CLIENT) as fin: - line = fin.readline() - self.assertEqual(line, 'hello wořld\n') - - fin.seek(0) - actual = list(fin) - self.assertEqual(fin.tell(), len(content)) - - expected = ['hello wořld\n', 'how are you?'] - self.assertEqual(expected, actual) - def test_read_max_concurrency(self): """Are Azure Blob Storage files read correctly?""" content = u"hello wořld\nhow are you?".encode('utf8')