Merge remote-tracking branch 'origin/main'

dhdaines · Sep 19, 2024 · e137f20 · e137f20
2 parents e314efc + 025fa8a
commit e137f20
Show file tree

Hide file tree

Showing 4 changed files with 65 additions and 23 deletions.
diff --git a/playa/pdfparser.py b/playa/pdfparser.py
@@ -80,16 +80,17 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
                 except KeyError:
                     if settings.STRICT:
                         raise PDFSyntaxError("/Length is undefined: %r" % dic)
+            # back up and read the entire line including 'stream' as
+            # the data starts after the trailing newline
             self.seek(pos)
             try:
-                (_, line) = self.nextline()  # 'stream'
+                (_, line) = self.nextline()  # 'stream\n'
             except PSEOF:
                 if settings.STRICT:
                     raise PDFSyntaxError("Unexpected EOF")
                 return
-            pos += len(line)
-            data = bytearray(self.read(pos, objlen))
-            self.seek(pos + objlen)
+            pos = self.tell()
+            data = self.read(objlen)
             while True:
                 try:
                     (linepos, line) = self.nextline()

diff --git a/playa/psparser.py b/playa/psparser.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import io
 import logging
+import mmap
 import re
 from binascii import unhexlify
 from collections import deque
@@ -188,10 +189,9 @@ def tell(self) -> int:
         """Get the current position in the file."""
         return self.fp.tell()
 
-    def read(self, pos: int, objlen: int) -> bytes:
+    def read(self, objlen: int) -> bytes:
         """Read data from a specified position, moving the current
         position to the end of this data."""
-        self.fp.seek(pos)
         return self.fp.read(objlen)
 
     def nextline(self) -> Tuple[int, bytes]:
@@ -610,6 +610,7 @@ def _parse_hexstring(self) -> bytes:
     | (?P<escape> \\.)
     | (?P<parenleft> \()
     | (?P<parenright> \))
+    | (?P<newline> \r\n?|\n)
     | (?P<other> .)
 )""",
     re.VERBOSE,
@@ -624,7 +625,7 @@ class PSInMemoryParser:
     Parser for in-memory data streams.
     """
 
-    def __init__(self, data: bytes) -> None:
+    def __init__(self, data: Union[bytes, mmap.mmap]) -> None:
         self.data = data
         self.pos = 0
         self.end = len(data)
@@ -646,9 +647,10 @@ def tell(self) -> int:
         """Get the current position in the buffer."""
         return self.pos
 
-    def read(self, pos: int, objlen: int) -> bytes:
-        """Read data from a specified position, moving the current
-        position to the end of this data."""
+    def read(self, objlen: int) -> bytes:
+        """Read data from current position, advancing to the end of
+        this data."""
+        pos = self.pos
         self.pos = min(pos + objlen, len(self.data))
         return self.data[pos : self.pos]
 
@@ -767,7 +769,8 @@ def __next__(self) -> Tuple[int, PSBaseParserToken]:
 
     def _parse_endstr(self, start: bytes, pos: int) -> Tuple[int, PSBaseParserToken]:
         """Parse the remainder of a string."""
-        parts = [start]
+        # Handle nonsense CRLF conversion in strings (PDF 1.7, p.15)
+        parts = [EOLR.sub(b"\n", start)]
         paren = 1
         for m in STRLEXER.finditer(self.data, pos):
             self.pos = m.end()
@@ -795,14 +798,17 @@ def _parse_endstr(self, start: bytes, pos: int) -> Tuple[int, PSBaseParserToken]
                     log.warning("Invalid octal %r (%d)", m[0][1:], chrcode)
                 else:
                     parts.append(bytes((chrcode,)))
+            elif m.lastgroup == "newline":  # type: ignore
+                # Handle nonsense CRLF conversion in strings (PDF 1.7, p.15)
+                parts.append(b"\n")
             elif m.lastgroup == "linebreak":  # type: ignore
                 pass
             else:
                 parts.append(m[0])
         if paren != 0:
             log.warning("Unterminated string at %d", pos)
             raise StopIteration
-        return (self._curtokenpos, b"".join(EOLR.sub(b"\n", part) for part in parts))
+        return (self._curtokenpos, b"".join(parts))
 
 
 # Stack slots may by occupied by any of:
@@ -830,7 +836,14 @@ def reinit(self, reader: Union[BinaryIO, bytes]) -> None:
                 reader
             )
         else:
-            self._parser = PSFileParser(reader)
+            try:
+                self._mmap = mmap.mmap(reader.fileno(), 0, access=mmap.ACCESS_READ)
+                self._parser = PSInMemoryParser(self._mmap)
+            except io.UnsupportedOperation:
+                log.warning(
+                    "mmap not supported on %r, falling back to file parser", reader
+                )
+                self._parser = PSFileParser(reader)
         self.reset()
 
     def reset(self) -> None:
@@ -845,6 +858,10 @@ def seek(self, pos: int) -> None:
         self._parser.seek(pos)
         self.reset()
 
+    def tell(self) -> int:
+        """Get the current position in the file."""
+        return self._parser.tell()
+
     def push(self, *objs: PSStackEntry[ExtraT]) -> None:
         """Push some objects onto the stack."""
         self.curstack.extend(objs)
@@ -985,10 +1002,10 @@ def revreadlines(self) -> Iterator[bytes]:
         """
         return self._parser.revreadlines()
 
-    def read(self, pos: int, objlen: int) -> bytes:
+    def read(self, objlen: int) -> bytes:
         """Read data from a specified position, moving the current
         position to the end of this data."""
-        return self._parser.read(pos, objlen)
+        return self._parser.read(objlen)
 
     def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
         """Get the next token in iteration, raising PSEOF when done."""

diff --git a/tests/benchmark_parser.py b/tests/benchmark_parser.py
@@ -283,6 +283,26 @@ def bench_bytes():
     )
 
 
+def bench_mmap():
+    import mmap
+
+    from playa.psparser import PSInMemoryParser
+
+    with tempfile.NamedTemporaryFile() as tf:
+        runs = 100
+        with open(tf.name, "wb") as outfh:
+            outfh.write(DATA * runs)
+        with open(tf.name, "rb") as infh:
+            start = time.time()
+            mapping = mmap.mmap(infh.fileno(), 0, access=mmap.ACCESS_READ)
+            parser = PSInMemoryParser(mapping)
+            _ = list(parser)
+            print(
+                "PLAYA Parser (mmap): %fms / run"
+                % ((time.time() - start) / runs * 1000),
+            )
+
+
 def bench_bytesio():
     from pdfminer.psparser import PSEOF, PSBaseParser
 
@@ -305,7 +325,7 @@ def bench_playa():
     from playa.pdfdocument import PDFDocument
     from playa.pdfinterp import PDFPageInterpreter, PDFResourceManager
     from playa.pdfpage import PDFPage
-    from playa.psparser import PSFileParser, PSInMemoryParser
+    from playa.psparser import PSFileParser
 
     runs = 100
     start = time.time()
@@ -314,12 +334,6 @@ def bench_playa():
     print(
         "PLAYA Parser (BytesIO): %fms / run" % ((time.time() - start) / runs * 1000),
     )
-    start = time.time()
-    parser = PSInMemoryParser(DATA * runs)
-    _ = list(parser)
-    print(
-        "PLAYA Parser (bytes): %fms / run" % ((time.time() - start) / runs * 1000),
-    )
     with tempfile.NamedTemporaryFile() as tf:
         runs = 100
         with open(tf.name, "wb") as outfh:
@@ -332,6 +346,8 @@ def bench_playa():
                 "PLAYA Parser (BinaryIO): %fms / run"
                 % ((time.time() - start) / runs * 1000),
             )
+    bench_bytes()
+    bench_mmap()
 
     runs = 20
     start = time.time()
@@ -405,3 +421,9 @@ def bench_pdfminer():
         bench_pdfminer()
     if len(sys.argv) < 2 or sys.argv[1] == "playa":
         bench_playa()
+    if len(sys.argv) > 1 and sys.argv[1] == "bytes":
+        bench_bytes()
+    if len(sys.argv) > 1 and sys.argv[1] == "bytesio":
+        bench_bytesio()
+    if len(sys.argv) > 1 and sys.argv[1] == "mmap":
+        bench_mmap()
diff --git a/tests/test_pdfminer_psparser.py b/tests/test_pdfminer_psparser.py
@@ -330,8 +330,10 @@ def test_new_parser_strings() -> None:
     )
     list_parsers(b"(foo\rbar)", [(0, b"foo\nbar")])
     list_parsers(b"(foo\r)", [(0, b"foo\n")])
-    list_parsers(b"(foo\r\nbaz)", [(0, b"foo\nbaz")])
+    list_parsers(b"(foo\r\nbar\r\nbaz)", [(0, b"foo\nbar\nbaz")])
     list_parsers(b"(foo\n)", [(0, b"foo\n")])
+    list_parsers(br"(foo\r\nbaz)", [(0, b"foo\r\nbaz")])
+    list_parsers(br"(foo\r\nbar\r\nbaz)", [(0, b"foo\r\nbar\r\nbaz")])
     list_parsers(
         rb"( This string contains \245two octal characters\307 . )",
         [(0, b" This string contains \245two octal characters\307 . ")],