diff --git a/playa/parser.py b/playa/parser.py index 88da36f..0ff4c44 100644 --- a/playa/parser.py +++ b/playa/parser.py @@ -170,7 +170,7 @@ def nextline(self) -> Tuple[int, bytes]: return (linepos, self.data[linepos : self.pos]) def get_inline_data( - self, target: bytes = b"EI", blocksize: int = -1 + self, target: bytes = b"\nEI", blocksize: int = -1 ) -> Tuple[int, bytes]: """Get the data for an inline image up to the target end-of-stream marker. @@ -349,6 +349,7 @@ def __next__(self) -> StackEntry: if self.stack and top is None: return self.stack.pop() (pos, token) = self.nexttoken() + log.debug("token at %d: %r", pos, token) if token is KEYWORD_ARRAY_BEGIN: if top is None: top = pos @@ -429,7 +430,7 @@ def __next__(self) -> StackEntry: dic = { literal_name(k): v for (k, v) in choplist(2, objs) if v is not None } - eos = b"EI" + eos = b"\nEI" filter = dic.get("F") if filter is not None: if not isinstance(filter, list): @@ -441,16 +442,19 @@ def __next__(self) -> StackEntry: # operator shall be followed by a single white-space # character, and the next character shall be # interpreted as the first byte of image data. - if eos == b"EI": + if eos == b"\nEI": self.seek(idpos + len(KEYWORD_ID.name) + 1) (eipos, data) = self.get_inline_data(target=eos) - # FIXME: it is totally unspecified what to do with - # a newline between the end of the data and "EI", - # since there is no explicit stream length. (PDF - # 1.7 p. 756: There should be an end-of-line - # marker after the data and before endstream; this - # marker shall not be included in the stream - # length.) We will include it, which might be wrong. + log.debug("data at %d: %r", eipos, data) + # It is totally unspecified what to do with a + # newline between the end of the data and "EI", + # since there is no explicit stream length, but in + # practice, there is always a newline, since "EI" + # may occur in the stream data even when it is not + # ASCII85, so we will not include it. (PDF 1.7 + # p. 756: There *should* be an end-of-line marker + # after the data and before endstream; this marker + # shall not be included in the stream length.) data = data[: -len(eos)] else: # Note absence of + 1 here (the "Unless" above) @@ -468,7 +472,9 @@ def __next__(self) -> StackEntry: log.debug("InlineImage @ %d: %r", pos, obj) # Inline images must occur at the top level, otherwise # something is wrong (probably a corrupt file) - assert pos == top, f"Inline image {obj} not at top level of stream" + assert ( + pos == top + ), f"Inline image {obj} not at top level of stream ({pos} != {top}, {self.stack})" top = None return pos, obj else: diff --git a/tests/test_object_parser.py b/tests/test_object_parser.py index 23eea93..eda8513 100644 --- a/tests/test_object_parser.py +++ b/tests/test_object_parser.py @@ -394,7 +394,7 @@ def test_inline_images(): pos, img = next(parser) assert isinstance(img, InlineImage) assert img.attrs["Foo"] == b"bar" - assert img.rawdata == b"VARIOUS UTTER NONSENSE\n" + assert img.rawdata == b"VARIOUS UTTER NONSENSE" pos, img = next(parser) assert isinstance(img, InlineImage) assert img.buffer == b"VARIOUS UTTER NONSENSE"