More (internal) API shrinkification (#12)

* refactor!: remove no longer used things * refactor: use dataclasses for graphic/text state * feat: maintain pdfplumber compatibility in to_dict * chore: cleanup tests * refactor!: useless information hiding * feat: safer colours as named tuples * fix: no there is no RGBA and also just use tuples everywhere * feat!: flatten out layout items (FIXME: actually slower) * fix: optimize a bit but not enough! * fix: mandate named arguments in some places and remove useless copy * fix: remove unhelpful asserts * refactor: refactor away LTChar and render_string_foobar * refactor!: swallow up playa.layout entirely * fix: cache a thing, now we are faster again * fix: remove useless accessors * refactor!: remove most bespoke exceptions * refactor!: remove more LTStuff * refactor!: remove all LTStuff except one (to be one separately)
dhdaines · Nov 5, 2024 · 0b4aa6d · 0b4aa6d
1 parent 9c0e217
commit 0b4aa6d
Show file tree

Hide file tree

Showing 19 changed files with 587 additions and 1,024 deletions.
diff --git a/playa/_saslprep.py b/playa/_saslprep.py
@@ -24,8 +24,6 @@
 import unicodedata
 from typing import Callable, Tuple
 
-from playa.exceptions import PDFValueError
-
 # RFC4013 section 2.3 prohibited output.
 _PROHIBITED: Tuple[Callable[[str], bool], ...] = (
     # A strict reading of RFC 4013 requires table c12 here, but
@@ -83,7 +81,7 @@ def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
             # RFC3454, Section 6, #3. If a string contains any
             # RandALCat character, the first and last characters
             # MUST be RandALCat characters.
-            raise PDFValueError("SASLprep: failed bidirectional check")
+            raise ValueError("SASLprep: failed bidirectional check")
         # RFC3454, Section 6, #2. If a string contains any RandALCat
         # character, it MUST NOT contain any LCat character.
         prohibited = prohibited + (stringprep.in_table_d2,)
@@ -96,6 +94,6 @@ def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
     # RFC3454 section 2, step 3 and 4 - Prohibit and check bidi
     for char in data:
         if any(in_table(char) for in_table in prohibited):
-            raise PDFValueError("SASLprep: failed prohibited character check")
+            raise ValueError("SASLprep: failed prohibited character check")
 
     return data
diff --git a/playa/ccitt.py b/playa/ccitt.py
@@ -25,9 +25,6 @@
     cast,
 )
 
-from playa.exceptions import PDFException, PDFValueError
-
-
 def get_bytes(data: bytes) -> Iterator[int]:
     yield from data
 
@@ -83,6 +80,18 @@ def _parse_bit(self, x: object) -> None:
             self._state = self._accept(v)
 
 
+class CCITTException(Exception):
+    pass
+
+class EOFB(CCITTException):
+    pass
+
+class InvalidData(CCITTException):
+    pass
+
+class ByteSkip(CCITTException):
+    pass
+
 class CCITTG4Parser(BitParser):
     MODE = [None, None]
     BitParser.add(MODE, 0, "1")
@@ -332,18 +341,6 @@ class CCITTG4Parser(BitParser):
     BitParser.add(UNCOMPRESSED, "T00000", "00000000011")
     BitParser.add(UNCOMPRESSED, "T10000", "00000000010")
 
-    class CCITTException(PDFException):
-        pass
-
-    class EOFB(CCITTException):
-        pass
-
-    class InvalidData(CCITTException):
-        pass
-
-    class ByteSkip(CCITTException):
-        pass
-
     _color: int
 
     def __init__(self, width: int, bytealign: bool = False) -> None:
@@ -357,10 +354,10 @@ def feedbytes(self, data: bytes) -> None:
             try:
                 for m in (128, 64, 32, 16, 8, 4, 2, 1):
                     self._parse_bit(byte & m)
-            except self.ByteSkip:
+            except ByteSkip:
                 self._accept = self._parse_mode
                 self._state = self.MODE
-            except self.EOFB:
+            except EOFB:
                 break
 
     def _parse_mode(self, mode: object) -> BitParserState:
@@ -379,17 +376,17 @@ def _parse_mode(self, mode: object) -> BitParserState:
             self._accept = self._parse_uncompressed
             return self.UNCOMPRESSED
         elif mode == "e":
-            raise self.EOFB
+            raise EOFB
         elif isinstance(mode, int):
             self._do_vertical(mode)
             self._flush_line()
             return self.MODE
         else:
-            raise self.InvalidData(mode)
+            raise InvalidData(mode)
 
     def _parse_horiz1(self, n: Any) -> BitParserState:
         if n is None:
-            raise self.InvalidData
+            raise InvalidData
         self._n1 += n
         if n < 64:
             self._n2 = 0
@@ -402,7 +399,7 @@ def _parse_horiz1(self, n: Any) -> BitParserState:
 
     def _parse_horiz2(self, n: Any) -> BitParserState:
         if n is None:
-            raise self.InvalidData
+            raise InvalidData
         self._n2 += n
         if n < 64:
             self._color = 1 - self._color
@@ -417,7 +414,7 @@ def _parse_horiz2(self, n: Any) -> BitParserState:
 
     def _parse_uncompressed(self, bits: Optional[str]) -> BitParserState:
         if not bits:
-            raise self.InvalidData
+            raise InvalidData
         if bits.startswith("T"):
             self._accept = self._parse_mode
             self._color = int(bits[1])
@@ -466,7 +463,7 @@ def _flush_line(self) -> None:
             self._y += 1
             self._reset_line()
             if self.bytealign:
-                raise self.ByteSkip
+                raise ByteSkip
 
     def _do_vertical(self, dx: int) -> None:
         x1 = self._curpos + 1
@@ -573,7 +570,7 @@ def ccittfaxdecode(data: bytes, params: Dict[str, object]) -> bytes:
         reversed = cast(bool, params.get("BlackIs1"))
         parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
     else:
-        raise PDFValueError(K)
+        raise ValueError(K)
     parser.feedbytes(data)
     return parser.close()
 

diff --git a/playa/cmapdb.py b/playa/cmapdb.py
@@ -32,14 +32,14 @@
 )
 
 from playa.encodingdb import name2unicode
-from playa.exceptions import PDFException, PDFTypeError, PSSyntaxError
+from playa.exceptions import PDFSyntaxError
 from playa.parser import KWD, Parser, PSKeyword, PSLiteral, literal_name
 from playa.utils import choplist, nunpack
 
 log = logging.getLogger(__name__)
 
 
-class CMapError(PDFException):
+class CMapError(Exception):
     pass
 
 
@@ -194,7 +194,7 @@ def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
         elif isinstance(code, int):
             unichr = chr(code)
         else:
-            raise PDFTypeError(code)
+            raise TypeError(code)
 
         # A0 = non-breaking space, some weird fonts can have a collision on a cid here.
         if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ":
@@ -224,9 +224,6 @@ class CMapDB:
     _cmap_cache: Dict[str, PyCMap] = {}
     _umap_cache: Dict[str, List[PyUnicodeMap]] = {}
 
-    class CMapNotFound(CMapError):
-        pass
-
     @classmethod
     def _load_data(cls, name: str) -> Any:
         name = name.replace("\0", "")
@@ -244,7 +241,7 @@ def _load_data(cls, name: str) -> Any:
                     return type(str(name), (), pickle.loads(gzfile.read()))
                 finally:
                     gzfile.close()
-        raise CMapDB.CMapNotFound(name)
+        raise KeyError(f"CMap {name!r} not found in CMapDB")
 
     @classmethod
     def get_cmap(cls, name: str) -> CMapBase:
@@ -324,17 +321,17 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
             try:
                 ((_, k), (_, v)) = self.pop(2)
                 self.cmap.set_attr(literal_name(k), v)
-            except PSSyntaxError:
+            except PDFSyntaxError:
                 pass
             return
 
         if token is self.KEYWORD_USECMAP:
             try:
                 ((_, cmapname),) = self.pop(1)
                 self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
-            except PSSyntaxError:
+            except PDFSyntaxError:
                 pass
-            except CMapDB.CMapNotFound:
+            except KeyError:
                 pass
             return
 

diff --git a/playa/color.py b/playa/color.py
@@ -1,6 +1,9 @@
 import collections
-from typing import Dict
+from typing import Dict, NamedTuple, Union
 
+from playa import settings
+from playa.casting import safe_float
+from playa.exceptions import PDFInterpreterError
 from playa.parser import LIT
 
 LITERAL_DEVICE_GRAY = LIT("DeviceGray")
@@ -12,16 +15,61 @@
 LITERAL_INLINE_DEVICE_CMYK = LIT("CMYK")
 
 
-class PDFColorSpace:
+class ColorGray(NamedTuple):
+    k: float
+
+
+class ColorRGB(NamedTuple):
+    r: float
+    g: float
+    b: float
+
+
+class ColorCMYK(NamedTuple):
+    c: float
+    m: float
+    y: float
+    k: float
+
+
+Color = Union[
+    ColorGray,
+    ColorRGB,
+    ColorCMYK,
+]
+
+
+class ColorSpace:
     def __init__(self, name: str, ncomponents: int) -> None:
         self.name = name
         self.ncomponents = ncomponents
 
+    def make_color(self, *components) -> Color:
+        if settings.STRICT and len(components) != self.ncomponents:
+            raise PDFInterpreterError(
+                "%s requires %d components, got %d!"
+                % (self.name, self.ncomponents, len(components))
+            )
+        cc = [safe_float(x) or 0.0 for x in components[0 : self.ncomponents]]
+        while len(cc) < self.ncomponents:
+            cc.append(0.0)
+        if self.ncomponents == 1:
+            return ColorGray(*cc)
+        elif self.ncomponents == 3:
+            return ColorRGB(*cc)
+        elif self.ncomponents == 4:
+            return ColorCMYK(*cc)
+        else:
+            raise PDFInterpreterError(
+                "unknown color space %s with %d components"
+                % (self.name, self.ncomponents)
+            )
+
     def __repr__(self) -> str:
-        return "<PDFColorSpace: %s, ncomponents=%d>" % (self.name, self.ncomponents)
+        return "<ColorSpace: %s, ncomponents=%d>" % (self.name, self.ncomponents)
 
 
-PREDEFINED_COLORSPACE: Dict[str, PDFColorSpace] = collections.OrderedDict()
+PREDEFINED_COLORSPACE: Dict[str, ColorSpace] = collections.OrderedDict()
 
 for name, n in [
     ("DeviceGray", 1),  # default value first
@@ -34,4 +82,4 @@ def __repr__(self) -> str:
     ("Indexed", 1),
     ("Pattern", 1),
 ]:
-    PREDEFINED_COLORSPACE[name] = PDFColorSpace(name, n)
+    PREDEFINED_COLORSPACE[name] = ColorSpace(name, n)