Skip to content

Commit

Permalink
More (internal) API shrinkification (#12)
Browse files Browse the repository at this point in the history
* refactor!: remove no longer used things

* refactor: use dataclasses for graphic/text state

* feat: maintain pdfplumber compatibility in to_dict

* chore: cleanup tests

* refactor!: useless information hiding

* feat: safer colours as named tuples

* fix: no there is no RGBA and also just use tuples everywhere

* feat!: flatten out layout items (FIXME: actually slower)

* fix: optimize a bit but not enough!

* fix: mandate named arguments in some places and remove useless copy

* fix: remove unhelpful asserts

* refactor: refactor away LTChar and render_string_foobar

* refactor!: swallow up playa.layout entirely

* fix: cache a thing, now we are faster again

* fix: remove useless accessors

* refactor!: remove most bespoke exceptions

* refactor!: remove more LTStuff

* refactor!: remove all LTStuff except one (to be one separately)
  • Loading branch information
dhdaines authored Nov 5, 2024
1 parent 9c0e217 commit 0b4aa6d
Show file tree
Hide file tree
Showing 19 changed files with 587 additions and 1,024 deletions.
6 changes: 2 additions & 4 deletions playa/_saslprep.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@
import unicodedata
from typing import Callable, Tuple

from playa.exceptions import PDFValueError

# RFC4013 section 2.3 prohibited output.
_PROHIBITED: Tuple[Callable[[str], bool], ...] = (
# A strict reading of RFC 4013 requires table c12 here, but
Expand Down Expand Up @@ -83,7 +81,7 @@ def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
# RFC3454, Section 6, #3. If a string contains any
# RandALCat character, the first and last characters
# MUST be RandALCat characters.
raise PDFValueError("SASLprep: failed bidirectional check")
raise ValueError("SASLprep: failed bidirectional check")
# RFC3454, Section 6, #2. If a string contains any RandALCat
# character, it MUST NOT contain any LCat character.
prohibited = prohibited + (stringprep.in_table_d2,)
Expand All @@ -96,6 +94,6 @@ def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
# RFC3454 section 2, step 3 and 4 - Prohibit and check bidi
for char in data:
if any(in_table(char) for in_table in prohibited):
raise PDFValueError("SASLprep: failed prohibited character check")
raise ValueError("SASLprep: failed prohibited character check")

return data
45 changes: 21 additions & 24 deletions playa/ccitt.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,6 @@
cast,
)

from playa.exceptions import PDFException, PDFValueError


def get_bytes(data: bytes) -> Iterator[int]:
yield from data

Expand Down Expand Up @@ -83,6 +80,18 @@ def _parse_bit(self, x: object) -> None:
self._state = self._accept(v)


class CCITTException(Exception):
pass

class EOFB(CCITTException):
pass

class InvalidData(CCITTException):
pass

class ByteSkip(CCITTException):
pass

class CCITTG4Parser(BitParser):
MODE = [None, None]
BitParser.add(MODE, 0, "1")
Expand Down Expand Up @@ -332,18 +341,6 @@ class CCITTG4Parser(BitParser):
BitParser.add(UNCOMPRESSED, "T00000", "00000000011")
BitParser.add(UNCOMPRESSED, "T10000", "00000000010")

class CCITTException(PDFException):
pass

class EOFB(CCITTException):
pass

class InvalidData(CCITTException):
pass

class ByteSkip(CCITTException):
pass

_color: int

def __init__(self, width: int, bytealign: bool = False) -> None:
Expand All @@ -357,10 +354,10 @@ def feedbytes(self, data: bytes) -> None:
try:
for m in (128, 64, 32, 16, 8, 4, 2, 1):
self._parse_bit(byte & m)
except self.ByteSkip:
except ByteSkip:
self._accept = self._parse_mode
self._state = self.MODE
except self.EOFB:
except EOFB:
break

def _parse_mode(self, mode: object) -> BitParserState:
Expand All @@ -379,17 +376,17 @@ def _parse_mode(self, mode: object) -> BitParserState:
self._accept = self._parse_uncompressed
return self.UNCOMPRESSED
elif mode == "e":
raise self.EOFB
raise EOFB
elif isinstance(mode, int):
self._do_vertical(mode)
self._flush_line()
return self.MODE
else:
raise self.InvalidData(mode)
raise InvalidData(mode)

def _parse_horiz1(self, n: Any) -> BitParserState:
if n is None:
raise self.InvalidData
raise InvalidData
self._n1 += n
if n < 64:
self._n2 = 0
Expand All @@ -402,7 +399,7 @@ def _parse_horiz1(self, n: Any) -> BitParserState:

def _parse_horiz2(self, n: Any) -> BitParserState:
if n is None:
raise self.InvalidData
raise InvalidData
self._n2 += n
if n < 64:
self._color = 1 - self._color
Expand All @@ -417,7 +414,7 @@ def _parse_horiz2(self, n: Any) -> BitParserState:

def _parse_uncompressed(self, bits: Optional[str]) -> BitParserState:
if not bits:
raise self.InvalidData
raise InvalidData
if bits.startswith("T"):
self._accept = self._parse_mode
self._color = int(bits[1])
Expand Down Expand Up @@ -466,7 +463,7 @@ def _flush_line(self) -> None:
self._y += 1
self._reset_line()
if self.bytealign:
raise self.ByteSkip
raise ByteSkip

def _do_vertical(self, dx: int) -> None:
x1 = self._curpos + 1
Expand Down Expand Up @@ -573,7 +570,7 @@ def ccittfaxdecode(data: bytes, params: Dict[str, object]) -> bytes:
reversed = cast(bool, params.get("BlackIs1"))
parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
else:
raise PDFValueError(K)
raise ValueError(K)
parser.feedbytes(data)
return parser.close()

Expand Down
17 changes: 7 additions & 10 deletions playa/cmapdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,14 @@
)

from playa.encodingdb import name2unicode
from playa.exceptions import PDFException, PDFTypeError, PSSyntaxError
from playa.exceptions import PDFSyntaxError
from playa.parser import KWD, Parser, PSKeyword, PSLiteral, literal_name
from playa.utils import choplist, nunpack

log = logging.getLogger(__name__)


class CMapError(PDFException):
class CMapError(Exception):
pass


Expand Down Expand Up @@ -194,7 +194,7 @@ def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
elif isinstance(code, int):
unichr = chr(code)
else:
raise PDFTypeError(code)
raise TypeError(code)

# A0 = non-breaking space, some weird fonts can have a collision on a cid here.
if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ":
Expand Down Expand Up @@ -224,9 +224,6 @@ class CMapDB:
_cmap_cache: Dict[str, PyCMap] = {}
_umap_cache: Dict[str, List[PyUnicodeMap]] = {}

class CMapNotFound(CMapError):
pass

@classmethod
def _load_data(cls, name: str) -> Any:
name = name.replace("\0", "")
Expand All @@ -244,7 +241,7 @@ def _load_data(cls, name: str) -> Any:
return type(str(name), (), pickle.loads(gzfile.read()))
finally:
gzfile.close()
raise CMapDB.CMapNotFound(name)
raise KeyError(f"CMap {name!r} not found in CMapDB")

@classmethod
def get_cmap(cls, name: str) -> CMapBase:
Expand Down Expand Up @@ -324,17 +321,17 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
try:
((_, k), (_, v)) = self.pop(2)
self.cmap.set_attr(literal_name(k), v)
except PSSyntaxError:
except PDFSyntaxError:
pass
return

if token is self.KEYWORD_USECMAP:
try:
((_, cmapname),) = self.pop(1)
self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
except PSSyntaxError:
except PDFSyntaxError:
pass
except CMapDB.CMapNotFound:
except KeyError:
pass
return

Expand Down
58 changes: 53 additions & 5 deletions playa/color.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import collections
from typing import Dict
from typing import Dict, NamedTuple, Union

from playa import settings
from playa.casting import safe_float
from playa.exceptions import PDFInterpreterError
from playa.parser import LIT

LITERAL_DEVICE_GRAY = LIT("DeviceGray")
Expand All @@ -12,16 +15,61 @@
LITERAL_INLINE_DEVICE_CMYK = LIT("CMYK")


class PDFColorSpace:
class ColorGray(NamedTuple):
k: float


class ColorRGB(NamedTuple):
r: float
g: float
b: float


class ColorCMYK(NamedTuple):
c: float
m: float
y: float
k: float


Color = Union[
ColorGray,
ColorRGB,
ColorCMYK,
]


class ColorSpace:
def __init__(self, name: str, ncomponents: int) -> None:
self.name = name
self.ncomponents = ncomponents

def make_color(self, *components) -> Color:
if settings.STRICT and len(components) != self.ncomponents:
raise PDFInterpreterError(
"%s requires %d components, got %d!"
% (self.name, self.ncomponents, len(components))
)
cc = [safe_float(x) or 0.0 for x in components[0 : self.ncomponents]]
while len(cc) < self.ncomponents:
cc.append(0.0)
if self.ncomponents == 1:
return ColorGray(*cc)
elif self.ncomponents == 3:
return ColorRGB(*cc)
elif self.ncomponents == 4:
return ColorCMYK(*cc)
else:
raise PDFInterpreterError(
"unknown color space %s with %d components"
% (self.name, self.ncomponents)
)

def __repr__(self) -> str:
return "<PDFColorSpace: %s, ncomponents=%d>" % (self.name, self.ncomponents)
return "<ColorSpace: %s, ncomponents=%d>" % (self.name, self.ncomponents)


PREDEFINED_COLORSPACE: Dict[str, PDFColorSpace] = collections.OrderedDict()
PREDEFINED_COLORSPACE: Dict[str, ColorSpace] = collections.OrderedDict()

for name, n in [
("DeviceGray", 1), # default value first
Expand All @@ -34,4 +82,4 @@ def __repr__(self) -> str:
("Indexed", 1),
("Pattern", 1),
]:
PREDEFINED_COLORSPACE[name] = PDFColorSpace(name, n)
PREDEFINED_COLORSPACE[name] = ColorSpace(name, n)
Loading

0 comments on commit 0b4aa6d

Please sign in to comment.