Skip to content

Commit

Permalink
refactor!: remove the utterly useless PDFResourceManager
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Oct 28, 2024
1 parent 9b3d352 commit f5bbaca
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 91 deletions.
119 changes: 45 additions & 74 deletions playa/pdfdocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,78 +692,6 @@ class OutlineItem(NamedTuple):
se: Union[PDFObjRef, None]


class PDFResourceManager:
"""Repository of shared resources.
ResourceManager facilitates reuse of shared resources
such as fonts and images so that large objects are not
allocated multiple times.
"""

def __init__(self, caching: bool = True) -> None:
self.caching = caching
self._cached_fonts: Dict[object, PDFFont] = {}

def get_procset(self, procs: Sequence[object]) -> None:
for proc in procs:
if proc is LITERAL_PDF or proc is LITERAL_TEXT:
pass
else:
pass

def get_cmap(self, cmapname: str, strict: bool = False) -> CMapBase:
try:
return CMapDB.get_cmap(cmapname)
except CMapDB.CMapNotFound:
if strict:
raise
return CMap()

def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
if objid and objid in self._cached_fonts:
font = self._cached_fonts[objid]
else:
log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
if settings.STRICT:
if spec["Type"] is not LITERAL_FONT:
raise PDFFontError("Type is not /Font")
# Create a Font object.
if "Subtype" in spec:
subtype = literal_name(spec["Subtype"])
else:
if settings.STRICT:
raise PDFFontError("Font Subtype is not specified.")
subtype = "Type1"
if subtype in ("Type1", "MMType1"):
# Type1 Font
font = PDFType1Font(spec)
elif subtype == "TrueType":
# TrueType Font
font = PDFTrueTypeFont(spec)
elif subtype == "Type3":
# Type3 Font
font = PDFType3Font(spec)
elif subtype in ("CIDFontType0", "CIDFontType2"):
# CID Font
font = PDFCIDFont(spec)
elif subtype == "Type0":
# Type0 Font
dfonts = list_value(spec["DescendantFonts"])
assert dfonts
subspec = dict_value(dfonts[0]).copy()
for k in ("Encoding", "ToUnicode"):
if k in spec:
subspec[k] = resolve1(spec[k])
font = self.get_font(None, subspec)
else:
if settings.STRICT:
raise PDFFontError("Invalid Font spec: %r" % spec)
font = PDFType1Font(spec) # FIXME: this is so wrong!
if objid and self.caching:
self._cached_fonts[objid] = font
return font


class PDFDocument:
"""Representation of a PDF document on disk.
Expand Down Expand Up @@ -808,6 +736,7 @@ def __init__(
self.decipher: Optional[DecipherCallable] = None
self._cached_objs: Dict[int, Tuple[object, int]] = {}
self._parsed_objs: Dict[int, Tuple[List[object], int]] = {}
self._cached_fonts: Dict[object, PDFFont] = {}
if isinstance(fp, io.TextIOBase):
raise PSException("fp is not a binary file")
self.pdf_version = read_header(fp)
Expand Down Expand Up @@ -860,8 +789,6 @@ def __init__(
if self.catalog.get("Type") is not LITERAL_CATALOG:
if settings.STRICT:
raise PDFSyntaxError("Catalog not found!")
# NOTE: This does nearly nothing at all
self.rsrcmgr = PDFResourceManager(True)

def _initialize_password(self, password: str = "") -> None:
"""Initialize the decryption handler with a given password, if any.
Expand Down Expand Up @@ -993,6 +920,50 @@ def __getitem__(self, objid: int) -> object:
self._cached_objs[objid] = (obj, genno)
return obj

def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
if objid and objid in self._cached_fonts:
font = self._cached_fonts[objid]
else:
log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
if settings.STRICT:
if spec["Type"] is not LITERAL_FONT:
raise PDFFontError("Type is not /Font")
# Create a Font object.
if "Subtype" in spec:
subtype = literal_name(spec["Subtype"])
else:
if settings.STRICT:
raise PDFFontError("Font Subtype is not specified.")
subtype = "Type1"
if subtype in ("Type1", "MMType1"):
# Type1 Font
font = PDFType1Font(spec)
elif subtype == "TrueType":
# TrueType Font
font = PDFTrueTypeFont(spec)
elif subtype == "Type3":
# Type3 Font
font = PDFType3Font(spec)
elif subtype in ("CIDFontType0", "CIDFontType2"):
# CID Font
font = PDFCIDFont(spec)
elif subtype == "Type0":
# Type0 Font
dfonts = list_value(spec["DescendantFonts"])
assert dfonts
subspec = dict_value(dfonts[0]).copy()
for k in ("Encoding", "ToUnicode"):
if k in spec:
subspec[k] = resolve1(spec[k])
font = self.get_font(None, subspec)
else:
if settings.STRICT:
raise PDFFontError("Invalid Font spec: %r" % spec)
font = PDFType1Font(spec) # FIXME: this is so wrong!
if objid:
self._cached_fonts[objid] = font
return font

@property
def outlines(self) -> Iterator[OutlineItem]:
if "Outlines" not in self.catalog:
Expand Down
32 changes: 15 additions & 17 deletions playa/pdfpage.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
)

if TYPE_CHECKING:
from playa.pdfdocument import PDFDocument, PDFResourceManager
from playa.pdfdocument import PDFDocument

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -167,17 +167,10 @@ def __init__(
def layout(self) -> "LTPage":
if self._layout is not None:
return self._layout

doc = self.doc()
if doc is None:
raise RuntimeError("Document no longer exists!")
# Q: How many classes does does it take a Java programmer to
# install a lightbulb?
device = PDFLayoutAnalyzer(
doc.rsrcmgr,
pageno=self.page_number,
)
interpreter = PDFPageInterpreter(doc.rsrcmgr, device)
interpreter = PDFPageInterpreter(self.doc, device)
interpreter.process_page(self)
assert device.result is not None
self._layout = device.result
Expand Down Expand Up @@ -343,10 +336,8 @@ class PDFLayoutAnalyzer:

def __init__(
self,
rsrcmgr: "PDFResourceManager",
pageno: int = 1,
) -> None:
self.rsrcmgr = rsrcmgr
self.pageno = pageno
self._stack: List[LTLayoutContainer] = []
self.result: Optional[LTPage] = None
Expand Down Expand Up @@ -732,13 +723,13 @@ class PDFPageInterpreter:
"""

def __init__(
self, rsrcmgr: "PDFResourceManager", device: "PDFLayoutAnalyzer"
self, doc: weakref.ReferenceType["PDFDocument"], device: "PDFLayoutAnalyzer"
) -> None:
self.rsrcmgr = rsrcmgr
self.doc = doc
self.device = device

def dup(self) -> "PDFPageInterpreter":
return self.__class__(self.rsrcmgr, self.device)
return self.__class__(self.doc, self.device)

def init_resources(self, resources: Dict[object, object]) -> None:
"""Prepare the fonts and XObjects listed in the Resource attribute."""
Expand All @@ -748,6 +739,9 @@ def init_resources(self, resources: Dict[object, object]) -> None:
self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
if not resources:
return
doc = self.doc()
if doc is None:
raise RuntimeError("Document no longer exists!")

def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
if isinstance(spec, list):
Expand All @@ -769,14 +763,15 @@ def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
if isinstance(spec, PDFObjRef):
objid = spec.objid
spec = dict_value(spec)
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
self.fontmap[fontid] = doc.get_font(objid, spec)
elif k == "ColorSpace":
for csid, spec in dict_value(v).items():
colorspace = get_colorspace(resolve1(spec))
if colorspace is not None:
self.csmap[csid] = colorspace
elif k == "ProcSet":
self.rsrcmgr.get_procset(list_value(v))
pass # called get_procset which did exactly
# nothing. perhaps we want to do something?
elif k == "XObject":
for xobjid, xobjstrm in dict_value(v).items():
self.xobjmap[xobjid] = xobjstrm
Expand Down Expand Up @@ -1167,7 +1162,10 @@ def do_Tf(self, fontid: PDFStackT, fontsize: PDFStackT) -> None:
except KeyError:
if settings.STRICT:
raise PDFInterpreterError("Undefined Font id: %r" % fontid)
self.textstate.font = self.rsrcmgr.get_font(None, {})
doc = self.doc()
if doc is None:
raise RuntimeError("Document no longer exists!")
self.textstate.font = doc.get_font(None, {})
self.textstate.fontsize = cast(float, fontsize)

def do_Tr(self, render: PDFStackT) -> None:
Expand Down

0 comments on commit f5bbaca

Please sign in to comment.