refactor!: remove the utterly useless PDFResourceManager

dhdaines · Oct 28, 2024 · f5bbaca · f5bbaca
1 parent 9b3d352
commit f5bbaca
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 91 deletions.
diff --git a/playa/pdfdocument.py b/playa/pdfdocument.py
@@ -692,78 +692,6 @@ class OutlineItem(NamedTuple):
     se: Union[PDFObjRef, None]
 
 
-class PDFResourceManager:
-    """Repository of shared resources.
-
-    ResourceManager facilitates reuse of shared resources
-    such as fonts and images so that large objects are not
-    allocated multiple times.
-    """
-
-    def __init__(self, caching: bool = True) -> None:
-        self.caching = caching
-        self._cached_fonts: Dict[object, PDFFont] = {}
-
-    def get_procset(self, procs: Sequence[object]) -> None:
-        for proc in procs:
-            if proc is LITERAL_PDF or proc is LITERAL_TEXT:
-                pass
-            else:
-                pass
-
-    def get_cmap(self, cmapname: str, strict: bool = False) -> CMapBase:
-        try:
-            return CMapDB.get_cmap(cmapname)
-        except CMapDB.CMapNotFound:
-            if strict:
-                raise
-            return CMap()
-
-    def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
-        if objid and objid in self._cached_fonts:
-            font = self._cached_fonts[objid]
-        else:
-            log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
-            if settings.STRICT:
-                if spec["Type"] is not LITERAL_FONT:
-                    raise PDFFontError("Type is not /Font")
-            # Create a Font object.
-            if "Subtype" in spec:
-                subtype = literal_name(spec["Subtype"])
-            else:
-                if settings.STRICT:
-                    raise PDFFontError("Font Subtype is not specified.")
-                subtype = "Type1"
-            if subtype in ("Type1", "MMType1"):
-                # Type1 Font
-                font = PDFType1Font(spec)
-            elif subtype == "TrueType":
-                # TrueType Font
-                font = PDFTrueTypeFont(spec)
-            elif subtype == "Type3":
-                # Type3 Font
-                font = PDFType3Font(spec)
-            elif subtype in ("CIDFontType0", "CIDFontType2"):
-                # CID Font
-                font = PDFCIDFont(spec)
-            elif subtype == "Type0":
-                # Type0 Font
-                dfonts = list_value(spec["DescendantFonts"])
-                assert dfonts
-                subspec = dict_value(dfonts[0]).copy()
-                for k in ("Encoding", "ToUnicode"):
-                    if k in spec:
-                        subspec[k] = resolve1(spec[k])
-                font = self.get_font(None, subspec)
-            else:
-                if settings.STRICT:
-                    raise PDFFontError("Invalid Font spec: %r" % spec)
-                font = PDFType1Font(spec)  # FIXME: this is so wrong!
-            if objid and self.caching:
-                self._cached_fonts[objid] = font
-        return font
-
-
 class PDFDocument:
     """Representation of a PDF document on disk.
 
@@ -808,6 +736,7 @@ def __init__(
         self.decipher: Optional[DecipherCallable] = None
         self._cached_objs: Dict[int, Tuple[object, int]] = {}
         self._parsed_objs: Dict[int, Tuple[List[object], int]] = {}
+        self._cached_fonts: Dict[object, PDFFont] = {}
         if isinstance(fp, io.TextIOBase):
             raise PSException("fp is not a binary file")
         self.pdf_version = read_header(fp)
@@ -860,8 +789,6 @@ def __init__(
         if self.catalog.get("Type") is not LITERAL_CATALOG:
             if settings.STRICT:
                 raise PDFSyntaxError("Catalog not found!")
-        # NOTE: This does nearly nothing at all
-        self.rsrcmgr = PDFResourceManager(True)
 
     def _initialize_password(self, password: str = "") -> None:
         """Initialize the decryption handler with a given password, if any.
@@ -993,6 +920,50 @@ def __getitem__(self, objid: int) -> object:
             self._cached_objs[objid] = (obj, genno)
         return obj
 
+    def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
+        if objid and objid in self._cached_fonts:
+            font = self._cached_fonts[objid]
+        else:
+            log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
+            if settings.STRICT:
+                if spec["Type"] is not LITERAL_FONT:
+                    raise PDFFontError("Type is not /Font")
+            # Create a Font object.
+            if "Subtype" in spec:
+                subtype = literal_name(spec["Subtype"])
+            else:
+                if settings.STRICT:
+                    raise PDFFontError("Font Subtype is not specified.")
+                subtype = "Type1"
+            if subtype in ("Type1", "MMType1"):
+                # Type1 Font
+                font = PDFType1Font(spec)
+            elif subtype == "TrueType":
+                # TrueType Font
+                font = PDFTrueTypeFont(spec)
+            elif subtype == "Type3":
+                # Type3 Font
+                font = PDFType3Font(spec)
+            elif subtype in ("CIDFontType0", "CIDFontType2"):
+                # CID Font
+                font = PDFCIDFont(spec)
+            elif subtype == "Type0":
+                # Type0 Font
+                dfonts = list_value(spec["DescendantFonts"])
+                assert dfonts
+                subspec = dict_value(dfonts[0]).copy()
+                for k in ("Encoding", "ToUnicode"):
+                    if k in spec:
+                        subspec[k] = resolve1(spec[k])
+                font = self.get_font(None, subspec)
+            else:
+                if settings.STRICT:
+                    raise PDFFontError("Invalid Font spec: %r" % spec)
+                font = PDFType1Font(spec)  # FIXME: this is so wrong!
+            if objid:
+                self._cached_fonts[objid] = font
+        return font
+
     @property
     def outlines(self) -> Iterator[OutlineItem]:
         if "Outlines" not in self.catalog:

diff --git a/playa/pdfpage.py b/playa/pdfpage.py
@@ -76,7 +76,7 @@
 )
 
 if TYPE_CHECKING:
-    from playa.pdfdocument import PDFDocument, PDFResourceManager
+    from playa.pdfdocument import PDFDocument
 
 log = logging.getLogger(__name__)
 
@@ -167,17 +167,10 @@ def __init__(
     def layout(self) -> "LTPage":
         if self._layout is not None:
             return self._layout
-
-        doc = self.doc()
-        if doc is None:
-            raise RuntimeError("Document no longer exists!")
-        # Q: How many classes does does it take a Java programmer to
-        # install a lightbulb?
         device = PDFLayoutAnalyzer(
-            doc.rsrcmgr,
             pageno=self.page_number,
         )
-        interpreter = PDFPageInterpreter(doc.rsrcmgr, device)
+        interpreter = PDFPageInterpreter(self.doc, device)
         interpreter.process_page(self)
         assert device.result is not None
         self._layout = device.result
@@ -343,10 +336,8 @@ class PDFLayoutAnalyzer:
 
     def __init__(
         self,
-        rsrcmgr: "PDFResourceManager",
         pageno: int = 1,
     ) -> None:
-        self.rsrcmgr = rsrcmgr
         self.pageno = pageno
         self._stack: List[LTLayoutContainer] = []
         self.result: Optional[LTPage] = None
@@ -732,13 +723,13 @@ class PDFPageInterpreter:
     """
 
     def __init__(
-        self, rsrcmgr: "PDFResourceManager", device: "PDFLayoutAnalyzer"
+        self, doc: weakref.ReferenceType["PDFDocument"], device: "PDFLayoutAnalyzer"
     ) -> None:
-        self.rsrcmgr = rsrcmgr
+        self.doc = doc
         self.device = device
 
     def dup(self) -> "PDFPageInterpreter":
-        return self.__class__(self.rsrcmgr, self.device)
+        return self.__class__(self.doc, self.device)
 
     def init_resources(self, resources: Dict[object, object]) -> None:
         """Prepare the fonts and XObjects listed in the Resource attribute."""
@@ -748,6 +739,9 @@ def init_resources(self, resources: Dict[object, object]) -> None:
         self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
         if not resources:
             return
+        doc = self.doc()
+        if doc is None:
+            raise RuntimeError("Document no longer exists!")
 
         def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
             if isinstance(spec, list):
@@ -769,14 +763,15 @@ def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
                     if isinstance(spec, PDFObjRef):
                         objid = spec.objid
                     spec = dict_value(spec)
-                    self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
+                    self.fontmap[fontid] = doc.get_font(objid, spec)
             elif k == "ColorSpace":
                 for csid, spec in dict_value(v).items():
                     colorspace = get_colorspace(resolve1(spec))
                     if colorspace is not None:
                         self.csmap[csid] = colorspace
             elif k == "ProcSet":
-                self.rsrcmgr.get_procset(list_value(v))
+                pass  # called get_procset which did exactly
+                      # nothing. perhaps we want to do something?
             elif k == "XObject":
                 for xobjid, xobjstrm in dict_value(v).items():
                     self.xobjmap[xobjid] = xobjstrm
@@ -1167,7 +1162,10 @@ def do_Tf(self, fontid: PDFStackT, fontsize: PDFStackT) -> None:
         except KeyError:
             if settings.STRICT:
                 raise PDFInterpreterError("Undefined Font id: %r" % fontid)
-            self.textstate.font = self.rsrcmgr.get_font(None, {})
+            doc = self.doc()
+            if doc is None:
+                raise RuntimeError("Document no longer exists!")
+            self.textstate.font = doc.get_font(None, {})
         self.textstate.fontsize = cast(float, fontsize)
 
     def do_Tr(self, render: PDFStackT) -> None: