stumpylog · stumpylog · Dec 10, 2024 · Nov 16, 2024 · Nov 16, 2024 · Dec 10, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 
 - Official support and testing for Python 3.13 ([#25](https://github.com/stumpylog/tika-client/pull/25))
+- Support for setting PDF metadata ([#42](https://github.com/stumpylog/tika-client/pull/42))
+  - Initial work by @spechtx in ([#39](https://github.com/stumpylog/tika-client/pull/42))
 
 ### Changed
 

diff --git a/README.md b/README.md
@@ -102,6 +102,35 @@ with GotenbergClient("http://localhost:3000") as client:
       response.to_file(Path("my-world.pdf"))
 ```
 
+Adding metadata to a PDF:
+
+This example shows how to add metadata to your generated PDF. All metadata fields are optional and include:
+
+- Document info (title, author, subject, keywords)
+- Dates (creation, modification)
+- Technical details (pdf version, creator, producer)
+- PDF standards (trapped status, marked status)
+
+```python
+from gotenberg_client import GotenbergClient
+from datetime import datetime
+
+with GotenbergClient("http://localhost:3000") as client:
+    with client.chromium.html_to_pdf() as route:
+        response = (route
+            .index("my-index.html")
+            .metadata(
+                title="My Document",
+                author="John Doe",
+                subject="Example PDF",
+                keywords=["sample", "document", "test"],
+                creation_date=datetime.now(),
+                trapped="Unknown"
+            )
+            .run())
+        response.to_file(Path("my-index.pdf"))
+```
+
 To ensure the proper clean up of all used resources, both the client and the route(s) should be
 used as context manager. If for some reason you cannot, you should `.close` the client and any
 routes:

diff --git a/docs/routes.md b/docs/routes.md
@@ -95,11 +95,45 @@ These options are not yet implemented
 | `pdfa`           | `.pdf_format()`                                                               | `PdfAFormat` |       |
 | `pdfua`          | <ul><li>`enable_universal_access()`<li>`disable_universal_access()`</li></ul> | N/A          |       |
 
-#### Metadata
+#### PDF Metadata Support
 
 [Gotenberg Documentation](https://gotenberg.dev/docs/routes#metadata-chromium)
 
-These options are not yet implemented
+Add metadata to your PDFs:
+
+```python
+from gotenberg_client import GotenbergClient
+from datetime import datetime
+
+with GotenbergClient("http://localhost:3000") as client:
+    with client.chromium.html_to_pdf() as route:
+        response = (route
+            .index("my-index.html")
+            .metadata(
+                title="My Document",
+                author="John Doe",
+                creation_date=datetime.now(),
+                keywords=["sample", "document"],
+                subject="Sample PDF Generation",
+                trapped="Unknown"
+            )
+            .run())
+```
+
+Supported metadata fields:
+
+- `title`: Document title
+- `author`: Document author
+- `subject`: Document subject
+- `keywords`: List of keywords
+- `creator`: Creating application
+- `creation_date`: Creation datetime
+- `modification_date`: Last modification datetime
+- `producer`: PDF producer
+- `trapped`: Trapping status ('True', 'False', 'Unknown')
+- `copyright`: Copyright information
+- `marked`: PDF marked status
+- `pdf_version`: PDF version number
 
 ## LibreOffice
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -65,7 +65,7 @@ installer = "uv"
 
 [tool.hatch.envs.hatch-static-analysis]
 # https://hatch.pypa.io/latest/config/internal/static-analysis/
-dependencies = [ "ruff ~= 0.6" ]
+dependencies = [ "ruff ~= 0.8" ]
 config-path = "none"
 
 [tool.hatch.envs.hatch-test]
@@ -246,6 +246,7 @@ lint.ignore = [
 ]
 # Tests can use magic values, assertions, and relative imports
 lint.per-file-ignores."tests/**/*" = [ "PLR2004", "S101", "TID252" ]
+lint.per-file-ignores."tests/utils.py" = [ "S603" ]
 # No relative imports
 lint.flake8-tidy-imports.ban-relative-imports = "all"
 # One import per line

diff --git a/src/gotenberg_client/__init__.py b/src/gotenberg_client/__init__.py
@@ -4,15 +4,19 @@
 from gotenberg_client._client import GotenbergClient
 from gotenberg_client._errors import BaseClientError
 from gotenberg_client._errors import CannotExtractHereError
+from gotenberg_client._errors import InvalidKeywordError
+from gotenberg_client._errors import InvalidPdfRevisionError
 from gotenberg_client._errors import MaxRetriesExceededError
 from gotenberg_client.responses import SingleFileResponse
 from gotenberg_client.responses import ZipFileResponse
 
 __all__ = [
-    "GotenbergClient",
-    "SingleFileResponse",
-    "ZipFileResponse",
     "BaseClientError",
     "CannotExtractHereError",
+    "GotenbergClient",
+    "InvalidKeywordError",
+    "InvalidPdfRevisionError",
     "MaxRetriesExceededError",
+    "SingleFileResponse",
+    "ZipFileResponse",
 ]
diff --git a/src/gotenberg_client/_base.py b/src/gotenberg_client/_base.py
@@ -111,6 +111,7 @@ def _base_run(self) -> Response:
         Executes the configured route against the server and returns the resulting
         Response.
         """
+
         resp = self._client.post(
             url=self._route,
             headers=self._headers,

diff --git a/src/gotenberg_client/_convert/chromium.py b/src/gotenberg_client/_convert/chromium.py
@@ -17,6 +17,7 @@
 from gotenberg_client._convert.common import EmulatedMediaMixin
 from gotenberg_client._convert.common import HeaderFooterMixin
 from gotenberg_client._convert.common import InvalidStatusCodesMixin
+from gotenberg_client._convert.common import MetadataMixin
 from gotenberg_client._convert.common import PageOrientMixin
 from gotenberg_client._convert.common import PagePropertiesMixin
 from gotenberg_client._convert.common import PerformanceModeMixin
@@ -125,6 +126,7 @@ class HtmlRoute(
     HeaderFooterMixin,
     RenderControlMixin,
     PageOrientMixin,
+    MetadataMixin,
     _RouteWithResources,
     _FileBasedRoute,
 ):
@@ -141,6 +143,7 @@ class UrlRoute(
     EmulatedMediaMixin,
     CustomHTTPHeaderMixin,
     PageOrientMixin,
+    MetadataMixin,
     BaseSingleFileResponseRoute,
 ):
     """
@@ -183,7 +186,7 @@ def _get_all_resources(self) -> ForceMultipartDict:
         return FORCE_MULTIPART
 
 
-class MarkdownRoute(PagePropertiesMixin, HeaderFooterMixin, _RouteWithResources, _FileBasedRoute):
+class MarkdownRoute(PagePropertiesMixin, HeaderFooterMixin, MetadataMixin, _RouteWithResources, _FileBasedRoute):
     """
     Represents the Gotenberg route for converting Markdown files to a PDF.
 

diff --git a/src/gotenberg_client/_convert/common.py b/src/gotenberg_client/_convert/common.py
@@ -3,19 +3,27 @@
 # SPDX-License-Identifier: MPL-2.0
 import json
 import logging
+from datetime import datetime
 from pathlib import Path
 from typing import Dict
+from typing import Final
 from typing import Iterable
+from typing import List
+from typing import Optional
+from typing import Union
 from warnings import warn
 
 from gotenberg_client._base import BaseSingleFileResponseRoute
+from gotenberg_client._errors import InvalidKeywordError
+from gotenberg_client._errors import InvalidPdfRevisionError
 from gotenberg_client._types import PageScaleType
 from gotenberg_client._types import Self
 from gotenberg_client._types import WaitTimeType
 from gotenberg_client.options import EmulatedMediaType
 from gotenberg_client.options import PageMarginsType
 from gotenberg_client.options import PageOrientation
 from gotenberg_client.options import PageSize
+from gotenberg_client.options import TrappedStatus
 
 logger = logging.getLogger()
 
@@ -233,3 +241,143 @@ def skip_network_idle(self) -> Self:
     def use_network_idle(self) -> Self:
         self._form_data.update({"skipNetworkIdleEvent": "false"})  # type: ignore[attr-defined,misc]
         return self
+
+
+class MetadataMixin:
+    """
+    Mixin for PDF metadata support.
+
+    This mixin provides functionality to set PDF metadata for documents processed through
+    the Gotenberg API (https://gotenberg.dev/docs/routes#metadata-chromium).
+
+    Important Notes:
+    - Gotenberg will use the current date/time for creation_date and modification_date,
+      even if custom dates are provided.
+    - Gotenberg will use its own pdf_version, even if a custom version is provided.
+
+    Example:
+        from gotenberg_client import GotenbergClient
+        from datetime import datetime
+        from zoneinfo import ZoneInfo
+        from pathlib import Path
+
+        with GotenbergClient('http://localhost:3000') as client:
+            with client.chromium.url_to_pdf() as route:
+
+                response = (
+                    route.url('https://hello.world')
+                    .metadata(
+                        author='John Doe',
+                        copyright='© 2024 My Company',
+                        creation_date = datetime.now(tz=ZoneInfo("Europe/Berlin")),
+                        creator='My Application',
+                        keywords=['keyword', 'example'],
+                        marked=True,
+                        modification_date=datetime.now(tz=ZoneInfo("Europe/Berlin")),
+                        pdf_version=1.7,
+                        producer='PDF Producer',
+                        subject='My Subject',
+                        title='My Title',
+                        trapped=True,
+                    )
+                )
+
+                response.to_file(Path('my-world.pdf'))
+    """
+
+    MIN_PDF_VERSION: Final[float] = 1.0
+    MAX_PDF_VERSION: Final[float] = 2.0
+
+    def metadata(
+        self,
+        author: Optional[str] = None,
+        pdf_copyright: Optional[str] = None,
+        creation_date: Optional[datetime] = None,
+        creator: Optional[str] = None,
+        keywords: Optional[List[str]] = None,
+        marked: Optional[bool] = None,
+        modification_date: Optional[datetime] = None,
+        pdf_version: Optional[float] = None,
+        producer: Optional[str] = None,
+        subject: Optional[str] = None,
+        title: Optional[str] = None,
+        trapped: Optional[Union[bool, TrappedStatus]] = None,
+    ) -> Self:
+        """
+        Sets PDF metadata for the document.
+
+        Args:
+            author: Document author name
+            copyright: Copyright information
+            creation_date: Document creation date (Note: Gotenberg will override this)
+            creator: Name of the creating application
+            keywords: List of keywords/tags for the document
+            marked: Whether the PDF is marked for structure
+            modification_date: Last modification date (Note: Gotenberg will override this)
+            pdf_version: PDF version number (Note: Gotenberg will override this)
+            producer: Name of the PDF producer
+            subject: Document subject/description
+            title: Document title
+            trapped: Trapping status (bool or one of: 'True', 'False', 'Unknown')
+
+        Returns:
+            Self for method chaining
+
+        Raises:
+            InvalidPdfRevisionError: If the provided PDF revision is outside the valid range
+            InvalidKeywordError: If any metadata keyword values are not allowed
+            TypeError: If any metadata values have incorrect types
+        """
+
+        # Validate metadata values
+        if pdf_version is not None and not (self.MIN_PDF_VERSION <= pdf_version <= self.MAX_PDF_VERSION):
+            msg = "PDF version must be between 1.0 and 2.0"
+            raise InvalidPdfRevisionError(msg)
+
+        if trapped is not None and isinstance(trapped, bool):
+            trapped = TrappedStatus.TRUE if trapped else TrappedStatus.FALSE
+
+        if keywords is not None:
+            if not all(isinstance(k, str) for k in keywords):
+                raise InvalidKeywordError("All keywords must be strings")  # noqa: EM101, TRY003
+            if any("," in k for k in keywords):
+                raise InvalidKeywordError("Keywords cannot contain commas")  # noqa: EM101, TRY003
+
+        # Get existing metadata if any
+        existing_metadata: Dict[str, Union[str, bool, float]] = {}
+        if "metadata" in self._form_data:  # type: ignore[attr-defined,misc]
+            existing_metadata = json.loads(self._form_data["metadata"])  # type: ignore[attr-defined,misc]
+
+        # Convert validated metadata to dictionary
+        metadata: Dict[str, Union[str, bool, float]] = {}
+
+        if author:
+            metadata["Author"] = author
+        if pdf_copyright:
+            metadata["Copyright"] = pdf_copyright
+        if creation_date:
+            metadata["CreationDate"] = creation_date.isoformat()
+        if creator:
+            metadata["Creator"] = creator
+        if keywords:
+            metadata["Keywords"] = ", ".join(keywords)
+        if marked is not None:
+            metadata["Marked"] = marked
+        if modification_date:
+            metadata["ModDate"] = modification_date.isoformat()
+        if pdf_version:
+            metadata["PDFVersion"] = pdf_version
+        if producer:
+            metadata["Producer"] = producer
+        if subject:
+            metadata["Subject"] = subject
+        if title:
+            metadata["Title"] = title
+        if trapped is not None:
+            metadata["Trapped"] = trapped.value
+
+        # Merge existing and new metadata
+        if metadata:
+            self._form_data.update({"metadata": json.dumps({**existing_metadata, **metadata})})  # type: ignore[attr-defined,misc]
+
+        return self
diff --git a/src/gotenberg_client/_convert/libre_office.py b/src/gotenberg_client/_convert/libre_office.py
@@ -9,6 +9,7 @@
 
 from gotenberg_client._base import BaseApi
 from gotenberg_client._base import BaseSingleFileResponseRoute
+from gotenberg_client._convert.common import MetadataMixin
 from gotenberg_client._convert.common import PageOrientMixin
 from gotenberg_client._convert.common import PageRangeMixin
 from gotenberg_client._types import Self
@@ -17,7 +18,7 @@
 from gotenberg_client.responses import ZipFileResponse
 
 
-class LibreOfficeConvertRoute(PageOrientMixin, PageRangeMixin, BaseSingleFileResponseRoute):
+class LibreOfficeConvertRoute(PageOrientMixin, PageRangeMixin, MetadataMixin, BaseSingleFileResponseRoute):
     """
     Represents the Gotenberg route for converting documents to PDF using LibreOffice.
 

diff --git a/src/gotenberg_client/_convert/pdfa.py b/src/gotenberg_client/_convert/pdfa.py
@@ -6,10 +6,11 @@
 
 from gotenberg_client._base import BaseApi
 from gotenberg_client._base import BaseSingleFileResponseRoute
+from gotenberg_client._convert.common import MetadataMixin
 from gotenberg_client._types import Self
 
 
-class PdfAConvertRoute(BaseSingleFileResponseRoute):
+class PdfAConvertRoute(MetadataMixin, BaseSingleFileResponseRoute):
     """
     Represents the Gotenberg route for converting PDFs to PDF/A format.
 

diff --git a/src/gotenberg_client/_errors.py b/src/gotenberg_client/_errors.py
@@ -23,3 +23,11 @@ def __init__(self, *, response: Response) -> None:
 
 class CannotExtractHereError(BaseClientError):
     pass
+
+
+class InvalidPdfRevisionError(BaseClientError):
+    pass
+
+
+class InvalidKeywordError(BaseClientError):
+    pass