From d077d28ef6e00dd5e19edfcbecee33309985c990 Mon Sep 17 00:00:00 2001 From: Toby Fleming Date: Tue, 18 Feb 2020 21:48:35 -0800 Subject: [PATCH] Type hinting bonanza (#1) --- README.md | 154 ++++++++++++++++----------- functional/container_test.py | 19 ++-- functional/utils.py | 8 +- lint | 2 + pyproject.toml | 2 +- src/xml_dataclasses/__init__.py | 21 +++- src/xml_dataclasses/modifiers.py | 14 ++- src/xml_dataclasses/resolve_types.py | 17 ++- src/xml_dataclasses/serde.py | 16 ++- tests/resolve_types_other_test.py | 21 +++- 10 files changed, 185 insertions(+), 89 deletions(-) diff --git a/README.md b/README.md index 7429197..3158f77 100644 --- a/README.md +++ b/README.md @@ -2,70 +2,15 @@ [![License: MPL 2.0](https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg)](https://opensource.org/licenses/MPL-2.0) -This is a prototype of how a library might look like for (de)serialising XML into Python dataclasses. XML dataclasses build on normal dataclasses from the standard library and [`lxml`](https://pypi.org/project/lxml/) elements. Loading and saving these elements is left to the consumer for flexibility of the desired output. +This library enables (de)serialising XML into Python dataclasses. XML dataclasses build on normal dataclasses from the standard library and [`lxml`](https://pypi.org/project/lxml/) elements. Loading and saving these elements is left to the consumer for flexibility of the desired output. -It isn't ready for production if you aren't willing to do your own evaluation/quality assurance. I don't recommend using this library with untrusted content. It inherits all of `lxml`'s flaws with regards to XML attacks, and recursively resolves data structures. Because deserialisation is driven from the dataclass definitions, it shouldn't be possible to execute arbitrary Python code. But denial of service attacks would very likely be feasible. +It's currently in alpha. It isn't ready for production if you aren't willing to do your own evaluation/quality assurance. I don't recommend using this library with untrusted content. It inherits all of `lxml`'s flaws with regards to XML attacks, and recursively resolves data structures. Because deserialisation is driven from the dataclass definitions, it shouldn't be possible to execute arbitrary Python code (not a guarantee, see license). Denial of service attacks would very likely be feasible. One workaround may be to [use `lxml` to validate](https://lxml.de/validation.html) untrusted content with a strict schema. Requires Python 3.7 or higher. -## Example - -(This is a simplified real world example - the container can also include optional `links` child elements.) - -```xml - - - - - - -``` - -```python -from lxml import etree -from typing import List -from xml_dataclasses import xml_dataclass, rename, load, dump - -CONTAINER_NS = "urn:oasis:names:tc:opendocument:xmlns:container" - -@xml_dataclass -class RootFile: - __ns__ = CONTAINER_NS - full_path: str = rename(name="full-path") - media_type: str = rename(name="media-type") - - -@xml_dataclass -class RootFiles: - __ns__ = CONTAINER_NS - rootfile: List[RootFile] - - -@xml_dataclass -class Container: - __ns__ = CONTAINER_NS - version: str - rootfiles: RootFiles - # WARNING: this is an incomplete implementation of an OPF container - - def xml_validate(self): - if self.version != "1.0": - raise ValueError(f"Unknown container version '{self.version}'") - - -if __name__ == "__main__": - nsmap = {None: CONTAINER_NS} - # see Gotchas, stripping whitespace is highly recommended - parser = etree.XMLParser(remove_blank_text=True) - lxml_el_in = etree.parse("container.xml", parser).getroot() - container = load(Container, lxml_el_in, "container") - lxml_el_out = dump(container, "container", nsmap) - print(etree.tostring(lxml_el_out, encoding="unicode", pretty_print=True)) -``` - ## Features -* XML dataclasses are also dataclasses, and only require a single decorator +* XML dataclasses are also dataclasses, and only require a single decorator to work (but see type hinting section for issues) * Convert XML documents to well-defined dataclasses, which should work with IDE auto-completion * Loading and dumping of attributes, child elements, and text content * Required and optional attributes and child elements @@ -91,7 +36,7 @@ class Foo: existing_field: str = rename(field(...), name="existing-field") ``` -I would like to add support for validation in future, which might also make it easier to support other types. For now, you can work around this limitation with properties that do the conversion. +For now, you can work around this limitation with properties that do the conversion, and perform post-load validation. ### Defining text @@ -122,10 +67,10 @@ Children must ultimately be other XML dataclasses. However, they can also be `Op * Next, `List` should be defined (if multiple child elements are allowed). Valid: `List[Union[XmlDataclass1, XmlDataclass2]]`. Invalid: `Union[List[XmlDataclass1], XmlDataclass2]` * Finally, if `Optional` or `List` were used, a union type should be the inner-most (again, if needed) -Children can be renamed via the `rename` function, however attempting to set a namespace is invalid, since the namespace is provided by the child type's XML dataclass. Also, unions of XML dataclasses must have the same namespace (you can use different fields if they have different namespaces). - If a class has children, it cannot have text content. +Children can be renamed via the `rename` function. However, attempting to set a namespace is invalid, since the namespace is provided by the child type's XML dataclass. Also, unions of XML dataclasses must have the same namespace (you can use different fields with renaming if they have different namespaces, since the XML names will be resolved as a combination of namespace and name). + ### Defining post-load validation Simply implement an instance method called `xml_validate` with no parameters, and no return value (if you're using type hints): @@ -137,8 +82,89 @@ def xml_validate(self) -> None: If defined, the `load` function will call it after all values have been loaded and assigned to the XML dataclass. You can validate the fields you want inside this method. Return values are ignored; instead raise and catch exceptions. +## Example (fully type hinted) + +(This is a simplified real world example - the container can also include optional `links` child elements.) + +```xml + + + + + + +``` + +```python +from dataclasses import dataclass +from typing import List +from lxml import etree # type: ignore +from xml_dataclasses import xml_dataclass, rename, load, dump, NsMap, XmlDataclass + +CONTAINER_NS = "urn:oasis:names:tc:opendocument:xmlns:container" + + +@xml_dataclass +@dataclass +class RootFile: + __ns__ = CONTAINER_NS + full_path: str = rename(name="full-path") + media_type: str = rename(name="media-type") + + +@xml_dataclass +@dataclass +class RootFiles: + __ns__ = CONTAINER_NS + rootfile: List[RootFile] + + +# see Gotchas, this workaround is required for type hinting +@xml_dataclass +@dataclass +class Container(XmlDataclass): + __ns__ = CONTAINER_NS + version: str + rootfiles: RootFiles + # WARNING: this is an incomplete implementation of an OPF container + + def xml_validate(self) -> None: + if self.version != "1.0": + raise ValueError(f"Unknown container version '{self.version}'") + + +if __name__ == "__main__": + nsmap: NsMap = {None: CONTAINER_NS} + # see Gotchas, stripping whitespace is highly recommended + parser = etree.XMLParser(remove_blank_text=True) + lxml_el_in = etree.parse("container.xml", parser).getroot() + container = load(Container, lxml_el_in, "container") + lxml_el_out = dump(container, "container", nsmap) + print(etree.tostring(lxml_el_out, encoding="unicode", pretty_print=True)) +``` + ## Gotchas +### Type hinting + +This can be a real pain to get right. Unfortunately, if you need this, you may have to resort to: + +```python +@xml_dataclass +@dataclass +class Child: + __ns__ = None + pass + +@xml_dataclass +@dataclass +class Parent(XmlDataclass): + __ns__ = None + children: Child +``` + +It's important that `@dataclass` be the *last* decorator, i.e. the closest to the class definition (and so the first to be applied). Luckily, only the root class you intend to pass to `load`/`dump` has to inherit from `XmlDataclass`, but all classes should have the `@dataclass` decorator applied. + ### Whitespace If you are able to, it is strongly recommended you strip whitespace from the input via `lxml`: @@ -151,7 +177,7 @@ By default, `lxml` preserves whitespace. This can cause a problem when checking ### Optional vs required -On dataclasses, optional fields also usually have a default value to be useful. But this isn't required; `Optional` is just a type hint to say `None` is allowed. +On dataclasses, optional fields also usually have a default value to be useful. But this isn't required; `Optional` is just a type hint to say `None` is allowed. This would occur e.g. if an element has no children. For XML dataclasses, on loading/deserialisation, whether or not a field is required is determined by if it has a `default`/`default_factory` defined. If so, and it's missing, that default is used. Otherwise, an error is raised. @@ -163,8 +189,8 @@ This makes sense in many cases, but possibly not every case. Most of these limitations/assumptions are enforced. They may make this project unsuitable for your use-case. -* It isn't possible to pass any parameters to the wrapped `@dataclass` decorator -* Setting the `init` parameter of a dataclass' `field` will lead to bad things happening, this isn't supported +* If you need to pass any parameters to the wrapped `@dataclass` decorator, apply it before the `@xml_dataclass` decorator +* Setting the `init` parameter of a dataclass' `field` will lead to bad things happening, this isn't supported. * Deserialisation is strict; missing required attributes and child elements will cause an error. I want this to be the default behaviour, but it should be straightforward to add a parameter to `load` for lenient operation * Dataclasses must be written by hand, no tools are provided to generate these from, DTDs, XML schema definitions, or RELAX NG schemas diff --git a/functional/container_test.py b/functional/container_test.py index 6cca389..0aecb79 100644 --- a/functional/container_test.py +++ b/functional/container_test.py @@ -1,10 +1,11 @@ +from dataclasses import dataclass from pathlib import Path from typing import List -import pytest -from lxml import etree +import pytest # type: ignore +from lxml import etree # type: ignore -from xml_dataclasses import dump, load, rename, xml_dataclass +from xml_dataclasses import NsMap, XmlDataclass, dump, load, rename, xml_dataclass from .utils import lmxl_dump @@ -14,6 +15,7 @@ @xml_dataclass +@dataclass class RootFile: __ns__ = CONTAINER_NS full_path: str = rename(name="full-path") @@ -21,25 +23,28 @@ class RootFile: @xml_dataclass +@dataclass class RootFiles: __ns__ = CONTAINER_NS rootfile: List[RootFile] @xml_dataclass -class Container: +@dataclass +class Container(XmlDataclass): __ns__ = CONTAINER_NS version: str rootfiles: RootFiles # WARNING: this is an incomplete implementation of an OPF container - def xml_validate(self): + def xml_validate(self) -> None: if self.version != "1.0": raise ValueError(f"Unknown container version '{self.version}'") @pytest.mark.parametrize("remove_blank_text", [True, False]) -def test_functional_container_no_whitespace(remove_blank_text): +def test_functional_container_no_whitespace(remove_blank_text): # type: ignore + nsmap: NsMap = {None: CONTAINER_NS} parser = etree.XMLParser(remove_blank_text=remove_blank_text) el = etree.parse(str(BASE / "container.xml"), parser).getroot() original = lmxl_dump(el) @@ -55,6 +60,6 @@ def test_functional_container_no_whitespace(remove_blank_text): ], ), ) - el = dump(container, "container", {None: CONTAINER_NS}) + el = dump(container, "container", nsmap) roundtrip = lmxl_dump(el) assert original == roundtrip diff --git a/functional/utils.py b/functional/utils.py index 7800186..6cc897c 100644 --- a/functional/utils.py +++ b/functional/utils.py @@ -1,8 +1,10 @@ -from lxml import etree +from typing import Any +from lxml import etree # type: ignore -def lmxl_dump(el): - encoded = etree.tostring( + +def lmxl_dump(el: Any) -> str: + encoded: bytes = etree.tostring( el, encoding="utf-8", pretty_print=True, xml_declaration=True ) return encoded.decode("utf-8") diff --git a/lint b/lint index 870d20a..50b45ea 100755 --- a/lint +++ b/lint @@ -22,4 +22,6 @@ else coverage html exit 1 fi + +mypy functional/container_test.py --strict pytest functional/ --random-order $PYTEST_DEBUG diff --git a/pyproject.toml b/pyproject.toml index 10b37ef..b2c8d04 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "xml_dataclasses" -version = "0.0.4" +version = "0.0.5" description = "(De)serialize XML documents into specially-annotated dataclasses" authors = ["Toby Fleming "] license = "MPL-2.0" diff --git a/src/xml_dataclasses/__init__.py b/src/xml_dataclasses/__init__.py index 475151e..f9d0825 100644 --- a/src/xml_dataclasses/__init__.py +++ b/src/xml_dataclasses/__init__.py @@ -3,5 +3,24 @@ logging.getLogger(__name__).addHandler(logging.NullHandler()) from .modifiers import rename, text # isort:skip -from .resolve_types import xml_dataclass # isort:skip +from .resolve_types import ( # isort:skip + is_xml_dataclass, + xml_dataclass, + NsMap, + XmlDataclass, +) from .serde import dump, load # isort:skip + + +# __all__ is required for mypy to pick up the imports +# for errors, use `from xml_dataclasses.errors import ...` +__all__ = [ + "rename", + "text", + "dump", + "load", + "is_xml_dataclass", + "xml_dataclass", + "NsMap", + "XmlDataclass", +] diff --git a/src/xml_dataclasses/modifiers.py b/src/xml_dataclasses/modifiers.py index efd201b..2ffffb2 100644 --- a/src/xml_dataclasses/modifiers.py +++ b/src/xml_dataclasses/modifiers.py @@ -14,12 +14,15 @@ def make_field(default: Union[_T, _MISSING_TYPE]) -> Field[_T]: return field(default=default) +# NOTE: Actual return type is 'Field[_T]', but we want to help type checkers +# to understand the magic that happens at runtime. +# see https://github.com/python/typeshed/blob/master/stdlib/3.7/dataclasses.pyi def rename( f: Optional[Field[_T]] = None, default: Union[_T, _MISSING_TYPE] = MISSING, name: Optional[str] = None, ns: Optional[str] = None, -) -> Field[_T]: +) -> _T: if f is None: f = make_field(default=default) metadata = dict(f.metadata) @@ -28,15 +31,18 @@ def rename( if ns: metadata["xml:ns"] = ns f.metadata = metadata - return f + return f # type: ignore +# NOTE: Actual return type is 'Field[_T]', but we want to help type checkers +# to understand the magic that happens at runtime. +# see https://github.com/python/typeshed/blob/master/stdlib/3.7/dataclasses.pyi def text( f: Optional[Field[_T]] = None, default: Union[_T, _MISSING_TYPE] = MISSING -) -> Field[_T]: +) -> _T: if f is None: f = make_field(default=default) metadata = dict(f.metadata) metadata["xml:text"] = True f.metadata = metadata - return f + return f # type: ignore diff --git a/src/xml_dataclasses/resolve_types.py b/src/xml_dataclasses/resolve_types.py index 1839e12..f0e90a7 100644 --- a/src/xml_dataclasses/resolve_types.py +++ b/src/xml_dataclasses/resolve_types.py @@ -14,6 +14,7 @@ Optional, Tuple, Type, + TypeVar, Union, ) @@ -28,6 +29,8 @@ from .lxml_utils import format_ns NoneType: Type[Any] = type(None) +# Union here is a hack to make this a valid alias +NsMap = Union[Mapping[Optional[str], str]] class XmlDataclass: @@ -35,7 +38,10 @@ class XmlDataclass: __attributes__: Collection[AttrInfo] __children__: Collection[ChildInfo] __text_field__: Optional[TextInfo] - __nsmap__: Optional[Mapping[Optional[str], str]] + __nsmap__: Optional[NsMap] + + +XmlDataclassInstance = TypeVar("XmlDataclassInstance", bound=XmlDataclass) def is_xml_dataclass(tp: Type[Any]) -> bool: @@ -228,8 +234,13 @@ def add(self, xml_name: str, field_name: str) -> None: raise XmlDataclassDuplicateFieldError(msg) -def xml_dataclass(cls: Type[Any]) -> Type[XmlDataclass]: - new_cls = dataclass()(cls) +def xml_dataclass(cls: Type[Any]) -> Type[XmlDataclassInstance]: + # if a dataclass is doubly decorated, metadata seems to disappear... + if is_dataclass(cls): + new_cls = cls + else: + new_cls = dataclass()(cls) + try: new_cls.__ns__ except AttributeError: diff --git a/src/xml_dataclasses/serde.py b/src/xml_dataclasses/serde.py index 5c7af89..5f87c91 100644 --- a/src/xml_dataclasses/serde.py +++ b/src/xml_dataclasses/serde.py @@ -6,7 +6,13 @@ from lxml.builder import ElementMaker # type: ignore from .lxml_utils import strip_ns -from .resolve_types import ChildInfo, TextInfo, XmlDataclass, is_xml_dataclass +from .resolve_types import ( + ChildInfo, + TextInfo, + XmlDataclass, + XmlDataclassInstance, + is_xml_dataclass, +) _T = TypeVar("_T") @@ -132,7 +138,9 @@ def _validate_name(cls: Type[XmlDataclass], el: Any, name: str) -> None: ) -def load(cls: Type[XmlDataclass], el: Any, name: Optional[str] = None) -> XmlDataclass: +def load( + cls: Type[XmlDataclassInstance], el: Any, name: Optional[str] = None +) -> XmlDataclassInstance: if not is_xml_dataclass(cls): raise ValueError(f"Class '{cls!r}' is not an XML dataclass") @@ -164,7 +172,9 @@ def load(cls: Type[XmlDataclass], el: Any, name: Optional[str] = None) -> XmlDat return instance -def dump(instance: XmlDataclass, name: str, nsmap: Mapping[Optional[str], str]) -> Any: +def dump( + instance: XmlDataclassInstance, name: str, nsmap: Mapping[Optional[str], str] +) -> Any: cls = type(instance) if not is_xml_dataclass(cls): raise ValueError(f"Class '{cls!r}' is not an XML dataclass") diff --git a/tests/resolve_types_other_test.py b/tests/resolve_types_other_test.py index 826f682..197825d 100644 --- a/tests/resolve_types_other_test.py +++ b/tests/resolve_types_other_test.py @@ -1,4 +1,4 @@ -from dataclasses import dataclass, field +from dataclasses import dataclass, field, fields from typing import Optional from unittest.mock import patch @@ -10,7 +10,6 @@ XmlDataclassInternalError, XmlDataclassModelError, XmlDataclassNoNamespaceError, - XmlTypeError, ) from xml_dataclasses.modifiers import rename, text from xml_dataclasses.resolve_types import FieldInfo, is_xml_dataclass, xml_dataclass @@ -35,7 +34,7 @@ class XmlDt2: @pytest.mark.parametrize( "cls,result", - [(int, False), (str, False), (object, False), (Dt, False), (XmlDt1, True),], + [(int, False), (str, False), (object, False), (Dt, False), (XmlDt1, True)], ) def test_is_xml_dataclass(cls, result): assert is_xml_dataclass(cls) is result @@ -207,3 +206,19 @@ class Foo: assert exc_info.value.__cause__ is None else: assert attr_info.get_default() is None + + +def test_xml_dataclass_already_a_dataclass_stays_a_dataclass(): + @dataclass + class Foo: + __ns__ = None + bar: str = rename(name="baz") + + Bar = xml_dataclass(Foo) + dt_fields = fields(Bar) + assert len(dt_fields) == 1 + assert dt_fields[0].metadata == {"xml:name": "baz"} + + assert len(Bar.__attributes__) == 1 + attr_info = Bar.__attributes__[0] + assert attr_info.xml_name == "baz"