Merge pull request #69 from kba/xsd-generateds

Generate PAGE API from XSD
OCR-D · Apr 26, 2018 · 9b914f6 · 9b914f6
2 parents 79bae4e + 0a108a3
commit 9b914f6
Show file tree

Hide file tree

Showing 17 changed files with 7,509 additions and 372 deletions.
diff --git a/.pylintrc b/.pylintrc
@@ -1,9 +1,12 @@
 [MASTER]
 extension-pkg-whitelist=lxml
-ignored-modules=cv2,tesserocr
+ignored-modules=cv2,tesserocr,ocrd.model
 
 [MESSAGES CONTROL]
+ignore-patterns='.*generateds.*'
 disable =
+    inconsistent-return-statements,
+    ungrouped-imports,
     missing-docstring,
     no-self-use,
     too-many-arguments,

diff --git a/Makefile b/Makefile
@@ -56,6 +56,15 @@ deps-pip-test:
 install: spec
 	$(PIP) install .
 
+# Regenerate python code from PAGE XSD
+generate-page: repo/assets
+	generateDS \
+		-f \
+		--no-namespace-defs \
+		--root-element='PcGts' \
+		-o ocrd/model/ocrd_page_generateds.py \
+		repo/assets/data/schema/2017-07-15.xsd
+
 #
 # Repos
 #

diff --git a/ocrd/__init__.py b/ocrd/__init__.py
@@ -1,5 +1,5 @@
 from ocrd.processor.base import run_processor, run_cli, Processor
-from ocrd.model import OcrdPage, OcrdMets, OcrdExif, OcrdFile, OcrdSwagger
+from ocrd.model import OcrdMets, OcrdExif, OcrdFile, OcrdSwagger
 from ocrd.constants import * # pylint: disable=wildcard-import
 from ocrd.resolver import Resolver
 from ocrd.validator import WorkspaceValidator, OcrdToolValidator

diff --git a/ocrd/constants.py b/ocrd/constants.py
@@ -1,6 +1,8 @@
 import yaml
 from pkg_resources import resource_string
 
+VERSION = '0.0.8-pre1'
+
 TMP_PREFIX = 'pyocrd-'
 
 NAMESPACES = {
@@ -11,13 +13,6 @@
     'xsl': 'http://www.w3.org/1999/XSL/Transform#',
 }
 
-PAGE_XML_EMPTY = '''<?xml version="1.0" encoding="UTF-8"?>
-<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15/pagecontent.xsd">
-        <Page>
-        </Page>
-</PcGts>
-'''
-
 MIMETYPE_PAGE = 'text/page+xml'
 
 DEFAULT_UPLOAD_FOLDER = '/tmp/uploads-pyocrd'

diff --git a/ocrd/model/__init__.py b/ocrd/model/__init__.py
@@ -1,6 +1,5 @@
 from .ocrd_xml_base import OcrdXmlDocument
 from .ocrd_mets import OcrdMets
-from .ocrd_page import OcrdPage
 from .ocrd_exif import OcrdExif
 from .ocrd_file import OcrdFile
 from .ocrd_swagger import OcrdSwagger
diff --git a/ocrd/model/ocrd_file.py b/ocrd/model/ocrd_file.py
@@ -1,5 +1,5 @@
 import os
-from ocrd.constants import NAMESPACES as NS, TAG_METS_FLOCAT
+from ocrd.constants import NAMESPACES as NS, TAG_METS_FLOCAT, TAG_METS_FILE
 
 from .ocrd_xml_base import ET
 
@@ -12,9 +12,13 @@ class OcrdFile(object):
     #  def create(mimetype, ID, url, local_filename):
     #      el_fileGrp.SubElement('file')
 
-    def __init__(self, el, instance=None, local_filename=None, workspace=None):
+    def __init__(self, el, mimetype=None, instance=None, local_filename=None, workspace=None):
+        if el is None:
+            el = ET.Element(TAG_METS_FILE)
         self._el = el
+        self.mimetype = mimetype
         self.local_filename = local_filename
+
         self._instance = instance
         self.workspace = workspace
 

diff --git a/ocrd/model/ocrd_page.py b/ocrd/model/ocrd_page.py
@@ -1,172 +1,57 @@
-from ocrd.constants import (
-    PAGE_XML_EMPTY,
-    NAMESPACES,
-    TAG_PAGE_READINGORDER,
-    TAG_PAGE_REGIONREFINDEXED,
+from io import StringIO
+from datetime import datetime
+
+# pylint: disable=unused-import
+from ocrd.model.ocrd_page_generateds import (
+    parse,
+    parseString,
+    CoordsType,
+    OrderedGroupType,
+    PcGtsType,
+    PageType,
+    MetadataType,
+    ReadingOrderType,
+    RegionRefIndexedType,
+    TextEquivType,
+    TextRegionType,
+    TextLineType,
+    WordType,
 )
-from ocrd.utils import getLogger
-
-from .ocrd_xml_base import OcrdXmlDocument, ET
-from .ocrd_page_textregion import OcrdPageTextRegion
-from .ocrd_page_textline import OcrdPageTextLine
-
-log = getLogger('ocrd.model.ocrd_page')
-
-class OcrdPage(OcrdXmlDocument):
-
-    def __init__(self, *args, **kwargs):
-        super(OcrdPage, self).__init__(*args, **kwargs)
-        self._image_el = ET.Element('file')
-        self._image_file = None
-
-    @staticmethod
-    def from_file(input_file):
-        """
-        Create a new PAGE-XML from a METS file representing a PAGE-XML or an image.
-        """
-        if input_file.mimetype.startswith('image'):
-            content = PAGE_XML_EMPTY.replace('<Page>', '<Page imageFileName="%s">' % (input_file.url))
-            return OcrdPage(content=content)
-        elif input_file.mimetype == 'text/page+xml':
-            return OcrdPage(filename=input_file.local_filename)
-
-    def __str__(self):
-        return '''
-        <OcrdPage>
-            imageFileName = %s
-        </OcrdPage>
-        ''' % (
-            self._tree.find('page:Page', NAMESPACES).get('imageFileName')
+from ocrd.constants import NAMESPACES, VERSION
+from ocrd.model.ocrd_exif import OcrdExif
+
+def to_xml(el):
+    sio = StringIO()
+    el.export(sio, 0, name_='PcGts', namespacedef_='xmlns:pc="%s"' % NAMESPACES['page'])
+    return '<?xml version="1.0" encoding="UTF-8"?>\n' + sio.getvalue()
+
+def page_from_image(input_file):
+    if input_file.local_filename is None:
+        raise Exception("input_file must have 'local_filename' property")
+    exif = OcrdExif.from_filename(input_file.local_filename)
+    now = datetime.now()
+    return PcGtsType(
+        Metadata=MetadataType(
+            Creator="OCR-D/core %s" % VERSION,
+            Created=now,
+            LastChange=now
+        ),
+        Page=PageType(
+            imageWidth=exif.width,
+            imageHeight=exif.height,
+            # XXX brittle
+            imageFilename=input_file.url if input_file.url is not None else 'file://' + input_file.local_filename
         )
-
-    @property
-    def page(self):
-        """
-        The Page element
-        """
-        return self._tree.find('.//page:Page', NAMESPACES)
-
-    @property
-    def pcGtsId(self):
-        """
-        The pcGtsId of the root element
-        """
-        return self._tree.getroot().get('pcGtsId')
-
-    @property
-    def imageFileName(self):
-        return self.page.get('imageFileName')
-
-    @imageFileName.setter
-    def imageFileName(self, v):
-        self.page.set('imageFileName', v)
-
-    @property
-    def imageWidth(self):
-        return self.page.get('imageWidth')
-
-    @imageWidth.setter
-    def imageWidth(self, v):
-        self.page.set('imageWidth', v)
-
-    @property
-    def imageHeight(self):
-        return self.page.get('imageHeight')
-
-    @imageHeight.setter
-    def imageHeight(self, v):
-        self.page.set('imageHeight', v)
-
-    @property
-    def imageXResolution(self):
-        return self.page.get('imageXResolution')
-
-    @imageXResolution.setter
-    def imageXResolution(self, v):
-        self.page.set('imageXResolution', v)
-
-    @property
-    def imageYResolution(self):
-        return self.page.get('imageYResolution')
-
-    @imageYResolution.setter
-    def imageYResolution(self, v):
-        self.page.set('imageYResolution', v)
-
-    @property
-    def imageCompression(self):
-        return self.page.get('imageCompression')
-
-    @imageCompression.setter
-    def imageCompression(self, v):
-        self.page.set('imageCompression', v)
-
-    @property
-    def imagePhotometricInterpretation(self):
-        return self.page.get('imagePhotometricInterpretation')
-
-    @imagePhotometricInterpretation.setter
-    def imagePhotometricInterpretation(self, v):
-        self.page.set('imagePhotometricInterpretation', v)
-
-    @property
-    def imageResolutionUnit(self):
-        return self.page.get('imageResolutionUnit')
-
-    @imageResolutionUnit.setter
-    def imageResolutionUnit(self, v):
-        self.page.set('imageResolutionUnit', v)
-
-    def add_reading_order_ref(self, region_ref, index):
-        """
-        Add the id of a region to the ReadingOrder
-        """
-        if self.page.find('.//page:ReadingOrder', NAMESPACES) is None:
-            ET.SubElement(self.page, TAG_PAGE_READINGORDER)
-        region_ref_indexed = ET.SubElement(self.page.find('.//page:ReadingOrder', NAMESPACES), TAG_PAGE_REGIONREFINDEXED)
-        region_ref_indexed.set("regionRef", region_ref)
-        region_ref_indexed.set("index", "%i" % index)
-
-    # --------------------------------------------------
-    # TextRegion
-    # --------------------------------------------------
-
-    def add_textregion(self, ID, coords):
-        """
-        Add a TextRegion
-        """
-        return OcrdPageTextRegion.create(self.page, ID=ID, coords=coords)
-
-    def get_textregion(self, ID):
-        """
-        Get TextRegion with ID.
-        """
-        return OcrdPageTextRegion(self.page.find('.//*[id="%s"' % ID))
-
-    def list_textregions(self):
-        """
-        List TextRegions as :py:mod:`OcrdPageTextRegion`
-        """
-        return [OcrdPageTextRegion(el) for el in self.page.findall('.//page:TextRegion', NAMESPACES)]
-
-    # --------------------------------------------------
-    # TextLine
-    # --------------------------------------------------
-
-    def add_textline(self, ID=None, coords=None):
-        """
-        Add a TextLine to the page.
-        """
-        return OcrdPageTextLine.create(self.page, ID=ID, coords=coords)
-
-    def get_textline(self, n):
-        """
-        Get the n-th TextLine on the page.
-        """
-        return OcrdPageTextLine(self.page.find('page:TextLine[%i]' % (n + 1), NAMESPACES))
-
-    def list_textlines(self):
-        """
-        List TextLine on page
-        """
-        return [OcrdPageTextLine(el) for el in self.page.findall('page:TextLine', NAMESPACES)]
+    )
+
+def from_file(input_file):
+    """
+    Create a new PAGE-XML from a METS file representing a PAGE-XML or an image.
+    """
+    #  print("PARSING PARSING '%s'" % input_file)
+    if input_file.mimetype.startswith('image'):
+        return page_from_image(input_file)
+    elif input_file.mimetype == 'text/page+xml':
+        return parse(input_file.local_filename, silence=True)
+    else:
+        raise Exception("Unsupported mimetype '%s'" % input_file.mimetype)