Skip to content

Commit

Permalink
Merge pull request #69 from kba/xsd-generateds
Browse files Browse the repository at this point in the history
Generate PAGE API from XSD
  • Loading branch information
kba authored Apr 26, 2018
2 parents 79bae4e + 0a108a3 commit 9b914f6
Show file tree
Hide file tree
Showing 17 changed files with 7,509 additions and 372 deletions.
5 changes: 4 additions & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
[MASTER]
extension-pkg-whitelist=lxml
ignored-modules=cv2,tesserocr
ignored-modules=cv2,tesserocr,ocrd.model

[MESSAGES CONTROL]
ignore-patterns='.*generateds.*'
disable =
inconsistent-return-statements,
ungrouped-imports,
missing-docstring,
no-self-use,
too-many-arguments,
Expand Down
9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,15 @@ deps-pip-test:
install: spec
$(PIP) install .

# Regenerate python code from PAGE XSD
generate-page: repo/assets
generateDS \
-f \
--no-namespace-defs \
--root-element='PcGts' \
-o ocrd/model/ocrd_page_generateds.py \
repo/assets/data/schema/2017-07-15.xsd

#
# Repos
#
Expand Down
2 changes: 1 addition & 1 deletion ocrd/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from ocrd.processor.base import run_processor, run_cli, Processor
from ocrd.model import OcrdPage, OcrdMets, OcrdExif, OcrdFile, OcrdSwagger
from ocrd.model import OcrdMets, OcrdExif, OcrdFile, OcrdSwagger
from ocrd.constants import * # pylint: disable=wildcard-import
from ocrd.resolver import Resolver
from ocrd.validator import WorkspaceValidator, OcrdToolValidator
Expand Down
9 changes: 2 additions & 7 deletions ocrd/constants.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import yaml
from pkg_resources import resource_string

VERSION = '0.0.8-pre1'

TMP_PREFIX = 'pyocrd-'

NAMESPACES = {
Expand All @@ -11,13 +13,6 @@
'xsl': 'http://www.w3.org/1999/XSL/Transform#',
}

PAGE_XML_EMPTY = '''<?xml version="1.0" encoding="UTF-8"?>
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15/pagecontent.xsd">
<Page>
</Page>
</PcGts>
'''

MIMETYPE_PAGE = 'text/page+xml'

DEFAULT_UPLOAD_FOLDER = '/tmp/uploads-pyocrd'
Expand Down
1 change: 0 additions & 1 deletion ocrd/model/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from .ocrd_xml_base import OcrdXmlDocument
from .ocrd_mets import OcrdMets
from .ocrd_page import OcrdPage
from .ocrd_exif import OcrdExif
from .ocrd_file import OcrdFile
from .ocrd_swagger import OcrdSwagger
8 changes: 6 additions & 2 deletions ocrd/model/ocrd_file.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from ocrd.constants import NAMESPACES as NS, TAG_METS_FLOCAT
from ocrd.constants import NAMESPACES as NS, TAG_METS_FLOCAT, TAG_METS_FILE

from .ocrd_xml_base import ET

Expand All @@ -12,9 +12,13 @@ class OcrdFile(object):
# def create(mimetype, ID, url, local_filename):
# el_fileGrp.SubElement('file')

def __init__(self, el, instance=None, local_filename=None, workspace=None):
def __init__(self, el, mimetype=None, instance=None, local_filename=None, workspace=None):
if el is None:
el = ET.Element(TAG_METS_FILE)
self._el = el
self.mimetype = mimetype
self.local_filename = local_filename

self._instance = instance
self.workspace = workspace

Expand Down
225 changes: 55 additions & 170 deletions ocrd/model/ocrd_page.py
Original file line number Diff line number Diff line change
@@ -1,172 +1,57 @@
from ocrd.constants import (
PAGE_XML_EMPTY,
NAMESPACES,
TAG_PAGE_READINGORDER,
TAG_PAGE_REGIONREFINDEXED,
from io import StringIO
from datetime import datetime

# pylint: disable=unused-import
from ocrd.model.ocrd_page_generateds import (
parse,
parseString,
CoordsType,
OrderedGroupType,
PcGtsType,
PageType,
MetadataType,
ReadingOrderType,
RegionRefIndexedType,
TextEquivType,
TextRegionType,
TextLineType,
WordType,
)
from ocrd.utils import getLogger

from .ocrd_xml_base import OcrdXmlDocument, ET
from .ocrd_page_textregion import OcrdPageTextRegion
from .ocrd_page_textline import OcrdPageTextLine

log = getLogger('ocrd.model.ocrd_page')

class OcrdPage(OcrdXmlDocument):

def __init__(self, *args, **kwargs):
super(OcrdPage, self).__init__(*args, **kwargs)
self._image_el = ET.Element('file')
self._image_file = None

@staticmethod
def from_file(input_file):
"""
Create a new PAGE-XML from a METS file representing a PAGE-XML or an image.
"""
if input_file.mimetype.startswith('image'):
content = PAGE_XML_EMPTY.replace('<Page>', '<Page imageFileName="%s">' % (input_file.url))
return OcrdPage(content=content)
elif input_file.mimetype == 'text/page+xml':
return OcrdPage(filename=input_file.local_filename)

def __str__(self):
return '''
<OcrdPage>
imageFileName = %s
</OcrdPage>
''' % (
self._tree.find('page:Page', NAMESPACES).get('imageFileName')
from ocrd.constants import NAMESPACES, VERSION
from ocrd.model.ocrd_exif import OcrdExif

def to_xml(el):
sio = StringIO()
el.export(sio, 0, name_='PcGts', namespacedef_='xmlns:pc="%s"' % NAMESPACES['page'])
return '<?xml version="1.0" encoding="UTF-8"?>\n' + sio.getvalue()

def page_from_image(input_file):
if input_file.local_filename is None:
raise Exception("input_file must have 'local_filename' property")
exif = OcrdExif.from_filename(input_file.local_filename)
now = datetime.now()
return PcGtsType(
Metadata=MetadataType(
Creator="OCR-D/core %s" % VERSION,
Created=now,
LastChange=now
),
Page=PageType(
imageWidth=exif.width,
imageHeight=exif.height,
# XXX brittle
imageFilename=input_file.url if input_file.url is not None else 'file://' + input_file.local_filename
)

@property
def page(self):
"""
The Page element
"""
return self._tree.find('.//page:Page', NAMESPACES)

@property
def pcGtsId(self):
"""
The pcGtsId of the root element
"""
return self._tree.getroot().get('pcGtsId')

@property
def imageFileName(self):
return self.page.get('imageFileName')

@imageFileName.setter
def imageFileName(self, v):
self.page.set('imageFileName', v)

@property
def imageWidth(self):
return self.page.get('imageWidth')

@imageWidth.setter
def imageWidth(self, v):
self.page.set('imageWidth', v)

@property
def imageHeight(self):
return self.page.get('imageHeight')

@imageHeight.setter
def imageHeight(self, v):
self.page.set('imageHeight', v)

@property
def imageXResolution(self):
return self.page.get('imageXResolution')

@imageXResolution.setter
def imageXResolution(self, v):
self.page.set('imageXResolution', v)

@property
def imageYResolution(self):
return self.page.get('imageYResolution')

@imageYResolution.setter
def imageYResolution(self, v):
self.page.set('imageYResolution', v)

@property
def imageCompression(self):
return self.page.get('imageCompression')

@imageCompression.setter
def imageCompression(self, v):
self.page.set('imageCompression', v)

@property
def imagePhotometricInterpretation(self):
return self.page.get('imagePhotometricInterpretation')

@imagePhotometricInterpretation.setter
def imagePhotometricInterpretation(self, v):
self.page.set('imagePhotometricInterpretation', v)

@property
def imageResolutionUnit(self):
return self.page.get('imageResolutionUnit')

@imageResolutionUnit.setter
def imageResolutionUnit(self, v):
self.page.set('imageResolutionUnit', v)

def add_reading_order_ref(self, region_ref, index):
"""
Add the id of a region to the ReadingOrder
"""
if self.page.find('.//page:ReadingOrder', NAMESPACES) is None:
ET.SubElement(self.page, TAG_PAGE_READINGORDER)
region_ref_indexed = ET.SubElement(self.page.find('.//page:ReadingOrder', NAMESPACES), TAG_PAGE_REGIONREFINDEXED)
region_ref_indexed.set("regionRef", region_ref)
region_ref_indexed.set("index", "%i" % index)

# --------------------------------------------------
# TextRegion
# --------------------------------------------------

def add_textregion(self, ID, coords):
"""
Add a TextRegion
"""
return OcrdPageTextRegion.create(self.page, ID=ID, coords=coords)

def get_textregion(self, ID):
"""
Get TextRegion with ID.
"""
return OcrdPageTextRegion(self.page.find('.//*[id="%s"' % ID))

def list_textregions(self):
"""
List TextRegions as :py:mod:`OcrdPageTextRegion`
"""
return [OcrdPageTextRegion(el) for el in self.page.findall('.//page:TextRegion', NAMESPACES)]

# --------------------------------------------------
# TextLine
# --------------------------------------------------

def add_textline(self, ID=None, coords=None):
"""
Add a TextLine to the page.
"""
return OcrdPageTextLine.create(self.page, ID=ID, coords=coords)

def get_textline(self, n):
"""
Get the n-th TextLine on the page.
"""
return OcrdPageTextLine(self.page.find('page:TextLine[%i]' % (n + 1), NAMESPACES))

def list_textlines(self):
"""
List TextLine on page
"""
return [OcrdPageTextLine(el) for el in self.page.findall('page:TextLine', NAMESPACES)]
)

def from_file(input_file):
"""
Create a new PAGE-XML from a METS file representing a PAGE-XML or an image.
"""
# print("PARSING PARSING '%s'" % input_file)
if input_file.mimetype.startswith('image'):
return page_from_image(input_file)
elif input_file.mimetype == 'text/page+xml':
return parse(input_file.local_filename, silence=True)
else:
raise Exception("Unsupported mimetype '%s'" % input_file.mimetype)
Loading

0 comments on commit 9b914f6

Please sign in to comment.