Skip to content

Commit

Permalink
ocrd_page: PcGts from image
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Apr 24, 2018
1 parent 66b8f1b commit 03a8c31
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 16 deletions.
1 change: 1 addition & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ ignored-modules=cv2,tesserocr,ocrd.model
[MESSAGES CONTROL]
ignore-patterns='.*generateds.*'
disable =
inconsistent-return-statements,
ungrouped-imports,
missing-docstring,
no-self-use,
Expand Down
8 changes: 6 additions & 2 deletions ocrd/model/ocrd_file.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from ocrd.constants import NAMESPACES as NS, TAG_METS_FLOCAT
from ocrd.constants import NAMESPACES as NS, TAG_METS_FLOCAT, TAG_METS_FILE

from .ocrd_xml_base import ET

Expand All @@ -12,9 +12,13 @@ class OcrdFile(object):
# def create(mimetype, ID, url, local_filename):
# el_fileGrp.SubElement('file')

def __init__(self, el, instance=None, local_filename=None, workspace=None):
def __init__(self, el, mimetype=None, instance=None, local_filename=None, workspace=None):
if el is None:
el = ET.Element(TAG_METS_FILE)
self._el = el
self.mimetype = mimetype
self.local_filename = local_filename

self._instance = instance
self.workspace = workspace

Expand Down
18 changes: 12 additions & 6 deletions ocrd/model/ocrd_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,24 @@ def to_xml(el):
el.export(sio, 0, name_='PcGts', namespacedef_='xmlns:pc="%s"' % NAMESPACES['page'])
return '<?xml version="1.0" encoding="UTF-8"?>\n' + sio.getvalue()

def page_from_image(input_file):
if input_file.local_filename is None:
raise Exception("input_file must have 'local_filename' property")
exif = OcrdExif.from_filename(input_file.local_filename)
content = PAGE_XML_EMPTY.replace('<Page>', '<Page imageWidth="%d" imageHeight="%i" imageFilename="%s">' % (
exif.width,
exif.height,
input_file.url
))
return content

def from_file(input_file):
"""
Create a new PAGE-XML from a METS file representing a PAGE-XML or an image.
"""
# print("PARSING PARSING '%s'" % input_file)
if input_file.mimetype.startswith('image'):
exif = OcrdExif.from_filename(input_file.local_filename)
content = PAGE_XML_EMPTY.replace('<Page>', '<Page imageWidth="%d" imageHeight="%i" imageFilename="%s">' % (
exif.width,
exif.height,
input_file.url
))
content = page_from_image(input_file)
return parseString(content.encode('utf-8'), silence=True)
elif input_file.mimetype == 'text/page+xml':
return parse(input_file.local_filename, silence=True)
Expand Down
7 changes: 3 additions & 4 deletions test/model/test_ocrd_mets.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
from ocrd.model import OcrdMets

from test.base import TestCase, main, assets
METS_HEROLD = assets.url_of('SBB0000F29300010000/mets.xml')

from ocrd.model import OcrdMets

class TestOcrdMets(TestCase):

def setUp(self):
self.mets = OcrdMets(filename=METS_HEROLD)
self.mets = OcrdMets(filename=assets.url_of('SBB0000F29300010000/mets.xml'))

def test_unique_identifier(self):
self.assertEqual(self.mets.unique_identifier, 'http://resolver.staatsbibliothek-berlin.de/SBB0000F29300010000', 'Right identifier')
Expand Down
16 changes: 12 additions & 4 deletions test/model/test_ocrd_page.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,26 @@

import sys

from test.base import TestCase, main, assets
import ocrd.model.ocrd_file as ocrd_file
import ocrd.model.ocrd_page as ocrd_page

# pylint: disable=protected-access

class TestOcrdPage(TestCase):

def setUp(self):
with open(assets.url_of('page-with-glyphs.xml').replace('file://', ''), 'rb') as f:
with open(assets.url_of('page-with-glyphs.xml', remove_file=True), 'rb') as f:
self.xml_as_str = f.read()
self.pcgts = ocrd_page.parseString(self.xml_as_str, silence=True)

def test_from_file(self):
f = ocrd_file.OcrdFile(
None,
mimetype='image/tif',
local_filename=assets.url_of('kant_aufklaerung_1784/kant_aufklaerung_1784_0017.tif', remove_file=True)
)
self.assertEqual(f.mimetype, 'image/tif')
p = ocrd_page.from_file(f)
print(ocrd_page.to_xml(p))

def test_pcGtsId(self):
self.assertEqual(self.pcgts.pcGtsId, 'glyph-test')

Expand Down

0 comments on commit 03a8c31

Please sign in to comment.