diff --git a/.pylintrc b/.pylintrc index ce764b9f9..6c8d9e72b 100644 --- a/.pylintrc +++ b/.pylintrc @@ -5,6 +5,7 @@ ignored-modules=cv2,tesserocr,ocrd.model [MESSAGES CONTROL] ignore-patterns='.*generateds.*' disable = + inconsistent-return-statements, ungrouped-imports, missing-docstring, no-self-use, diff --git a/ocrd/model/ocrd_file.py b/ocrd/model/ocrd_file.py index b4a6ed42a..ad28987bb 100644 --- a/ocrd/model/ocrd_file.py +++ b/ocrd/model/ocrd_file.py @@ -1,5 +1,5 @@ import os -from ocrd.constants import NAMESPACES as NS, TAG_METS_FLOCAT +from ocrd.constants import NAMESPACES as NS, TAG_METS_FLOCAT, TAG_METS_FILE from .ocrd_xml_base import ET @@ -12,9 +12,13 @@ class OcrdFile(object): # def create(mimetype, ID, url, local_filename): # el_fileGrp.SubElement('file') - def __init__(self, el, instance=None, local_filename=None, workspace=None): + def __init__(self, el, mimetype=None, instance=None, local_filename=None, workspace=None): + if el is None: + el = ET.Element(TAG_METS_FILE) self._el = el + self.mimetype = mimetype self.local_filename = local_filename + self._instance = instance self.workspace = workspace diff --git a/ocrd/model/ocrd_page.py b/ocrd/model/ocrd_page.py index ca3354692..1bfafa899 100644 --- a/ocrd/model/ocrd_page.py +++ b/ocrd/model/ocrd_page.py @@ -20,18 +20,24 @@ def to_xml(el): el.export(sio, 0, name_='PcGts', namespacedef_='xmlns:pc="%s"' % NAMESPACES['page']) return '\n' + sio.getvalue() +def page_from_image(input_file): + if input_file.local_filename is None: + raise Exception("input_file must have 'local_filename' property") + exif = OcrdExif.from_filename(input_file.local_filename) + content = PAGE_XML_EMPTY.replace('', '' % ( + exif.width, + exif.height, + input_file.url + )) + return content + def from_file(input_file): """ Create a new PAGE-XML from a METS file representing a PAGE-XML or an image. """ # print("PARSING PARSING '%s'" % input_file) if input_file.mimetype.startswith('image'): - exif = OcrdExif.from_filename(input_file.local_filename) - content = PAGE_XML_EMPTY.replace('', '' % ( - exif.width, - exif.height, - input_file.url - )) + content = page_from_image(input_file) return parseString(content.encode('utf-8'), silence=True) elif input_file.mimetype == 'text/page+xml': return parse(input_file.local_filename, silence=True) diff --git a/test/model/test_ocrd_mets.py b/test/model/test_ocrd_mets.py index 194292a27..0844cdfd8 100644 --- a/test/model/test_ocrd_mets.py +++ b/test/model/test_ocrd_mets.py @@ -1,12 +1,11 @@ -from ocrd.model import OcrdMets - from test.base import TestCase, main, assets -METS_HEROLD = assets.url_of('SBB0000F29300010000/mets.xml') + +from ocrd.model import OcrdMets class TestOcrdMets(TestCase): def setUp(self): - self.mets = OcrdMets(filename=METS_HEROLD) + self.mets = OcrdMets(filename=assets.url_of('SBB0000F29300010000/mets.xml')) def test_unique_identifier(self): self.assertEqual(self.mets.unique_identifier, 'http://resolver.staatsbibliothek-berlin.de/SBB0000F29300010000', 'Right identifier') diff --git a/test/model/test_ocrd_page.py b/test/model/test_ocrd_page.py index e78775ace..b6584dead 100644 --- a/test/model/test_ocrd_page.py +++ b/test/model/test_ocrd_page.py @@ -1,7 +1,5 @@ - -import sys - from test.base import TestCase, main, assets +import ocrd.model.ocrd_file as ocrd_file import ocrd.model.ocrd_page as ocrd_page # pylint: disable=protected-access @@ -9,10 +7,20 @@ class TestOcrdPage(TestCase): def setUp(self): - with open(assets.url_of('page-with-glyphs.xml').replace('file://', ''), 'rb') as f: + with open(assets.url_of('page-with-glyphs.xml', remove_file=True), 'rb') as f: self.xml_as_str = f.read() self.pcgts = ocrd_page.parseString(self.xml_as_str, silence=True) + def test_from_file(self): + f = ocrd_file.OcrdFile( + None, + mimetype='image/tif', + local_filename=assets.url_of('kant_aufklaerung_1784/kant_aufklaerung_1784_0017.tif', remove_file=True) + ) + self.assertEqual(f.mimetype, 'image/tif') + p = ocrd_page.from_file(f) + print(ocrd_page.to_xml(p)) + def test_pcGtsId(self): self.assertEqual(self.pcgts.pcGtsId, 'glyph-test')