Skip to content

Commit

Permalink
conversion of TextStyle
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Apr 7, 2021
1 parent 2e1269d commit 99e80c5
Show file tree
Hide file tree
Showing 4 changed files with 138 additions and 7 deletions.
19 changes: 16 additions & 3 deletions ocrd_page_to_alto/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
set_alto_shape_from_coords,
setxml
)
from .styles import TextStylesManager

NAMESPACES = {**NAMESPACES_}
NAMESPACES['xsi'] = 'http://www.w3.org/2001/XMLSchema-instance'
Expand Down Expand Up @@ -62,6 +63,7 @@ def __init__(self, *, check_words=True, check_border=True, skip_empty_lines=Fals
raise ValueError("The PAGE-XML to transform contains neither Border nor PrintSpace")
self.alto_alto, self.alto_description, self.alto_styles, self.alto_tags, self.alto_page = self.create_alto()
self.alto_printspace = self.convert_border()
self.textstyle_mgr = TextStylesManager()

def __str__(self):
return ET.tostring(self.alto_alto, pretty_print=True).decode('utf-8')
Expand All @@ -83,6 +85,10 @@ def convert(self):
self.convert_metadata()
self.convert_text()
self.convert_reading_order()
self.convert_styles()

def convert_styles(self):
self.textstyle_mgr.to_xml(self.alto_styles)

def convert_reading_order(self):
index_order = [x.id for x in self.page_page.get_AllRegions(order='reading-order', depth=1)]
Expand Down Expand Up @@ -151,13 +157,14 @@ def _convert_textlines(self, reg_alto, reg_page):
if is_empty_line and self.skip_empty_lines:
return
line_alto = ET.SubElement(reg_alto, 'TextLine')
if is_empty_line:
word_alto_empty = ET.SubElement(line_alto, 'String')
word_alto_empty.set('CONTENT', '')
set_alto_id_from_page_id(line_alto, line_page)
set_alto_xywh_from_coords(line_alto, line_page)
set_alto_shape_from_coords(line_alto, line_page)
self.set_alto_styleref_from_textstyle(line_alto, line_page)
# XXX ALTO does not allow TextLine without at least one String
if is_empty_line:
word_alto_empty = ET.SubElement(line_alto, 'String')
word_alto_empty.set('CONTENT', '')
for word_page in line_page.get_Word():
word_alto = ET.SubElement(line_alto, 'String')
set_alto_id_from_page_id(word_alto, word_page)
Expand All @@ -175,6 +182,7 @@ def convert_text(self):
set_alto_id_from_page_id(reg_alto, reg_page)
set_alto_xywh_from_coords(reg_alto, reg_page)
set_alto_shape_from_coords(reg_alto, reg_page)
self.set_alto_styleref_from_textstyle(reg_alto, reg_page)
if reg_page_type == 'Text':
self._convert_textlines(reg_alto, reg_page)
elif reg_page_type == 'Table':
Expand All @@ -186,3 +194,8 @@ def convert_text(self):
else:
raise ValueError('Unhandled region type %s' % reg_page_type)


def set_alto_styleref_from_textstyle(self, reg_alto, reg_page):
textstyle = reg_page.get_TextStyle() if hasattr(reg_page, 'get_TextStyle') else None
if textstyle:
reg_alto.set('STYLEREFS', self.textstyle_mgr.from_textstyle(textstyle))
92 changes: 92 additions & 0 deletions ocrd_page_to_alto/styles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from lxml import etree as ET

class TextStylesManager():

def __init__(self):
self._styles = set()
self.fields = ('font_family', 'font_type', 'font_width', 'font_size', 'font_color', 'font_style')
self.output_element = 'TextStyle'

def get_style_id(self, **kwargs):
if any(k not in self.fields for k in kwargs):
raise ValueError(f"Unknown fields in {kwargs}")
key = '---'.join([str(kwargs.get(x, None)).replace(' ', '%20') for x in self.fields])
if key not in self.styles:
self._styles.add(key)
return key

@property
def styles(self):
ret = {}
for key in self._styles:
ret[key] = {}
vals = key.split('---')
for field_idx, field in enumerate(self.fields):
ret[key][field] = vals[field_idx].replace('%20', ' ')
return ret

def from_textstyle(self, textstyle):
kwargs = {}
print(textstyle)
kwargs['font_family'] = textstyle.fontFamily
kwargs['font_type'] = 'serif' if textstyle.serif else 'sans-serif'
kwargs['font_width'] = 'fixed' if textstyle.monospace else 'proportional'
if textstyle.fontSize:
kwargs['font_size'] = textstyle.fontSize
if textstyle.textColourRgb:
b = textstyle.textColourRgb // 65336
g = (textstyle.textColourRgb - (b * 65336)) // 256
r = textstyle.textColourRgb - (b * 65336) - (g * 256)
kwargs['font_color'] = '%2x%2x%2x' % (r, g, b)
if textstyle.textColour:
# https://en.wikipedia.org/wiki/Web_colors
rgb = 'ffffff' if textstyle.textColour == 'white' else \
'000000' if textstyle.textColour == 'black' else \
'ff0000' if textstyle.textColour == 'red' else \
'800000' if textstyle.textColour == 'brown' else \
'00ffff' if textstyle.fontColour == 'cyan' else \
'00ff00' if textstyle.fontColour == 'green' else \
'999999' if textstyle.fontColour == 'grey' else \
'4b0082' if textstyle.fontColour == 'indigo' else \
'ff00ff' if textstyle.fontColour == 'magenta' else \
'ffa500' if textstyle.fontColour == 'orange' else \
'ff00cb' if textstyle.fontColour == 'pink' else \
'40e0d0' if textstyle.fontColour == 'turquoise' else \
'ee82ee' if textstyle.fontColour == 'violet' else \
'ffff00' if textstyle.fontColour == 'yellow' else \
None
if rgb:
kwargs['font_color'] = rgb
font_style = []
if textstyle.italic:
font_style.append('italics')
if textstyle.underlined:
font_style.append('underline')
for att in ('bold', 'smallCaps', 'strikethrough', 'subscript', 'superscript'):
if getattr(textstyle, att):
font_style.append(att.lower())
if font_style:
kwargs['font_style'] = ' '.join(font_style)
# TODO kerning
# TODO underlineStyle
# TODO bgColour
# TODO bgColourRgb
# TODO reverseVideo
# TODO xHeight
# TODO letterSpaced
return self.get_style_id(**kwargs)

def to_xml(self, alto_styles):
for style_id, style in self.styles.items():
el_style = ET.SubElement(alto_styles, self.output_element)
el_style.set('ID', style_id)
for k, v in style.items():
if v != 'None':
el_style.set(k.replace('_', '').upper(), v)

class ParagraphStyleManager(TextStylesManager):

def __ini__(self):
super().__init__()
self.fields = ('align', 'left', 'right', 'line_space', 'first_line')
self.output_element = 'ParagraphStyle'
7 changes: 3 additions & 4 deletions ocrd_page_to_alto/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from lxml import etree as ET
from ocrd_utils import xywh_from_points

def setxml(el, name, val):
el.set(name, str(val))

def set_alto_xywh_from_coords(reg_alto, reg_page, classes=None):
if classes is None:
classes = ['HEIGHT', 'WIDTH', 'HPOS', 'VPOS']
Expand All @@ -21,7 +24,3 @@ def set_alto_shape_from_coords(reg_alto, reg_page):

def set_alto_id_from_page_id(reg_alto, reg_page):
setxml(reg_alto, 'ID', reg_page.id)

def setxml(el, name, val):
el.set(name, str(val))

27 changes: 27 additions & 0 deletions tests/test_styles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from pytest import raises, main, fixture
from lxml import etree as ET
from ocrd_models.ocrd_page import TextStyleType, to_xml

from ocrd_page_to_alto.styles import TextStylesManager

def test_styles_id():
m = TextStylesManager()
assert m.get_style_id(font_family='Foo') == 'Foo---None---None---None---None---None'
assert m.styles['Foo---None---None---None---None---None']['font_family'] == 'Foo'

def test_styles_to_xml():
m = TextStylesManager()
m.get_style_id(font_family='Foo Serif')
el = ET.Element('Styles')
m.to_xml(el)
assert ET.tostring(el).decode('utf-8') == '<Styles><TextStyle ID="Foo%20Serif---None---None---None---None---None" FONTFAMILY="Foo Serif"/></Styles>'
assert m.styles['Foo%20Serif---None---None---None---None---None']['font_family'] == 'Foo Serif'

def test_styles_from_textstyle():
m = TextStylesManager()
textstyle = TextStyleType(fontFamily='Times New Roman', serif=True, textColourRgb=6559300)
print(m.from_textstyle(textstyle))
assert 0

if __name__ == "__main__":
main([__file__])

0 comments on commit 99e80c5

Please sign in to comment.