-
Notifications
You must be signed in to change notification settings - Fork 8
/
ocrd-olena-binarize
executable file
·474 lines (431 loc) · 18 KB
/
ocrd-olena-binarize
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
#!/usr/bin/env bash
# shellcheck disable=SC2086
set -eu
set -o pipefail
#set -x
### binarization with Olena
#
# Finds and downloads all files in the input fileGrp
# of the workspace. Then for each file, determines its
# MIME type:
# If PAGE-XML, queries the input file for an existing
# AlternativeImage on the Page level, otherwise uses
# imageFilename. Passes the image file to the scribo
# binarization program for the chosen algorithm. The
# resulting file will reside in OCR-D-IMG-BIN and its
# ID will be derived from the input file's ID. Finally,
# adds a reference to the new file by appending some
# AlternativeImage to the PAGE-XML.
# If an image, creates a skeleton PAGE-XML for that
# imageFilename. Passes the image file to scribo
# likewise. Also adds a reference to the new file
# by inserting an AlternativeImage to the PAGE-XML.
# Regardless, adds new image and new PAGE-XML to METS.
#
# Note: this only works on the page level. Going down
# the element hierarchy would necessitate emulating
# coordinate transforms for polygon/bbox cropping and
# multi-level rotation/reflection in bash. If you need
# binarization on the region or line level with Olena,
# consider using ocrd-preprocess-image with scribo-cli
# as command parameter instead.
which ocrd >/dev/null 2>/dev/null || { echo >&2 "ocrd not in \$PATH. Panicking"; exit 1; }
((BASH_VERSINFO<4 || BASH_VERSINFO==4 && BASH_VERSINFO[1]<4)) && echo >&2 "bash $BASH_VERSION is too old. Please install bash 4.4 or newer." && exit 1
SHAREDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
PYTHON=python3
FALLBACK_IMAGE_FILEGRP="OCR-D-IMG-BIN"
PRESERVE_NAMESPACE=1 # 1 preserves the input file's PAGE namespace prefix and URL (version)
MIMETYPE_PAGE=$(ocrd bashlib constants MIMETYPE_PAGE)
declare -A NAMESPACES
eval "NAMESPACES=( $(ocrd bashlib constants NAMESPACES) )"
function xywh_from_points {
# FIXME: add a bashlib wrapper for utils (coordinate conversion) to use here
minx=$((2**32))
miny=$((2**32))
maxx=0
maxy=0
for point in $*; do
pointx=${point%,*}
pointy=${point#*,}
((pointx < minx)) && minx=$pointx
((pointx > maxx)) && maxx=$pointx
((pointy < miny)) && miny=$pointy
((pointy > maxy)) && maxy=$pointy
done
echo $((maxx - minx)) $((maxy - miny)) $minx $miny
}
function iso8601_date () {
date '+%FT%T'
}
function image_id_from_fpath {
local image_in_fpath="$1" in_id="$2" in_pageId="$3"
# find image ID representing the image file in METS
# FIXME: extend `ocrd workspace find` with a filter option for FLocat to use here
options=( sel
-N "mets=${NAMESPACES[mets]}"
-N "xlink=${NAMESPACES[xlink]}"
-t -v "//mets:file[starts-with(@MIMETYPE,'image/') and ./mets:FLocat/@xlink:href='${image_in_fpath}']/@ID"
"${ocrd__argv[mets_file]}" )
if ! image_in_id=$(xmlstarlet "${options[@]}" | head -1); then
ocrd log warning "image URL '${image_in_fpath}' not referenced in METS for input file ID=${in_id} (pageId=${in_pageId})"
image_in_id="${in_id}-IMG" # fallback
fi
echo "$image_in_id"
}
function auto_winsize {
local image_in_fpath="$1" in_pageId="$2"
if [ ${params[impl]} = 'otsu' ];then
echo
return
elif ((${params[win-size]})); then
echo
return
else
# Get window size from image DPI
if ((${params[dpi]})); then
dpi=${params[dpi]}
else
# now get DPI meta-data
read dpi units < <(identify -format "%x %U\n" "${image_in_fpath}[0]")
case "$units" in
PixelsPerCentimeter)
dpi=$($PYTHON -c "print(2.54*$dpi)");;
PixelsPerInch)
;;
Undefined)
# fallback
dpi=300;;
esac
fi
# Convert to odd integer
case ${params[impl]} in
sauvola*)
size=$dpi
;;
*)
size=$dpi/6
;;
esac
size=$($PYTHON -c "print(int($size) + int(($size+1)%2))")
ocrd log debug "Using $dpi DPI-derived window size $size for page ${in_pageId}"
echo --win-size $size
return
fi
}
function modify_page {
local namespace="$1" ns_prefix="$2" image_out_fpath="$3" out_fpath="$4" comments="$5" out_id="$6"
declare -a options
# shellcheck disable=SC2016
options+=( --no-doc-namespace ed --inplace
-N "pc=${namespace}"
# set pcGtsId:
-u '/pc:PcGts/@pcGtsId'
-v "${out_id}"
# update LastChange date stemp:
-u '/pc:PcGts/pc:Metadata/pc:LastChange'
-v "$(iso8601_date)"
# insert MetadataItem with runtime parameters:
-s '/pc:PcGts/pc:Metadata'
-t elem -n "${ns_prefix}MetadataItem"
# bind previous element to "new-item":
--var new-item '$prev'
-s '$new-item' -t attr -n type
-v "processingStep"
-s '$new-item' -t attr -n name
-v "preprocessing/optimization/binarization"
-s '$new-item' -t attr -n value
-v "ocrd-olena-binarize"
# add "Labels":
-s '$new-item' -t elem -n "${ns_prefix}Labels"
# bind previous element to "new-labels":
--var new-labels '$prev' )
for key in ${!params[@]}; do
# shellcheck disable=SC2016
options+=( # add another "Label":
-s '$new-labels' -t elem -n "${ns_prefix}Label"
# bind previous element to "new-label":
--var new-label '$prev'
-s '$new-label' -t attr -n value
-v ${params[$key]}
-s '$new-label' -t attr -n type
-v $key )
done
# insert/append AlternativeImage:
if xmlstarlet --no-doc-namespace sel \
-N "pc=${namespace}" --quiet \
-t -v '/pc:PcGts/pc:Page/pc:AlternativeImage' \
"$out_fpath"; then
# AlternativeImage exists: append after
options+=( -a '/pc:PcGts/pc:Page/pc:AlternativeImage[last()]' )
elif xmlstarlet --no-doc-namespace sel \
-N "pc=${namespace}" --quiet \
-t -v '/pc:PcGts/pc:Page/*' \
"$out_fpath"; then
# something exists: insert before
options+=( -i '/pc:PcGts/pc:Page/*[1]' )
else
# nothing here yet: append subnode
options+=( -s '/pc:PcGts/pc:Page' )
fi
# shellcheck disable=SC2016
options+=( -t elem -n "${ns_prefix}AlternativeImage"
--var new-image '$prev'
-s '$new-image' -t attr -n filename
-v "$image_out_fpath"
-s '$new-image' -t attr -n comments
-v "$comments"
"$out_fpath" )
xmlstarlet "${options[@]}"
}
function process_pagefile {
local in_fpath="$1" in_id="$2" in_pageId="$3" out_fpath="$4" out_id="$5" out_file_grp="$6"
local image_in_fpath image_in_id image_out_fpath image_out_id comments
if ((PRESERVE_NAMESPACE)); then
# preserve namespace and prefix
cat <"$in_fpath" >"$out_fpath"
else
# stylesheet transforms to standard namespace:
cat <<EOF >convert-namespace.xsl
<?xml version="1.0" encoding="ISO-8859-1"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="${NAMESPACES[xsl]}"
xmlns="${NAMESPACES[page]}">
<xsl:output method="xml" version="1.0"
encoding="UTF-8" indent="yes"/>
<xsl:template match="@*|text()|comment()|processing-instruction()">
<xsl:copy/>
</xsl:template>
<xsl:template match="*">
<xsl:element name="{local-name()}">
<xsl:apply-templates select="@*|*"/>
</xsl:element>
</xsl:template>
</xsl:stylesheet>
EOF
xmlstarlet tr convert-namespace.xsl <"$in_fpath" >"$out_fpath"
fi
# to become independent of whether and what
# namespace prefix is used for PAGE-XML,
# we first have to know the namespace:
namespace=$(xmlstarlet sel -t -m '/*[1]' -v 'namespace-uri()' "$out_fpath")
# now (using --no-doc-namespace) we can
# safely query with -N pc=${namespace}
# and safely add with prefix ${ns_prefix}:
ns_prefix=$(xmlstarlet sel -t -m '/*[1]' -v 'substring-before(name(),"PcGts")' "$out_fpath"; true)
declare -a options
# find image file representing the page in PAGE
if options=( --no-doc-namespace sel
-N "pc=${namespace}" -t
-v '/pc:PcGts/pc:Page/pc:AlternativeImage[not(contains(@comments,"binarized"))][last()]/@filename'
"$out_fpath" )
image_in_fpath=$(xmlstarlet "${options[@]}"); then
options=( --no-doc-namespace sel
-N "pc=${namespace}" -t
-v "/pc:PcGts/pc:Page/pc:AlternativeImage[@filename=\"${image_in_fpath}\"]/@comments"
"$out_fpath" )
comments=$(xmlstarlet "${options[@]}")
ocrd log info "found AlternativeImage filename '${image_in_fpath}' ($comments) for input file ID=${in_id} (pageId=${in_pageId})"
image_in_id=$(image_id_from_fpath "$image_in_fpath" "$in_id" "$in_pageId")
image_in_fpath="${image_in_fpath#file://}"
comments="$comments"${comments:+,}binarized
else
options=( --no-doc-namespace sel
-N "pc=${namespace}" -t
-v '/pc:PcGts/pc:Page/@imageFilename'
"$out_fpath" )
image_in_fpath=$(xmlstarlet "${options[@]}")
ocrd log info "found imageFilename '${image_in_fpath}' for input file ID=${in_id} (pageId=${in_pageId})"
comments=binarized
fi
image_in_id=$(image_id_from_fpath "$image_in_fpath" "$in_id" "$in_pageId")
image_in_fpath="${image_in_fpath#file://}"
local tmpfile=
if [[ "$comments" =~ cropped ]]; then
ocrd log debug "Using page border in input file ID=${in_id} (pageId=${in_pageId})"
elif options=( --no-doc-namespace sel
-N "pc=${namespace}" -t
-v '/pc:PcGts/pc:Page/pc:Border/pc:Coords/@points'
"$out_fpath" )
# FIXME: add a bashlib wrapper for workspace.image_from_page to use here
border=$(xmlstarlet "${options[@]}"); then
ocrd log debug "Cropping to page border '$border' in input file ID=${in_id} (pageId=${in_pageId})"
tmpfile=$(mktemp --tmpdir ocrd-olena-binarize-cropped.XXXXXX)
xywh_from_points $border | {
read width height left top
convert "${image_in_fpath}[0]" -crop ${width}x${height}+${left}+${top} "$tmpfile"
}
image_in_fpath="$tmpfile"
comments="${comments%binarized}cropped,binarized"
fi
# set output names
image_out_id="${image_in_id}-BIN_${params[impl]}"
image_out_fpath="${out_file_grp}/${image_out_id}.png"
# override win-size=0 from DPI
# (due to OCR-D/core#343, we cannot use $image_in_fpath for this
# (i.e. the derived image), but have to look at the original image)
options=( --no-doc-namespace sel
-N "pc=${namespace}" -t
-v '/pc:PcGts/pc:Page/@imageFilename'
"$out_fpath" )
original_image_in_fpath=$(xmlstarlet "${options[@]}")
local scribo_extra=$(auto_winsize "${original_image_in_fpath}" "${in_pageId}")
scribo-cli "${params[impl]}" "${image_in_fpath}" "${image_out_fpath}" "${scribo_options[@]}" ${scribo_extra}
# Remove temporary image file, if any
if [ -n "$tmpfile" ]; then
rm "$tmpfile"
fi
# Add image file to METS
# ocrd workspace add \
# --force \
# -G ${out_file_grp} \
# -g "$in_pageId" \
# -m image/png \
# -i "$image_out_id" \
# "$image_out_fpath"
echo "${out_file_grp}" image/png "${in_pageId}" "${image_out_id}" "${image_out_fpath}" >&3
# Reference image file in PAGE
modify_page "$namespace" "$ns_prefix" "$image_out_fpath" "$out_fpath" "$comments" "$out_id"
return 0
}
function process_imagefile {
local in_fpath="$1" in_id="$2" in_pageId="$3" out_fpath="$4" out_id="$5" out_file_grp="$6"
local image_out_fpath image_out_id
image_in_fpath="${in_fpath#file://}"
image_out_id="${out_id}-BIN_${params[impl]}"
image_out_fpath="${out_file_grp}/${image_out_id}.png"
# override win-size=0 from DPI
local scribo_extra=$(auto_winsize "${image_in_fpath}" "${in_pageId}")
scribo-cli "${params[impl]}" "${image_in_fpath}" "${image_out_fpath}" "${scribo_options[@]}" ${scribo_extra}
# Add image file to METS
# ocrd workspace add \
# --force \
# -G ${out_file_grp} \
# -g "$in_pageId" \
# -m image/png \
# -i "$image_out_id" \
# "$image_out_fpath"
echo "${out_file_grp}" image/png "${in_pageId}" "${image_out_id}" "${image_out_fpath}" >&3
# Reference image file in PAGE
# FIXME: add a bashlib wrapper for OcrdExif to use here
imageGeometry=($(identify -format "%[fx:w] %[fx:h]" "${in_fpath}[0]"))
imageWidth=${imageGeometry[0]}
imageHeight=${imageGeometry[1]}
# FIXME: add a bashlib wrapper for page_from_file to use here
cat <<EOF > "$out_fpath"
<?xml version="1.0" encoding="UTF-8"?>
<PcGts xmlns:xsl="${NAMESPACES[xsl]}"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="${NAMESPACES[page]}"
xsi:schemaLocation="${NAMESPACES[page]} ${NAMESPACES[page]}/pagecontent.xsd"
pcGtsId="${out_id}">
<Metadata>
<Creator>OCR-D/core $(versionstring=($(ocrd --version)); echo ${versionstring[-1]})</Creator>
<Created>$(iso8601_date)</Created>
<LastChange/>
</Metadata>
<Page imageFilename="$in_fpath" imageWidth="$imageWidth" imageHeight="$imageHeight" type="content"/>
</PcGts>
EOF
modify_page "${NAMESPACES[page]}" "" "$image_out_fpath" "$out_fpath" "binarized" "$out_id"
return 0
}
FIFO=$(mktemp -u)
function backout {
kill $(jobs -p)
wait &>/dev/null
exec 3>&-
rm -f $FIFO
exit 1
}
function main {
# Load ocrd bashlib functions
# shellcheck source=../core/ocrd/bashlib/lib.bash
source $(ocrd bashlib filename)
ocrd__wrap "$SHAREDIR/ocrd-tool.json" "ocrd-olena-binarize" "$@"
ocrd__minversion 2.58.1
scribo_options=(--enable-negate-output)
case ${params[impl]} in
otsu)
: # has no options whatsoever
;;
niblack)
scribo_options+=(--disable-negate-input)
# has default -0.2 not 0.34
scribo_options+=(--k $($PYTHON -c "print(${params[k]}/-1.7)"))
;;& # get more
sauvola|kim|wolf)
scribo_options+=(--k ${params[k]})
;;& # get more
singh)
# has default 0.06 not 0.34
scribo_options+=(--k $($PYTHON -c "print(${params[k]}*0.1765)"))
;;& # get more
sauvola-ms*)
scribo_options+=(--k2 $($PYTHON -c "print(${params[k]}/0.34*0.2)")
--k3 $($PYTHON -c "print(${params[k]}/0.34*0.3)")
--k4 $($PYTHON -c "print(${params[k]}/0.34*0.5)"))
;;& # get more
*)
scribo_options+=(--win-size ${params[win-size]})
;;
esac
cd "${ocrd__argv[working_dir]}"
mets_basename=$(basename ${ocrd__argv[mets_file]})
out_file_grp=${ocrd__argv[output_file_grp]}
mkfifo $FIFO
trap backout ERR
bulk_options=( -r '(?P<grp>[^ ]+) (?P<mime>[^ ]+) (?P<page>[^ ]+) (?P<file>[^ ]+) (?P<local_filename>.*)')
bulk_options+=( -G '{{ grp }}' -m '{{ mime }}' -g '{{ page }}' -i '{{ file }}' -l '{{ local_filename }}')
if [[ "${ocrd__argv[overwrite]}" == true ]];then
bulk_options+=( --force )
fi
workspace_options=( -m "${mets_basename}" )
if [[ -n "${ocrd__argv[mets_server_url]}" ]];then
workspace_options+=( --mets-server-url "${ocrd__argv[mets_server_url]}" )
fi
ocrd workspace "${workspace_options[@]}" bulk-add "${bulk_options[@]}" - <$FIFO &
exec 3>$FIFO
for ((n=0; n<${#ocrd__files[*]}; n++)); do
local in_fpath="$(ocrd__input_file $n local_filename)"
local in_id="$(ocrd__input_file $n ID)"
local in_mimetype="$(ocrd__input_file $n mimetype)"
local in_pageId="$(ocrd__input_file $n pageId)"
local out_id="$(ocrd__input_file $n outputFileId)"
local out_fpath="$out_file_grp/${out_id}.xml"
if ! test -f "${in_fpath#file://}"; then
ocrd log error "input file ID=${in_id} (pageId=${in_pageId} MIME=${in_mimetype}) is not on disk"
continue
fi
mkdir -p $out_file_grp
if [ "x${in_mimetype}" = x${MIMETYPE_PAGE} ]; then
ocrd log info "processing PAGE-XML input file $in_id ($in_pageId)"
process_pagefile "$in_fpath" "$in_id" "$in_pageId" "$out_fpath" "$out_id" "$out_file_grp"
elif [ "${in_mimetype}" != "${in_mimetype#image/}" ]; then
ocrd log info "processing $in_mimetype input file $in_id ($in_pageId)"
process_imagefile "$in_fpath" "$in_id" "$in_pageId" "$out_fpath" "$out_id" "$out_file_grp"
else
ocrd log error "input file ID=${in_id} (pageId=${in_pageId} MIME=${in_mimetype}) is neither an image nor a PAGE-XML file"
continue
fi
# Add PAGE file to METS
# declare -a options
# if [ -n "$in_pageId" ]; then
# options=( -g $in_pageId )
# else
# options=()
# fi
# if [[ "${ocrd__argv[overwrite]}" == true ]];then
# options+=( --force )
# fi
# options+=( -G $out_file_grp
# -m $MIMETYPE_PAGE
# -i "$out_id"
# "$out_fpath" )
# ocrd workspace add "${options[@]}"
echo "${out_file_grp}" "${MIMETYPE_PAGE}" "${in_pageId}" "${out_id}" "${out_fpath}" >&3
done
exec 3>&-
rm -f $FIFO
wait
}
main "$@"