-
Notifications
You must be signed in to change notification settings - Fork 4
/
ocrd-import
executable file
·280 lines (262 loc) · 8.5 KB
/
ocrd-import
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
#!/usr/bin/env bash
function cleanup {
set +e
rm -f mets.sock 2>/dev/null
}
trap cleanup EXIT
# FIXME: bash says under BUGS "There may be only one active coprocess at a time." which causes spurious warnings here.
# (We therefore temporarily silence stderr to silence the execute_coproc warnings.)
exec 4>&2
{
coproc critical { ocrd log -n ocrd-import critical - >& 4 2>& 4; }
coproc error { ocrd log -n ocrd-import error - >& 4 2>& 4; }
coproc warning { ocrd log -n ocrd-import warning - >& 4 2>& 4; }
coproc info { ocrd log -n ocrd-import info - >& 4 2>& 4; }
coproc debug { ocrd log -n ocrd-import debug - >& 4 2>& 4; }
} 2>/dev/null
function critical { echo "$1" >& ${critical[1]}; }
function error { echo "$1" >& ${error[1]}; }
function warning { echo "$1" >& ${warning[1]}; }
function info { echo "$1" >& ${info[1]}; }
function debug { echo "$1" >& ${debug[1]}; }
((BASH_VERSINFO<4 || BASH_VERSINFO==4 && BASH_VERSINFO[1]<4)) && critical "bash $BASH_VERSION is too old. Please install 4.4 or newer" && exit 2
ignore=0
skip=()
regex=()
convert=1
dpi=300
numpageid=1
onlybasename=0
while (($#)); do
case "${1:--h}" in
-h|-[-]help)
cat <<EOF
Usage: $(basename $0) [OPTIONS] [DIRECTORY]
with options:
-i|--ignore keep going after unknown file types
-s|--skip SUFFIX ignore file names ending in given SUFFIX (repeatable)
-R|--regex EXPR only include paths matching given EXPR (repeatable)
-C|--no-convert do not attempt to convert image file types
-r|--render DPI when converting PDFs, render at DPI pixel density
-P|--nonnum-ids do not use numeric pageIds but basename patterns
-B|--basename only use basename for IDs
Create OCR-D workspace meta-data (mets.xml) in DIRECTORY (or $PWD), importing...
* all image files (with known file extension or convertible via ImageMagick) under fileGrp OCR-D-IMG
* all .xml files (if they validate as PAGE-XML) under fileGrp OCR-D-SEG-PAGE
...but failing otherwise.
EOF
exit
;;
-i|--ignore)
ignore=1
shift
;;
# keep just for backwards compatibility:
-j|--jobs)
shift
shift
;;
-s|--skip)
shift
skip+=("$1")
shift
;;
-R|--regex)
shift
regex+=("$1")
shift
;;
-C|--no-convert)
convert=0
shift
;;
-r|--render)
shift
dpi="$1"
[[ "$dpi" =~ [0-9]+ ]] || {
critical "--render needs a numeric value"
exit 2
}
shift
;;
-P|--nonnum-ids)
numpageid=0
shift
;;
-B|--basename)
onlybasename=1
shift
;;
*)
break
;;
esac
done
(($#>1)) && warning "non-first argument(s) will be ignored: '${@:2}'"
set -eE
declare -A MIMETYPES
eval MIMETYPES=( $(ocrd bashlib constants EXT_TO_MIME) )
MIMETYPE_PAGE=$(ocrd bashlib constants MIMETYPE_PAGE)
DIRECTORY="${1:-.}"
if ! test -d "$DIRECTORY"; then
critical "not a directory: '$DIRECTORY'"
false
fi
# avoid damaging/replacing existing workspaces:
if test -f "$DIRECTORY"/mets.xml || test -d "$DIRECTORY"/data -a -f "$DIRECTORY"/data/mets.xml; then
critical "Directory '$1' already is a workspace"
false
fi
# trap to back-off from mets.xml and subdir in case of failure:
function backout {
set +e
critical "Cancelled '$DIRECTORY'"
ocrd workspace -U mets.sock server stop
test -v PID_SERVER && kill $PID_SERVER &>/dev/null
rm -f mets.xml
rmdir --ignore-fail-on-non-empty OCR-D-IMG OCR-D-SEG-PAGE 2>/dev/null
popd > /dev/null
exit 1
}
trap backout ERR INT
info "analysing '$DIRECTORY'"
pushd "$DIRECTORY" > /dev/null
ocrd workspace init > /dev/null
ocrd workspace -U mets.sock server start > /dev/null &
PID_SERVER=$!
PID_TOP=$$
sleep 1 # wait for server to become available
num=0 zeros=0000
IFS=$'\n'
for file in $(find -L . -type f -not -name mets.xml -not -name "*.log" | sort); do
IFS=$' \t\n'
let num++ || true
page=p${zeros:0:$((4-${#num}))}$num
group=OCR-D-IMG
file="${file#./}"
for suffix in "${skip[@]}"; do
if test "$file" != "${file%$suffix}"; then
info "skipping file '$file'"
continue
fi
done
if ((${#regex[*]})); then
match=0
for expr in "${regex[@]}"; do
expr="${expr#$DIRECTORY}"
if [[ "$file" =~ $expr ]]; then
match=1
break
fi
done
if ((match)); then
info "matching file '$file'"
else
continue
fi
fi
if test -z "$file"; then
warning "ignoring empty file $file"
continue
fi
# guess MIME type
name="$(basename "$file")"
suffix=."${name##*.}"
mimetype=${MIMETYPES[${suffix,,[A-Z]}]}
# create ID from path
base="${name%$suffix}"
name="$(dirname "${file#./}")"
if test "$name" != . && ! ((onlybasename)); then
base="${name//\//_}_$base"
fi
# XSD ID must start with letter and not contain colons or spaces
# also, avoid . in IDs, because downstream it will confuse filename suffix detection
base="${base//[ :.,]/_}"
if ! [[ ${base:0:1} =~ [a-zA-Z] ]]; then
base=f${base}
fi
if ! ((numpageid)); then
page=$base
fi
#debug "found file '$file' (base=$base page=$page mimetype=$mimetype)"
case "$mimetype" in
${MIMETYPE_PAGE})
# FIXME should really validate this is PAGE-XML (cf. core#353)
if fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/ "$file" \
&& fgrep -qw 'PcGts' "$file"; then
group=OCR-D-SEG-PAGE
elif fgrep -q http://www.loc.gov/standards/alto/ "$file" \
&& fgrep -qw alto "$file"; then
mimetype=application/alto+xml
group=OCR-D-SEG-ALTO
elif (($ignore)); then
warning "unknown type of file '$file'"
continue
else
critical "unknown type of file '$file'"
false
fi
;;
image/tiff|image/jpeg|image/png)
# directly supported
;;
*)
case "$suffix" in
.pdf|.PDF)
inopts=(-units PixelsPerInch -density $((2*$dpi)))
outopts=(-background white -alpha remove -alpha off -colorspace Gray -units PixelsPerInch -resample $dpi -density $dpi)
;;
*)
inopts=()
outopts=()
esac
if (($convert)) && \
mkdir -p OCR-D-IMG && \
warning "converting '$file' to 'OCR-D-IMG/${base}_*.tif' prior to import" && \
convert "${inopts[@]}" "$file" "${outopts[@]}" OCR-D-IMG/"${base}_%04d.tif"; then
mimetype=image/tiff
IFS=$'\n'
files=($(find OCR-D-IMG -name "${base}_[0-9]*.tif" | sort))
IFS=$' \t\n'
info "converted '$file' to 'OCR-D-IMG/${base}_*.tif' (${#files[*]} files)"
if ((${#files[*]}>1)); then
for file in "${files[@]}"; do
file="${file#OCR-D-IMG/}"
base="${file%.tif}"
info "adding -g ${page}_${base:(-4)} -G $group -m $mimetype -i $base '$file'"
ocrd workspace -U mets.sock add -G $group -m $mimetype -g ${page}_${base:(-4)} -i "$base" "$file"
done
# there's no danger of clashes with other files here
continue
else
file="${files[0]}"
file="${file#./}"
fi
elif (($ignore)); then
warning "unknown type of file '$file'"
continue
else
critical "unknown type of file '$file'"
false
fi
;;
esac
# file IDs must contain fileGrp, otherwise processors will have to prevent
# ID clashes by using numeric IDs
if [[ "$base" != $group* ]]; then
base=${group}_"$base"
fi
# finally, add the file to the METS
info "adding -g $page -G $group -m $mimetype -i $base '$file'"
ocrd workspace -U mets.sock add -G $group -m $mimetype -g $page -i "$base" "$file"
done
# undo backout trap
trap "" ERR
# persist METS
ocrd workspace -U mets.sock server stop
wait $PID_SERVER
# ensure these exist in the file system, too
# (useful for ocrd-make)
mkdir -p OCR-D-IMG OCR-D-SEG-PAGE
popd > /dev/null
info "Success on '$DIRECTORY'"