forked from ocropus-archive/DUP-ocropy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ocropus-errs
executable file
·60 lines (52 loc) · 2.39 KB
/
ocropus-errs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse,sys,os,os.path,multiprocessing
import ocrolib
from ocrolib import edist
parser = argparse.ArgumentParser(description = """
Compute the edit distances between ground truth and recognizer output.
Run with the ground truth files as arguments, and it will find the
corresponnding recognizer output files using the given extension (-x).
Missing output files are handled as empty strings, unless the -s
option is given.
""")
parser.add_argument("files",default=[],nargs='*',help="input lines")
parser.add_argument("-x","--extension",default=".txt",help="extension for recognizer output, default: %(default)s")
# parser.add_argument("-g","--gtextension",default=".gt.txt",help="extension for ground truth, default: %(default)s")
parser.add_argument("-k","--kind",default="exact",help="kind of comparison (exact, nospace, letdig, letters, digits, lnc), default: %(default)s")
parser.add_argument("-e","--erroronly",action="store_true",help="only output an error rate")
parser.add_argument("-s","--skipmissing",action="store_true",help="don't use missing or empty output files in the calculation")
parser.add_argument("-Q","--parallel",type=int,default=multiprocessing.cpu_count())
args = parser.parse_args()
args.files = ocrolib.glob_all(args.files)
if not ".gt." in args.files[0]:
sys.stderr.write("warning: compare on .gt.txt files, not .txt files\n")
def process1(fname):
# fgt = ocrolib.allsplitext(fname)[0]+args.gtextension
gt = ocrolib.project_text(ocrolib.read_text(fname),kind=args.kind)
ftxt = ocrolib.allsplitext(fname)[0]+args.extension
missing = 0
if os.path.exists(ftxt):
txt = ocrolib.project_text(ocrolib.read_text(ftxt),kind=args.kind)
else:
missing = len(gt)
txt = ""
err = edist.levenshtein(txt,gt)
return fname,err,len(gt),missing
outputs = ocrolib.parallel_map(process1,args.files,parallel=args.parallel,chunksize=10)
errs = 0
total = 0
missing = 0
for fname,e,t,m in sorted(outputs):
if not args.erroronly:
print "%6d\t%6d\t%s"%(e,t,fname)
errs += e
total += t
missing += m
sys.stderr.write("errors %8d\n"%errs)
sys.stderr.write("missing %8d\n"%missing)
sys.stderr.write("total %8d\n"%total)
sys.stderr.write("err %8.3f %%\n"%(errs*100.0/total))
sys.stderr.write("errnomiss %8.3f %%\n"%((errs-missing)*100.0/total))
if args.erroronly:
print errs*1.0/total