-
Notifications
You must be signed in to change notification settings - Fork 0
/
train-recaser.perl
executable file
·242 lines (211 loc) · 8.37 KB
/
train-recaser.perl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
#!/usr/bin/perl -w
#
# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
# Previously modified by Ondřej Bojar and others
#
# ©2011–2014 Autodesk Development Sàrl
#
# ChangeLog
# v2.7.3 Last modified by Ventsislav Zhechev on 15 Apr 2014
# Updated to use EN_GB for British English.
# Improved some error messages.
#
# v2.7.2 Last modified by Ventsislav Zhechev on 18 Apr 2013
# Added a -max-lexical-reordering 0 parameter to the call to the train-model.perl script
#
# v2.7.1 Last modified by Ventsislav Zhechev on 22 Jan 2013
# Added a special handling for PT_BR as a target language when creating the language model.
#
# v2.7 Last modified by Ventsislav Zhechev on 25 May 2012
# Added an option that controls whether the phrase table gets binarised.
#
# v2.6 Last modified by Ventsislav Zhechev
# Added a special handling for EN_UK as a target language when creating the language model.
#
# v2.5.2 Last modified by Ventsislav Zhechev
# Added an extra debug message in the data preparation code.
# Adjusted thread assignment to reduce memory wastage.
#
# v2.5.1 Last modified by Ventsislav Zhechev
# Added provision for passing the device ID to the train-model.perl script.
#
# v2.5 Last modified by Ventsislav Zhechev
# The main training script was changed to eliminate empty lines from the data, so that we do not have to do any filtering here.
#
# v2.4 Last modified by Ventsislav Zhechev
# The building of the recaser language model can be done in parallel to the data preparation. This will give a slight reduction in overall runtime.
#
# v2.3.1 Last modified by Ventsislav Zhechev
# Unfortunately, empty lines in the data cause problems for further processing and we need to filter them out. Because of this we cannot simply copy the source corpus.
#
# v2.3 Last modified by Ventsislav Zhechev
# It is not necessary to filter the data based on segment length for the recaser training. Because of this, we can simply copy the source corpus as the cased version of the data, rather than decompressing and compressing it again.
#
# v2.2.4 Last modified by Ventsislav Zhechev
# Now we are only expecting bzip2-compressed source, as this is what our training script produces.
#
# v2.2.3 Last modified by Ventsislav Zhechev
# For the data preparation step, the three different output streams now work in three separate threads, apart from the input reading thread. In this way, we are decompressing the input in one thread and compressing the output in three other threads, thus achieving a performance boost on multi-CPU machines.
#
# v2.2.2 Last modified by Ventsislav Zhechev
# Lowercasing wasn’t working properly for UTF-8 data, when binmode was set to the file handles. Fixed by decoding/encoding the data only for lowercasing.
# Sentence tokenisation is now performed only on plain spaces and tabs, igonring e.g. ideographic space.
#
# v2.2.1 Last modified by Ventsislav Zhechev
# Added special rules for lowercasing Turkish.
#
# v2.2 Last modified by Ventsislav Zhechev
# Removed some unnecessary regex checks.
# Switched to a stricter UTF-8 processing.
#
# v2.0 Last modified by Ventsislav Zhechev
# Made it possible to read-in a gzipped corpus and modified the script to write out bzip2-ed data.
#
####################
use strict;
use threads;
use Thread::Queue;
use Getopt::Long "GetOptions";
use IO::Uncompress::Bunzip2 qw/$Bunzip2Error/;
use IO::Compress::Bzip2 qw/$Bzip2Error/;
use Encode qw/encode decode/;
# apply switches
my ($DIR,$CORPUS,$SCRIPTS_ROOT_DIR,$CONFIG);
my $NGRAM_COUNT = "ngram-count";
my $TRAIN_SCRIPT = "train-model.perl";
my $MAX_LEN = 1;
my $FIRST_STEP = 1;
my $LAST_STEP = 11;
my $LANGUAGE = "";
my $DEVICE = "disk0";
my $BINARISE = 1;
die("Usage: $0 --dir recaser --corpus cased")
unless &GetOptions( 'first-step=i' => \$FIRST_STEP,
'last-step=i' => \$LAST_STEP,
'corpus=s' => \$CORPUS,
'config=s' => \$CONFIG,
'dir=s' => \$DIR,
'ngram-count=s' => \$NGRAM_COUNT,
'train-script=s' => \$TRAIN_SCRIPT,
'scripts-root-dir=s' => \$SCRIPTS_ROOT_DIR,
'max-len=i' => \$MAX_LEN,
'language=s' => \$LANGUAGE,
'device=s' => \$DEVICE,
'binarise=i' => \$BINARISE,
);
# check and set default to unset parameters
die("please specify working dir --dir") unless defined($DIR);
die("please specify --corpus") if !defined($CORPUS) && $FIRST_STEP <= 2 && $LAST_STEP >= 1;
mkdir($DIR) unless -e $DIR;
#&truecase() if 0 && $FIRST_STEP == 1;
if ($FIRST_STEP <= 2 && $LAST_STEP >= 3) {
my $pid = fork();
if ($pid) {
&train_lm() if $FIRST_STEP <= 2;
waitpid $pid, 0;
} else {
&prepare_data() if $FIRST_STEP <= 3 && $LAST_STEP >= 3;
exit(0);
}
}
&train_recase_model() if $FIRST_STEP <= 10 && $LAST_STEP >= 3;
&cleanup() if $LAST_STEP == 11;
### subs ###
#sub truecase {
# # to do
#}
sub train_lm {
if (-e "$DIR/cased.srilm.gz") {
print STDERR "Reusing existing language model!\n";
return;
}
print STDERR "(2) Train language model on cased data @ ".`date`;
my $cmd = "$NGRAM_COUNT -text $CORPUS -lm $DIR/cased.srilm.gz -interpolate -".($LANGUAGE eq "en_gb" ? "wb" : "kn")."discount";
print STDERR $cmd."\n";
system($cmd) == 0
or die "Recaser language model training failed!\n";
}
sub prepare_data {
if (-e "$DIR/aligned.cased.bz2" && -e "$DIR/aligned.lowercased.bz2" && -e "$DIR/aligned.a.bz2") {
print STDERR "Reusing existing alignment data!\n";
return;
}
print STDERR "\n(3) Preparing data for training recasing model @ ".`date`;
system 'cp', '-p', $CORPUS, "$DIR/aligned.cased.bz2";
# my $casedQueue = Thread::Queue->new();
# my $printCased = sub {
# my $CASED = new IO::Compress::Bzip2 "$DIR/aligned.cased.bz2", BlockSize100K => 9
# or die "Could not write $DIR/aligned.cased.bz2 ($Bzip2Error)!\n";
#
# while (defined(my $line = $casedQueue->dequeue())) {
# print $CASED encode "utf-8", $line;
# }
#
# $CASED->close();
# };
my $lowercasedQueue = Thread::Queue->new();
my $printLowercased = sub {
my $LOWERCASED = new IO::Compress::Bzip2 "$DIR/aligned.lowercased.bz2", BlockSize100K => 9
or die "Could not write $DIR/aligned.lowercased.bz2 ($Bzip2Error)!\n";
while (defined(my $line = $lowercasedQueue->dequeue())) {
$line =~ tr/İI/iı/ if $LANGUAGE =~ /tr/;
print $LOWERCASED encode "utf-8", lc $line;
}
$LOWERCASED->close();
print STDERR "Finished outputting lowercased data.";
};
my $alignmentQueue = Thread::Queue->new();
my $printAlignment = sub {
my $ALIGNMENT = new IO::Compress::Bzip2 "$DIR/aligned.a.bz2", BlockSize100K => 9
or die "Could not write $DIR/aligned.a.bz2 ($Bzip2Error)!\n";
while (defined(my $line = $alignmentQueue->dequeue())) {
# unless ($line =~ /^\s*$/) {
my $i=0;
foreach (split /[ \t]/, $line) {
print $ALIGNMENT "$i-$i ";
++$i;
}
# }
print $ALIGNMENT "\n";
}
$ALIGNMENT->close();
print STDERR "Finished outputting alignment data.";
};
my $mainThread = sub {
my $CORP = new IO::Uncompress::Bunzip2($CORPUS)
or die "Coud not read corpus $CORPUS ($Bunzip2Error)!\n";
select STDERR;
$| = 1;
while (my $line = <$CORP>) {
$line = decode "utf-8", $line;
print STDERR "[generate: $.]" unless $. % 500000;
print STDERR "." unless $. % 10000;
# next if $line =~ /^\s*$/;
# $casedQueue->enqueue($line);
$lowercasedQueue->enqueue($line);
$alignmentQueue->enqueue($line);
}
close $CORP;
print STDERR "Finished queuing data.";
# $casedQueue->enqueue(undef);
$lowercasedQueue->enqueue(undef);
$alignmentQueue->enqueue(undef);
};
$_->join() foreach (scalar threads->create($mainThread),
# scalar threads->create($printCased),
scalar threads->create($printLowercased), scalar threads->create($printAlignment));
}
sub train_recase_model {
my $first = $FIRST_STEP < 4 ? 4 : $FIRST_STEP;
print STDERR "\n(4) Training recasing model @ ".`date`;
my $cmd = "$TRAIN_SCRIPT -root-dir $DIR -model-dir $DIR -temp-dir /tmp -first-step $first -alignment a -corpus-dir \"$DIR\" -corpus aligned -f lowercased -e cased -max-phrase-length $MAX_LEN -lm 0:3:$DIR/cased.srilm.gz:0 -device $DEVICE -binarise $BINARISE -max-lexical-reordering 0";
$cmd .= " -scripts-root-dir $SCRIPTS_ROOT_DIR" if $SCRIPTS_ROOT_DIR;
$cmd .= " -config $CONFIG" if $CONFIG;
print STDERR "$cmd\n";
system($cmd) == 0
or die "Recase model training failed!\n";
}
sub cleanup {
print STDERR "\n(11) Cleaning up @ ".`date`;
system "rm -f $DIR/aligned*; rm -f $DIR/lex*";
}