-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtmx2plain.marketing.pl
executable file
·177 lines (158 loc) · 5.04 KB
/
tmx2plain.marketing.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/perl -ws
#####################
#
# A utility to convert TMX data to plain aligned text
#
# © 2009–2010 Венцислав Жечев
# © 2011–2013 Autodesk Development Sàrl
#
# created 14 Oct 2009
# modified 11 Jan 2013 by Ventsislav Zhechev
#
# Changelog
# version 1.5.1
# Updated to handle some RTF-based peculiarities of Marketing TMs.
# The output is now formatted like TM_..._ALL.bz2 files.
#
# version 1.5
# Changed to handle zipped Trados export with UTF16-BE encoding.
# Removed TDA-specific code.
# Updated to use latest release of XML::TMX::Reader (.23+)
#
# version 1.1
# Changed the command-line option handling.
# Added an option to extract only data from a specific organisation.
#
# version 1.0
# Modified to suit data downloaded from TDA.
# Improved regex practices
#
# version 0.9
# Any leftover whitespace at the beginning or end of segments is now also being stripped.
# Translation pairs where there is a sentence-length ratio greater than the standard GIZA++ fertility limit are now being skipped.
# (Caveat: read-in of bzip2 files is not currently supported due to the way the XML::TMX::Reader works.)
#
# version 0.8
# Added normalisation of whitespace — all and any amount of whitespace is converted to single space characters.
# Translation pairs where there is an empty segment for one of the languages are now skipped.
#
# version 0.7
# The script now removes new lines from within translation units.
#
# version 0.6
# Modified the filename conversion so that it would work with any .tmx or .bz2 file
#
# version 0.5
# Basic version dealing specifically with DGT TMs.
#
#####################
use strict;
use utf8;
use Encode qw/encode decode/;
$| = 1;
#use IO::Uncompress::Unzip;
use IO::Compress::Bzip2 qw(bzip2 $Bzip2Error);
#use IO::Uncompress::Bunzip2 qw(bunzip2 $Bunzip2Error);
use XML::TMX::Reader;
our ($fileName);
die encode "utf-8", "Usage: $0 -fileName=…\n" unless defined $fileName;
#my $fileName = $ARGV[0];
print STDERR encode "utf-8", "Using TMX file ‘$fileName’\n";
system "bunzip2 \"$fileName\"";
$fileName =~ s/.bz2$//;
my $origFileName = $fileName;
my $reader = XML::TMX::Reader->new($fileName);
$reader->ignore_markup(0);
my @languages = sort {$a eq "EN-US" ? -1 : ($b eq "EN-US" ? 1 : $a cmp $b)} $reader->languages();
local $" = "; ";
print STDERR encode "utf-8", "Languages: @languages\n";
$fileName =~ s/^(.*)(?:_.._..)?\.(?:tmx|zip)(?:\.bz2)?/$1/;
local $" = "_";
$fileName .= "[email protected]";
print STDERR encode "utf-8", "Outputting to file: $fileName\n";
#open (OUT, ">:utf8", $fileName);
my $out = new IO::Compress::Bzip2 $fileName
or die encode "utf-8", "Cannot write file “$fileName”: $Bzip2Error\n";
my $tus =
#my $errWrongType =
my $errDiamonds =
#my $errNoJapanese =
my $errEmpty =
0;
$reader->for_tu(sub {
local $" = ", ";
local $| = 1;
my $tu = shift;
++$tus;
print STDERR "." if ($tus % 1000 == 0);
# print STDOUT "tu is:\n", (values %$tu), "\n===\n";
# if ($tu->{-prop}->{"tda-type"} ne "Software Strings and Documentation") {
# print STDERR "«$tus»";
# ++$errWrongType;
# return;
# }
my $pair = "";
my $tokens = 0;
foreach my $key (sort {$a eq "EN-US" ? -1 : ($b eq "EN-US" ? 1 : $a cmp $b)} keys %$tu) {
unless ($key =~ m"^-"m) {
my $line = $tu->{$key}->{-seg};
if ($line =~ /♦♦♦/) {
print STDERR encode "utf-8", "«$tus»";
++$errDiamonds;
return;
}
# unless ($key =~ /en/ || $line =~ /[\p{Script:Hiragana}\p{Script:Katakana}\p{Script:Han}]+/) {
# print STDERR "«$tus»";
# ++$errNoJapanese;
# return;
# }
$line =~ s!<br(?:/)?>! !g;
$line =~ s/\n+//g;
$line =~ s/\r+//g;
$line =~ s!<ut>.*?</ut>!!g;
$line =~ s/<!\[CDATA\[|\]\]>//g;
$line =~ s/(?<![A-Z]:)(?:\\[rn])+/ /g;
$line =~ s!<([\w:-]+).*?>(?:.*?</\1>)?!{1}!g;
$line =~ s!<[\w:-]+[^>]*?/>!{1}!g;
$line =~ s!</[\w:-]+>!{1}!g;
$line =~ s!<\?[\w:-]+[^>]*?\?>!{1}!g;
my $ph = 0;
$line =~ s/(?<=\{)\d+(?=\})/++$ph/ge;
$line =~ s/\s+/ /g;
$line =~ s/(?:^\s+)|(?:\s+$)//g;
$line =~ s/\\0$//g;
$line =~ s/\\'/\'/g;
$line =~ s/^{\\field.*?}?}}\s*(?!{)//;
if ($line =~ /^\s*$/) {
print STDERR encode "utf-8", "«$tus»";
# $pair = "";
++$errEmpty;
return;
}
# unless ($tokens) {
# $tokens = () = split ' ', $line, -1;
# } else {
# my $newTokens = () = split ' ', $line, -1;
# if ($newTokens > $tokens ? (1. * $newTokens)/$tokens > 9. : (1. * $tokens)/$newTokens > 9.) {
# print STDERR "Bad pair $tus (length disparity)\t";
# $pair = "";
# ++$skippedTus;
# last;
# }
# }
$pair .= "$line";
}
}
print $out "$pair◊÷\n";
});
system "bzip2 \"$origFileName\"";
$" = "/";
print STDERR encode "utf-8", "\nProcessed $tus translation units for language pair @languages.\nOutput ".
#($tus - $errWrongType - $errDiamonds - $errNoJapanese - $errEmpty)
($tus - $errDiamonds - $errEmpty)
." translation units.\n"
#."Skipped due to wrong data type: $errWrongType\n"
."Skipped due to ♦♦♦: $errDiamonds\n"
#."Skipped due to target not containing Japanese: $errNoJapanese\n"
."Skipped due to an empty segment: $errEmpty\n";
1;