-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalculateTMStatistics.pl
executable file
·76 lines (60 loc) · 1.71 KB
/
calculateTMStatistics.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/perl -ws
#####################
#
# © 2012 Autodesk Development Sàrl
#
# Created on 26 Mar 2012 by Ventsislav Zhechev
# Last modified on 29 Mar 2012 by Ventsislav Zhechev
#
# Changelog
# v0.2.1
# The data is now output in tab-separated format.
#
# v0.2
# Modified to process a whole folder of TMs at once.
# Now we are applying normalisation to the TM names to avoid significant duplication.
#
# v0.1
# First version.
#
#####################
use strict;
use utf8;
use IO::Uncompress::Bunzip2 qw/$Bunzip2Error/;
use Encode qw/encode decode/;
$| = 1;
our ($tmFolder);
die encode "utf-8", "Usage: $0 -tmFolder=…\n"
unless defined $tmFolder;
opendir TMs, $tmFolder
or die encode "utf-8", "Cannot open folder “$tmFolder”\n";
while (my $tm = readdir TMs) {
next unless $tm =~ /TM_(.*)_ALL\.bz2/;
my $lang = $1;
my $file = new IO::Uncompress::Bunzip2 "$tmFolder/$tm"
or die encode "utf-8", "Could not read file “$tmFolder/$tm”: $Bunzip2Error\n";
my %fullStats;
while (my $line = $file->getline) {
chomp $line;
my $decodeError = 0;
my $tmp = decode "utf-8", $line, sub {$decodeError = 1};
if ($decodeError) {
warn "Could not decode line: $line\n";
next;
}
my ($src, $trg, $prd) = split //, $tmp;
next unless defined $src && defined $trg && defined $prd;
$prd =~ s/^\s+|\s+$//g;
$prd =~ s/\s+/_/g;
# $prd = lc $prd;
++$fullStats{$prd};
}
close $file;
open $file, ">$tmFolder/stats.$lang.txt"
or die encode "utf-8", "Cannot write file “$tmFolder/stats.$lang.txt”\n";
print $file encode "utf-8", "TM\tSegments\n";
print $file encode "utf-8", "$_\t$fullStats{$_}\n" foreach sort {$fullStats{$b} <=> $fullStats{$a} || $a cmp $b} keys %fullStats;
close $file;
}
closedir TMs;
1;