forked from mozilla/translations
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run-opusfilter.py
60 lines (56 loc) · 1.41 KB
/
run-opusfilter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from opusfilter.opusfilter import OpusFilter
import yaml
import sys
input_files = sys.argv[1].split()
filtered_files = sys.argv[2].split()
dedup_files = [filtered_file.replace(".gz",".dedup.gz") for filtered_file in filtered_files]
src_lang = sys.argv[3]
tgt_lang = sys.argv[4]
config_file = sys.argv[5]
threads=sys.argv[6]
if config_file == "default": #if a no configuration is given, use default
filter_params = [
{'AlphabetRatioFilter': {}},
{'LanguageIDFilter': {
'id_method': 'cld2',
'languages': [src_lang, tgt_lang]
}},
{'LengthRatioFilter': {
'name': 'word',
'unit': 'word'
}},
{'NonZeroNumeralsFilter': {}},
{'TerminalPunctuationFilter': {}},
{'RepetitionFilter': {}},
{'LengthFilter': {
'min_length': 3,
'max_length': 100
}}
]
else:
with open(config_file, 'r') as file:
filter_params = yaml.safe_load(file)
config = {
'common': {
'default_n_jobs': int(threads)
},
'steps': [
{
'type': 'remove_duplicates',
'parameters': {
'inputs': input_files,
'outputs': dedup_files
}
},
{
'type': 'filter',
'parameters': {
'inputs': dedup_files,
'outputs': filtered_files,
'filters': filter_params
}
}
]
}
of = OpusFilter(config)
of.execute_steps()