forked from TextDatasetCleaner/TextDatasetCleaner
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.yml
40 lines (39 loc) · 839 Bytes
/
config.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
PRE_PROCESSING:
- unique
PROCESSING:
- line_strip
- clean_html:
or_condition: False
- detect_language:
language_code: ru
delimiter: '~'
delimited_position: -1
model_path: '/tmp/fasttext-lid.176.bin'
- remove_accents
- filter_stop_words:
language_code: ru
mode: replace
- normalize_hyphenated_words
- normalize_quotation_marks
- normalize_repeating_chars
- normalize_unicode
- filter_url
- filter_currency_symbols
- filter_email
- filter_emoji
- filter_hashtags
- filter_numbers
- filter_phone_number
- filter_user_handle
- remove_profanity:
threshold: 0.9
- clean_symbols
- normalize_whitespace
- add_prefix:
text: '<|startoftext|>'
- add_postfix:
text: '<|endoftext|>'
POST_PROCESSING:
- unique
- shuffle
CACHE_DIR: '/tmp/tdc'