-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprocessors.py
125 lines (108 loc) · 3.25 KB
/
processors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from threading import Thread
import constants
from pathlib import Path
import random
from typing import Union, Any, List
from interfaces import IProcess, IProcessor
from processes import (
RandomCharRemover,
RandomCharsInjector,
RandomCharsSwapper,
RandomNeighborReplacer,
RandomWordsCollapsor,
PunctuationRemover,
SentencePermutation,
)
class FilesProcessor(IProcessor):
def __init__(
self, processes: List[IProcess],
n_dist: int = 32
) -> None:
self.processes = processes
self.n_dist = n_dist
self.__dist = False
self.__cache = []
def file_run(self, file: Union[str, Path]) -> Any:
result = file
for process in self.processes:
result = process.execute(result)
return result
def run(
self,
files: List[Union[str, Path]]
) -> Any:
result = list(map(self.file_run, files))
if self.__dist is True:
self.__cache.append(result)
return
return result
def _divde(self, data: List[Any]):
items_per_div = len(data) // self.n_dist
divs = []
for i in range(items_per_div):
start = i * items_per_div
end = (i + 1) * items_per_div
if i == (items_per_div - 1):
end = len(divs)
divs.append(data[start: end])
return divs
def dist_run(
self,
files: List[Union[str, Path]]
) -> Any:
self.__dist = True
self.__cache = []
divs = self._divde(files)
threads = []
for div in divs:
t = Thread(target=self.run, args=(div,))
t.start()
threads.append(t)
for t in threads:
t.join()
self.__dist = False
results = []
for item in self.__cache:
results.extend(item)
self.__cache = []
return results
class TextDistorter(IProcessor):
def __init__(
self, ratio: float, processes: List[IProcess]
) -> None:
super().__init__()
self.ratio = ratio
self.processes = processes
def run(self, line: str) -> str:
length = len(line)
n = int(self.ratio * length)
for _ in range(n):
line = random.choice(self.processes).execute(line)
return line
def dist_run(self):
# TODO
pass
class TextProcessor(IProcessor):
def __init__(self, processes: List[IProcess]) -> None:
super().__init__()
self.processes = processes
def run(self, sentence: str):
for process in self.processes:
sentence = process.execute(sentence)
return sentence
def dist_run(self, sentence: str) -> str:
return self.run(sentence)
def get_text_distorter(ratio, sentences: List[str]):
return TextDistorter(
ratio=ratio,
processes=[
SentencePermutation(sentences),
RandomCharsInjector(constants.KURDISH_CHARS),
RandomCharsSwapper(),
RandomCharRemover(),
RandomWordsCollapsor(),
RandomNeighborReplacer(
constants.KEYBOARD_KEYS, constants.KEYBOARD_BLANK
)
]
)