-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_features.py
84 lines (63 loc) · 2.85 KB
/
text_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import unidecode
from sklearn.base import BaseEstimator, TransformerMixin
import string
import re
from underthesea.word_tokenize.regex_tokenize import tokenize
negative_emoticons = {':(', '☹', '❌', '👎', '👹', '💀', '🔥', '🤔', '😏', '😐', '😑', '😒', '😓', '😔', '😕', '😖',
'😞', '😟', '😠', '😡', '😢', '😣', '😤', '😥', '😧', '😨', '😩', '😪', '😫', '😭', '😰', '😱',
'😳', '😵', '😶', '😾', '🙁', '🙏', '🚫', '>:[', ':-(', ':(', ':-c', ':c', ':-<', ':っC', ':<',
':-[', ':[', ':{'}
positive_emoticons = {'=))', 'v', ';)', '^^', '<3', '☀', '☺', '♡', '♥', '✌', '✨', '❣', '❤', '🌝', '🌷', '🌸',
'🌺', '🌼', '🍓', '🎈', '🐅', '🐶', '🐾', '👉', '👌', '👍', '👏', '👻', '💃', '💄', '💋',
'💌', '💎', '💐', '💓', '💕', '💖', '💗', '💙', '💚', '💛', '💜', '💞', ':-)', ':)', ':D', ':o)',
':]', ':3', ':c)', ':>', '=]', '8)'}
class Lowercase(BaseEstimator, TransformerMixin):
def transform(self, x):
return [s.lower() for s in x]
def fit(self, x, y=None):
return self
class RemoveTone(BaseEstimator, TransformerMixin):
def remove_tone(self, s):
return unidecode.unidecode(s)
def transform(self, x):
return [self.remove_tone(s) for s in x]
def fit(self, x, y=None):
return self
class CountEmoticons(BaseEstimator, TransformerMixin):
def count_emoticon(self, s):
positive_count = 0
negative_count = 0
for emoticon in positive_emoticons:
positive_count += s.count(emoticon)
for emoticon in negative_emoticons:
negative_count += s.count(emoticon)
return positive_count, negative_count
def transform(self, x):
return [self.count_emoticon(s) for s in x]
def fit(self, x, y=None):
return self
class RemoveDuplicate(BaseEstimator, TransformerMixin):
def transform(self, x):
result = []
for s in x:
s = re.sub(r'([a-z])\1+', lambda m: m.group(1), s, flags=re.IGNORECASE)
s = re.sub(r'([a-z][a-z])\1+', lambda m: m.group(1), s, flags=re.IGNORECASE)
result.append(s)
return result
def fit(self,x, y=None):
return self
class Tokenrize(BaseEstimator, TransformerMixin):
def pun_num(self, s):
for token in s.split():
if token in string.punctuation:
if token == '.':
s = s
else:
s = s.replace(token, 'punc')
else:
s = s
return s
def transform(self, x):
return [self.pun_num(tokenize(s, format='text')) for s in x]
def fit(self, x, y=None):
return self