-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFeatureExtraction.py
110 lines (65 loc) · 2.26 KB
/
FeatureExtraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python
# coding: utf-8
# In[162]:
"""
@author anoopppanyam
"""
import pandas as pd
import numpy as np
import spacy
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
from spacy.matcher import Matcher
from sklearn.base import TransformerMixin
# In[163]:
#Read and split training set
trainingdf = pd.read_csv('training_set.csv', encoding="ISO-8859-1")
X_train = trainingdf.drop(['Engagements'], axis=1)
Y_train = trainingdf.iloc[:, 0]
# In[32]:
# Create Temporal Features
X_train['Created'] = pd.to_datetime(X_train['Created'], format='%m/%d/%Y %I:%M:%S %p')
X_train['Weekday'] = X_train['Created'].dt.weekday
X_train['Hour'] = X_train['Created'].dt.hour
X_train['Post_Type'] = np.where(X_train['Type'] == 'Photo', 1, 0)
# In[140]:
#Add caption length feature
X_train['Caption_Len'] = len(X_train['Description'])
# Setting up text analysis -- Bag of words
#Filter out punctuation except @ or # and all stop words
punctuation = re.sub('[#@]', '', string.punctuation)
stop_words = spacy.lang.en.stop_words.STOP_WORDS
matcher = Matcher(nlp.vocab)
matcher.add('HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])
def tokenize(text):
nlp = spacy.lang.en.English()
doc = nlp(text)
matches = matcher(doc)
hashtags = []
for match_id, start, end in matches:
hashtags.append(doc[start:end])
for span in hashtags:
span.merge()
doc = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in doc]
doc = [word for word in doc if (word not in stop_words) and (word not in punctuation)]
return doc
# Custom transformer using spaCy
class predictors(TransformerMixin):
def transform(self, X, **transform_params):
# Cleaning Text
return [clean_text(text) for text in X]
def fit(self, X, y=None, **fit_params):
return self
def get_params(self, deep=True):
return {}
# Basic function to clean the text
def clean_text(text):
# Removing spaces and converting text into lowercase
return text.strip().lower()
#Generate features without normalization
ct_vector = CountVectorizer(tokenizer=tokenize, ngram_range=(1,1))
ct_vector.fit_transform(X_train['Description'].apply(lambda x: np.str(x)))
# In[ ]:
# In[ ]:
# In[ ]: