XGBPredictor

# -*- coding: utf-8 -*-
"""
@author anooppanyam

"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import spacy
from textblob import TextBlob as tb
from spacy.tokens import Doc
from spacy.matcher import Matcher
import string
import re
from functools import reduce
from operator import and_, or_, contains
from nltk.stem import LancasterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
import math


# Read in dataframe and isolate features and target variable 
df = pd.read_csv('training_set.csv', encoding="CP1250")
features = df.drop(['Engagements'], axis=1)
target = df[['Engagements']]

# Add features and label encode categorical variables

le = LabelEncoder()
features['Created'] = pd.to_datetime(df['Created'])
features['Weekday'] = features['Created'].dt.weekday
features['Hour'] = features['Created'].dt.hour 
features['Length'] = features.Description.str.len()
features['dFollowers'] = (features['Followers at Posting'].diff(periods= -3)) / (features['Created'].dt.hour*60 + features['Created'].dt.minute).diff(periods=-3)
features['dTime'] = (features['Created'].dt.hour*60 + features['Created'].dt.minute).diff(-3)
features['Type'] = le.fit_transform(features['Type'])

# Resolve null values 
features['Description'] = features['Description'].fillna('')
features[['Length', 'dFollowers', 'dTime']] = features[['Length', 'dFollowers', 'dTime']].fillna(0)


# Add sentiment - create custom extention for spacy tokens
features['Sentiment'] = features['Description'].apply(lambda x : tb(str(x)).sentiment[0])
features['Subjectivity'] = features['Description'].apply(lambda x : tb(str(x)).sentiment[1])

# Setting up text analysis -- Bag of words


#Filter out punctuation except @ or # and all stop words
nlp = spacy.load('en_core_web_lg')
punctuation = re.sub('[#@]', '', string.punctuation)
stop_words = spacy.lang.en.stop_words.STOP_WORDS
matcher = Matcher(nlp.vocab)
matcher.add('HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])

                               
# Filtering out encoding errors 


unwanted = {'–','‘', '’', '“','”','•', '…', '\ufeff1', '\'', '.', '/', '\\'}    
def containsAny(str, set):
    return reduce(or_, map(str.__contains__, set))

# Stemmer 

stemmer = LancasterStemmer() 

# Aggressively stem and lemmatize to reduce sparcity
    def PreprocessAndTokenize(text): 
        # Keep hashtags
        text = re.sub('[?][?]', ' EMOJI ', text)
        doc = nlp(text)
        matches = matcher(doc)
        hashtags = []
        for match_id, start, end in matches:
            hashtags.append(doc[start:end])
        for span in hashtags:
            span.merge()
        # Lemmatize first and then stem  
        doc = [stemmer.stem(word.lemma_.lower().strip()) if word.lemma_ != "-PRON-" else stemmer.stem(word.lower_) for word in doc]
        doc = [word for word in doc if (word not in stop_words) and  (word not in punctuation)]
        doc = [word for word in doc if (not containsAny(word, unwanted) and ((word.find('@') != -1) or ((not any(str.isdigit(c) for c in word)) and len(word) >= 4)))]
        return doc
    

# Create a count B.O.W model
ctv = CountVectorizer(tokenizer=PreprocessAndTokenize, analyzer='word', ngram_range=[1,1])
X_ct = ctv.fit_transform(features['Description'].apply(lambda x : np.str(x)))
addedfeatures = pd.DataFrame(X_ct.toarray(), columns = ctv.get_feature_names())


# Create a Term-Frequency - Inverse Document Frequency weighted model
tfv = TfidfVectorizer(tokenizer=PreprocessAndTokenize, analyzer='word', ngram_range=[1,1])
X_tf = tfv.fit_transform(features['Description'].apply(lambda x : np.str(x)))
newfeats = pd.DataFrame(X_tf.todense(), columns=tfv.get_feature_names()) 


# Filter out the top 150 scores


 def top_tfidf_feats(row, features, top_n=150):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=150):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)


# Extract top 150 most important features
newdf = newfeats[top_mean_feats(X_tf, tfv.get_feature_names())['feature'].tolist()]


# Drop uneccessary features
features.drop(['Created', 'Description'], axis=1, inplace=True)

# Merge extracted features with main database
for i in range(len(features.columns)):
    newdf.loc[:, features.columns[i]] = features.loc[:, features.columns[i]]

# Feature scale
scaler = preprocessing.MinMaxScaler(copy=False, feature_range=(0,1))
X = scaler.fit_transform(newdf)
X = pd.DataFrame(X, columns=newdf.columns)


# Feature selection
lvsc = Lasso(alpha=0.05, copy_X = False, normalize='l1', max_iter=10000 ).fit(X, target)
model = SelectFromModel(lvsc, prefit=True)
X_new = model.transform(newdf)
print(X_new.shape)


# split into train and test models
X_train, X_test, Y_train, Y_test = train_test_split(X, target, test_size=0.25, random_state=55)

import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate, GridSearchCV

# Cross validate and tune parameters before final training

def modelfit(alg, dtrain, dtest, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')