-
Notifications
You must be signed in to change notification settings - Fork 0
/
RFM.py
76 lines (63 loc) · 2.76 KB
/
RFM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas as pd
import datetime as dt
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
### Functions ###
def outlier_thresholds(dataframe, variable):
quartile1 = dataframe[variable].quantile(0.01)
quartile3 = dataframe[variable].quantile(0.99)
interquantile_range = quartile3 - quartile1
up_limit = quartile3 + 1.5 * interquantile_range
low_limit = quartile1 - 1.5 * interquantile_range
return low_limit, up_limit
def replace_with_thresholds(dataframe, variable):
low_limit, up_limit = outlier_thresholds(dataframe, variable)
dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit
### EDA & PREP ###
df_ = pd.read_excel("datasets/online_retail_II.xlsx",
sheet_name="Year 2010-2011")
df= df_.copy()
df.isnull().any()
df.isnull().sum()
df.dropna(inplace=True)
df["Description"].nunique()
df["Description"].value_counts()
df.groupby("Description").agg({"Quantity": "sum"}).sort_values("Quantity", ascending=False).head()
df = df[~df["Invoice"].str.contains("C", na=False)]
df = df[(df['Quantity'] > 0)]
df = df[(df['Price'] > 0)]
replace_with_thresholds(df, 'Price')
replace_with_thresholds(df, 'Quantity')
df["TotalPrice"] = df["Quantity"]*df["Price"]
df.describe([0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]).T
### RFM Metrics ###
df["InvoiceDate"].max()
today_date = dt.datetime(2011, 12, 11)
rfm = df.groupby('Customer ID').agg({'InvoiceDate': lambda InvoiceDate: (today_date - InvoiceDate.max()).days,
'Invoice': lambda Invoice: Invoice.nunique(),
'TotalPrice': lambda TotalPrice: TotalPrice.sum()})
rfm.columns = ["recency", "frequency", "monetary"]
rfm = rfm[(rfm["monetary"] > 0) & (rfm["frequency"] > 0)]
### RFM Scores ###
rfm["recency_score"] = pd.qcut(rfm['recency'], 5, labels=[5, 4, 3, 2, 1])
rfm["frequency_score"] = pd.cut(rfm["frequency"].rank(method= "first"), 5 , labels=[1, 2, 3, 4, 5])
rfm["monetary_score"] = pd.qcut(rfm['monetary'], 5, labels=[1, 2, 3, 4, 5])
rfm["RFM_SCORE"] = (rfm['recency_score'].astype(str) +
rfm['frequency_score'].astype(str))
### RFM Segments ###
seg_map = {
r'[1-2][1-2]': 'Hibernating',
r'[1-2][3-4]': 'At_Risk',
r'[1-2]5': 'Cant_Loose',
r'3[1-2]': 'About_to_Sleep',
r'33': 'Need_Attention',
r'[3-4][4-5]': 'Loyal_Customers',
r'41': 'Promising',
r'51': 'New_Customers',
r'[4-5][2-3]': 'Potential_Loyalists',
r'5[4-5]': 'Champions'
}
rfm['segment'] = rfm['RFM_SCORE'].replace(seg_map, regex=True)
rfm[["segment", "recency", "frequency", "monetary"]].groupby("segment").agg(["mean", "count"])