-
Notifications
You must be signed in to change notification settings - Fork 16
/
model.py
73 lines (64 loc) · 2.46 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from multiprocessing import Pool, cpu_count
class EASE:
def __init__(self):
self.user_enc = LabelEncoder()
self.item_enc = LabelEncoder()
def _get_users_and_items(self, df):
users = self.user_enc.fit_transform(df.loc[:, 'user_id'])
items = self.item_enc.fit_transform(df.loc[:, 'item_id'])
return users, items
def fit(self, df, lambda_: float = 0.5, implicit=True):
"""
df: pandas.DataFrame with columns user_id, item_id and (rating)
lambda_: l2-regularization term
implicit: if True, ratings are ignored and taken as 1, else normalized ratings are used
"""
users, items = self._get_users_and_items(df)
values = (
np.ones(df.shape[0])
if implicit
else df['rating'].to_numpy() / df['rating'].max()
)
X = csr_matrix((values, (users, items)))
self.X = X
G = X.T.dot(X).toarray()
diagIndices = np.diag_indices(G.shape[0])
G[diagIndices] += lambda_
P = np.linalg.inv(G)
B = P / (-np.diag(P))
B[diagIndices] = 0
self.B = B
self.pred = X.dot(B)
def predict(self, train, users, items, k):
items = self.item_enc.transform(items)
dd = train.loc[train.user_id.isin(users)]
dd['ci'] = self.item_enc.transform(dd.item_id)
dd['cu'] = self.user_enc.transform(dd.user_id)
g = dd.groupby('cu')
with Pool(cpu_count()) as p:
user_preds = p.starmap(
self.predict_for_user,
[(user, group, self.pred[user, :], items, k) for user, group in g],
)
df = pd.concat(user_preds)
df['item_id'] = self.item_enc.inverse_transform(df['item_id'])
df['user_id'] = self.user_enc.inverse_transform(df['user_id'])
return df
@staticmethod
def predict_for_user(user, group, pred, items, k):
watched = set(group['ci'])
candidates = [item for item in items if item not in watched]
pred = np.take(pred, candidates)
res = np.argpartition(pred, -k)[-k:]
r = pd.DataFrame(
{
"user_id": [user] * len(res),
"item_id": np.take(candidates, res),
"score": np.take(pred, res),
}
).sort_values('score', ascending=False)
return r