forked from fuxuemingzhu/MovieLens-Recommender
-
Notifications
You must be signed in to change notification settings - Fork 0
/
similarity.py
165 lines (142 loc) · 6.74 KB
/
similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# -*- coding = utf-8 -*-
"""
Calculate user similarity matrix.
Created on 2018-04-15
@author: fuxuemingzhu
"""
import collections
import math
from collections import defaultdict
from utils import LogTime
def calculate_user_similarity(trainset, use_iif_similarity=False):
"""
Calculate user similarity matrix by building movie-users inverse table.
The calculating will only between users which have common items votes.
:param use_iif_similarity: This is based on User IIF similarity.
if the item is very popular, users' similarity will be lower.
:param trainset: trainset
:return: similarity matrix
"""
# build inverse table for item-users
# key=movieID, value=list of userIDs who have seen this movie
print('building movie-users inverse table...')
movie2users = collections.defaultdict(set)
movie_popular = defaultdict(int)
for user, movies in trainset.items():
for movie in movies:
movie2users[movie].add(user)
movie_popular[movie] += 1
print('building movie-users inverse table success.')
# save the total movie number, which will be used in evaluation
movie_count = len(movie2users)
print('total movie number = %d' % movie_count)
# count co-rated items between users
print('generate user co-rated movies similarity matrix...')
# the keys of usersim_mat are user1's id,
# the values of usersim_mat are dicts which save {user2's id: co-occurrence times}.
# so you can seem usersim_mat as a two-dim table.
# TODO DO NOT USE DICT TO SAVE MATRIX, USE LIST INDEED.
# TODO IF USE LIST, THE MATRIX WILL BE VERY SPARSE.
usersim_mat = {}
# record the calculate time has spent.
movie2users_time = LogTime(print_step=1000)
for movie, users in movie2users.items():
for user1 in users:
# set default similarity between user1 and other users equals zero
usersim_mat.setdefault(user1, defaultdict(int))
for user2 in users:
if user1 == user2:
continue
# ignore the score they voted.
# user similarity matrix only focus on co-occurrence.
if use_iif_similarity:
# if the item is very popular, users' similarity will be lower.
usersim_mat[user1][user2] += 1 / math.log(1 + len(users))
else:
# origin method, users'similarity based on common items count.
usersim_mat[user1][user2] += 1
# log steps and times.
movie2users_time.count_time()
print('generate user co-rated movies similarity matrix success.')
movie2users_time.finish()
# calculate user-user similarity matrix
print('calculate user-user similarity matrix...')
# record the calculate time has spent.
usersim_mat_time = LogTime(print_step=1000)
for user1, related_users in usersim_mat.items():
len_user1 = len(trainset[user1])
for user2, count in related_users.items():
len_user2 = len(trainset[user2])
# The similarity of user1 and user2 is len(common movies)/sqrt(len(user1 movies)* len(user2 movies)
usersim_mat[user1][user2] = count / math.sqrt(len_user1 * len_user2)
# log steps and times.
usersim_mat_time.count_time()
print('calculate user-user similarity matrix success.')
usersim_mat_time.finish()
return usersim_mat, movie_popular, movie_count
def calculate_item_similarity(trainset, use_iuf_similarity=False):
"""
Calculate item similarity matrix by building movie-users inverse table.
The calculating will only between items which are voted by common users.
:param use_iuf_similarity: This is based on Item IUF similarity.
if a person views a lot of movies, items' similarity will be lower.
:param trainset: trainset
:return: similarity matrix
"""
movie_popular, movie_count = calculate_movie_popular(trainset)
# count co-rated items between users
print('generate items co-rated similarity matrix...')
# the keys of item_sim_mat are movie1's id,
# the values of item_sim_mat are dicts which save {movie2's id: co-occurrence times}.
# so you can seem item_sim_mat as a two-dim table.
# TODO DO NOT USE DICT TO SAVE MATRIX, USE LIST INDEED.
# TODO IF USE LIST, THE MATRIX WILL BE VERY SPARSE.
movie_sim_mat = {}
# record the calculate time has spent.
movie2users_time = LogTime(print_step=1000)
for user, movies in trainset.items():
for movie1 in movies:
# set default similarity between movie1 and other users equals zero
movie_sim_mat.setdefault(movie1, defaultdict(int))
for movie2 in movies:
if movie1 == movie2:
continue
# ignore the score they voted.
# item similarity matrix only focus on co-occurrence.
if use_iuf_similarity:
# if a person views a lot of movies, items' similarity will be lower.
movie_sim_mat[movie1][movie2] += 1 / math.log(1 + len(movies))
else:
# origin method, users'similarity based on common items count.
movie_sim_mat[movie1][movie2] += 1
# log steps and times.
movie2users_time.count_time()
print('generate items co-rated similarity matrix success.')
movie2users_time.finish()
# calculate item-item similarity matrix
print('calculate item-item similarity matrix...')
# record the calculate time has spent.
movie_sim_mat_time = LogTime(print_step=1000)
for movie1, related_items in movie_sim_mat.items():
len_movie1 = movie_popular[movie1]
for movie2, count in related_items.items():
len_user2 = movie_popular[movie2]
# The similarity of user1 and user2 is len(common movies)/sqrt(len(user1 movies)* len(user2 movies)
movie_sim_mat[movie1][movie2] = count / math.sqrt(len_movie1 * len_user2)
# log steps and times.
movie_sim_mat_time.count_time()
print('calculate item-item similarity matrix success.')
movie_sim_mat_time.finish()
return movie_sim_mat, movie_popular, movie_count
def calculate_movie_popular(trainset):
movie_popular = defaultdict(int)
print('counting movies number and popularity...')
for user, movies in trainset.items():
for movie in movies:
# count item popularity
movie_popular[movie] += 1
print('counting movies number and popularity success.')
# save the total movie number, which will be used in evaluation
movie_count = len(movie_popular)
print('total movie number = %d' % movie_count)
return movie_popular, movie_count