Skip to content

Commit

Permalink
完成Item-based CF
Browse files Browse the repository at this point in the history
  • Loading branch information
fuxuemingzhu committed Apr 17, 2018
1 parent 3e350a2 commit 33c0fbc
Show file tree
Hide file tree
Showing 7 changed files with 378 additions and 50 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ target/
# Movie rating data
ml-1m/
ml-1m.zip
ml-100k/
ml-100k.zip
jester_dataset_2.zip
jester_dataset_2/
dataset

# model
model/
Expand Down
151 changes: 151 additions & 0 deletions ItemCF.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,154 @@
Description : Item-based Collaborative filtering.
"""
import collections
from operator import itemgetter

import math

from collections import defaultdict

import similarity
import utils
from utils import LogTime


class ItemBasedCF:
"""
Item-based Collaborative filtering.
Top-N recommendation.
"""

def __init__(self, n_sim_movie=20, n_rec_movie=10, save_model=True):
"""
Init UserBasedCF with n_sim_user and n_rec_movie.
:return: None
"""
self.n_sim_movie = n_sim_movie
self.n_rec_movie = n_rec_movie
self.trainset = None
self.save_model = save_model

def fit(self, trainset):
"""
Fit the trainset by calculate movie similarity matrix.
:param trainset: train dataset
:return: None
"""
model_manager = utils.ModelManager()
try:
self.movie_sim_mat = model_manager.load_model('movie_sim_mat')
self.movie_popular = model_manager.load_model('movie_popular')
self.movie_count = model_manager.load_model('movie_count')
self.trainset = model_manager.load_model('trainset')
print('The model has saved before.\nLoad model success...')
except OSError:
print('No model saved before.\nTrain a new model...')
self.movie_sim_mat, self.movie_popular, self.movie_count = \
similarity.calculate_item_similarity(trainset=trainset)
self.trainset = trainset
print('Train a new model success.')
if self.save_model:
model_manager.save_model(self.movie_sim_mat, 'movie_sim_mat')
model_manager.save_model(self.movie_popular, 'movie_popular')
model_manager.save_model(self.movie_count, 'movie_count')
model_manager.save_model(self.trainset, 'trainset')
print('The new model has saved success.')

def recommend(self, user):
"""
Find K similar movies and recommend N movies for the user.
:param user: The user we recommend movies to.
:return: the N best score movies
"""
if not self.n_rec_movie or not self.trainset or not self.movie_popular or not self.movie_count:
raise NotImplementedError('ItemCF has not init or fit method has not called yet.')
K = self.n_sim_movie
N = self.n_rec_movie
predict_score = collections.defaultdict(int)
if user not in self.trainset:
print('The user (%s) not in trainset.' % user)
return
# print('Recommend movies to user start...')
watched_movies = self.trainset[user]
# record the calculate time has spent.
for movie, rating in watched_movies.items():
for related_movie, similarity_factor in sorted(self.movie_sim_mat[movie].items(),
key=itemgetter(1), reverse=True)[0:K]:
if related_movie in watched_movies:
continue
# predict the user's "interest" for each movie
# the predict_score is sum(similarity_factor * rating)
predict_score[related_movie] += similarity_factor * rating
# log steps and times.
# print('Recommend movies to user success.')
# return the N best score movies
return sorted(predict_score.items(), key=itemgetter(1), reverse=True)[0:N]

def test(self, testset):
"""
Test the recommendation system by recommending scores to all users in testset.
:param testset: test dataset
:return:
"""
if not self.n_rec_movie or not self.trainset or not self.movie_popular or not self.movie_count:
raise ValueError('ItemCF has not init or fit method has not called yet.')
self.testset = testset
print('Test recommendation system start...')
N = self.n_rec_movie
# varables for precision and recall
hit = 0
rec_count = 0
test_count = 0
# varables for coverage
all_rec_movies = set()
# varables for popularity
popular_sum = 0

# record the calculate time has spent.
test_time = LogTime(print_step=1000)
for i, user in enumerate(self.trainset):
test_movies = self.testset.get(user, {})
rec_movies = self.recommend(user) # type:list
for movie, _ in rec_movies:
if movie in test_movies:
hit += 1
all_rec_movies.add(movie)
popular_sum += math.log(1 + self.movie_popular[movie])
# log steps and times.
rec_count += N
test_count += len(test_movies)
# print time per 500 times.
test_time.count_time()
precision = hit / (1.0 * rec_count)
recall = hit / (1.0 * test_count)
coverage = len(all_rec_movies) / (1.0 * self.movie_count)
popularity = popular_sum / (1.0 * rec_count)

print('Test recommendation system success.')
test_time.finish()

print('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' %
(precision, recall, coverage, popularity))

def predict(self, testset):
"""
Predict scores of movies to all users in testset.
:param testset: test dataset
:return: `dict` : recommend list for each user.
"""
movies_recommend = defaultdict(list)
print('Predict scores start...')
# record the calculate time has spent.
predict_time = LogTime(print_step=500)
for i, user in enumerate(self.trainset):
test_movies = testset.get(user, {})
rec_movies = self.recommend(user) # type:list
for movie, _ in rec_movies:
if movie in test_movies:
movies_recommend[user].append(movie)
# log steps and times.
predict_time.count_time()
print('Predict scores success.')
predict_time.finish()
return movies_recommend
20 changes: 10 additions & 10 deletions UserCF.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,12 @@ def fit(self, trainset):
:param trainset: train dataset
:return: None
"""
model_manager = utils.ModelManager()
try:
print('The model has saved before.\nBegin loading model...')
self.user_sim_mat = utils.load_model('user_sim_mat')
self.movie_popular = utils.load_model('movie_popular')
self.movie_count = utils.load_model('movie_count')
self.trainset = utils.load_model('trainset')
self.user_sim_mat = model_manager.load_model('user_sim_mat')
self.movie_popular = model_manager.load_model('movie_popular')
self.movie_count = model_manager.load_model('movie_count')
print('Load model success.')
except OSError:
print('No model saved before.\nTrain a new model...')
Expand All @@ -54,10 +54,9 @@ def fit(self, trainset):
self.trainset = trainset
print('Train a new model success.')
if self.save_model:
utils.save_model(self.user_sim_mat, 'user_sim_mat')
utils.save_model(self.movie_popular, 'movie_popular')
utils.save_model(self.movie_count, 'movie_count')
utils.save_model(self.trainset, 'trainset')
model_manager.save_model(self.user_sim_mat, 'user_sim_mat')
model_manager.save_model(self.movie_popular, 'movie_popular')
model_manager.save_model(self.movie_count, 'movie_count')
print('The new model has saved success.')

def recommend(self, user):
Expand All @@ -79,11 +78,12 @@ def recommend(self, user):
# record the calculate time has spent.
for similar_user, similarity_factor in sorted(self.user_sim_mat[user].items(),
key=itemgetter(1), reverse=True)[0:K]:
for movie in self.trainset[similar_user]:
for movie, rating in self.trainset[similar_user].items():
if movie in watched_movies:
continue
# predict the user's "interest" for each movie
predict_score[movie] += similarity_factor
# the predict_score is sum(similarity_factor * rating)
predict_score[movie] += similarity_factor * rating
# log steps and times.
# print('Recommend movies to user success.')
# return the N best score movies
Expand Down
73 changes: 55 additions & 18 deletions dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,37 @@
import os
import itertools
import random

ml_1m = {
'url' : 'http://files.grouplens.org/datasets/movielens/ml-1m.zip',
'path' : 'ml-1m/ratings.dat',
'reader_params': dict(line_format='user item rating timestamp',
rating_scale=(1, 5),
sep='::')
from collections import namedtuple

BuiltinDataset = namedtuple('BuiltinDataset', ['url', 'path', 'sep', 'reader_params'])

BUILTIN_DATASETS = {
'ml-100k':
BuiltinDataset(
url='http://files.grouplens.org/datasets/movielens/ml-100k.zip',
path='data/ml-100k/u.data',
sep='\t',
reader_params=dict(line_format='user item rating timestamp',
rating_scale=(1, 5),
sep='\t')
),
'ml-1m' :
BuiltinDataset(
url='http://files.grouplens.org/datasets/movielens/ml-1m.zip',
path='data/ml-1m/ratings.dat',
sep='::',
reader_params=dict(line_format='user item rating timestamp',
rating_scale=(1, 5),
sep='::')
),
'jester' :
BuiltinDataset(
url='http://eigentaste.berkeley.edu/dataset/jester_dataset_2.zip',
path='data/jester/jester_ratings.dat',
sep='\t\t',
reader_params=dict(line_format='user item rating',
rating_scale=(-10, 10))
)
}

# modify the random seed will change dataset spilt.
Expand All @@ -36,34 +60,46 @@ def __init__(self):
pass

@classmethod
def load_ml_1m(cls):
"""Load ml-1m dataset.
def load_dataset(cls, name='ml-100k'):
"""Load a built-in dataset.
:param name:string: The name of the built-in dataset to load.
Accepted values are 'ml-100k', 'ml-1m', and 'jester'.
Default is 'ml-100k'.
:return: ratings for each line.
"""
if not os.path.isfile(ml_1m['path']):
raise OSError("Dataset ml-1m could not be found in this project. Please download it from " + ml_1m[
'url'] + ' manually and unzip it to this directory.')
with open(ml_1m['path']) as f:
ratings = [cls.parse_line(line) for line in itertools.islice(f, 0, None)]
print("Load ml-1m dataset success.")
try:
dataset = BUILTIN_DATASETS[name]
except KeyError:
raise ValueError('unknown dataset ' + name +
'. Accepted values are ' +
', '.join(BUILTIN_DATASETS.keys()) + '.')
if not os.path.isfile(dataset.path):
raise OSError(
"Dataset data/" + name + " could not be found in this project.\n"
"Please download it from " + dataset.url +
' manually and unzip it to data/ directory.')
with open(dataset.path) as f:
ratings = [cls.parse_line(line, dataset.sep) for line in itertools.islice(f, 0, None)]
print("Load " + name + " dataset success.")
return ratings

@classmethod
def parse_line(cls, line: str):
def parse_line(cls, line: str, sep: str):
"""
Parse a line.
Ratings as ensured to positive integers.
the separator in rating.data is `::`.
:param sep: the separator between fields. Example : ``';'``.
:param line: The line to parse
:return: tuple: User id, item id, rating score.
The timestamp will be ignored cause it wasn't used in Collaborative filtering.
"""
user, movie, rate, _ = line.strip('\r\n').split("::")
user, movie, rate, _ = line.strip('\r\n').split(sep)
return user, movie, rate

@classmethod
Expand All @@ -75,14 +111,15 @@ def train_test_split(cls, ratings, test_size=0.2):
The rating file should be a instance of DataSet.
:param ratings: raw dataset
:param test_size: the percentage of test size.
:return: train_set and test_set
"""
train, test = collections.defaultdict(dict), collections.defaultdict(dict)
trainset_len = 0
testset_len = 0
for user, movie, rate in ratings:
if random.random() < test_size:
if random.random() <= test_size:
test[user][movie] = int(rate)
testset_len += 1
else:
Expand Down
36 changes: 31 additions & 5 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,44 @@
@author: fuxuemingzhu
"""
import utils
from ItemCF import ItemBasedCF
from UserCF import UserBasedCF
from dataset import DataSet
from utils import LogTime

if __name__ == '__main__':
ratings = DataSet.load_ml_1m()
train, test = DataSet.train_test_split(ratings, test_size=0.3)
usercf = UserBasedCF()
usercf.fit(train)
main_time = LogTime("Main Function")
dataset_name = 'ml-100k'
model_manager = utils.ModelManager(dataset_name)
try:
train = model_manager.load_model('trainset')
test = model_manager.load_model('testset')
except OSError:
ratings = DataSet.load_dataset(name=dataset_name)
train, test = DataSet.train_test_split(ratings, test_size=0.3)
model_manager.save_model(train, 'trainset')
model_manager.save_model(test, 'testset')
'''Do you want to clean workspace and retrain model again?'''
'''if you want to change test_size or retrain model, please set clean_workspace True'''
# utils.clean_workspace(False)
# usercf = UserBasedCF()
# usercf.fit(train)
# recommend100 = usercf.recommend('100')
# recommend88 = usercf.recommend('88')
# recommend89 = usercf.recommend('89')
# print("recommend for userid = 100:\n", recommend100)
# print("recommend for userid = 88:\n", recommend88)
# print("recommend for userid = 89:\n", recommend89)
usercf.test(test)
# usercf.test(test)
itemcf = ItemBasedCF()
itemcf.fit(train)
# recommend100 = itemcf.recommend('100')
# recommend88 = itemcf.recommend('88')
# recommend89 = itemcf.recommend('89')
# print("recommend for userid = 100:\n", recommend100)
# print("recommend for userid = 88:\n", recommend88)
# print("recommend for userid = 89:\n", recommend89)
itemcf.test(test)

main_time.finish()
Loading

0 comments on commit 33c0fbc

Please sign in to comment.