diff --git a/README.md b/README.md index 5c34702..f7cdc34 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,10 @@ to provide a rich set of components from which you can construct a customized recommender system from a set of algorithms. + In this fork, I (gabrielspmoreira) have done the following changes: + * Fixed some issues for compatibility with scikit.learn last version (BaseEstimator class deprecated import (scikits.learn.base) and _set_param renamed method) + * Implemented a hybrid method to combine Collaborative Filtering with Global Baseline estimates (based on users and items average preference), inspired on [Implementing Collaborative Filtering class](https://class.coursera.org/mmds-001/lecture/95) from [Coursera Mining Massive Datasets course](https://www.coursera.org/course/mmds). + ## Usage For Usage and Instructions checkout the [Crab Wiki](https://github.com/muricoca/crab/wiki) @@ -14,7 +18,7 @@ The project was started in 2010 by Marcel Caraciolo as a M.S.C related project, and since then many people interested joined to help in the project. It is currently maintained by a team of volunteers, members of the Muriçoca Labs. -## Authors +## Original Authors Marcel Caraciolo (marcel@muricoca.com) diff --git a/scikits/crab/base.py b/scikits/crab/base.py index b49c70d..435e6fa 100644 --- a/scikits/crab/base.py +++ b/scikits/crab/base.py @@ -8,7 +8,7 @@ # Bruno Melo # License: BSD Style. -from scikits.learn.base import BaseEstimator +from sklearn.base import BaseEstimator class BaseRecommender(BaseEstimator): diff --git a/scikits/crab/metrics/classes.py b/scikits/crab/metrics/classes.py index f416561..9f9f8dc 100644 --- a/scikits/crab/metrics/classes.py +++ b/scikits/crab/metrics/classes.py @@ -21,7 +21,7 @@ from metrics import recall_score from metrics import f1_score from sampling import SplitSampling -from scikits.learn.base import clone +from sklearn.base import clone from ..models.utils import ItemNotFoundError, UserNotFoundError diff --git a/scikits/crab/recommenders/knn/classes.py b/scikits/crab/recommenders/knn/classes.py index bf96af3..6896d38 100644 --- a/scikits/crab/recommenders/knn/classes.py +++ b/scikits/crab/recommenders/knn/classes.py @@ -135,7 +135,7 @@ def recommend(self, user_id, how_many=None, **params): Desired number of recommendations (default=None ALL) ''' - self._set_params(**params) + self.set_params(**params) candidate_items = self.all_other_items(user_id) @@ -597,7 +597,7 @@ def recommend(self, user_id, how_many=None, **params): ''' - self._set_params(**params) + self.set_params(**params) candidate_items = self.all_other_items(user_id, **params) @@ -706,3 +706,132 @@ def recommended_because(self, user_id, item_id, how_many=None, **params): for ind in sorted_preferences] return top_n_recs + + +#===================== +#User Based Recommender combined with Global Baseline Recommender +#Based on Collaborative Filtering class from Coursera Mining Massive Datasets course, available in the following link +#https://class.coursera.org/mmds-001/lecture/95 +class UserBasedRecommenderCombinedWithGlobalBaseline(UserBasedRecommender): + global_preferences_mean = None + + def _get_global_preferences_mean(self): + if self.global_preferences_mean == None: + self.global_preferences_mean = np.nanmean(self.model.index) + return self.global_preferences_mean + + + def get_global_baseline_estimate(self, user_id, item_id): + ''' + Parameters + ---------- + user_id: int or string + User for which recommendations are to be computed. + + item_id: int or string + ID of item for which wants to find the estimated preference. + + Returns + ------- + Return an estimated preference based on Global Baseline strategy + where user average preference and item average preference are + combined with the global preference average to preferences_from_user + an estimate for user-item preference + ''' + #print "GB for user",user_id," / item",item_id + global_mean = self._get_global_preferences_mean() + #print "global_mean:",global_mean + user_preferences = self.model.preferences_from_user(user_id, order_by_id=False) + #print "user_preferences:",user_preferences + item_preferences = self.model.preferences_for_item(item_id, order_by_id=False) + #print "item_preferences:",item_preferences + user_preferences_mean = sum(map(lambda i: i[1], user_preferences)) / float(len(user_preferences)) + #print "user_preferences_mean:",user_preferences_mean + item_preferences_mean = sum(map(lambda i: i[1], item_preferences)) / float(len(item_preferences)) + #print "item_preferences_mean:",item_preferences_mean + + baseline_estimate = global_mean + (user_preferences_mean - global_mean) + (item_preferences_mean - global_mean) + #print "baseline_estimate",baseline_estimate + + return baseline_estimate + + + def estimate_preference(self, user_id, item_id, **params): + ''' + Parameters + ---------- + user_id: int or string + User for which recommendations are to be computed. + + item_id: int or string + ID of item for which wants to find the estimated preference. + + Returns + ------- + Return an estimated preference of Collaborative Filtering combined + with the Global Baseline preference estimate for the user and item. + This hybrid method allows to deal better with user or item cold start problem + ''' + + preference = self.model.preference_value(user_id, item_id) + if not np.isnan(preference): + return preference + + n_similarity = params.pop('n_similarity', 'user_similarity') + distance = params.pop('distance', self.similarity.distance) + nhood_size = params.pop('nhood_size', None) + + nearest_neighbors = self.neighborhood_strategy.user_neighborhood(user_id, + self.model, n_similarity, distance, nhood_size, **params) + + preference = 0.0 + total_similarity = 0.0 + + similarities = np.array([self.similarity.get_similarity(user_id, to_user_id) + for to_user_id in nearest_neighbors]).flatten() + + prefs = np.array([self.model.preference_value(to_user_id, item_id) + for to_user_id in nearest_neighbors]) + + prefs_baseline = np.array([self.get_global_baseline_estimate(to_user_id, item_id) + for to_user_id in nearest_neighbors]) + + prefs = prefs[~np.isnan(prefs)] + similarities = similarities[~np.isnan(prefs)] + + #prefs_sim = np.sum(prefs[~np.isnan(similarities)] * + # similarities[~np.isnan(similarities)]) + + prefs_sim_gb = np.sum((prefs[~np.isnan(similarities)] - prefs_baseline[~np.isnan(similarities)]) * + similarities[~np.isnan(similarities)]) + + total_similarity = np.sum(similarities) + + #Throw out the estimate if it was based on no data points, + #of course, but also if based on just one. This is a bit + #of a band-aid on the 'stock' item-based algorithm for + #the moment. The reason is that in this case the estimate + #is, simply, the user's rating for one item that happened + #to have a defined similarity. The similarity score doesn't + #matter, and that seems like a bad situation. + if total_similarity == 0.0 or \ + not similarities[~np.isnan(similarities)].size: + return np.nan + + #estimated = prefs_sim / total_similarity + + baseline = self.get_global_baseline_estimate(user_id, item_id) + estimated_with_gb = baseline + (prefs_sim_gb / total_similarity) + + #print "\nEstimating Preference - User",user_id," - Item",item_id + #print "CF:",estimated," CF+Baseline:",estimated_with_gb," Baseline:",baseline + #print "SIMILARITIES" + #pprint(zip(nearest_neighbors, similarities, prefs, prefs_baseline)) + + if self.capper: + max_p = self.model.maximum_preference_value() + min_p = self.model.minimum_preference_value() + estimated_with_gb = max_p if estimated_with_gb > max_p else min_p \ + if estimated_with_gb < min_p else estimated_with_gb + + return estimated_with_gb \ No newline at end of file diff --git a/scikits/crab/recommenders/svd/classes.py b/scikits/crab/recommenders/svd/classes.py index 36be33a..5f9bcb5 100644 --- a/scikits/crab/recommenders/svd/classes.py +++ b/scikits/crab/recommenders/svd/classes.py @@ -286,7 +286,7 @@ def recommend(self, user_id, how_many=None, **params): Desired number of recommendations (default=None ALL) ''' - self._set_params(**params) + self.set_params(**params) candidate_items = self.all_other_items(user_id)