-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhybrid_filtering_tfidf_svd.py
207 lines (170 loc) · 7.8 KB
/
hybrid_filtering_tfidf_svd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import streamlit as st
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD
from fuzzywuzzy import process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval
from nltk.stem.snowball import SnowballStemmer
# Load datasets
@st.cache_data()
def load_data():
credits = pd.read_csv('data/credits.csv')
keywords = pd.read_csv('data/keywords.csv')
links_small = pd.read_csv('data/links_small.csv')
md = pd.read_csv('data/movies_metadata.csv')
ratings = pd.read_csv('data/ratings_small.csv')
return credits, keywords, links_small, md, ratings
credits, keywords, links_small, md, ratings = load_data()
# Preprocess data
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if pd.notnull(x) else np.nan)
links_small = links_small.dropna(subset=['tmdbId']) # Drop rows with NaN tmdbId
links_small['tmdbId'] = links_small['tmdbId'].astype('int')
def convert_int(x):
try:
return int(x)
except:
return np.nan
md['id'] = md['id'].apply(convert_int)
md = md.drop([19730, 29503, 35587])
md['id'] = md['id'].astype('int')
# Merge datasets
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')
smd = md[md['id'].isin(links_small['tmdbId'])]
# Parse data
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))
# Extract director
def get_director(x):
for i in x:
if i['job'] == 'Director':
return i['name']
return np.nan
smd['director'] = smd['crew'].apply(get_director)
# Process cast, director, and keywords
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + ' ' + smd['tagline']
smd['description'] = smd['description'].fillna('')
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >= 3 else x)
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x, x, x])
# Filter keywords
stemmer = SnowballStemmer('english')
s = smd.apply(lambda x: pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
s = s[s > 1]
def filter_keywords(x):
words = []
for i in x:
if i in s:
words.append(i)
return words
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
# Create soup
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(map(str, x))) # Convert dictionaries to strings
# Build TF-IDF matrix
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english')
tfidf_matrix = tfidf.fit_transform(smd['soup'])
# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])
# Create mappings between movie IDs and titles
id_to_title = dict(zip(smd['id'], smd['title'].str.strip()))
title_to_id = dict(zip(smd['title'].str.strip(), smd['id']))
# Function to find the closest matching movie title
def find_matching_movie(title):
return process.extractOne(title, title_to_id.keys())[0]
# Load the dataset lazily
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()
# Initialize SVD algorithm and train the model
svd = SVD()
svd.fit(trainset)
# Preprocess id_map for hybrid function
id_map = links_small[['movieId', 'tmdbId']]
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')
indices_map = id_map.set_index('id')
# Hybrid recommendation function
def hybrid(userId, title):
idx = indices[title]
sim_scores = list(enumerate(cosine_sim[int(idx)]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:26]
movie_indices = [i[0] for i in sim_scores]
movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'release_date', 'id']]
movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
movies = movies.sort_values('est', ascending=False)
return movies.head(10).reset_index(drop=True)
def calculate_genre_percentage(userId):
# Get movies rated by the user
user_movies = ratings[ratings['userId'] == userId]
user_movie_ids = user_movies['movieId'].tolist()
# Get genres of user's rated movies
user_genres = smd[smd['id'].isin(user_movie_ids)]['genres'].sum()
# Count genre occurrences
genre_counts = pd.Series(user_genres).value_counts(normalize=True) * 100
return genre_counts
# Streamlit UI
st.title('Movie Recommendation System')
# Dropdown menu for movie title
movie_title = st.selectbox('Select a movie title:', titles)
# Print selected movie title and ID
selected_movie_id = title_to_id.get(movie_title, None)
if selected_movie_id:
pass
else:
closest_match = find_matching_movie(movie_title)
selected_movie_id = title_to_id.get(closest_match, None)
if selected_movie_id:
st.write(f"Selected Movie: {closest_match} - ID: {selected_movie_id}")
else:
st.write(f"Movie '{movie_title}' not found or not in the training set.")
# User input for userId
user_id = st.number_input('Enter your User ID:', min_value=1, step=1)
# Calculate genre interests for the selected user
user_ratings = ratings[ratings['userId'] == user_id]
user_movies = smd[smd['id'].isin(user_ratings['movieId'])]
genre_counts = user_movies['genres'].apply(pd.Series).stack().value_counts(normalize=True)
genre_table = pd.DataFrame({'Genre': genre_counts.index, 'Interest': genre_counts.values})
genre_table['Interest'] = genre_table['Interest'].map(lambda x: f"{x:.2%}")
# Button to recommend movies
if st.button('Recommend Movies'):
if selected_movie_id:
hybrid_recommendations = hybrid(user_id, movie_title)
if not hybrid_recommendations.empty:
hybrid_recommendations = hybrid_recommendations.reset_index(drop=True)
st.write(f"Top hybrid recommendations for user {user_id} and movie '{movie_title}':")
for i, rec in hybrid_recommendations.iterrows():
st.write(f"{i+1}. {rec['title']} (Release Date: {rec['release_date']}, "
f"Vote Count: {rec['vote_count']}, Vote Average: {rec['vote_average']:.2f}, "
f"Estimated Rating: {rec['est']:.2f})")
else:
st.write(f"No hybrid recommendations found for '{movie_title}'.")
else:
st.write(f"Movie '{movie_title}' not found or not in the training set.")
# Button to show genre interests
if st.button('Show Genre Interest'):
genre_percentage = calculate_genre_percentage(user_id)
genre_percentage.columns = ['Interest']
st.write(f"\nGenre interest percentage for User {user_id}:")
st.table(genre_percentage)