forked from SurajRKU/SE_PROJECT_GRP_12
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplot_based_recommendation.py
76 lines (58 loc) · 2.9 KB
/
plot_based_recommendation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
#Loads movie metadata from a CSV file named "movies_metadata.csv" and returns the specified number of rows (movie_length).
def get_data(movie_length):
metadata = pd.read_csv("movies_metadata.csv", low_memory=False)
return metadata[:movie_length]
#Calculates the cosine similarity matrix using the TF-IDF matrix to determine similarities between movies based on their overviews.
def compute_tfidfmatrix(metadata):
# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words="english")
# Replace NaN with an empty string
metadata["overview"] = metadata["overview"].fillna("")
# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata["overview"])
cosine_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)
np.savez("cosine_similarity_10k", matrix=cosine_similarity)
#Finds the index of the movie in the metadata and obtains similarity scores for all movies relative to it.
def get_recommendations(title, indices, cosine_sim):
# Get the index of the movie that matches the title
if title not in indices:
return None
idx = indices[title]
#print("type")
print(type(indices))
#print("indices")
print(indices)
# Get the pairwsie similarity scores of all movies with that movie
sim_scores = list(enumerate(cosine_sim[idx]))
# Sort the movies based on the similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# Get the scores of the 10 most similar movies
sim_scores = sim_scores[1:11]
# Get the movie indices
movie_indices = [i[0] for i in sim_scores]
# Return the top 10 most similar movies
return metadata["title"].iloc[movie_indices]
if __name__ == "__main__":
metadata = get_data(movie_length=5000)
# compute_tfidfmatrix(metadata)
# Cosine similarity matrix is already saved.
data = np.load("cosine_similarity_5k.npz", allow_pickle=True)
cosine_similarity = data["matrix"]
indices = pd.Series(metadata.index, index=metadata["title"]).drop_duplicates()
play = True
while play != False:
#movie = input("Name of movie from movie list: ")
recommendations = get_recommendations(movie, indices, cosine_similarity)
if recommendations is None:
print("Given movie not in database, try again")
continue
#print("Following are the recommended movies if you like : ", movie)
#print(recommendations)
#response = input("Do you want to continue?(yes/no)")
#if response == "no":
# play = False