-
Notifications
You must be signed in to change notification settings - Fork 1
/
souce_code.py
163 lines (123 loc) · 4.67 KB
/
souce_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# -*- coding: utf-8 -*-
"""Mini_Project.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1RJPzowjB2WGLAwpwDOnAmyBnNWe5J5id
"""
import pandas as pd
df = pd.read_csv('/content/ps5.csv')
# Display the first few rows of the dataset
print(df.head())
# Get summary statistics
print(df.describe())
# Get information about the dataset
print(df.info())
# Check for missing values
print(df.isnull().sum())
# Drop or fill missing values
df = df.dropna() # or use df.fillna(method='ffill') for forward filling
# Check for duplicates
df = df.drop_duplicates()
# Inspect the releaseDate column
print(df['releaseDate'].head())
print(df['releaseDate'].dtype)
# Convert releaseDate to datetime
df['releaseDate'] = pd.to_datetime(df['releaseDate'], errors='coerce')
# Check for any conversion issues
print(df['releaseDate'].isnull().sum()) # This will show how many entries could not be converted
# Extract year and month from releaseDate
df['releaseYear'] = df['releaseDate'].dt.year
df['releaseMonth'] = df['releaseDate'].dt.month
# Verify the new columns
print(df[['releaseDate', 'releaseYear', 'releaseMonth']].head())
import seaborn as sns
import matplotlib.pyplot as plt
# Plot user ratings over the years
plt.figure(figsize=(10, 6))
sns.lineplot(x='releaseYear', y='starRating/averageRating', data=df)
plt.title('User Ratings Over the Years')
plt.xlabel('Release Year')
plt.ylabel('Average Rating')
plt.show()
# Distribution of user ratings by release month
plt.figure(figsize=(10, 6))
sns.boxplot(x='releaseMonth', y='starRating/averageRating', data=df)
plt.title('User Ratings by Release Month')
plt.xlabel('Release Month')
plt.ylabel('Average Rating')
plt.show()
# Top publishers by average rating
top_publishers = df.groupby('publisherName')['starRating/averageRating'].mean().sort_values(ascending=False).head(10)
print(top_publishers)
# Plot average ratings for top publishers
plt.figure(figsize=(12, 6))
sns.barplot(x=top_publishers.index, y=top_publishers.values)
plt.title('Top Publishers by Average Rating')
plt.xlabel('Publisher')
plt.ylabel('Average Rating')
plt.xticks(rotation=45)
plt.show()
import pandas as pd
# Example DataFrame
data = {
'url': ['http://example.com', 'http://example2.com'],
'id': ['1', '2'],
'publisherName': ['Publisher1', 'Unknown'],
'releaseDate': ['2022-01-01', '2022-02-01'],
'name': ['Item1', 'Item2'],
'isAgeRestricted': [True, False],
'activeCtaId': ['cta1', 'cta2'],
'starRating/averageRating': [4.5, 3.8],
'starRating/totalRatingsCount': [100, 50]
}
df = pd.DataFrame(data)
# Average rating by age restriction
age_restrictions = df.groupby('isAgeRestricted')['starRating/averageRating'].mean()
print(age_restrictions)
# Optionally, if you want to visualize the result:
import matplotlib.pyplot as plt
age_restrictions.plot(kind='bar', color=['blue', 'orange'])
plt.xlabel('Is Age Restricted')
plt.ylabel('Average Star Rating')
plt.title('Average Star Rating by Age Restriction')
plt.xticks(ticks=[0, 1], labels=['False', 'True'], rotation=0)
plt.show()
print(df.columns)
# Check for string values in features
print(df.select_dtypes(include=['object']).columns)
# Check for string values in target variable
print(df['starRating/averageRating'].dtype)
import joblib
import pandas as pd
# Load the pre-trained model
model = joblib.load('ps5_game_rating_predictor.pkl')
# Example new data with sample values
new_data = {
'url': ['https://store.playstation.com/en-us/product/UP0101-PPSA19225_00-0159266583099383'],
'id': ['1'],
'publisherName': ['Konami Digital Entertainment, Inc.'], # Sample publisher
'releaseDate': ['2024-07-20'], # Sample release date
'name': ['Sample Game'], # Sample game name
'isAgeRestricted': [False], # Sample age restriction
'activeCtaId': ['cta1'], # Sample activeCtaId
'starRating/totalRatingsCount': [50] # Sample total ratings count
}
# Convert new data to DataFrame
new_df = pd.DataFrame(new_data)
# Drop non-numeric columns that were not used in training
non_numeric_cols = ['url', 'id', 'publisherName', 'name', 'releaseDate', 'activeCtaId']
new_df.drop(columns=non_numeric_cols, inplace=True)
# Ensure the correct dtype for isAgeRestricted
new_df['isAgeRestricted'] = new_df['isAgeRestricted'].astype(int) # Convert boolean to int
# Manually specify the features used during training
# Ensure this list matches exactly the features used during training
training_features = [
'isAgeRestricted'
]
# Ensure new_df has the same columns as used during training
new_df = new_df[training_features]
# Make prediction
prediction = model.predict(new_df)
# Print the prediction
print(f'Predicted Average Rating: {prediction[0]}')
#correctThisOne