-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcustom-functions.R
235 lines (213 loc) · 8.06 KB
/
custom-functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# For aggregation
library(plyr)
# For evaluation
library("Metrics")
# Returns a functions that takes user row input as parameter
# removeSeen: Wheter to recommend already seen interactions
# numberOfRecommendations: how many items to recommend
PopularRecommender <- function(urm, removeSeen, numberOfRecommendations) {
mostPopularItems <- MostPopularItems(urm)
function(userRow) {
RecommendForUser(userRow, mostPopularItems, removeSeen, numberOfRecommendations)
}
}
# Returns a functions that takes user row input as parameter
# removeSeen: Wheter to recommend already seen interactions
# movies metadata
# numberOfRecommendations: how many items to recommend
PopularByGenreRecommender <- function(urm, removeSeen, numberOfRecommendations) {
movies <- LoadMovies()
mostPopularByGenre <- MostPopularByGenre(urm, movies)
moviesWithGenre <- ddply(movies, "MovieId", function(row) {
# Extract genre vector
Genre <- row$Genre[[1]]
data.frame(Genre)
})
function(userRow) {
userRatings <- urm[urm$UserId == userRow$UserId,]
# Could be filtered by 3 here to be more flexible
goodRatings <- FilterGoodRatings(userRatings, 3)
# Anotate ratings with genre
ratingsWithGenre <- merge(goodRatings, moviesWithGenre, by.x="ItemId", by.y="MovieId")
itemCountPerGenre <- count(ratingsWithGenre, "Genre")
itemCountPerGenre <- arrange(itemCountPerGenre, desc(freq))
totalCount <- sum(itemCountPerGenre$freq)
itemCountPerGenre$share <- ceiling(5 * itemCountPerGenre$freq / totalCount)
moviesTaken <- ddply(itemCountPerGenre, "Genre", function(genreRow) {
genre <- genreRow$Genre
ItemId <- head(GetItemIdsVector(mostPopularByGenre[mostPopularByGenre$Genre == genre,]),genreRow$share)
data.frame(ItemId)
})
# Extract recommended items
recommendedItems <- as.vector(moviesTaken$ItemId)
# TODO: For each genre where the share is greater than 0 take some movies
#GetItemIdsVector(mostPopularByGenre[mostPopularByGenre$Genre == "Drama",])
RecommendForUser(userRow, recommendedItems, removeSeen, numberOfRecommendations)
}
}
# Recommend n items removing those seen if indicated
RecommendForUser <- function(userRow, recommendedItems, removeSeen, numberOfRecommendations) {
if (removeSeen) {
itemsToRecommend <- RemoveSeenItems(userRow, recommendedItems)
} else {
itemsToRecommend <- recommendedItems
}
# Take as many as recommendations as requested
head(recommendedItems, numberOfRecommendations)
}
# Return most popular items by count of goodRatings
# Expects a User Rating Matrix data frame
# with the columns "UserId", "ItemId" and "Rating"
MostPopularItems <- function(urm) {
goodRatings <- FilterGoodRatings(urm)
# Ordering movies by popularity
mostPopularMovies <- OrderItemsByCount(goodRatings)
}
# As per competition forum
FilterGoodRatings <- function(urm, threshold = 4) {
goodRatings <- urm[urm$Rating >= threshold,]
row.names(goodRatings) <- NULL
return(goodRatings)
}
# Expects a dataFrame with "ItemId" repeated
# Returns a vector ordered by count
OrderItemsByCount <- function(df) {
# Ordering items by count
itemCount <- CountItems(df)
itemsByCount <- arrange(itemCount, desc(freq))
# Only items as a vector
return(as.vector(itemsByCount[["ItemId"]]))
}
CountItems <- function(df) {
count(df, "ItemId")
}
# First parameter should have an ItemIds column with a singleton list with a vector
# Second should be the recommendations available to this user as a vector of item ids
RemoveSeenItems <- function(userRow, recommendations)
{
seenItems <- GetItemIdsVector(userRow)
# Remove from recommendations
setdiff(recommendations, seenItems)
}
# Expect the user ratings matrix and returns the
# most popular movies for each genre
MostPopularByGenre <- function(urm, movies) {
goodRatings <- FilterGoodRatings(urm)
# Expand movies by genre with a row per movie and gender
moviesByGenre <- ddply(movies, "MovieId", function(row) {
# Extract genre vector
Genre <- row$Genre[[1]]
data.frame(Genre)
})
# Anotate ratings with genre
ratingsWithGenre <- merge(goodRatings, moviesByGenre, by.x="ItemId", by.y="MovieId")
# Order Items by popularity for each Genre
itemPopularityByGenre <- ddply(ratingsWithGenre, "Genre", function(df) {
itemCount <- OrderItemsByCount(df)
data.frame(ItemId = itemCount)
})
# Aggregate into genre and vector
popularByGenre <- SummariseItemsBy(itemPopularityByGenre, "Genre")
}
# Seen items for evaluation or not
FilterSeenItems <- function(urm, forEvaluation) {
if (evaluation) {
seenItems = ItemsSeenByNonTestUsers(urm)
} else {
seenItems = ItemsSeenByTestUsers(urm)
}
}
ItemsSeenByTestUsers <- function(urm) {
testUserIds <- LoadTestUserIds()
testRatings <- merge(testUserIds, urm)
SummariseItemsBy(testRatings, "UserId")
}
ItemsSeenByNonTestUsers <- function(urm) {
testUserIds <- LoadTestUserIds()
nonTestRatings <- urm[! urm$UserId %in% testUserIds$UserId,]
SummariseItemsBy(nonTestRatings, "UserId")
}
LoadTestUserIds <- function() {
ReadCsvData("test")
}
# Aggregate items as a vector grouping by a column
SummariseItemsBy <- function(df, groupBy) {
ddply(df, groupBy, summarise, ItemIds = list(ItemId))
}
LoadURM <- function() {
# Get movie metadata
ReadCsvData("train")
}
LoadMovies <- function() {
# Get movie metadata
movies <- ReadCsvData("movieMetaCorrected")
# Split genre string by '|' grouping by MovieId
movies$Genre <- by(movies, movies$MovieId, FUN=function(row) {
strsplit(as.character(row$Genre), '|', fixed=TRUE)
})
return(movies)
}
ReadCsvData <- function(filename) {
#Deafult separator is space
completeFilename <- paste("data/",filename,".csv",sep='')
read.csv(completeFilename)
}
# Function used to generate the recommendation data frame
# seenItems: Items already seen by the user
# recommendationFunction: receiving a user row
# submission: boolean to indicated wheter to prepare for submission or evaluation
GenerateRecommendations <- function(seenItems, recommendationFunction, submission) {
recommendations <- ddply(seenItems, "UserId", function(row) {
ItemId <- recommend(row)
if (length(ItemId) == 0)
{
ItemId = c(1)
}
data.frame(ItemId)
})
# Aggregating recommendations per user
# As a space separated string for submission
if(submission)
{
ddply(recommendations, "UserId", summarise,
RecommendedMovieIds = paste(ItemId, collapse = " "))
} else {
ddply(recommendations, "UserId", summarise,
ItemIds = list(ItemId))
}
}
# df is a data frame with a "ItemIds" Column as a list with a single vector element
GetItemIdsVector <- function(userRow) {
userRow$ItemIds[[1]]
}
# print map or generate submission file
GenerateOutput <- function(urm, recommendedPerUser, forEvaluation) {
if(forEvaluation) {
CalculateMap(urm, recommendedPerUser)
} else {
WriteSubmission(recommendedPerUser)
}
}
# Print the MAP at the indicated number of recommendations
CalculateMap <- function(urm, recommendedPerUser) {
# Filter relevant items per user
goodRatings <- FilterGoodRatings(urm)
relevantItems <- ItemsSeenByNonTestUsers(goodRatings)
relevantItemsPerUser = dlply(relevantItems, "UserId", GetItemIdsVector)
# Aggregate predictions per user
predictedPerUser = dlply(recommendedPerUser, "UserId", GetItemIdsVector)
# Compute number of recommendations done
samplerecommendation = GetItemIdsVector(recommendedPerUser[1,])
numberOfRecommendations = length(samplerecommendation)
# Mean average precision from metrics package
mapk(numberOfRecommendations,relevantItemsPerUser, predictedPerUser)
}
WriteSubmission <- function(recommendedPerUser) {
# submission in output format
submission <- recommendedPerUser
# Drop vector of Ids for the output
submission$ItemIds <- NULL
write.table(submission,
file="submissions/submission.csv",
sep=",", quote=FALSE, row.names=FALSE)
}