Skip to content

Commit

Permalink
Merge branch 'release/v0.6.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
evfro committed Jun 3, 2018
2 parents addb914 + 90d3b2b commit 15530ec
Show file tree
Hide file tree
Showing 15 changed files with 400 additions and 191 deletions.
31 changes: 24 additions & 7 deletions polara/datasets/movielens.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from io import BytesIO
import numpy as np
import pandas as pd

try:
Expand All @@ -8,9 +9,14 @@


def get_movielens_data(local_file=None, get_ratings=True, get_genres=False,
split_genres=True, mdb_mapping=False):
split_genres=True, mdb_mapping=False, get_tags=False, include_time=False):
'''Downloads movielens data and stores it in pandas dataframe.
'''
fields = ['userid', 'movieid', 'rating']

if include_time:
fields.append('timestamp')

if not local_file:
# downloading data
from requests import get
Expand All @@ -20,7 +26,7 @@ def get_movielens_data(local_file=None, get_ratings=True, get_genres=False,
else:
zip_contents = local_file

ml_data = ml_genres = mapping = None
ml_data = ml_genres = ml_tags = mapping = None
# loading data into memory
with ZipFile(zip_contents) as zfile:
zip_files = pd.Series(zfile.namelist())
Expand All @@ -33,9 +39,7 @@ def get_movielens_data(local_file=None, get_ratings=True, get_genres=False,
zdata = zdata.replace(b'::', delimiter.encode())
# makes data compatible with pandas c-engine
# returns string objects instead of bytes in that case
ml_data = pd.read_csv(BytesIO(zdata), sep=delimiter, header=header, engine='c',
names=['userid', 'movieid', 'rating', 'timestamp'],
usecols=['userid', 'movieid', 'rating'])
ml_data = pd.read_csv(BytesIO(zdata), sep=delimiter, header=header, engine='c', names=fields, usecols=fields)

if get_genres:
zip_file = zip_files[zip_files.str.contains('movies')].iat[0]
Expand All @@ -51,14 +55,27 @@ def get_movielens_data(local_file=None, get_ratings=True, get_genres=False,

ml_genres = get_split_genres(genres_data) if split_genres else genres_data

if get_tags:
zip_file = zip_files[zip_files.str.contains('/tags')].iat[0] #not genome
zdata = zfile.read(zip_file)
if not is_new_format:
# make data compatible with pandas c-engine
# pandas returns string objects instead of bytes in that case
delimiter = '^'
zdata = zdata.replace(b'::', delimiter.encode())
fields[2] = 'tag'
ml_tags = pd.read_csv(BytesIO(zdata), sep=delimiter, header=header,
engine='c', encoding='latin1',
names=fields, usecols=range(len(fields)))

if mdb_mapping and is_new_format:
# imdb and tmdb mapping - exists only in ml-latest or 20m datasets
zip_file = zip_files[zip_files.str.contains('links')].iat[0]
with zfile.open(zip_file) as zdata:
mapping = pd.read_csv(zdata, sep=',', header=0, engine='c',
names=['movieid', 'imdbid', 'tmdbid'])

res = [data for data in [ml_data, ml_genres, mapping] if data is not None]
res = [data for data in [ml_data, ml_genres, ml_tags, mapping] if data is not None]
if len(res)==1: res = res[0]
return res

Expand All @@ -75,7 +92,7 @@ def filter_short_head(data, threshold=0.01):
short_head.sort_values(ascending=False, inplace=True)

ratings_perc = short_head.cumsum()*1.0/short_head.sum()
movies_perc = pd.np.arange(1, len(short_head)+1, dtype=pd.np.float64) / len(short_head)
movies_perc = np.arange(1, len(short_head)+1, dtype='f8') / len(short_head)

long_tail_movies = ratings_perc[movies_perc > threshold].index
return long_tail_movies
29 changes: 21 additions & 8 deletions polara/evaluation/evaluation_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,23 @@
except NameError:
pass

from math import sqrt
import pandas as pd


def sample_ci(df, coef=2.776, level=None): # 95% CI for sample under Student's t-test
# http://www.stat.yale.edu/Courses/1997-98/101/confint.htm
# example from http://onlinestatbook.com/2/estimation/mean.html
nlevels = df.index.nlevels
if (nlevels == 1) & (level is None):
n = df.shape[0]
elif (nlevels==2) & (level is not None):
n = df.index.levshape[1-level]
else:
raise ValueError
return coef * df.std(level=level, ddof=1) / sqrt(n)


def save_scores(scores, dataset_name, experiment_name, save_folder=None):
experiment_keys = scores.keys()
save_folder = save_folder or 'results'
Expand Down Expand Up @@ -49,9 +63,10 @@ def set_topk(models, topk):
model.topk = topk


def build_models(models):
def build_models(models, force=True):
for model in models:
model.build()
if not model._is_ready or force:
model.build()


def consolidate(scores, params, metrics):
Expand Down Expand Up @@ -90,14 +105,13 @@ def holdout_test_pair(model1, model2, holdout_sizes=[1], metrics=['hits']):
return consolidate(holdout_scores, holdout_sizes, metrics)


def holdout_test(models, holdout_sizes=[1], metrics=['hits']):
def holdout_test(models, holdout_sizes=[1], metrics=['hits'], force_build=True):
holdout_scores = []
data = models[0].data
assert all([model.data is data for model in models[1:]]) #check that data is shared across models

build_models(models)
build_models(models, force_build)
for i in holdout_sizes:
print(i, end=' ')
data.holdout_size = i
data.update()

Expand All @@ -107,17 +121,16 @@ def holdout_test(models, holdout_sizes=[1], metrics=['hits']):
return consolidate(holdout_scores, holdout_sizes, metrics)


def topk_test(models, topk_list=[10], metrics=['hits']):
def topk_test(models, topk_list=[10], metrics=['hits'], force_build=True):
topk_scores = []
data = models[0].data
assert all([model.data is data for model in models[1:]]) #check that data is shared across models

data.update()
topk_list = list(reversed(sorted(topk_list))) #start from max topk and rollback

build_models(models)
build_models(models, force_build)
for topk in topk_list:
print(topk, end=' ')
metric_scores = evaluate_models(models, metrics, topk)
topk_scores.append(metric_scores)

Expand Down
34 changes: 34 additions & 0 deletions polara/evaluation/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from __future__ import print_function

from operator import mul as mul_op
from functools import reduce
from itertools import product
from random import choice


def random_chooser():
while True:
values = yield
yield choice(values)


def random_grid(params, n=60, grid_cache=None):
if not isinstance(n, int):
raise TypeError('n must be an integer, not {}'.format(type(n)))
if n < 0:
raise ValueError('n should be >= 0')

grid = grid_cache or set()
max_n = reduce(mul_op, [len(vals) for vals in params.values()])
n = min(n if n > 0 else max_n, max_n)
param_chooser = random_chooser()
try:
while len(grid) < n:
level_choice = []
for v in params.values():
next(param_chooser)
level_choice.append(param_chooser.send(v))
grid.add(tuple(level_choice))
except KeyboardInterrupt:
print('Interrupted by user. Providing current results.')
return grid
102 changes: 42 additions & 60 deletions polara/lib/hosvd.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,46 +6,40 @@
pass

import numpy as np
from numba import jit


@jit(nopython=True, nogil=True)
def double_tensordot(idx, val, U, V, new_shape1, new_shape2, ten_mode0, ten_mode1, ten_mode2, res):
I = idx.shape[0]
J = new_shape1
K = new_shape2
for i in range(I):
i0 = idx[i, ten_mode0]
i1 = idx[i, ten_mode1]
i2 = idx[i, ten_mode2]
for j in range(J):
for k in range(K):
res[i0, j, k] += val[i] * U[i1, j] * V[i2, k]


def tensordot2(idx, val, shape, U, V, modes):
ten_mode1, mat_mode1 = modes[0]
ten_mode2, mat_mode2 = modes[1]

ten_mode0, = [x for x in (0, 1, 2) if x not in (ten_mode1, ten_mode2)]
new_shape = (shape[ten_mode0], U.shape[1-mat_mode1], V.shape[1-mat_mode2])
res = np.zeros(new_shape)

if mat_mode1 == 1:
vU = U.T
else:
vU = U

if mat_mode2 == 1:
vV = V.T
else:
vV = V

double_tensordot(idx, val, vU, vV, new_shape[1], new_shape[2], ten_mode0, ten_mode1, ten_mode2, res)
from scipy.sparse.linalg import svds
from numba import njit


@njit(nogil=True)
def double_tensordot(idx, val, u, v, mode0, mode1, mode2, res):
new_shape1 = u.shape[1]
new_shape2 = v.shape[1]
for i in range(len(val)):
i0 = idx[i, mode0]
i1 = idx[i, mode1]
i2 = idx[i, mode2]
vi = val[i]
for j in range(new_shape1):
for k in range(new_shape2):
res[i0, j, k] += vi * u[i1, j] * v[i2, k]


def tensordot2(idx, val, shape, U, V, modes, dtype=None):
mode1, mat_mode1 = modes[0]
mode2, mat_mode2 = modes[1]

u = U.T if mat_mode1 == 1 else U
v = V.T if mat_mode2 == 1 else V

mode0, = [x for x in (0, 1, 2) if x not in (mode1, mode2)]
new_shape = (shape[mode0], U.shape[1-mat_mode1], V.shape[1-mat_mode2])

res = np.zeros(new_shape, dtype=dtype)
double_tensordot(idx, val, u, v, mode0, mode1, mode2, res)
return res


def tucker_als(idx, val, shape, core_shape, iters=25, growth_tol=0.01, batch_run=False):
def tucker_als(idx, val, shape, core_shape, iters=25, growth_tol=0.01, batch_run=False, seed=None):
'''
The function computes Tucker ALS decomposition of sparse tensor
provided in COO format. Usage:
Expand All @@ -55,53 +49,41 @@ def log_status(msg):
if not batch_run:
print(msg)

if not (idx.flags.c_contiguous and val.flags.c_contiguous):
raise ValueError('Warning! Imput arrays must be C-contigous.')


#TODO: it's better to implement check for future
#if np.any(idx[1:, 0]-idx[:-1, 0]) < 0):
# print('Warning! Index array must be sorted by first column in ascending order.')
random_state = np.random if seed is None else np.random.RandomState(seed)

r0, r1, r2 = core_shape

u1 = np.random.rand(shape[1], r1)
u1 = random_state.rand(shape[1], r1)
u1 = np.linalg.qr(u1, mode='reduced')[0]

u2 = np.random.rand(shape[2], r2)
u2 = random_state.rand(shape[2], r2)
u2 = np.linalg.qr(u2, mode='reduced')[0]

u1 = np.ascontiguousarray(u1)
u2 = np.ascontiguousarray(u2)

g_norm_old = 0

for i in range(iters):
log_status('Step %i of %i' % (i+1, iters))
u0 = tensordot2(idx, val, shape, u2, u1, ((2, 0), (1, 0)))\
.reshape(shape[0], r1*r2)
uu = np.linalg.svd(u0, full_matrices=0)[0]
u0 = np.ascontiguousarray(uu[:, :r0])
uu = svds(u0, k=r0, return_singular_vectors='u')[0]
u0 = np.ascontiguousarray(uu[:, ::-1])

u1 = tensordot2(idx, val, shape, u2, u0, ((2, 0), (0, 0)))\
.reshape(shape[1], r0*r2)
uu = np.linalg.svd(u1, full_matrices=0)[0]
u1 = np.ascontiguousarray(uu[:, :r1])
uu = svds(u1, k=r1, return_singular_vectors='u')[0]
u1 = np.ascontiguousarray(uu[:, ::-1])

u2 = tensordot2(idx, val, shape, u1, u0, ((1, 0), (0, 0)))\
.reshape(shape[2], r0*r1)
uu, ss, vv = np.linalg.svd(u2, full_matrices=0)
u2 = np.ascontiguousarray(uu[:, :r2])
uu, ss, vv = svds(u2, k=r2)
u2 = np.ascontiguousarray(uu[:, ::-1])

g_norm_new = np.linalg.norm(ss[:r2])
g_norm_new = np.linalg.norm(ss)
g_growth = (g_norm_new - g_norm_old) / g_norm_new
g_norm_old = g_norm_new
log_status('growth of the core: %f' % g_growth)
if g_growth < growth_tol:
log_status('Core is no longer growing. Norm of the core: %f' % g_norm_old)
break

g = ss[:r2, np.newaxis] * vv[:r2, :]
g = np.ascontiguousarray((ss[:, np.newaxis] * vv)[::-1, :])
g = g.reshape(r2, r1, r0).transpose(2, 1, 0)
log_status('Done')
return u0, u1, u2, g
6 changes: 3 additions & 3 deletions polara/lib/optimize.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
from numba import jit
from numba import njit

@jit(nopython=True, nogil=True)
@njit(nogil=True)
def sgd_step(users_idx, items_idx, feedbacks, P, Q, eta, lambd):
cum_error = 0
for k, a in enumerate(feedbacks):
Expand All @@ -22,7 +22,7 @@ def sgd_step(users_idx, items_idx, feedbacks, P, Q, eta, lambd):
cum_error += e*e
return cum_error

@jit(nopython=True, nogil=True)
@njit(nogil=True)
def sgd_step_biased(users_idx, items_idx, feedbacks, P, Q, b_user, b_item, mu, eta, lambd):
cum_error = 0
for k, a in enumerate(feedbacks):
Expand Down
Loading

0 comments on commit 15530ec

Please sign in to comment.