-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_utils.py
199 lines (170 loc) · 6.04 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import os
import numpy as np
import pandas as pd
import scipy.sparse as sp
from copy import deepcopy
import random
from time import time
import torch.utils.data as data
def create_adj_mat(mat, user_num, item_num, path):
t1 = time()
adj_mat = sp.dok_matrix((user_num + item_num, user_num + item_num), dtype=np.float32)
adj_mat = adj_mat.tolil()
# R = mat.totil()
R = mat
adj_mat[:user_num, user_num:] = R
adj_mat[user_num:, :user_num] = R.T
adj_mat = adj_mat.todok()
print('already create adjacency matrix', adj_mat.shape, time() - t1)
t2 = time()
def mean_adj_single(adj):
# D^-1 * A
rowsum = np.array(adj.sum(1))
# d_inv = np.power(rowsum, -1).flatten()
d_inv = np.power(rowsum, -0.5).flatten()
d_inv[np.isinf(d_inv)] = 0.
d_mat_inv = sp.diags(d_inv)
norm_adj = d_mat_inv.dot(adj)
norm_adj = norm_adj.dot(d_mat_inv)
norm_adj = norm_adj.tocsr()
# norm_adj = adj.dot(d_mat_inv)
print('generate single-normalized adjacency matrix.')
# return norm_adj.tocoo()
return norm_adj.tocsr()
# norm_adj_mat = mean_adj_single(adj_mat + sp.eye(adj_mat.shape[0]))
norm_adj_mat = mean_adj_single(adj_mat)
print('already normalize adjacency matrix in %.4fs' % (time() - t2))
sp.save_npz(path + '_s_pre_adj_mat.npz', norm_adj_mat)
return norm_adj_mat
def load_all(dataset, data_path):
train_rating = data_path + '{}.train.rating'.format(dataset)
valid_rating = data_path + '{}.valid.rating'.format(dataset)
test_negative = data_path + '{}.test.negative'.format(dataset)
test_noisy = data_path + '{}.test.noisy'.format(dataset)
################# load training data #################
train_data = pd.read_csv(
train_rating,
sep='\t', header=None, names=['user', 'item', 'noisy'],
usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.int32})
if dataset == "adressa":
user_num = 212231
item_num = 6596
else:
user_num = train_data['user'].max() + 1
item_num = train_data['item'].max() + 1
print("user, item num")
print(user_num, item_num)
train_data = train_data.values.tolist()
# load ratings as a dok matrix
train_mat = sp.dok_matrix((user_num, item_num), dtype=np.float32)
train_data_list = []
train_data_noisy = []
for x in train_data:
train_mat[x[0], x[1]] = 1.0
train_data_list.append([x[0], x[1]])
train_data_noisy.append(x[2])
################# load validation data #################
valid_data = pd.read_csv(
valid_rating,
sep='\t', header=None, names=['user', 'item', 'noisy'],
usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.int32})
valid_data_pos = {}
valid_data = valid_data.values.tolist()
valid_data_list = []
valid_data_noisy = []
for x in valid_data:
if x[0] in valid_data_pos:
valid_data_pos[x[0]].append(x[1])
else:
valid_data_pos[x[0]] = [x[1]]
valid_data_list.append([x[0], x[1]])
valid_data_noisy.append(x[2])
user_pos = {}
for x in train_data_list:
if x[0] in user_pos:
user_pos[x[0]].append(x[1])
else:
user_pos[x[0]] = [x[1]]
for x in valid_data_list:
if x[0] in user_pos:
user_pos[x[0]].append(x[1])
else:
user_pos[x[0]] = [x[1]]
################# load testing data #################
clean_test_mat = sp.dok_matrix((user_num, item_num), dtype=np.float32)
noisy_test_mat = sp.dok_matrix((user_num, item_num), dtype=np.float32)
test_data_pos = {}
with open(test_negative, 'r') as fd:
line = fd.readline()
while line != None and line != '':
arr = line.split('\t')
u = int(arr[0])
i = int(arr[1])
if u in test_data_pos:
test_data_pos[u].append(i)
else:
test_data_pos[u] = [i]
clean_test_mat[u, i] = 1.0
line = fd.readline()
test_data_noisy = {}
with open(test_noisy, 'r') as fd:
line = fd.readline()
while line != None and line != '':
arr = line.split('\t')
u = int(arr[0])
i = int(arr[1])
if u in test_data_noisy:
test_data_noisy[u].append(i)
else:
test_data_noisy[u] = [i]
noisy_test_mat[u, i] = 1.0
line = fd.readline()
return train_data_list, valid_data_list, test_data_pos, test_data_noisy, user_pos, user_num, \
item_num, train_mat, valid_data_pos, clean_test_mat, noisy_test_mat, train_data_noisy, valid_data_noisy
class NCFData(data.Dataset):
def __init__(self, features,
num_item, train_mat=None, NSR=0, is_training=0, noisy_or_not=None):
super(NCFData, self).__init__()
""" Note that the labels are only useful when training, we thus
add them in the ng_sample() function.
"""
self.features_ps = features
self.features_fill = features
self.noisy_or_not = noisy_or_not
self.noisy_or_not_fill = noisy_or_not
self.num_item = num_item
self.train_mat = train_mat
self.NSR = NSR
self.is_training = is_training
self.labels = [0 for _ in range(len(features))]
self.labels_fill = self.labels
def ng_sample(self):
assert self.is_training != 2, 'no need to sampling when testing'
self.features_ng = []
for x in self.features_ps:
u = x[0]
for t in range(self.NSR):
j = np.random.randint(self.num_item)
while (u, j) in self.train_mat:
j = np.random.randint(self.num_item)
self.features_ng.append([u, j])
labels_ps = [1 for _ in range(len(self.features_ps))]
labels_ng = [0 for _ in range(len(self.features_ng))]
self.noisy_or_not_fill = self.noisy_or_not + [1 for _ in range(len(self.features_ng))]
self.features_fill = self.features_ps + self.features_ng
assert len(self.noisy_or_not_fill) == len(self.features_fill)
self.labels_fill = labels_ps + labels_ng
def __len__(self):
return (self.NSR + 1) * len(self.labels)
def __getitem__(self, idx):
features = self.features_fill if self.is_training != 2 \
else self.features_ps
labels = self.labels_fill if self.is_training != 2 \
else self.labels
noisy_or_not = self.noisy_or_not_fill if self.is_training != 2 \
else self.noisy_or_not
user = features[idx][0]
item = features[idx][1]
label = labels[idx]
noisy_label = noisy_or_not[idx]
return user, item, label, noisy_label