-
Notifications
You must be signed in to change notification settings - Fork 5
/
NeighAggre_main.py
102 lines (85 loc) · 4.45 KB
/
NeighAggre_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from __future__ import division
from __future__ import print_function
import os
import glob
import time
import random
import argparse
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import pickle
from utils import load_data, accuracy, new_load_data
from GCN_models import GCN
from sklearn.utils import shuffle
from evaluation import RECALL_NDCG
os.environ['CUDA_VISIBLE_DEVICES'] = ' '
method_name = 'NeighAggre'
train_fts_ratio = 0.4*0.2
topK_list = [10, 20, 50]
# Training settings
parser = argparse.ArgumentParser()
parser.add_argument('--no-cuda', action='store_true', default=False, help='Disables CUDA training.')
parser.add_argument('--dataset', type=str, default='pubmed', help='cora, citeseer, steam')
parser.add_argument('--seed', type=int, default=72, help='Random seed.')
args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.cuda:
torch.cuda.manual_seed(args.seed)
# Load data
print('loading dataset: {}'.format(args.dataset))
# note that the node_class_lbls, node_idx_train, node_idx_val, node_idx_test are only used for evaluation.
adj, true_features, node_class_lbls, _, _, _ = new_load_data(args.dataset, norm_adj=False, generative_flag=True)
adj = adj.to_dense()
# pickle.dump(adj.numpy(), open(os.path.join(os.getcwd(), 'features', 'NeighAggre',
# '{}_sp_adj.pkl'.format(args.dataset)), 'wb'))
#
# pickle.dump(true_features.data.numpy(), open(os.path.join(os.getcwd(), 'features', 'NeighAggre',
# '{}_true_features.pkl'.format(args.dataset)), 'wb'))
# generate ont-hot features for all nodes, this means no node feature is used
indices = torch.LongTensor(np.stack([np.arange(adj.shape[0]), np.arange(adj.shape[0])], axis=0))
values = torch.FloatTensor(np.ones(indices.shape[1]))
features = torch.sparse.FloatTensor(indices, values, torch.Size([adj.shape[0], adj.shape[0]]))
# split train features and generative features
shuffled_nodes = shuffle(np.arange(adj.shape[0]), random_state=args.seed)
train_fts_idx = torch.LongTensor(shuffled_nodes[:int(train_fts_ratio*adj.shape[0])])
vali_fts_idx = torch.LongTensor(shuffled_nodes[int(0.4*adj.shape[0]):int((0.4+0.1)*adj.shape[0])])
test_fts_idx = torch.LongTensor(shuffled_nodes[int((0.4+0.1)*adj.shape[0]):])
pickle.dump(train_fts_idx, open(os.path.join(os.getcwd(), 'features', method_name, '{}_{}_train_fts_idx.pkl'.format(
args.dataset, train_fts_ratio)), 'wb'))
pickle.dump(vali_fts_idx, open(os.path.join(os.getcwd(), 'features', method_name, '{}_{}_vali_fts_idx.pkl'.format(
args.dataset, train_fts_ratio)), 'wb'))
pickle.dump(test_fts_idx, open(os.path.join(os.getcwd(), 'features', method_name, '{}_{}_test_fts_idx.pkl'.format(
args.dataset, train_fts_ratio)), 'wb'))
# find neighbors and make raw feature aggregation for unknown nodes
# since we only have the train node fts to aggregate and normalize with mean operation, so we set zero to nodes without fts.
mask_adj = torch.zeros_like(adj)
mask_adj = mask_adj[test_fts_idx, :]
mask_adj[:, train_fts_idx] = adj[test_fts_idx, :][:, train_fts_idx]
aggregation_fts = torch.mm(mask_adj, true_features)/torch.reshape(mask_adj.sum(1)+1e-24, shape=[-1, 1])
save_fts = torch.zeros_like(true_features)
save_fts[test_fts_idx] = aggregation_fts
print('Saving generated features and true features......')
pickle.dump(save_fts, open(os.path.join(os.getcwd(), 'features', method_name,
'gene_fts_train_ratio_{}_{}.pkl'.format(args.dataset, train_fts_ratio)), 'wb'))
print('test for label propagation......')
if args.cuda:
aggregation_fts = aggregation_fts.cpu().numpy()
gt_fts = true_features[test_fts_idx].cpu().numpy()
else:
aggregation_fts = aggregation_fts.numpy()
gt_fts = true_features[test_fts_idx].numpy()
if args.dataset in ['cora', 'citeseer', 'steam']:
for topK in topK_list:
avg_recall, avg_ndcg = RECALL_NDCG(aggregation_fts, gt_fts, topN=topK)
print('tpoK: {}, recall: {}, ndcg: {}'.format(topK, avg_recall, avg_ndcg))
elif args.dataset in ['pubmed']:
NL2 = np.mean(np.linalg.norm(aggregation_fts - gt_fts, axis=1)/(np.linalg.norm(gt_fts, axis=1)))
print('normalized L2 distance: {:.8f}'.format(NL2))
print('method: {}, dataset: {}, ratio: {}'.format(method_name, args.dataset, train_fts_ratio))