forked from lgalke/gnn-pretraining-evaluation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinfo.py
executable file
·105 lines (78 loc) · 3.09 KB
/
info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
""" Print dataset info with subgraph train-test split """
import argparse
import torch
import dgl
from dgl.data import register_data_args, load_data
def main(args):
data = load_data(args)
features = torch.FloatTensor(data.features)
labels = torch.LongTensor(data.labels)
train_mask = torch.ByteTensor(data.train_mask)
val_mask = torch.ByteTensor(data.val_mask)
test_mask = torch.ByteTensor(data.test_mask)
in_feats = features.shape[1]
n_classes = data.num_labels
# We dont use a validation set
train_mask = train_mask | val_mask
if args.invert:
# This is different from swapping train and test mask
# because train | test not cover the whole dataset
train_mask, test_mask = ~train_mask, train_mask
setting = 'B'
else:
setting = 'A'
udgraph = data.graph
g = dgl.DGLGraph(udgraph)
self_edges = g.has_edges_between(g.nodes(), g.nodes()).sum()
print("Orig data has %d self edges" % self_edges)
n_edges = g.number_of_edges() - self_edges # Don't count self edges here
src, dst = g.all_edges()
is_symmetric = all(g.has_edges_between(dst, src))
print("Is symmetric:", is_symmetric)
if not is_symmetric:
print("WARN the input graph is non-symmetric")
# g.add_edges(g.nodes(), g.nodes())
# assert all(g.has_edges_between(g.nodes(), g.nodes()))
train_nodes = torch.arange(g.number_of_nodes())[train_mask]
g_train = g.subgraph(train_nodes)
# assert all(g_train.has_edges_between(g_train.nodes(), g_train.nodes()))
assert g != g_train
g_train.set_n_initializer(dgl.init.zero_initializer)
features_train = features[train_mask]
labels_train = labels[train_mask]
self_edges_train = g_train.has_edges_between(g_train.nodes(), g_train.nodes()).sum().item()
print("Self edges in train set", self_edges_train)
n_edges_train = g_train.number_of_edges() - self_edges_train
unseen_nodes = g.number_of_nodes() - g_train.number_of_nodes()
# Only real unseen edges, not the included train edges
unseen_edges = n_edges - n_edges_train
print("""---- Data statistics: %s, Setting %s----
#Full Graph Nodes %d
#Full Graph Edges %d (undirected: %d)
#Classes %d
#Features %d
#Train samples %d
#Train edges %d (undirected: %d)
#Unseen nodes %d
#Unseen edges %d (undirected: %d)
#Test nodes %d
#Label rate %.3f""" %
(args.dataset,
'B' if args.invert else 'A',
g.number_of_nodes(), n_edges, n_edges // 2,
n_classes, in_feats,
g_train.number_of_nodes(),
n_edges_train, n_edges_train // 2,
unseen_nodes,
unseen_edges, unseen_edges // 2,
test_mask.sum().item(),
g_train.number_of_nodes() / g.number_of_nodes()
)
)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
register_data_args(parser)
parser.add_argument('--invert', default=False, action='store_true',
help="Invert train and test set")
args = parser.parse_args()
main(args)