From b347590a3713b12467d5f9fe19696d187a4158cc Mon Sep 17 00:00:00 2001 From: "xiang song(charlie.song)" Date: Sat, 1 Aug 2020 19:25:50 +0800 Subject: [PATCH] [Dataset] Citation graph (#1902) * citation graph * GCN example use new citatoin dataset * mxnet gat * triger * Fix * Fix gat * fix * Fix tensorflow dgi * Fix appnp, graphsage for mxnet * fix monet and sgc for mxnet * Fix tagcn * update sgc, appnp Co-authored-by: Ubuntu --- examples/mxnet/appnp/appnp.py | 52 +- examples/mxnet/gat/train.py | 57 +- examples/mxnet/gcn/gcn_concat.py | 52 +- examples/mxnet/gcn/train.py | 52 +- examples/mxnet/graphsage/main.py | 52 +- examples/mxnet/monet/citation.py | 59 +- examples/mxnet/sgc/sgc.py | 50 +- examples/mxnet/tagcn/train.py | 55 +- examples/pytorch/appnp/train.py | 55 +- examples/pytorch/gat/train.py | 59 +- examples/pytorch/gcn/gcn_mp.py | 54 +- examples/pytorch/gcn/train.py | 56 +- examples/pytorch/sgc/sgc.py | 52 +- examples/tensorflow/dgi/train.py | 41 +- examples/tensorflow/gat/README.md | 1 + examples/tensorflow/gat/train.py | 40 +- examples/tensorflow/gcn/README.md | 5 +- examples/tensorflow/gcn/gcn_builtin.py | 37 +- examples/tensorflow/gcn/gcn_mp.py | 44 +- examples/tensorflow/gcn/train.py | 36 +- python/dgl/data/__init__.py | 4 +- python/dgl/data/citation_graph.py | 841 +++++++++++++++++++------ python/dgl/data/rdf.py | 49 +- 23 files changed, 1157 insertions(+), 646 deletions(-) diff --git a/examples/mxnet/appnp/appnp.py b/examples/mxnet/appnp/appnp.py index 82d045992eb8..c1dc3e548c5c 100644 --- a/examples/mxnet/appnp/appnp.py +++ b/examples/mxnet/appnp/appnp.py @@ -4,8 +4,9 @@ import mxnet as mx from mxnet import nd, gluon from mxnet.gluon import nn -from dgl import DGLGraph -from dgl.data import register_data_args, load_data +import dgl +from dgl.data import register_data_args +from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset from dgl.nn.mxnet.conv import APPNPConv class APPNP(nn.Block): @@ -57,13 +58,29 @@ def evaluate(model, features, labels, mask): def main(args): # load and preprocess dataset - data = load_data(args) - features = nd.array(data.features) - labels = nd.array(data.labels) - train_mask = nd.array(data.train_mask) - val_mask = nd.array(data.val_mask) - test_mask = nd.array(data.test_mask) + if args.dataset == 'cora': + data = CoraGraphDataset() + elif args.dataset == 'citeseer': + data = CiteseerGraphDataset() + elif args.dataset == 'pubmed': + data = PubmedGraphDataset() + else: + raise ValueError('Unknown dataset: {}'.format(args.dataset)) + + g = data[0] + if args.gpu < 0: + cuda = False + ctx = mx.cpu(0) + else: + cuda = True + ctx = mx.gpu(args.gpu) + g = g.to(ctx) + features = g.ndata['feat'] + labels = mx.nd.array(g.ndata['label'], dtype="float32", ctx=ctx) + train_mask = g.ndata['train_mask'] + val_mask = g.ndata['val_mask'] + test_mask = g.ndata['test_mask'] in_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() @@ -78,24 +95,9 @@ def main(args): val_mask.sum().asscalar(), test_mask.sum().asscalar())) - if args.gpu < 0: - ctx = mx.cpu() - else: - ctx = mx.gpu(args.gpu) - - features = features.as_in_context(ctx) - labels = labels.as_in_context(ctx) - train_mask = train_mask.as_in_context(ctx) - val_mask = val_mask.as_in_context(ctx) - test_mask = test_mask.as_in_context(ctx) - - # graph preprocess and calculate normalization factor - g = DGLGraph(data.graph) - n_edges = g.number_of_edges() # add self loop - g.add_edges(g.nodes(), g.nodes()) - g.set_n_initializer(dgl.init.zero_initializer) - g.set_e_initializer(dgl.init.zero_initializer) + g = dgl.remove_self_loop(g) + g = dgl.add_self_loop(g) # create APPNP model model = APPNP(g, diff --git a/examples/mxnet/gat/train.py b/examples/mxnet/gat/train.py index e8f82a4e065d..4981a83f4b9f 100644 --- a/examples/mxnet/gat/train.py +++ b/examples/mxnet/gat/train.py @@ -14,8 +14,10 @@ import mxnet as mx from mxnet import gluon import numpy as np -from dgl import DGLGraph -from dgl.data import register_data_args, load_data + +import dgl +from dgl.data import register_data_args +from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset from gat import GAT from utils import EarlyStopping @@ -34,33 +36,38 @@ def evaluate(model, features, labels, mask): def main(args): # load and preprocess dataset - data = load_data(args) + if args.dataset == 'cora': + data = CoraGraphDataset() + elif args.dataset == 'citeseer': + data = CiteseerGraphDataset() + elif args.dataset == 'pubmed': + data = PubmedGraphDataset() + else: + raise ValueError('Unknown dataset: {}'.format(args.dataset)) - features = mx.nd.array(data.features) - labels = mx.nd.array(data.labels) - mask = mx.nd.array(np.where(data.train_mask == 1)) - test_mask = mx.nd.array(np.where(data.test_mask == 1)) - val_mask = mx.nd.array(np.where(data.val_mask == 1)) + g = data[0] + if args.gpu < 0: + cuda = False + ctx = mx.cpu(0) + else: + cuda = True + ctx = mx.gpu(args.gpu) + g = g.to(ctx) + + features = g.ndata['feat'] + labels = mx.nd.array(g.ndata['label'], dtype="float32", ctx=ctx) + mask = g.ndata['train_mask'] + mask = mx.nd.array(np.nonzero(mask.asnumpy())[0], ctx=ctx) + val_mask = g.ndata['val_mask'] + val_mask = mx.nd.array(np.nonzero(val_mask.asnumpy())[0], ctx=ctx) + test_mask = g.ndata['test_mask'] + test_mask = mx.nd.array(np.nonzero(test_mask.asnumpy())[0], ctx=ctx) in_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() - if args.gpu < 0: - ctx = mx.cpu() - else: - ctx = mx.gpu(args.gpu) - features = features.as_in_context(ctx) - labels = labels.as_in_context(ctx) - mask = mask.as_in_context(ctx) - test_mask = test_mask.as_in_context(ctx) - val_mask = val_mask.as_in_context(ctx) - # create graph - g = data.graph - # add self-loop - g.remove_edges_from(nx.selfloop_edges(g)) - g = DGLGraph(g) - g.add_edges(g.nodes(), g.nodes()) - g = g.to(ctx) + g = dgl.remove_self_loop(g) + g = dgl.add_self_loop(g) # create model heads = ([args.num_heads] * args.num_layers) + [args.num_out_heads] model = GAT(g, @@ -100,7 +107,7 @@ def main(args): val_accuracy = evaluate(model, features, labels, val_mask) print("Validation Accuracy {:.4f}".format(val_accuracy)) if args.early_stop: - if stopper.step(val_accuracy, model): + if stopper.step(val_accuracy, model): break print() diff --git a/examples/mxnet/gcn/gcn_concat.py b/examples/mxnet/gcn/gcn_concat.py index 8ee88501a9d5..1d9efd1dcda9 100644 --- a/examples/mxnet/gcn/gcn_concat.py +++ b/examples/mxnet/gcn/gcn_concat.py @@ -11,8 +11,8 @@ from mxnet import gluon import dgl import dgl.function as fn -from dgl import DGLGraph -from dgl.data import register_data_args, load_data +from dgl.data import register_data_args +from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset class GCNLayer(gluon.Block): @@ -75,16 +75,29 @@ def evaluate(model, features, labels, mask): def main(args): # load and preprocess dataset - data = load_data(args) + if args.dataset == 'cora': + data = CoraGraphDataset() + elif args.dataset == 'citeseer': + data = CiteseerGraphDataset() + elif args.dataset == 'pubmed': + data = PubmedGraphDataset() + else: + raise ValueError('Unknown dataset: {}'.format(args.dataset)) - if args.self_loop: - data.graph.add_edges_from([(i,i) for i in range(len(data.graph))]) + g = data[0] + if args.gpu < 0: + cuda = False + ctx = mx.cpu(0) + else: + cuda = True + ctx = mx.gpu(args.gpu) + g = g.to(ctx) - features = mx.nd.array(data.features) - labels = mx.nd.array(data.labels) - train_mask = mx.nd.array(data.train_mask) - val_mask = mx.nd.array(data.val_mask) - test_mask = mx.nd.array(data.test_mask) + features = g.ndata['feat'] + labels = mx.nd.array(g.ndata['label'], dtype="float32", ctx=ctx) + train_mask = g.ndata['train_mask'] + val_mask = g.ndata['val_mask'] + test_mask = g.ndata['test_mask'] in_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() @@ -99,21 +112,10 @@ def main(args): val_mask.sum().asscalar(), test_mask.sum().asscalar())) - if args.gpu < 0: - cuda = False - ctx = mx.cpu(0) - else: - cuda = True - ctx = mx.gpu(args.gpu) - - features = features.as_in_context(ctx) - labels = labels.as_in_context(ctx) - train_mask = train_mask.as_in_context(ctx) - val_mask = val_mask.as_in_context(ctx) - test_mask = test_mask.as_in_context(ctx) - - # create GCN model - g = DGLGraph(data.graph) + # add self loop + if args.self_loop: + g = dgl.remove_self_loop(g) + g = dgl.add_self_loop(g) # normalization in_degs = g.in_degrees().astype('float32') out_degs = g.out_degrees().astype('float32') diff --git a/examples/mxnet/gcn/train.py b/examples/mxnet/gcn/train.py index 7daf2f024f51..ed9a1c1e2faa 100644 --- a/examples/mxnet/gcn/train.py +++ b/examples/mxnet/gcn/train.py @@ -6,7 +6,8 @@ from mxnet import gluon import dgl -from dgl.data import register_data_args, load_data +from dgl.data import register_data_args +from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset from gcn import GCN #from gcn_mp import GCN @@ -19,13 +20,29 @@ def evaluate(model, features, labels, mask): def main(args): # load and preprocess dataset - data = load_data(args) - features = mx.nd.array(data.features) - labels = mx.nd.array(data.labels) - train_mask = mx.nd.array(data.train_mask) - val_mask = mx.nd.array(data.val_mask) - test_mask = mx.nd.array(data.test_mask) + if args.dataset == 'cora': + data = CoraGraphDataset() + elif args.dataset == 'citeseer': + data = CiteseerGraphDataset() + elif args.dataset == 'pubmed': + data = PubmedGraphDataset() + else: + raise ValueError('Unknown dataset: {}'.format(args.dataset)) + + g = data[0] + if args.gpu < 0: + cuda = False + ctx = mx.cpu(0) + else: + cuda = True + ctx = mx.gpu(args.gpu) + g = g.to(ctx) + features = g.ndata['feat'] + labels = mx.nd.array(g.ndata['label'], dtype="float32", ctx=ctx) + train_mask = g.ndata['train_mask'] + val_mask = g.ndata['val_mask'] + test_mask = g.ndata['test_mask'] in_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() @@ -40,25 +57,10 @@ def main(args): val_mask.sum().asscalar(), test_mask.sum().asscalar())) - if args.gpu < 0: - cuda = False - ctx = mx.cpu(0) - else: - cuda = True - ctx = mx.gpu(args.gpu) - - features = features.as_in_context(ctx) - labels = labels.as_in_context(ctx) - train_mask = train_mask.as_in_context(ctx) - val_mask = val_mask.as_in_context(ctx) - test_mask = test_mask.as_in_context(ctx) - - # create GCN model - g = data.graph + # add self loop if args.self_loop: - g.remove_edges_from(nx.selfloop_edges(g)) - g.add_edges_from(zip(g.nodes(), g.nodes())) - g = dgl.graph(g).to(ctx) + g = dgl.remove_self_loop(g) + g = dgl.add_self_loop(g) # normalization degs = g.in_degrees().astype('float32') norm = mx.nd.power(degs, -0.5) diff --git a/examples/mxnet/graphsage/main.py b/examples/mxnet/graphsage/main.py index a04db488e058..d646cc656379 100644 --- a/examples/mxnet/graphsage/main.py +++ b/examples/mxnet/graphsage/main.py @@ -11,8 +11,9 @@ import mxnet as mx from mxnet import nd, gluon from mxnet.gluon import nn -from dgl import DGLGraph -from dgl.data import register_data_args, load_data +import dgl +from dgl.data import register_data_args +from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset from dgl.nn.mxnet.conv import SAGEConv @@ -52,13 +53,29 @@ def evaluate(model, features, labels, mask): def main(args): # load and preprocess dataset - data = load_data(args) - features = nd.array(data.features) - labels = nd.array(data.labels) - train_mask = nd.array(data.train_mask) - val_mask = nd.array(data.val_mask) - test_mask = nd.array(data.test_mask) + if args.dataset == 'cora': + data = CoraGraphDataset() + elif args.dataset == 'citeseer': + data = CiteseerGraphDataset() + elif args.dataset == 'pubmed': + data = PubmedGraphDataset() + else: + raise ValueError('Unknown dataset: {}'.format(args.dataset)) + + g = data[0] + if args.gpu < 0: + cuda = False + ctx = mx.cpu(0) + else: + cuda = True + ctx = mx.gpu(args.gpu) + g = g.to(ctx) + features = g.ndata['feat'] + labels = mx.nd.array(g.ndata['label'], dtype="float32", ctx=ctx) + train_mask = g.ndata['train_mask'] + val_mask = g.ndata['val_mask'] + test_mask = g.ndata['test_mask'] in_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() @@ -73,22 +90,9 @@ def main(args): val_mask.sum().asscalar(), test_mask.sum().asscalar())) - if args.gpu < 0: - ctx = mx.cpu(0) - else: - ctx = mx.gpu(args.gpu) - print("use cuda:", args.gpu) - - features = features.as_in_context(ctx) - labels = labels.as_in_context(ctx) - train_mask = train_mask.as_in_context(ctx) - val_mask = val_mask.as_in_context(ctx) - test_mask = test_mask.as_in_context(ctx) - - # graph preprocess and calculate normalization factor - g = data.graph - g.remove_edges_from(nx.selfloop_edges(g)) - g = DGLGraph(g) + # add self loop + g = dgl.remove_self_loop(g) + g = dgl.add_self_loop(g) n_edges = g.number_of_edges() # create GraphSAGE model diff --git a/examples/mxnet/monet/citation.py b/examples/mxnet/monet/citation.py index 214ae61b0290..a544e95d55db 100644 --- a/examples/mxnet/monet/citation.py +++ b/examples/mxnet/monet/citation.py @@ -5,8 +5,9 @@ import mxnet as mx from mxnet import gluon, nd from mxnet.gluon import nn -from dgl import DGLGraph -from dgl.data import register_data_args, load_data +import dgl +from dgl.data import register_data_args +from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset from dgl.nn.mxnet.conv import GMMConv @@ -59,13 +60,29 @@ def evaluate(model, features, pseudo, labels, mask): def main(args): # load and preprocess dataset - data = load_data(args) - features = nd.array(data.features) - labels = nd.array(data.labels) - train_mask = nd.array(data.train_mask) - val_mask = nd.array(data.val_mask) - test_mask = nd.array(data.test_mask) + if args.dataset == 'cora': + data = CoraGraphDataset() + elif args.dataset == 'citeseer': + data = CiteseerGraphDataset() + elif args.dataset == 'pubmed': + data = PubmedGraphDataset() + else: + raise ValueError('Unknown dataset: {}'.format(args.dataset)) + + g = data[0] + if args.gpu < 0: + cuda = False + ctx = mx.cpu(0) + else: + cuda = True + ctx = mx.gpu(args.gpu) + g = g.to(ctx) + features = g.ndata['feat'] + labels = mx.nd.array(g.ndata['label'], dtype="float32", ctx=ctx) + train_mask = g.ndata['train_mask'] + val_mask = g.ndata['val_mask'] + test_mask = g.ndata['test_mask'] in_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() @@ -80,29 +97,19 @@ def main(args): val_mask.sum().asscalar(), test_mask.sum().asscalar())) - if args.gpu < 0: - ctx = mx.cpu(0) - else: - ctx = mx.gpu(args.gpu) - print("use cuda:", args.gpu) - - features = features.as_in_context(ctx) - labels = labels.as_in_context(ctx) - train_mask = train_mask.as_in_context(ctx) - val_mask = val_mask.as_in_context(ctx) - test_mask = test_mask.as_in_context(ctx) - - # graph preprocess and calculate normalization factor - g = data.graph - g.remove_edges_from(nx.selfloop_edges(g)) - g = DGLGraph(g) + # add self loop + g = dgl.remove_self_loop(g) + g = dgl.add_self_loop(g) + n_edges = g.number_of_edges() us, vs = g.edges() + us = us.asnumpy() + vs = vs.asnumpy() pseudo = [] for i in range(g.number_of_edges()): pseudo.append([ - 1 / np.sqrt(g.in_degree(us[i].asscalar())), - 1 / np.sqrt(g.in_degree(vs[i].asscalar())) + 1 / np.sqrt(g.in_degree(us[i])), + 1 / np.sqrt(g.in_degree(vs[i])) ]) pseudo = nd.array(pseudo, ctx=ctx) diff --git a/examples/mxnet/sgc/sgc.py b/examples/mxnet/sgc/sgc.py index fae6595ea99b..0e7e8bd06d7e 100644 --- a/examples/mxnet/sgc/sgc.py +++ b/examples/mxnet/sgc/sgc.py @@ -10,8 +10,9 @@ import mxnet as mx from mxnet import nd, gluon from mxnet.gluon import nn -from dgl import DGLGraph -from dgl.data import register_data_args, load_data +import dgl +from dgl.data import register_data_args +from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset from dgl.nn.mxnet.conv import SGConv @@ -22,13 +23,29 @@ def evaluate(model, g, features, labels, mask): def main(args): # load and preprocess dataset - data = load_data(args) - features = nd.array(data.features) - labels = nd.array(data.labels) - train_mask = nd.array(data.train_mask) - val_mask = nd.array(data.val_mask) - test_mask = nd.array(data.test_mask) + if args.dataset == 'cora': + data = CoraGraphDataset() + elif args.dataset == 'citeseer': + data = CiteseerGraphDataset() + elif args.dataset == 'pubmed': + data = PubmedGraphDataset() + else: + raise ValueError('Unknown dataset: {}'.format(args.dataset)) + + g = data[0] + if args.gpu < 0: + cuda = False + ctx = mx.cpu(0) + else: + cuda = True + ctx = mx.gpu(args.gpu) + g = g.to(ctx) + features = g.ndata['feat'] + labels = mx.nd.array(g.ndata['label'], dtype="float32", ctx=ctx) + train_mask = g.ndata['train_mask'] + val_mask = g.ndata['val_mask'] + test_mask = g.ndata['test_mask'] in_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() @@ -43,22 +60,9 @@ def main(args): val_mask.sum().asscalar(), test_mask.sum().asscalar())) - if args.gpu < 0: - ctx = mx.cpu(0) - else: - ctx = mx.gpu(args.gpu) - - features = features.as_in_context(ctx) - labels = labels.as_in_context(ctx) - train_mask = train_mask.as_in_context(ctx) - val_mask = val_mask.as_in_context(ctx) - test_mask = test_mask.as_in_context(ctx) - - # graph preprocess and calculate normalization factor - g = DGLGraph(data.graph) - n_edges = g.number_of_edges() # add self loop - g.add_edges(g.nodes(), g.nodes()) + g = dgl.remove_self_loop(g) + g = dgl.add_self_loop(g) # create SGC model model = SGConv(in_feats, diff --git a/examples/mxnet/tagcn/train.py b/examples/mxnet/tagcn/train.py index 361b80df5641..b990ed13fe0d 100644 --- a/examples/mxnet/tagcn/train.py +++ b/examples/mxnet/tagcn/train.py @@ -4,8 +4,9 @@ import mxnet as mx from mxnet import gluon -from dgl import DGLGraph -from dgl.data import register_data_args, load_data +import dgl +from dgl.data import register_data_args +from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset from tagcn import TAGCN @@ -16,12 +17,29 @@ def evaluate(model, features, labels, mask): def main(args): # load and preprocess dataset - data = load_data(args) - features = mx.nd.array(data.features) - labels = mx.nd.array(data.labels) - train_mask = mx.nd.array(data.train_mask) - val_mask = mx.nd.array(data.val_mask) - test_mask = mx.nd.array(data.test_mask) + if args.dataset == 'cora': + data = CoraGraphDataset() + elif args.dataset == 'citeseer': + data = CiteseerGraphDataset() + elif args.dataset == 'pubmed': + data = PubmedGraphDataset() + else: + raise ValueError('Unknown dataset: {}'.format(args.dataset)) + + g = data[0] + if args.gpu < 0: + cuda = False + ctx = mx.cpu(0) + else: + cuda = True + ctx = mx.gpu(args.gpu) + g = g.to(ctx) + + features = g.ndata['feat'] + labels = mx.nd.array(g.ndata['label'], dtype="float32", ctx=ctx) + train_mask = g.ndata['train_mask'] + val_mask = g.ndata['val_mask'] + test_mask = g.ndata['test_mask'] in_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() @@ -36,26 +54,9 @@ def main(args): val_mask.sum().asscalar(), test_mask.sum().asscalar())) - if args.gpu < 0: - cuda = False - ctx = mx.cpu(0) - else: - cuda = True - ctx = mx.gpu(args.gpu) - - features = features.as_in_context(ctx) - labels = labels.as_in_context(ctx) - train_mask = train_mask.as_in_context(ctx) - val_mask = val_mask.as_in_context(ctx) - test_mask = test_mask.as_in_context(ctx) - - # graph preprocess and calculate normalization factor - g = data.graph # add self loop - if args.self_loop: - g.remove_edges_from(nx.selfloop_edges(g)) - g.add_edges_from(zip(g.nodes(), g.nodes())) - g = DGLGraph(g) + g = dgl.remove_self_loop(g) + g = dgl.add_self_loop(g) # create TAGCN model model = TAGCN(g, diff --git a/examples/pytorch/appnp/train.py b/examples/pytorch/appnp/train.py index ed5cb2544b90..a09e6f623593 100644 --- a/examples/pytorch/appnp/train.py +++ b/examples/pytorch/appnp/train.py @@ -3,8 +3,8 @@ import torch import torch.nn as nn import torch.nn.functional as F -from dgl import DGLGraph -from dgl.data import register_data_args, load_data +from dgl.data import register_data_args +from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset import dgl from appnp import APPNP @@ -22,17 +22,27 @@ def evaluate(model, features, labels, mask): def main(args): # load and preprocess dataset - data = load_data(args) - features = torch.FloatTensor(data.features) - labels = torch.LongTensor(data.labels) - if hasattr(torch, 'BoolTensor'): - train_mask = torch.BoolTensor(data.train_mask) - val_mask = torch.BoolTensor(data.val_mask) - test_mask = torch.BoolTensor(data.test_mask) + if args.dataset == 'cora': + data = CoraGraphDataset() + elif args.dataset == 'citeseer': + data = CiteseerGraphDataset() + elif args.dataset == 'pubmed': + data = PubmedGraphDataset() else: - train_mask = torch.ByteTensor(data.train_mask) - val_mask = torch.ByteTensor(data.val_mask) - test_mask = torch.ByteTensor(data.test_mask) + raise ValueError('Unknown dataset: {}'.format(args.dataset)) + + g = data[0] + if args.gpu < 0: + cuda = False + else: + cuda = True + g = g.to(args.gpu) + + features = g.ndata['feat'] + labels = g.ndata['label'] + train_mask = g.ndata['train_mask'] + val_mask = g.ndata['val_mask'] + test_mask = g.ndata['test_mask'] in_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() @@ -47,27 +57,10 @@ def main(args): val_mask.int().sum().item(), test_mask.int().sum().item())) - if args.gpu < 0: - cuda = False - else: - cuda = True - torch.cuda.set_device(args.gpu) - features = features.cuda() - labels = labels.cuda() - train_mask = train_mask.cuda() - val_mask = val_mask.cuda() - test_mask = test_mask.cuda() - - # graph preprocess and calculate normalization factor - g = DGLGraph(data.graph) n_edges = g.number_of_edges() # add self loop - g.add_edges(g.nodes(), g.nodes()) - g.set_n_initializer(dgl.init.zero_initializer) - g.set_e_initializer(dgl.init.zero_initializer) - - if args.gpu >= 0: - g = g.to(args.gpu) + g = dgl.remove_self_loop(g) + g = dgl.add_self_loop(g) # create APPNP model model = APPNP(g, diff --git a/examples/pytorch/gat/train.py b/examples/pytorch/gat/train.py index c9e602fb9b8f..45551905f00b 100644 --- a/examples/pytorch/gat/train.py +++ b/examples/pytorch/gat/train.py @@ -14,8 +14,10 @@ import time import torch import torch.nn.functional as F -from dgl import DGLGraph -from dgl.data import register_data_args, load_data +import dgl +from dgl.data import register_data_args +from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset + from gat import GAT from utils import EarlyStopping @@ -37,23 +39,33 @@ def evaluate(model, features, labels, mask): def main(args): # load and preprocess dataset - data = load_data(args) - features = torch.FloatTensor(data.features) - labels = torch.LongTensor(data.labels) - if hasattr(torch, 'BoolTensor'): - train_mask = torch.BoolTensor(data.train_mask) - val_mask = torch.BoolTensor(data.val_mask) - test_mask = torch.BoolTensor(data.test_mask) + if args.dataset == 'cora': + data = CoraGraphDataset() + elif args.dataset == 'citeseer': + data = CiteseerGraphDataset() + elif args.dataset == 'pubmed': + data = PubmedGraphDataset() else: - train_mask = torch.ByteTensor(data.train_mask) - val_mask = torch.ByteTensor(data.val_mask) - test_mask = torch.ByteTensor(data.test_mask) + raise ValueError('Unknown dataset: {}'.format(args.dataset)) + + g = data[0] + if args.gpu < 0: + cuda = False + else: + cuda = True + g = g.to(args.gpu) + + features = g.ndata['feat'] + labels = g.ndata['label'] + train_mask = g.ndata['train_mask'] + val_mask = g.ndata['val_mask'] + test_mask = g.ndata['test_mask'] num_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() print("""----Data statistics------' #Edges %d - #Classes %d + #Classes %d #Train samples %d #Val samples %d #Test samples %d""" % @@ -62,24 +74,9 @@ def main(args): val_mask.int().sum().item(), test_mask.int().sum().item())) - if args.gpu < 0: - cuda = False - else: - cuda = True - torch.cuda.set_device(args.gpu) - features = features.cuda() - labels = labels.cuda() - train_mask = train_mask.cuda() - val_mask = val_mask.cuda() - test_mask = test_mask.cuda() - - g = data.graph # add self loop - g.remove_edges_from(nx.selfloop_edges(g)) - g = DGLGraph(g) - g.add_edges(g.nodes(), g.nodes()) - if cuda: - g = g.to(args.gpu) + g = dgl.remove_self_loop(g) + g = dgl.add_self_loop(g) n_edges = g.number_of_edges() # create model heads = ([args.num_heads] * args.num_layers) + [args.num_out_heads] @@ -129,7 +126,7 @@ def main(args): else: val_acc = evaluate(model, features, labels, val_mask) if args.early_stop: - if stopper.step(val_acc, model): + if stopper.step(val_acc, model): break print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | TrainAcc {:.4f} |" diff --git a/examples/pytorch/gcn/gcn_mp.py b/examples/pytorch/gcn/gcn_mp.py index 77822819feac..e62e5dcb7c5a 100644 --- a/examples/pytorch/gcn/gcn_mp.py +++ b/examples/pytorch/gcn/gcn_mp.py @@ -11,8 +11,9 @@ import torch import torch.nn as nn import torch.nn.functional as F -from dgl import DGLGraph -from dgl.data import register_data_args, load_data +import dgl +from dgl.data import register_data_args +from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset def gcn_msg(edge): @@ -116,17 +117,27 @@ def evaluate(model, features, labels, mask): def main(args): # load and preprocess dataset - data = load_data(args) - features = torch.FloatTensor(data.features) - labels = torch.LongTensor(data.labels) - if hasattr(torch, 'BoolTensor'): - train_mask = torch.BoolTensor(data.train_mask) - val_mask = torch.BoolTensor(data.val_mask) - test_mask = torch.BoolTensor(data.test_mask) + if args.dataset == 'cora': + data = CoraGraphDataset() + elif args.dataset == 'citeseer': + data = CiteseerGraphDataset() + elif args.dataset == 'pubmed': + data = PubmedGraphDataset() else: - train_mask = torch.ByteTensor(data.train_mask) - val_mask = torch.ByteTensor(data.val_mask) - test_mask = torch.ByteTensor(data.test_mask) + raise ValueError('Unknown dataset: {}'.format(args.dataset)) + + g = data[0] + if args.gpu < 0: + cuda = False + else: + cuda = True + g = g.to(args.gpu) + + features = g.ndata['feat'] + labels = g.ndata['label'] + train_mask = g.ndata['train_mask'] + val_mask = g.ndata['val_mask'] + test_mask = g.ndata['test_mask'] in_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() @@ -141,24 +152,11 @@ def main(args): val_mask.int().sum().item(), test_mask.int().sum().item())) - if args.gpu < 0: - cuda = False - else: - cuda = True - torch.cuda.set_device(args.gpu) - features = features.cuda() - labels = labels.cuda() - train_mask = train_mask.cuda() - val_mask = val_mask.cuda() - test_mask = test_mask.cuda() - - # graph preprocess and calculate normalization factor - g = data.graph - g.remove_edges_from(nx.selfloop_edges(g)) - g = DGLGraph(g) # add self loop - g.add_edges(g.nodes(), g.nodes()) + g = dgl.remove_self_loop(g) + g = dgl.add_self_loop(g) n_edges = g.number_of_edges() + # normalization degs = g.in_degrees().float() norm = torch.pow(degs, -0.5) diff --git a/examples/pytorch/gcn/train.py b/examples/pytorch/gcn/train.py index 5f9461944f36..4ad484e04b1e 100644 --- a/examples/pytorch/gcn/train.py +++ b/examples/pytorch/gcn/train.py @@ -4,8 +4,9 @@ import torch import torch.nn as nn import torch.nn.functional as F -from dgl import DGLGraph -from dgl.data import register_data_args, load_data +import dgl +from dgl.data import register_data_args +from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset from gcn import GCN #from gcn_mp import GCN @@ -23,17 +24,27 @@ def evaluate(model, features, labels, mask): def main(args): # load and preprocess dataset - data = load_data(args) - features = torch.FloatTensor(data.features) - labels = torch.LongTensor(data.labels) - if hasattr(torch, 'BoolTensor'): - train_mask = torch.BoolTensor(data.train_mask) - val_mask = torch.BoolTensor(data.val_mask) - test_mask = torch.BoolTensor(data.test_mask) + if args.dataset == 'cora': + data = CoraGraphDataset() + elif args.dataset == 'citeseer': + data = CiteseerGraphDataset() + elif args.dataset == 'pubmed': + data = PubmedGraphDataset() else: - train_mask = torch.ByteTensor(data.train_mask) - val_mask = torch.ByteTensor(data.val_mask) - test_mask = torch.ByteTensor(data.test_mask) + raise ValueError('Unknown dataset: {}'.format(args.dataset)) + + g = data[0] + if args.gpu < 0: + cuda = False + else: + cuda = True + g = g.to(args.gpu) + + features = g.ndata['feat'] + labels = g.ndata['label'] + train_mask = g.ndata['train_mask'] + val_mask = g.ndata['val_mask'] + test_mask = g.ndata['test_mask'] in_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() @@ -48,27 +59,12 @@ def main(args): val_mask.int().sum().item(), test_mask.int().sum().item())) - if args.gpu < 0: - cuda = False - else: - cuda = True - torch.cuda.set_device(args.gpu) - features = features.cuda() - labels = labels.cuda() - train_mask = train_mask.cuda() - val_mask = val_mask.cuda() - test_mask = test_mask.cuda() - - # graph preprocess and calculate normalization factor - g = data.graph # add self loop if args.self_loop: - g.remove_edges_from(nx.selfloop_edges(g)) - g.add_edges_from(zip(g.nodes(), g.nodes())) - g = DGLGraph(g) + g = dgl.remove_self_loop(g) + g = dgl.add_self_loop(g) n_edges = g.number_of_edges() - if cuda: - g = g.to(args.gpu) + # normalization degs = g.in_degrees().float() norm = torch.pow(degs, -0.5) diff --git a/examples/pytorch/sgc/sgc.py b/examples/pytorch/sgc/sgc.py index 8b3c9a3d990f..8abaefbfde2f 100644 --- a/examples/pytorch/sgc/sgc.py +++ b/examples/pytorch/sgc/sgc.py @@ -11,8 +11,9 @@ import torch.nn as nn import torch.nn.functional as F import dgl.function as fn -from dgl import DGLGraph -from dgl.data import register_data_args, load_data +import dgl +from dgl.data import register_data_args +from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset from dgl.nn.pytorch.conv import SGConv @@ -27,17 +28,27 @@ def evaluate(model, g, features, labels, mask): def main(args): # load and preprocess dataset - data = load_data(args) - features = torch.FloatTensor(data.features) - labels = torch.LongTensor(data.labels) - if hasattr(torch, 'BoolTensor'): - train_mask = torch.BoolTensor(data.train_mask) - val_mask = torch.BoolTensor(data.val_mask) - test_mask = torch.BoolTensor(data.test_mask) + if args.dataset == 'cora': + data = CoraGraphDataset() + elif args.dataset == 'citeseer': + data = CiteseerGraphDataset() + elif args.dataset == 'pubmed': + data = PubmedGraphDataset() else: - train_mask = torch.ByteTensor(data.train_mask) - val_mask = torch.ByteTensor(data.val_mask) - test_mask = torch.ByteTensor(data.test_mask) + raise ValueError('Unknown dataset: {}'.format(args.dataset)) + + g = data[0] + if args.gpu < 0: + cuda = False + else: + cuda = True + g = g.to(args.gpu) + + features = g.ndata['feat'] + labels = g.ndata['label'] + train_mask = g.ndata['train_mask'] + val_mask = g.ndata['val_mask'] + test_mask = g.ndata['test_mask'] in_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() @@ -52,22 +63,10 @@ def main(args): val_mask.int().sum().item(), test_mask.int().sum().item())) - if args.gpu < 0: - cuda = False - else: - cuda = True - torch.cuda.set_device(args.gpu) - features = features.cuda() - labels = labels.cuda() - train_mask = train_mask.cuda() - val_mask = val_mask.cuda() - test_mask = test_mask.cuda() - - # graph preprocess and calculate normalization factor - g = DGLGraph(data.graph) n_edges = g.number_of_edges() # add self loop - g.add_edges(g.nodes(), g.nodes()) + g = dgl.remove_self_loop(g) + g = dgl.add_self_loop(g) # create SGC model model = SGConv(in_feats, @@ -78,7 +77,6 @@ def main(args): if cuda: model.cuda() - g = g.to(args.gpu) loss_fcn = torch.nn.CrossEntropyLoss() # use optimizer diff --git a/examples/tensorflow/dgi/train.py b/examples/tensorflow/dgi/train.py index 33357cc9fcfd..555138b75904 100644 --- a/examples/tensorflow/dgi/train.py +++ b/examples/tensorflow/dgi/train.py @@ -4,8 +4,9 @@ import networkx as nx import tensorflow as tf from tensorflow.keras import layers -from dgl import DGLGraph -from dgl.data import register_data_args, load_data +import dgl +from dgl.data import register_data_args +from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset from dgi import DGI, Classifier @@ -20,28 +21,36 @@ def evaluate(model, features, labels, mask): def main(args): # load and preprocess dataset - data = load_data(args) + if args.dataset == 'cora': + data = CoraGraphDataset() + elif args.dataset == 'citeseer': + data = CiteseerGraphDataset() + elif args.dataset == 'pubmed': + data = PubmedGraphDataset() + else: + raise ValueError('Unknown dataset: {}'.format(args.dataset)) + + g = data[0] if args.gpu < 0: device = "/cpu:0" else: device = "/gpu:{}".format(args.gpu) + g = g.to(device) + with tf.device(device): - features = tf.convert_to_tensor(data.features, dtype=tf.float32) - labels = tf.convert_to_tensor(data.labels, dtype=tf.int64) - train_mask = tf.convert_to_tensor(data.train_mask, dtype=tf.bool) - val_mask = tf.convert_to_tensor(data.val_mask, dtype=tf.bool) - test_mask = tf.convert_to_tensor(data.test_mask, dtype=tf.bool) + features = g.ndata['feat'] + labels = g.ndata['label'] + train_mask = g.ndata['train_mask'] + val_mask = g.ndata['val_mask'] + test_mask = g.ndata['test_mask'] in_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() - # graph preprocess - g = data.graph # add self loop if args.self_loop: - g.remove_edges_from(nx.selfloop_edges(g)) - g.add_edges_from(zip(g.nodes(), g.nodes())) - g = DGLGraph(g) + g = dgl.remove_self_loop(g) + g = dgl.add_self_loop(g) n_edges = g.number_of_edges() # create DGI model @@ -67,7 +76,7 @@ def main(args): with tf.GradientTape() as tape: loss = dgi(features) # Manually Weight Decay - # We found Tensorflow has a different implementation on weight decay + # We found Tensorflow has a different implementation on weight decay # of Adam(W) optimizer with PyTorch. And this results in worse results. # Manually adding weights to the loss to do weight decay solves this problem. for weight in dgi.trainable_weights: @@ -115,10 +124,10 @@ def main(args): preds = classifier(embeds) loss = loss_fcn(labels[train_mask], preds[train_mask]) # Manually Weight Decay - # We found Tensorflow has a different implementation on weight decay + # We found Tensorflow has a different implementation on weight decay # of Adam(W) optimizer with PyTorch. And this results in worse results. # Manually adding weights to the loss to do weight decay solves this problem. - # In original code, there's no weight decay applied in this part + # In original code, there's no weight decay applied in this part # link: https://github.com/PetarV-/DGI/blob/master/execute.py#L121 # for weight in classifier.trainable_weights: # loss = loss + \ diff --git a/examples/tensorflow/gat/README.md b/examples/tensorflow/gat/README.md index f29603f6538c..761f20e491c7 100644 --- a/examples/tensorflow/gat/README.md +++ b/examples/tensorflow/gat/README.md @@ -14,6 +14,7 @@ Dependencies ```bash pip install tensorflow requests +DGLBACKEND=tensorflow ``` How to run diff --git a/examples/tensorflow/gat/train.py b/examples/tensorflow/gat/train.py index 6d283d772f18..c3387824df62 100644 --- a/examples/tensorflow/gat/train.py +++ b/examples/tensorflow/gat/train.py @@ -15,8 +15,9 @@ import networkx as nx import time import tensorflow as tf -from dgl import DGLGraph -from dgl.data import register_data_args, load_data +import dgl +from dgl.data import register_data_args +from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset from gat import GAT from utils import EarlyStopping @@ -35,26 +36,34 @@ def evaluate(model, features, labels, mask): def main(args): # load and preprocess dataset - data = load_data(args) + if args.dataset == 'cora': + data = CoraGraphDataset() + elif args.dataset == 'citeseer': + data = CiteseerGraphDataset() + elif args.dataset == 'pubmed': + data = PubmedGraphDataset() + else: + raise ValueError('Unknown dataset: {}'.format(args.dataset)) + g = data[0] if args.gpu < 0: device = "/cpu:0" else: device = "/gpu:{}".format(args.gpu) + g = g.to(device) with tf.device(device): - - features = tf.convert_to_tensor(data.features, dtype=tf.float32) - labels = tf.convert_to_tensor(data.labels, dtype=tf.int64) - train_mask = tf.convert_to_tensor(data.train_mask, dtype=tf.bool) - val_mask = tf.convert_to_tensor(data.val_mask, dtype=tf.bool) - test_mask = tf.convert_to_tensor(data.test_mask, dtype=tf.bool) + features = g.ndata['feat'] + labels = g.ndata['label'] + train_mask = g.ndata['train_mask'] + val_mask = g.ndata['val_mask'] + test_mask = g.ndata['test_mask'] num_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() print("""----Data statistics------' #Edges %d - #Classes %d + #Classes %d #Train samples %d #Val samples %d #Test samples %d""" % @@ -63,11 +72,8 @@ def main(args): val_mask.numpy().sum(), test_mask.numpy().sum())) - g = data.graph - # add self loop - g.remove_edges_from(nx.selfloop_edges(g)) - g = DGLGraph(g).to(device) - g.add_edges(g.nodes(), g.nodes()) + g = dgl.remove_self_loop(g) + g = dgl.add_self_loop(g) n_edges = g.number_of_edges() # create model heads = ([args.num_heads] * args.num_layers) + [args.num_out_heads] @@ -106,7 +112,7 @@ def main(args): loss_value = tf.reduce_mean(loss_fcn( labels=labels[train_mask], logits=logits[train_mask])) # Manually Weight Decay - # We found Tensorflow has a different implementation on weight decay + # We found Tensorflow has a different implementation on weight decay # of Adam(W) optimizer with PyTorch. And this results in worse results. # Manually adding weights to the loss to do weight decay solves this problem. for weight in model.trainable_weights: @@ -175,5 +181,5 @@ def main(args): help="skip re-evaluate the validation set") args = parser.parse_args() print(args) - + main(args) diff --git a/examples/tensorflow/gcn/README.md b/examples/tensorflow/gcn/README.md index 3eafc4cd4d69..a71c2534a2f8 100644 --- a/examples/tensorflow/gcn/README.md +++ b/examples/tensorflow/gcn/README.md @@ -2,8 +2,8 @@ Graph Convolutional Networks (GCN) ============ - Paper link: [https://arxiv.org/abs/1609.02907](https://arxiv.org/abs/1609.02907) -- Author's code repo: [https://github.com/tkipf/gcn](https://github.com/tkipf/gcn). Note that the original code is -implemented with Tensorflow for the paper. +- Author's code repo: [https://github.com/tkipf/gcn](https://github.com/tkipf/gcn). Note that the original code is +implemented with Tensorflow for the paper. Dependencies ------------ @@ -12,6 +12,7 @@ Dependencies ``bash pip install tensorflow requests +export DGLBACKEND=tensorflow `` Codes diff --git a/examples/tensorflow/gcn/gcn_builtin.py b/examples/tensorflow/gcn/gcn_builtin.py index d5d6354833ca..2cf03a6f632a 100644 --- a/examples/tensorflow/gcn/gcn_builtin.py +++ b/examples/tensorflow/gcn/gcn_builtin.py @@ -4,9 +4,10 @@ import numpy as np import networkx as nx import tensorflow as tf -from dgl import DGLGraph import dgl.function as fn -from dgl.data import register_data_args, load_data +import dgl +from dgl.data import register_data_args +from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset from tensorflow.keras import layers @@ -94,19 +95,28 @@ def evaluate(model, features, labels, mask): def main(args): # load and preprocess dataset - data = load_data(args) + if args.dataset == 'cora': + data = CoraGraphDataset() + elif args.dataset == 'citeseer': + data = CiteseerGraphDataset() + elif args.dataset == 'pubmed': + data = PubmedGraphDataset() + else: + raise ValueError('Unknown dataset: {}'.format(args.dataset)) + g = data[0] if args.gpu < 0: device = "/cpu:0" else: device = "/gpu:{}".format(args.gpu) + g = g.to(device) with tf.device(device): - features = tf.convert_to_tensor(data.features, dtype=tf.float32) - labels = tf.convert_to_tensor(data.labels, dtype=tf.int64) - train_mask = tf.convert_to_tensor(data.train_mask, dtype=tf.bool) - val_mask = tf.convert_to_tensor(data.val_mask, dtype=tf.bool) - test_mask = tf.convert_to_tensor(data.test_mask, dtype=tf.bool) + features = g.ndata['feat'] + labels = g.ndata['label'] + train_mask = g.ndata['train_mask'] + val_mask = g.ndata['val_mask'] + test_mask = g.ndata['test_mask'] in_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() @@ -121,12 +131,9 @@ def main(args): val_mask.numpy().sum(), test_mask.numpy().sum())) - # graph preprocess and calculate normalization factor - g = data.graph - g.remove_edges_from(nx.selfloop_edges(g)) - g = DGLGraph(g) - # # add self loop - g.add_edges(g.nodes(), g.nodes()) + # add self loop + g = dgl.remove_self_loop(g) + g = dgl.add_self_loop(g) n_edges = g.number_of_edges() # # normalization degs = tf.cast(tf.identity(g.in_degrees()), dtype=tf.float32) @@ -159,7 +166,7 @@ def main(args): logits = model(features) loss_value = loss_fcn(labels[train_mask], logits[train_mask]) # Manually Weight Decay - # We found Tensorflow has a different implementation on weight decay + # We found Tensorflow has a different implementation on weight decay # of Adam(W) optimizer with PyTorch. And this results in worse results. # Manually adding weights to the loss to do weight decay solves this problem. for weight in model.trainable_weights: diff --git a/examples/tensorflow/gcn/gcn_mp.py b/examples/tensorflow/gcn/gcn_mp.py index d75cbfed63af..4f5208c3b64d 100644 --- a/examples/tensorflow/gcn/gcn_mp.py +++ b/examples/tensorflow/gcn/gcn_mp.py @@ -4,9 +4,9 @@ import numpy as np import networkx as nx import tensorflow as tf -from dgl import DGLGraph -import dgl.function as fn -from dgl.data import register_data_args, load_data +import dgl +from dgl.data import register_data_args +from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset from tensorflow.keras import layers def gcn_msg(edge): @@ -100,19 +100,28 @@ def evaluate(model, features, labels, mask): def main(args): # load and preprocess dataset - data = load_data(args) + if args.dataset == 'cora': + data = CoraGraphDataset() + elif args.dataset == 'citeseer': + data = CiteseerGraphDataset() + elif args.dataset == 'pubmed': + data = PubmedGraphDataset() + else: + raise ValueError('Unknown dataset: {}'.format(args.dataset)) + g = data[0] if args.gpu < 0: device = "/cpu:0" else: device = "/gpu:{}".format(args.gpu) + g = g.to(device) with tf.device(device): - features = tf.convert_to_tensor(data.features, dtype=tf.float32) - labels = tf.convert_to_tensor(data.labels, dtype=tf.int64) - train_mask = tf.convert_to_tensor(data.train_mask, dtype=tf.bool) - val_mask = tf.convert_to_tensor(data.val_mask, dtype=tf.bool) - test_mask = tf.convert_to_tensor(data.test_mask, dtype=tf.bool) + features = g.ndata['feat'] + labels = g.ndata['label'] + train_mask = g.ndata['train_mask'] + val_mask = g.ndata['val_mask'] + test_mask = g.ndata['test_mask'] in_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() @@ -127,12 +136,11 @@ def main(args): val_mask.numpy().sum(), test_mask.numpy().sum())) - # graph preprocess and calculate normalization factor - g = data.graph - g.remove_edges_from(nx.selfloop_edges(g)) - g = DGLGraph(g) - # # add self loop - g.add_edges(g.nodes(), g.nodes()) + # add self loop + if args.self_loop: + g = dgl.remove_self_loop(g) + g = dgl.add_self_loop(g) + n_edges = g.number_of_edges() n_edges = g.number_of_edges() # # normalization degs = tf.cast(tf.identity(g.in_degrees()), dtype=tf.float32) @@ -165,7 +173,7 @@ def main(args): logits = model(features) loss_value = loss_fcn(labels[train_mask], logits[train_mask]) # Manually Weight Decay - # We found Tensorflow has a different implementation on weight decay + # We found Tensorflow has a different implementation on weight decay # of Adam(W) optimizer with PyTorch. And this results in worse results. # Manually adding weights to the loss to do weight decay solves this problem. for weight in model.trainable_weights: @@ -180,7 +188,7 @@ def main(args): acc = evaluate(model, features, labels, val_mask) print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | " "ETputs(KTEPS) {:.2f}". format(epoch, np.mean(dur), loss_value.numpy().item(), - acc, n_edges / np.mean(dur) / 1000)) + acc, n_edges / np.mean(dur) / 1000)) acc = evaluate(model, features, labels, test_mask) print("Test Accuracy {:.4f}".format(acc)) @@ -203,6 +211,8 @@ def main(args): help="number of hidden gcn layers") parser.add_argument("--weight-decay", type=float, default=5e-4, help="Weight for L2 loss") + parser.add_argument("--self-loop", action='store_true', + help="graph self-loop (default=False)") args = parser.parse_args() print(args) diff --git a/examples/tensorflow/gcn/train.py b/examples/tensorflow/gcn/train.py index f51124029a4a..27bce76525b9 100644 --- a/examples/tensorflow/gcn/train.py +++ b/examples/tensorflow/gcn/train.py @@ -3,8 +3,9 @@ import numpy as np import networkx as nx import tensorflow as tf -from dgl import DGLGraph -from dgl.data import register_data_args, load_data +import dgl +from dgl.data import register_data_args +from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset from gcn import GCN @@ -19,19 +20,28 @@ def evaluate(model, features, labels, mask): def main(args): # load and preprocess dataset - data = load_data(args) + if args.dataset == 'cora': + data = CoraGraphDataset() + elif args.dataset == 'citeseer': + data = CiteseerGraphDataset() + elif args.dataset == 'pubmed': + data = PubmedGraphDataset() + else: + raise ValueError('Unknown dataset: {}'.format(args.dataset)) + g = data[0] if args.gpu < 0: device = "/cpu:0" else: device = "/gpu:{}".format(args.gpu) + g = g.to(device) with tf.device(device): - features = tf.convert_to_tensor(data.features, dtype=tf.float32) - labels = tf.convert_to_tensor(data.labels, dtype=tf.int64) - train_mask = tf.convert_to_tensor(data.train_mask, dtype=tf.bool) - val_mask = tf.convert_to_tensor(data.val_mask, dtype=tf.bool) - test_mask = tf.convert_to_tensor(data.test_mask, dtype=tf.bool) + features = g.ndata['feat'] + labels = g.ndata['label'] + train_mask = g.ndata['train_mask'] + val_mask = g.ndata['val_mask'] + test_mask = g.ndata['test_mask'] in_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() @@ -46,12 +56,10 @@ def main(args): val_mask.numpy().sum(), test_mask.numpy().sum())) - # graph preprocess and calculate normalization factor - g = data.graph + # add self loop if args.self_loop: - g.remove_edges_from(nx.selfloop_edges(g)) - g.add_edges_from(zip(g.nodes(), g.nodes())) - g = DGLGraph(g).to(device) + g = dgl.remove_self_loop(g) + g = dgl.add_self_loop(g) n_edges = g.number_of_edges() # normalization degs = tf.cast(tf.identity(g.in_degrees()), dtype=tf.float32) @@ -85,7 +93,7 @@ def main(args): logits = model(features) loss_value = loss_fcn(labels[train_mask], logits[train_mask]) # Manually Weight Decay - # We found Tensorflow has a different implementation on weight decay + # We found Tensorflow has a different implementation on weight decay # of Adam(W) optimizer with PyTorch. And this results in worse results. # Manually adding weights to the loss to do weight decay solves this problem. for weight in model.trainable_weights: diff --git a/python/dgl/data/__init__.py b/python/dgl/data/__init__.py index 69b44ef1639d..a84b6f8e98be 100644 --- a/python/dgl/data/__init__.py +++ b/python/dgl/data/__init__.py @@ -17,6 +17,7 @@ from .gdelt import GDELT from .icews18 import ICEWS18 from .qm7b import QM7b +from .citation_graph import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset def register_data_args(parser): @@ -27,7 +28,6 @@ def register_data_args(parser): help= "The input dataset. Can be cora, citeseer, pubmed, syn(synthetic dataset) or reddit" ) - citegrh.register_args(parser) def load_data(args): @@ -37,8 +37,6 @@ def load_data(args): return citegrh.load_citeseer() elif args.dataset == 'pubmed': return citegrh.load_pubmed() - elif args.dataset == 'syn': - return citegrh.load_synthetic(args) elif args.dataset is not None and args.dataset.startswith('reddit'): return RedditDataset(self_loop=('self-loop' in args.dataset)) else: diff --git a/python/dgl/data/citation_graph.py b/python/dgl/data/citation_graph.py index 9ee313e98968..f09fa837bab7 100644 --- a/python/dgl/data/citation_graph.py +++ b/python/dgl/data/citation_graph.py @@ -11,18 +11,17 @@ import scipy.sparse as sp import os, sys -from .utils import download, extract_archive, get_download_dir, _get_dgl_url -from ..utils import retry_method_with_fix +from .utils import save_graphs, load_graphs, save_info, load_info, makedirs, _get_dgl_url +from .utils import generate_mask_tensor +from .utils import deprecate_property, deprecate_function +from .dgl_dataset import DGLBuiltinDataset from .. import convert from .. import batch from .. import backend as F +from ..convert import graph as dgl_graph +from ..convert import to_networkx -_urls = { - 'cora_v2' : 'dataset/cora_v2.zip', - 'citeseer' : 'dataset/citeseer.zip', - 'pubmed' : 'dataset/pubmed.zip', - 'cora_binary' : 'dataset/cora_binary.zip', -} +backend = os.environ.get('DGLBACKEND', 'pytorch') def _pickle_load(pkl_file): if sys.version_info > (3, 0): @@ -30,7 +29,7 @@ def _pickle_load(pkl_file): else: return pkl.load(pkl_file) -class CitationGraphDataset(object): +class CitationGraphDataset(DGLBuiltinDataset): r"""The citation graph dataset, including cora, citeseer and pubmeb. Nodes mean authors and edges mean citation relationships. @@ -38,8 +37,21 @@ class CitationGraphDataset(object): ----------- name: str name can be 'cora', 'citeseer' or 'pubmed'. + raw_dir : str + Raw file directory to download/contains the input data directory. + Default: ~/.dgl/ + force_reload : bool + Whether to reload the dataset. Default: False + verbose: bool + Whether to print out progress information. Default: True. """ - def __init__(self, name): + _urls = { + 'cora_v2' : 'dataset/cora_v2.zip', + 'citeseer' : 'dataset/citeseer.zip', + 'pubmed' : 'dataset/pubmed.zip', + } + + def __init__(self, name, raw_dir=None, force_reload=False, verbose=True): assert name.lower() in ['cora', 'citeseer', 'pubmed'] # Previously we use the pre-processing in pygcn (https://github.com/tkipf/pygcn) @@ -47,18 +59,15 @@ def __init__(self, name): if name.lower() == 'cora': name = 'cora_v2' - self.name = name - self.dir = get_download_dir() - self.zip_file_path='{}/{}.zip'.format(self.dir, name) - self._load() - - def _download_and_extract(self): - download(_get_dgl_url(_urls[self.name]), path=self.zip_file_path) - extract_archive(self.zip_file_path, '{}/{}'.format(self.dir, self.name)) + url = _get_dgl_url(self._urls[name]) + super(CitationGraphDataset, self).__init__(name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose) - @retry_method_with_fix(_download_and_extract) - def _load(self): - """Loads input data from gcn/data directory + def process(self): + """Loads input data from data directory ind.name.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object; ind.name.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object; @@ -70,13 +79,8 @@ def _load(self): ind.name.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict object; ind.name.test.index => the indices of test instances in graph, for the inductive setting as list object. - - All objects above must be saved using python pickle module. - - :param name: Dataset name - :return: All data input files loaded (as well the training/test data). """ - root = '{}/{}'.format(self.dir, self.name) + root = self.raw_path objnames = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(objnames)): @@ -114,37 +118,135 @@ def _load(self): val_mask = _sample_mask(idx_val, labels.shape[0]) test_mask = _sample_mask(idx_test, labels.shape[0]) - self.graph = graph - self.features = _preprocess_features(features) - self.labels = labels - self.onehot_labels = onehot_labels - self.num_labels = onehot_labels.shape[1] - self.train_mask = train_mask - self.val_mask = val_mask - self.test_mask = test_mask - - print('Finished data loading and preprocessing.') - print(' NumNodes: {}'.format(self.graph.number_of_nodes())) - print(' NumEdges: {}'.format(self.graph.number_of_edges())) - print(' NumFeats: {}'.format(self.features.shape[1])) - print(' NumClasses: {}'.format(self.num_labels)) - print(' NumTrainingSamples: {}'.format(len(np.nonzero(self.train_mask)[0]))) - print(' NumValidationSamples: {}'.format(len(np.nonzero(self.val_mask)[0]))) - print(' NumTestSamples: {}'.format(len(np.nonzero(self.test_mask)[0]))) + self._graph = graph + g = dgl_graph(graph) + + g.ndata['train_mask'] = generate_mask_tensor(train_mask) + g.ndata['val_mask'] = generate_mask_tensor(val_mask) + g.ndata['test_mask'] = generate_mask_tensor(test_mask) + g.ndata['label'] = F.tensor(labels) + g.ndata['feat'] = F.tensor(_preprocess_features(features), dtype=F.data_type_dict['float32']) + self._num_labels = onehot_labels.shape[1] + self._labels = labels + self._g = g + + if self.verbose: + print('Finished data loading and preprocessing.') + print(' NumNodes: {}'.format(self._g.number_of_nodes())) + print(' NumEdges: {}'.format(self._g.number_of_edges())) + print(' NumFeats: {}'.format(self._g.ndata['feat'].shape[1])) + print(' NumClasses: {}'.format(self.num_labels)) + print(' NumTrainingSamples: {}'.format( + F.nonzero_1d(self._g.ndata['train_mask']).shape[0])) + print(' NumValidationSamples: {}'.format( + F.nonzero_1d(self._g.ndata['val_mask']).shape[0])) + print(' NumTestSamples: {}'.format( + F.nonzero_1d(self._g.ndata['test_mask']).shape[0])) + + def has_cache(self): + graph_path = os.path.join(self.save_path, + self.save_name + '.bin') + info_path = os.path.join(self.save_path, + self.save_name + '.pkl') + if os.path.exists(graph_path) and \ + os.path.exists(info_path): + return True + + return False + + def save(self): + """save the graph list and the labels""" + graph_path = os.path.join(self.save_path, + self.save_name + '.bin') + info_path = os.path.join(self.save_path, + self.save_name + '.pkl') + save_graphs(str(graph_path), self._g) + save_info(str(info_path), {'num_labels': self.num_labels}) + + def load(self): + graph_path = os.path.join(self.save_path, + self.save_name + '.bin') + info_path = os.path.join(self.save_path, + self.save_name + '.pkl') + graphs, _ = load_graphs(str(graph_path)) + + info = load_info(str(info_path)) + self._g = graphs[0] + graph = graph.clone() + graph.pop('train_mask') + graph.pop('val_mask') + graph.pop('test_mask') + graph.pop('feat') + graph.pop('label') + graph = to_networkx(graph) + self._graph = nx.DiGraph(graph) + + self._num_labels = info['num_labels'] + self._g.ndata['train_mask'] = generate_mask_tensor(self._g.ndata['train_mask'].numpy()) + self._g.ndata['val_mask'] = generate_mask_tensor(self._g.ndata['val_mask'].numpy()) + self._g.ndata['test_mask'] = generate_mask_tensor(self._g.ndata['test_mask'].numpy()) + # hack for mxnet compatability + + if self.verbose: + print(' NumNodes: {}'.format(self._g.number_of_nodes())) + print(' NumEdges: {}'.format(self._g.number_of_edges())) + print(' NumFeats: {}'.format(self._g.ndata['feat'].shape[1])) + print(' NumClasses: {}'.format(self.num_labels)) + print(' NumTrainingSamples: {}'.format( + F.nonzero_1d(self._g.ndata['train_mask']).shape[0])) + print(' NumValidationSamples: {}'.format( + F.nonzero_1d(self._g.ndata['val_mask']).shape[0])) + print(' NumTestSamples: {}'.format( + F.nonzero_1d(self._g.ndata['test_mask']).shape[0])) def __getitem__(self, idx): assert idx == 0, "This dataset has only one graph" - g = convert.graph(self.graph) - g.ndata['train_mask'] = F.tensor(self.train_mask, F.bool) - g.ndata['val_mask'] = F.tensor(self.val_mask, F.bool) - g.ndata['test_mask'] = F.tensor(self.test_mask, F.bool) - g.ndata['label'] = F.tensor(self.labels, F.int64) - g.ndata['feat'] = F.tensor(self.features, F.float32) - return g + return self._g def __len__(self): return 1 + @property + def save_name(self): + return self.name + '_dgl_graph' + + @property + def num_labels(self): + return self._num_labels + + """ Citation graph is used in many examples + We preserve these properties for compatability. + """ + @property + def graph(self): + deprecate_property('dataset.graph', 'dataset.g') + return self._graph + + @property + def train_mask(self): + deprecate_property('dataset.train_mask', 'g.ndata[\'train_mask\']') + return F.asnumpy(self._g.ndata['train_mask']) + + @property + def val_mask(self): + deprecate_property('dataset.val_mask', 'g.ndata[\'val_mask\']') + return F.asnumpy(self._g.ndata['val_mask']) + + @property + def test_mask(self): + deprecate_property('dataset.test_mask', 'g.ndata[\'test_mask\']') + return F.asnumpy(self._g.ndata['test_mask']) + + @property + def labels(self): + deprecate_property('dataset.label', 'g.ndata[\'label\']') + return F.asnumpy(self._g.ndata['label']) + + @property + def features(self): + deprecate_property('dataset.feat', 'g.ndata[\'feat\']') + return self._g.ndata['feat'] + def _preprocess_features(features): """Row-normalize feature matrix and convert to tuple representation""" rowsum = np.asarray(features.sum(1)) @@ -167,139 +269,436 @@ def _sample_mask(idx, l): mask[idx] = 1 return mask -def load_cora(): - data = CitationGraphDataset('cora') - return data +class CoraGraphDataset(CitationGraphDataset): + r""" Cora citation network dataset. + + .. deprecated:: 0.5.0 + `graph` is deprecated, it is replaced by: + >>> dataset = CoraGraphDataset() + >>> graph = dataset[0] + `train_mask` is deprecated, it is replaced by: + >>> dataset = CoraGraphDataset() + >>> graph = dataset[0] + >>> train_mask = graph.ndata['train_mask'] + `val_mask` is deprecated, it is replaced by: + >>> dataset = CoraGraphDataset() + >>> graph = dataset[0] + >>> val_mask = graph.ndata['val_mask'] + `test_mask` is deprecated, it is replaced by: + >>> dataset = CoraGraphDataset() + >>> graph = dataset[0] + >>> test_mask = graph.ndata['test_mask'] + `labels` is deprecated, it is replaced by: + >>> dataset = CoraGraphDataset() + >>> graph = dataset[0] + >>> labels = graph.ndata['label'] + `feat` is deprecated, it is replaced by: + >>> dataset = CoraGraphDataset() + >>> graph = dataset[0] + >>> feat = graph.ndata['feat'] + + Nodes mean paper and edges mean citation + relationships. Each node has a predefined + feature with 1433 dimensions. The dataset is + designed for the node classification task. + The task is to predict the category of + certain paper. + + Statistics + ---------- + Nodes: 2708 + Edges: 10556 + Number of Classes: 7 + Label Split: Train: 140 ,Valid: 500, Test: 1000 -def load_citeseer(): - data = CitationGraphDataset('citeseer') - return data + Parameters + ---------- + raw_dir : str + Raw file directory to download/contains the input data directory. + Default: ~/.dgl/ + force_reload : bool + Whether to reload the dataset. Default: False + verbose: bool + Whether to print out progress information. Default: True. + + Attributes + ---------- + num_labels: int + Number of label classes + graph: networkx.DiGraph + Graph structure + train_mask: Numpy array + Mask of training nodes + val_mask: Numpy array + Mask of validation nodes + test_mask: Numpy array + Mask of test nodes + labels: Numpy array + Ground truth labels of each node + features: Tensor + Node features + + Notes + ----- + The node feature is row-normalized. + + Examples + -------- + >>> dataset = CoraGraphDataset() + >>> g = dataset.graph + >>> num_class = g.num_labels + >>> + >>> # get node feature + >>> feat = g.ndata['feat'] + >>> + >>> # get data split + >>> train_mask = g.ndata['train_mask'] + >>> val_mask = g.ndata['val_mask'] + >>> test_mask = g.ndata['test_mask'] + >>> + >>> # get labels + >>> label = g.ndata['label'] + >>> + >>> # Train, Validation and Test -def load_pubmed(): - data = CitationGraphDataset('pubmed') - return data + """ + def __init__(self, raw_dir=None, force_reload=False, verbose=True): + name = 'cora' -class GCNSyntheticDataset(object): - def __init__(self, - graph_generator, - num_feats=500, - num_classes=10, - train_ratio=1., - val_ratio=0., - test_ratio=0., - seed=None): - rng = np.random.RandomState(seed) - # generate graph - self.graph = graph_generator(seed) - num_nodes = self.graph.number_of_nodes() - - # generate features - #self.features = rng.randn(num_nodes, num_feats).astype(np.float32) - self.features = np.zeros((num_nodes, num_feats), dtype=np.float32) - - # generate labels - self.labels = rng.randint(num_classes, size=num_nodes) - onehot_labels = np.zeros((num_nodes, num_classes), dtype=np.float32) - onehot_labels[np.arange(num_nodes), self.labels] = 1. - self.onehot_labels = onehot_labels - self.num_labels = num_classes - - # generate masks - ntrain = int(num_nodes * train_ratio) - nval = int(num_nodes * val_ratio) - ntest = int(num_nodes * test_ratio) - mask_array = np.zeros((num_nodes,), dtype=np.int32) - mask_array[0:ntrain] = 1 - mask_array[ntrain:ntrain+nval] = 2 - mask_array[ntrain+nval:ntrain+nval+ntest] = 3 - rng.shuffle(mask_array) - self.train_mask = (mask_array == 1).astype(np.int32) - self.val_mask = (mask_array == 2).astype(np.int32) - self.test_mask = (mask_array == 3).astype(np.int32) - - print('Finished synthetic dataset generation.') - print(' NumNodes: {}'.format(self.graph.number_of_nodes())) - print(' NumEdges: {}'.format(self.graph.number_of_edges())) - print(' NumFeats: {}'.format(self.features.shape[1])) - print(' NumClasses: {}'.format(self.num_labels)) - print(' NumTrainingSamples: {}'.format(len(np.nonzero(self.train_mask)[0]))) - print(' NumValidationSamples: {}'.format(len(np.nonzero(self.val_mask)[0]))) - print(' NumTestSamples: {}'.format(len(np.nonzero(self.test_mask)[0]))) + super(CoraGraphDataset, self).__init__(name, raw_dir, force_reload, verbose) def __getitem__(self, idx): - return self + r"""Gets the graph object + + Parameters + ----------- + idx: int + Item index, CoraGraphDataset has only one graph object + + Return + ------ + dgl.DGLGraph + graph structure, node features and labels. + - ndata['train_mask']: mask for training node set + - ndata['val_mask']: mask for validation node set + - ndata['test_mask']: mask for test node set + - ndata['feat']: node feature + - ndata['label']: ground truth labels + """ + return super(CoraGraphDataset, self).__getitem__(idx) def __len__(self): - return 1 + r"""The number of graphs in the dataset.""" + return super(CoraGraphDataset, self).__len__() + +class CiteseerGraphDataset(CitationGraphDataset): + r""" Citeseer citation network dataset. + + .. deprecated:: 0.5.0 + `graph` is deprecated, it is replaced by: + >>> dataset = CiteseerGraphDataset() + >>> graph = dataset[0] + `train_mask` is deprecated, it is replaced by: + >>> dataset = CiteseerGraphDataset() + >>> graph = dataset[0] + >>> train_mask = graph.ndata['train_mask'] + `val_mask` is deprecated, it is replaced by: + >>> dataset = CiteseerGraphDataset() + >>> graph = dataset[0] + >>> val_mask = graph.ndata['val_mask'] + `test_mask` is deprecated, it is replaced by: + >>> dataset = CiteseerGraphDataset() + >>> graph = dataset[0] + >>> test_mask = graph.ndata['test_mask'] + `labels` is deprecated, it is replaced by: + >>> dataset = CiteseerGraphDataset() + >>> graph = dataset[0] + >>> labels = graph.ndata['label'] + `feat` is deprecated, it is replaced by: + >>> dataset = CiteseerGraphDataset() + >>> graph = dataset[0] + >>> feat = graph.ndata['feat'] + + Nodes mean scientific publications and edges + mean citation relationships. Each node has a + predefined feature with 3703 dimensions. The + dataset is designed for the node classification + task. The task is to predict the category of + certain publication. + + Statistics + ---------- + Nodes: 3327 + Edges: 9228 + Number of Classes: 6 + Label Split: Train: 120 ,Valid: 500, Test: 1000 -def get_gnp_generator(args): - n = args.syn_gnp_n - p = (2 * np.log(n) / n) if args.syn_gnp_p == 0. else args.syn_gnp_p - def _gen(seed): - return nx.fast_gnp_random_graph(n, p, seed, True) - return _gen - -class ScipyGraph(object): - """A simple graph object that uses scipy matrix.""" - def __init__(self, mat): - self._mat = mat - - def get_graph(self): - return self._mat - - def number_of_nodes(self): - return self._mat.shape[0] - - def number_of_edges(self): - return self._mat.getnnz() - -def get_scipy_generator(args): - n = args.syn_gnp_n - p = (2 * np.log(n) / n) if args.syn_gnp_p == 0. else args.syn_gnp_p - def _gen(seed): - return ScipyGraph(sp.random(n, n, p, format='coo')) - return _gen - -def load_synthetic(args): - ty = args.syn_type - if ty == 'gnp': - gen = get_gnp_generator(args) - elif ty == 'scipy': - gen = get_scipy_generator(args) - else: - raise ValueError('Unknown graph generator type: {}'.format(ty)) - return GCNSyntheticDataset( - gen, - args.syn_nfeats, - args.syn_nclasses, - args.syn_train_ratio, - args.syn_val_ratio, - args.syn_test_ratio, - args.syn_seed) - -def register_args(parser): - # Args for synthetic graphs. - parser.add_argument('--syn-type', type=str, default='gnp', - help='Type of the synthetic graph generator') - parser.add_argument('--syn-nfeats', type=int, default=500, - help='Number of node features') - parser.add_argument('--syn-nclasses', type=int, default=10, - help='Number of output classes') - parser.add_argument('--syn-train-ratio', type=float, default=.1, - help='Ratio of training nodes') - parser.add_argument('--syn-val-ratio', type=float, default=.2, - help='Ratio of validation nodes') - parser.add_argument('--syn-test-ratio', type=float, default=.5, - help='Ratio of testing nodes') - # Args for GNP generator - parser.add_argument('--syn-gnp-n', type=int, default=1000, - help='n in gnp random graph') - parser.add_argument('--syn-gnp-p', type=float, default=0.0, - help='p in gnp random graph') - parser.add_argument('--syn-seed', type=int, default=42, - help='random seed') - -class CoraBinary(object): + Parameters + ----------- + raw_dir : str + Raw file directory to download/contains the input data directory. + Default: ~/.dgl/ + force_reload : bool + Whether to reload the dataset. Default: False + verbose: bool + Whether to print out progress information. Default: True. + + Attributes + ---------- + num_labels: int + Number of label classes + graph: networkx.DiGraph + Graph structure + train_mask: Numpy array + Mask of training nodes + val_mask: Numpy array + Mask of validation nodes + test_mask: Numpy array + Mask of test nodes + labels: Numpy array + Ground truth labels of each node + features: Tensor + Node features + + Notes + ----- + The node feature is row-normalized. + + In citeseer dataset, there are some isolated nodes in the graph. + These isolated nodes are added as zero-vecs into the right position. + + Examples + -------- + >>> dataset = CiteseerGraphDataset() + >>> g = dataset.graph + >>> num_class = g.num_labels + >>> + >>> # get node feature + >>> feat = g.ndata['feat'] + >>> + >>> # get data split + >>> train_mask = g.ndata['train_mask'] + >>> val_mask = g.ndata['val_mask'] + >>> test_mask = g.ndata['test_mask'] + >>> + >>> # get labels + >>> label = g.ndata['label'] + >>> + >>> # Train, Validation and Test + + """ + def __init__(self, raw_dir=None, force_reload=False, verbose=True): + name = 'citeseer' + + super(CiteseerGraphDataset, self).__init__(name, raw_dir, force_reload, verbose) + + def __getitem__(self, idx): + r"""Gets the graph object + + Parameters + ----------- + idx: int + Item index, CiteseerGraphDataset has only one graph object + + Return + ------ + dgl.DGLGraph + graph structure, node features and labels. + - ndata['train_mask']: mask for training node set + - ndata['val_mask']: mask for validation node set + - ndata['test_mask']: mask for test node set + - ndata['feat']: node feature + - ndata['label']: ground truth labels + """ + return super(CiteseerGraphDataset, self).__getitem__(idx) + + def __len__(self): + r"""The number of graphs in the dataset.""" + return super(CiteseerGraphDataset, self).__len__() + +class PubmedGraphDataset(CitationGraphDataset): + r""" Pubmed citation network dataset. + + .. deprecated:: 0.5.0 + `graph` is deprecated, it is replaced by: + >>> dataset = PubmedGraphDataset() + >>> graph = dataset[0] + `train_mask` is deprecated, it is replaced by: + >>> dataset = PubmedGraphDataset() + >>> graph = dataset[0] + >>> train_mask = graph.ndata['train_mask'] + `val_mask` is deprecated, it is replaced by: + >>> dataset = PubmedGraphDataset() + >>> graph = dataset[0] + >>> val_mask = graph.ndata['val_mask'] + `test_mask` is deprecated, it is replaced by: + >>> dataset = PubmedGraphDataset() + >>> graph = dataset[0] + >>> test_mask = graph.ndata['test_mask'] + `labels` is deprecated, it is replaced by: + >>> dataset = PubmedGraphDataset() + >>> graph = dataset[0] + >>> labels = graph.ndata['label'] + `feat` is deprecated, it is replaced by: + >>> dataset = PubmedGraphDataset() + >>> graph = dataset[0] + >>> feat = graph.ndata['feat'] + + Nodes mean scientific publications and edges + mean citation relationships. Each node has a + predefined feature with 500 dimensions. The + dataset is designed for the node classification + task. The task is to predict the category of + certain publication. + + Statistics + ---------- + Nodes: 19717 + Edges: 88651 + Number of Classes: 3 + Label Split: Train: 60 ,Valid: 500, Test: 1000 + + Parameters + ----------- + raw_dir : str + Raw file directory to download/contains the input data directory. + Default: ~/.dgl/ + force_reload : bool + Whether to reload the dataset. Default: False + verbose: bool + Whether to print out progress information. Default: True. + + Attributes + ---------- + num_labels: int + Number of label classes + graph: networkx.DiGraph + Graph structure + train_mask: Numpy array + Mask of training nodes + val_mask: Numpy array + Mask of validation nodes + test_mask: Numpy array + Mask of test nodes + labels: Numpy array + Ground truth labels of each node + features: Tensor + Node features + + Notes + ----- + The node feature is row-normalized. + + Examples + -------- + >>> dataset = PubmedGraphDataset() + >>> g = dataset.graph + >>> num_class = g.num_of_class + >>> + >>> # get node feature + >>> feat = g.ndata['feat'] + >>> + >>> # get data split + >>> train_mask = g.ndata['train_mask'] + >>> val_mask = g.ndata['val_mask'] + >>> test_mask = g.ndata['test_mask'] + >>> + >>> # get labels + >>> label = g.ndata['label'] + >>> + >>> # Train, Validation and Test + + """ + def __init__(self, raw_dir=None, force_reload=False, verbose=True): + name = 'pubmed' + + super(PubmedGraphDataset, self).__init__(name, raw_dir, force_reload, verbose) + + def __getitem__(self, idx): + r"""Gets the graph object + + Parameters + ----------- + idx: int + Item index, PubmedGraphDataset has only one graph object + + Return + ------ + dgl.DGLGraph + graph structure, node features and labels. + - ndata['train_mask']: mask for training node set + - ndata['val_mask']: mask for validation node set + - ndata['test_mask']: mask for test node set + - ndata['feat']: node feature + - ndata['label']: ground truth labels + """ + return super(PubmedGraphDataset, self).__getitem__(idx) + + def __len__(self): + r"""The number of graphs in the dataset.""" + return super(PubmedGraphDataset, self).__len__() + +def load_cora(raw_dir=None, force_reload=False, verbose=True): + """Get CoraGraphDataset + + Parameters + ----------- + raw_dir : str + Raw file directory to download/contains the input data directory. + Default: ~/.dgl/ + force_reload : bool + Whether to reload the dataset. Default: False + verbose: bool + Whether to print out progress information. Default: True. + + Return + ------- + CoraGraphDataset + """ + data = CoraGraphDataset(raw_dir, force_reload, verbose) + return data + +def load_citeseer(raw_dir=None, force_reload=False, verbose=True): + """Get CiteseerGraphDataset + + Parameters + ----------- + raw_dir : str + Raw file directory to download/contains the input data directory. + Default: ~/.dgl/ + force_reload : bool + Whether to reload the dataset. Default: False + verbose: bool + Whether to print out progress information. Default: True. + + Return + ------- + CiteseerGraphDataset + """ + data = CiteseerGraphDataset(raw_dir, force_reload, verbose) + return data + +def load_pubmed(raw_dir=None, force_reload=False, verbose=True): + """Get PubmedGraphDataset + + Parameters + ----------- + raw_dir : str + Raw file directory to download/contains the input data directory. + Default: ~/.dgl/ + force_reload : bool + Whether to reload the dataset. Default: False + verbose: bool + Whether to print out progress information. Default: True. + + Return + ------- + PubmedGraphDataset + """ + data = PubmedGraphDataset(raw_dir, force_reload, verbose) + return data + +class CoraBinary(DGLBuiltinDataset): """A mini-dataset for binary classification task using Cora. After loaded, it has following members: @@ -307,20 +706,28 @@ class CoraBinary(object): graphs : list of :class:`~dgl.DGLGraph` pmpds : list of :class:`scipy.sparse.coo_matrix` labels : list of :class:`numpy.ndarray` + + Parameters + ----------- + raw_dir : str + Raw file directory to download/contains the input data directory. + Default: ~/.dgl/ + force_reload : bool + Whether to reload the dataset. Default: False + verbose: bool + Whether to print out progress information. Default: True. """ - def __init__(self): - self.dir = get_download_dir() - self.name = 'cora_binary' - self.zip_file_path='{}/{}.zip'.format(self.dir, self.name) - self._load() - - def _download_and_extract(self): - download(_get_dgl_url(_urls[self.name]), path=self.zip_file_path) - extract_archive(self.zip_file_path, '{}/{}'.format(self.dir, self.name)) - - @retry_method_with_fix(_download_and_extract) - def _load(self): - root = '{}/{}'.format(self.dir, self.name) + def __init__(self, raw_dir=None, force_reload=False, verbose=True): + name = 'cora_binary' + url = _get_dgl_url('dataset/cora_binary.zip') + super(CoraBinary, self).__init__(name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose) + + def process(self): + root = self.raw_path # load graphs self.graphs = [] with open("{}/graphs.txt".format(root), 'r') as f: @@ -328,13 +735,13 @@ def _load(self): for line in f.readlines(): if line.startswith('graph'): if len(elist) != 0: - self.graphs.append(convert.graph(elist)) + self.graphs.append(dgl_graph(elist)) elist = [] else: u, v = line.strip().split(' ') elist.append((int(u), int(v))) if len(elist) != 0: - self.graphs.append(convert.graph(elist)) + self.graphs.append(dgl_graph(elist)) with open("{}/pmpds.pkl".format(root), 'rb') as f: self.pmpds = _pickle_load(f) self.labels = [] @@ -353,12 +760,64 @@ def _load(self): assert len(self.graphs) == len(self.pmpds) assert len(self.graphs) == len(self.labels) + def has_cache(self): + graph_path = os.path.join(self.save_path, + self.save_name + '.bin') + if os.path.exists(graph_path): + return True + + return False + + def save(self): + """save the graph list and the labels""" + graph_path = os.path.join(self.save_path, + self.save_name + '.bin') + labels = {} + for i, label in enumerate(self.labels): + labels['{}'.format(i)] = F.tensor(label) + save_graphs(str(graph_path), self.graphs, labels) + if self.verbose: + print('Done saving data into cached files.') + + def load(self): + graph_path = os.path.join(self.save_path, + self.save_name + '.bin') + self.graphs, labels = load_graphs(str(graph_path)) + + self.labels = [] + for i in range(len(lables)): + self.labels.append(labels['{}'.format(i)].asnumpy()) + # load pmpds under self.raw_path + with open("{}/pmpds.pkl".format(self.raw_path), 'rb') as f: + self.pmpds = _pickle_load(f) + if self.verbose: + print('Done loading data into cached files.') + # sanity check + assert len(self.graphs) == len(self.pmpds) + assert len(self.graphs) == len(self.labels) + def __len__(self): return len(self.graphs) def __getitem__(self, i): + r"""Gets the idx-th sample. + + Parameters + ----------- + idx : int + The sample index. + + Returns + ------- + (dgl.DGLGraph, scipy.sparse.coo_matrix, int) + The graph, scipy sparse coo_matrix and its label. + """ return (self.graphs[i], self.pmpds[i], self.labels[i]) + @property + def save_name(self): + return self.name + '_dgl_graph' + @staticmethod def collate_fn(cur): graphs, pmpds, labels = zip(*cur) diff --git a/python/dgl/data/rdf.py b/python/dgl/data/rdf.py index 7d976572edee..32fe08ad0a91 100644 --- a/python/dgl/data/rdf.py +++ b/python/dgl/data/rdf.py @@ -249,6 +249,7 @@ def findidfn(ent): # save for compatability self._train_idx = F.tensor(train_idx) self._test_idx = F.tensor(test_idx) + self._labels = labels def build_graph(self, mg, src, dst, ntid, etid, ntypes, etypes): """Build the graphs @@ -638,17 +639,17 @@ def __getitem__(self, idx): Return ------- - dgl.DGLGraph - graph structure, node features and labels. - - ndata['train_mask']: mask for training node set - - ndata['test_mask']: mask for testing node set - - ndata['labels']: mask for labels + dgl.DGLGraph + graph structure, node features and labels. + - ndata['train_mask']: mask for training node set + - ndata['test_mask']: mask for testing node set + - ndata['labels']: mask for labels """ return super(AIFBDataset, self).__getitem__(idx) def __len__(self): r"""The number of graphs in the dataset.""" - return super(AIFBDataset, self).__len__(idx) + return super(AIFBDataset, self).__len__() def parse_entity(self, term): if isinstance(term, rdf.Literal): @@ -801,17 +802,17 @@ def __getitem__(self, idx): Return ------- - dgl.DGLGraph - graph structure, node features and labels. - - ndata['train_mask']: mask for training node set - - ndata['test_mask']: mask for testing node set - - ndata['labels']: mask for labels + dgl.DGLGraph + graph structure, node features and labels. + - ndata['train_mask']: mask for training node set + - ndata['test_mask']: mask for testing node set + - ndata['labels']: mask for labels """ return super(MUTAGDataset, self).__getitem__(idx) def __len__(self): r"""The number of graphs in the dataset.""" - return super(MUTAGDataset, self).__len__(idx) + return super(MUTAGDataset, self).__len__() def parse_entity(self, term): if isinstance(term, rdf.Literal): @@ -980,17 +981,17 @@ def __getitem__(self, idx): Return ------- - dgl.DGLGraph - graph structure, node features and labels. - - ndata['train_mask']: mask for training node set - - ndata['test_mask']: mask for testing node set - - ndata['labels']: mask for labels + dgl.DGLGraph + graph structure, node features and labels. + - ndata['train_mask']: mask for training node set + - ndata['test_mask']: mask for testing node set + - ndata['labels']: mask for labels """ return super(BGSDataset, self).__getitem__(idx) def __len__(self): r"""The number of graphs in the dataset.""" - return super(BGSDataset, self).__len__(idx) + return super(BGSDataset, self).__len__() def parse_entity(self, term): if isinstance(term, rdf.Literal): @@ -1155,17 +1156,17 @@ def __getitem__(self, idx): Return ------- - dgl.DGLGraph - graph structure, node features and labels. - - ndata['train_mask']: mask for training node set - - ndata['test_mask']: mask for testing node set - - ndata['labels']: mask for labels + dgl.DGLGraph + graph structure, node features and labels. + - ndata['train_mask']: mask for training node set + - ndata['test_mask']: mask for testing node set + - ndata['labels']: mask for labels """ return super(AMDataset, self).__getitem__(idx) def __len__(self): r"""The number of graphs in the dataset.""" - return super(AMDataset, self).__len__(idx) + return super(AMDataset, self).__len__() def parse_entity(self, term): if isinstance(term, rdf.Literal):