diff --git a/examples/pytorch/tree_lstm/train.py b/examples/pytorch/tree_lstm/train.py index cdd29ffed537..4c1e9663fd88 100644 --- a/examples/pytorch/tree_lstm/train.py +++ b/examples/pytorch/tree_lstm/train.py @@ -8,7 +8,7 @@ from torch.utils.data import DataLoader import dgl -import dgl.data as data +from dgl.data.tree import SST from tree_lstm import TreeLSTM @@ -25,22 +25,22 @@ def main(args): if cuda: th.cuda.set_device(args.gpu) - trainset = data.SST() + trainset = SST() train_loader = DataLoader(dataset=trainset, batch_size=args.batch_size, - collate_fn=data.SST.batcher(device), + collate_fn=SST.batcher(device), shuffle=True, num_workers=0) - devset = data.SST(mode='dev') + devset = SST(mode='dev') dev_loader = DataLoader(dataset=devset, batch_size=100, - collate_fn=data.SST.batcher(device), + collate_fn=SST.batcher(device), shuffle=False, num_workers=0) - testset = data.SST(mode='test') + testset = SST(mode='test') test_loader = DataLoader(dataset=testset, - batch_size=100, collate_fn=data.SST.batcher(device), shuffle=False, num_workers=0) + batch_size=100, collate_fn=SST.batcher(device), shuffle=False, num_workers=0) model = TreeLSTM(trainset.num_vocabs, args.x_size, diff --git a/python/dgl/data/sbm.py b/python/dgl/data/sbm.py index 9679b0961ac0..2a753b724c62 100644 --- a/python/dgl/data/sbm.py +++ b/python/dgl/data/sbm.py @@ -1,3 +1,4 @@ +"""Dataset for stochastic block model.""" import math import os import pickle diff --git a/python/dgl/data/tree.py b/python/dgl/data/tree.py index 2043bc27a247..7af98da4e287 100644 --- a/python/dgl/data/tree.py +++ b/python/dgl/data/tree.py @@ -6,8 +6,6 @@ from __future__ import absolute_import from collections import namedtuple, OrderedDict -from nltk.tree import Tree -from nltk.corpus.reader import BracketParseCorpusReader import networkx as nx import numpy as np @@ -16,6 +14,8 @@ import dgl.backend as F from dgl.data.utils import download, extract_archive, get_download_dir, _get_dgl_url +__all__ = ['SSTBatch', 'SST'] + _urls = { 'sst' : 'dataset/sst.zip', } @@ -63,6 +63,7 @@ def __init__(self, mode='train', vocab_file=None): print('Dataset creation finished. #Trees:', len(self.trees)) def _load(self): + from nltk.corpus.reader import BracketParseCorpusReader # load vocab file self.vocab = OrderedDict() with open(self.vocab_file, encoding='utf-8') as vf: diff --git a/python/dgl/data/utils.py b/python/dgl/data/utils.py index 96d126ece44b..e6bdb68d0a41 100644 --- a/python/dgl/data/utils.py +++ b/python/dgl/data/utils.py @@ -13,6 +13,8 @@ class requests_failed_to_import(object): pass requests = requests_failed_to_import +__all__ = ['download', 'check_sha1', 'extract_archive', 'get_download_dir'] + def _get_dgl_url(file_url): """Get DGL online url for download.""" dgl_repo_url = 'https://s3.us-east-2.amazonaws.com/dgl.ai/' diff --git a/python/dgl/frame.py b/python/dgl/frame.py index 9647965502c0..688cf674f2df 100644 --- a/python/dgl/frame.py +++ b/python/dgl/frame.py @@ -765,7 +765,7 @@ def delete_rows(self, query): if isinstance(query, slice): query = range(query.start, query.stop) else: - query = query.tolist() + query = query.tonumpy() if isinstance(self._index_data, slice): self._index_data = range(self._index_data.start, self._index_data.stop) @@ -861,51 +861,3 @@ def frame_like(other, num_rows): # now supports non-exist columns. newf._initializers = other._initializers return newf - -def merge_frames(frames, indices, max_index, reduce_func): - """Merge a list of frames. - - The result frame contains `max_index` number of rows. For each frame in - the given list, its row is merged as follows: - - merged[indices[i][row]] += frames[i][row] - - Parameters - ---------- - frames : iterator of dgl.frame.FrameRef - A list of frames to be merged. - indices : iterator of dgl.utils.Index - The indices of the frame rows. - reduce_func : str - The reduce function (only 'sum' is supported currently) - - Returns - ------- - merged : FrameRef - The merged frame. - """ - # TODO(minjie) - assert False, 'Buggy code, disabled for now.' - assert reduce_func == 'sum' - assert len(frames) > 0 - schemes = frames[0].schemes - # create an adj to merge - # row index is equal to the concatenation of all the indices. - row = sum([idx.tolist() for idx in indices], []) - col = list(range(len(row))) - n = max_index - m = len(row) - row = F.unsqueeze(F.tensor(row, dtype=F.int64), 0) - col = F.unsqueeze(F.tensor(col, dtype=F.int64), 0) - idx = F.cat([row, col], dim=0) - dat = F.ones((m,)) - adjmat = F.sparse_tensor(idx, dat, [n, m]) - ctx_adjmat = utils.CtxCachedObject(lambda ctx: F.to_context(adjmat, ctx)) - merged = {} - for key in schemes: - # the rhs of the spmv is the concatenation of all the frame columns - feats = F.pack([fr[key] for fr in frames]) - merged_feats = F.spmm(ctx_adjmat.get(F.get_context(feats)), feats) - merged[key] = merged_feats - merged = FrameRef(Frame(merged)) - return merged diff --git a/python/dgl/graph.py b/python/dgl/graph.py index 5d86de831ddc..7826e78057fe 100644 --- a/python/dgl/graph.py +++ b/python/dgl/graph.py @@ -8,7 +8,7 @@ import dgl from .base import ALL, is_all, DGLError, dgl_warning from . import backend as F -from .frame import FrameRef, Frame, merge_frames +from .frame import FrameRef, Frame from .graph_index import GraphIndex, create_graph_index from .runtime import ir, scheduler, Runtime from . import utils diff --git a/python/dgl/runtime/degree_bucketing.py b/python/dgl/runtime/degree_bucketing.py index 9d0534f8c3ca..cd002e0b8556 100644 --- a/python/dgl/runtime/degree_bucketing.py +++ b/python/dgl/runtime/degree_bucketing.py @@ -168,7 +168,7 @@ def _process_buckets(buckets): msg_ids = [utils.toindex(msg_id) for msg_id in msg_ids] # handle zero deg - degs = degs.tolist() + degs = degs.tonumpy() if degs[-1] == 0: degs = degs[:-1] zero_deg_nodes = dsts[-1] diff --git a/python/dgl/traversal.py b/python/dgl/traversal.py index 777093883f0d..41e9575d741f 100644 --- a/python/dgl/traversal.py +++ b/python/dgl/traversal.py @@ -44,7 +44,7 @@ def bfs_nodes_generator(graph, source, reversed=False): ret = _CAPI_DGLBFSNodes(ghandle, source, reversed) all_nodes = utils.toindex(ret(0)).tousertensor() # TODO(minjie): how to support directly creating python list - sections = utils.toindex(ret(1)).tousertensor().tolist() + sections = utils.toindex(ret(1)).tonumpy().tolist() node_frontiers = F.split(all_nodes, sections, dim=0) return node_frontiers @@ -84,7 +84,7 @@ def bfs_edges_generator(graph, source, reversed=False): ret = _CAPI_DGLBFSEdges(ghandle, source, reversed) all_edges = utils.toindex(ret(0)).tousertensor() # TODO(minjie): how to support directly creating python list - sections = utils.toindex(ret(1)).tousertensor().tolist() + sections = utils.toindex(ret(1)).tonumpy().tolist() edge_frontiers = F.split(all_edges, sections, dim=0) return edge_frontiers @@ -120,7 +120,7 @@ def topological_nodes_generator(graph, reversed=False): ret = _CAPI_DGLTopologicalNodes(ghandle, reversed) all_nodes = utils.toindex(ret(0)).tousertensor() # TODO(minjie): how to support directly creating python list - sections = utils.toindex(ret(1)).tousertensor().tolist() + sections = utils.toindex(ret(1)).tonumpy().tolist() return F.split(all_nodes, sections, dim=0) def dfs_edges_generator(graph, source, reversed=False): @@ -165,7 +165,7 @@ def dfs_edges_generator(graph, source, reversed=False): ret = _CAPI_DGLDFSEdges(ghandle, source, reversed) all_edges = utils.toindex(ret(0)).tousertensor() # TODO(minjie): how to support directly creating python list - sections = utils.toindex(ret(1)).tousertensor().tolist() + sections = utils.toindex(ret(1)).tonumpy().tolist() return F.split(all_edges, sections, dim=0) def dfs_labeled_edges_generator( @@ -244,11 +244,11 @@ def dfs_labeled_edges_generator( # TODO(minjie): how to support directly creating python list if return_labels: all_labels = utils.toindex(ret(1)).tousertensor() - sections = utils.toindex(ret(2)).tousertensor().tolist() + sections = utils.toindex(ret(2)).tonumpy().tolist() return (F.split(all_edges, sections, dim=0), F.split(all_labels, sections, dim=0)) else: - sections = utils.toindex(ret(1)).tousertensor().tolist() + sections = utils.toindex(ret(1)).tonumpy().tolist() return F.split(all_edges, sections, dim=0) _init_api("dgl.traversal") diff --git a/python/dgl/utils.py b/python/dgl/utils.py index 10630655952e..00063ab6d763 100644 --- a/python/dgl/utils.py +++ b/python/dgl/utils.py @@ -15,24 +15,24 @@ def __init__(self, data): self._initialize_data(data) def _initialize_data(self, data): - self._list_data = None # a numpy type data or a slice + self._pydata = None # a numpy type data or a slice self._user_tensor_data = dict() # dictionary of user tensors self._dgl_tensor_data = None # a dgl ndarray self._dispatch(data) def __iter__(self): - for i in self.tolist(): + for i in self.tonumpy(): yield int(i) def __len__(self): - if self._list_data is not None and isinstance(self._list_data, slice): - slc = self._list_data + if self._pydata is not None and isinstance(self._pydata, slice): + slc = self._pydata if slc.step is None: return slc.stop - slc.start else: return (slc.stop - slc.start) // slc.step - elif self._list_data is not None: - return len(self._list_data) + elif self._pydata is not None: + return len(self._pydata) elif len(self._user_tensor_data) > 0: data = next(iter(self._user_tensor_data.values())) return len(data) @@ -40,7 +40,7 @@ def __len__(self): return len(self._dgl_tensor_data) def __getitem__(self, i): - return int(self.tolist()[i]) + return int(self.tonumpy()[i]) def _dispatch(self, data): """Store data based on its type.""" @@ -59,35 +59,35 @@ def _dispatch(self, data): raise DGLError('Index data must be 1D int64 vector, but got: %s' % str(data)) self._dgl_tensor_data = data elif isinstance(data, slice): - # save it in the _list_data temporarily; materialize it if `tolist` is called - self._list_data = data + # save it in the _pydata temporarily; materialize it if `tonumpy` is called + self._pydata = data else: try: - self._list_data = np.array([int(data)]).astype(np.int64) + self._pydata = np.array([int(data)]).astype(np.int64) except: try: data = np.array(data).astype(np.int64) if data.ndim != 1: raise DGLError('Index data must be 1D int64 vector,' ' but got: %s' % str(data)) - self._list_data = data + self._pydata = data except: raise DGLError('Error index data: %s' % str(data)) - self._user_tensor_data[F.cpu()] = F.zerocopy_from_numpy(self._list_data) + self._user_tensor_data[F.cpu()] = F.zerocopy_from_numpy(self._pydata) - def tolist(self): - """Convert to a python-list compatible object.""" - if self._list_data is None: + def tonumpy(self): + """Convert to a numpy ndarray.""" + if self._pydata is None: if self._dgl_tensor_data is not None: - self._list_data = self._dgl_tensor_data.asnumpy() + self._pydata = self._dgl_tensor_data.asnumpy() else: data = self.tousertensor() - self._list_data = F.zerocopy_to_numpy(data) - elif isinstance(self._list_data, slice): + self._pydata = F.zerocopy_to_numpy(data) + elif isinstance(self._pydata, slice): # convert it to numpy array - slc = self._list_data - self._list_data = np.arange(slc.start, slc.stop, slc.step).astype(np.int64) - return self._list_data + slc = self._pydata + self._pydata = np.arange(slc.start, slc.stop, slc.step).astype(np.int64) + return self._pydata def tousertensor(self, ctx=None): """Convert to user tensor (defined in `backend`).""" @@ -100,7 +100,7 @@ def tousertensor(self, ctx=None): self._user_tensor_data[F.cpu()] = F.zerocopy_from_dlpack(dl) else: # zero copy from numpy array - self._user_tensor_data[F.cpu()] = F.zerocopy_from_numpy(self.tolist()) + self._user_tensor_data[F.cpu()] = F.zerocopy_from_numpy(self.tonumpy()) if ctx not in self._user_tensor_data: # copy from cpu to another device data = next(iter(self._user_tensor_data.values())) @@ -117,8 +117,8 @@ def todgltensor(self): return self._dgl_tensor_data def is_slice(self, start, stop, step=None): - return (isinstance(self._list_data, slice) - and self._list_data == slice(start, stop, step)) + return (isinstance(self._pydata, slice) + and self._pydata == slice(start, stop, step)) def __getstate__(self): return self.tousertensor() diff --git a/tests/graph_index/test_basics.py b/tests/graph_index/test_basics.py index 53bb08c2adc2..94e3e0ba95ab 100644 --- a/tests/graph_index/test_basics.py +++ b/tests/graph_index/test_basics.py @@ -11,14 +11,14 @@ def test_edge_id(): gi.add_nodes(4) gi.add_edge(0, 1) - eid = gi.edge_id(0, 1).tolist() + eid = gi.edge_id(0, 1).tonumpy() assert len(eid) == 1 assert eid[0] == 0 assert gi.is_multigraph() # multiedges gi.add_edge(0, 1) - eid = gi.edge_id(0, 1).tolist() + eid = gi.edge_id(0, 1).tonumpy() assert len(eid) == 2 assert eid[0] == 0 assert eid[1] == 1 @@ -60,7 +60,7 @@ def test_edge_id(): gi.add_nodes(4) gi.add_edge(0, 1) - eid = gi.edge_id(0, 1).tolist() + eid = gi.edge_id(0, 1).tonumpy() assert len(eid) == 1 assert eid[0] == 0 diff --git a/tests/mxnet/test_graph_index.py b/tests/mxnet/test_graph_index.py index fce9faa67775..959d992a2e92 100644 --- a/tests/mxnet/test_graph_index.py +++ b/tests/mxnet/test_graph_index.py @@ -62,13 +62,13 @@ def check_basics(g, ig): for u in randv.asnumpy(): for v in randv.asnumpy(): - if len(g.edge_id(u, v).tolist()) == 1: - assert g.edge_id(u, v).tolist() == ig.edge_id(u, v).tolist() + if len(g.edge_id(u, v)) == 1: + assert g.edge_id(u, v).tonumpy() == ig.edge_id(u, v).tonumpy() assert g.has_edge_between(u, v) == ig.has_edge_between(u, v) randv = utils.toindex(randv) - ids = g.edge_ids(randv, randv)[2].tolist() - assert sum(ig.edge_ids(randv, randv)[2].tolist() == ids) == len(ids) - assert sum(g.has_edges_between(randv, randv).tolist() == ig.has_edges_between(randv, randv).tolist()) == len(randv) + ids = g.edge_ids(randv, randv)[2].tonumpy() + assert sum(ig.edge_ids(randv, randv)[2].tonumpy() == ids) == len(ids) + assert sum(g.has_edges_between(randv, randv).tonumpy() == ig.has_edges_between(randv, randv).tonumpy()) == len(randv) def test_basics(): diff --git a/tests/pytorch/test_frame.py b/tests/pytorch/test_frame.py index 718f824f7b99..e8f27ca6b347 100644 --- a/tests/pytorch/test_frame.py +++ b/tests/pytorch/test_frame.py @@ -208,7 +208,7 @@ def test_row3(): assert f.is_contiguous() assert f.is_span_whole_column() assert f.num_rows == N - del f[th.tensor([2, 3])] + del f[toindex(th.tensor([2, 3]))] assert not f.is_contiguous() assert not f.is_span_whole_column() # delete is lazy: only reflect on the ref while the diff --git a/tests/pytorch/test_index.py b/tests/pytorch/test_index.py index 1290ded307fa..bf52f62e6209 100644 --- a/tests/pytorch/test_index.py +++ b/tests/pytorch/test_index.py @@ -49,7 +49,7 @@ def test_index(): # from np data data = np.ones((10,), dtype=np.int64) * 10 idx = toindex(data) - y1 = idx.tolist() + y1 = idx.tonumpy() y2 = idx.tousertensor().numpy() y3 = idx.todgltensor().asnumpy() assert np.allclose(ans, y1) @@ -59,7 +59,7 @@ def test_index(): # from list data = [10] * 10 idx = toindex(data) - y1 = idx.tolist() + y1 = idx.tonumpy() y2 = idx.tousertensor().numpy() y3 = idx.todgltensor().asnumpy() assert np.allclose(ans, y1) @@ -69,7 +69,7 @@ def test_index(): # from torch data = th.ones((10,), dtype=th.int64) * 10 idx = toindex(data) - y1 = idx.tolist() + y1 = idx.tonumpy() y2 = idx.tousertensor().numpy() y3 = idx.todgltensor().asnumpy() assert np.allclose(ans, y1) @@ -79,7 +79,7 @@ def test_index(): # from dgl.NDArray data = dgl.ndarray.array(np.ones((10,), dtype=np.int64) * 10) idx = toindex(data) - y1 = idx.tolist() + y1 = idx.tonumpy() y2 = idx.tousertensor().numpy() y3 = idx.todgltensor().asnumpy() assert np.allclose(ans, y1) diff --git a/tutorials/models/2_small_graph/3_tree-lstm.py b/tutorials/models/2_small_graph/3_tree-lstm.py index 0e502939e59f..3030b9c7de45 100644 --- a/tutorials/models/2_small_graph/3_tree-lstm.py +++ b/tutorials/models/2_small_graph/3_tree-lstm.py @@ -46,13 +46,13 @@ # import dgl -import dgl.data as data +from dgl.data.tree import SST # Each sample in the dataset is a constituency tree. The leaf nodes # represent words. The word is a int value stored in the "x" field. # The non-leaf nodes has a special word PAD_WORD. The sentiment # label is stored in the "y" feature field. -trainset = data.SST(mode='tiny') # the "tiny" set has only 5 trees +trainset = SST(mode='tiny') # the "tiny" set has only 5 trees tiny_sst = trainset.trees num_vocabs = trainset.num_vocabs num_classes = trainset.num_classes @@ -337,7 +337,7 @@ def forward(self, batch, h, c): train_loader = DataLoader(dataset=tiny_sst, batch_size=5, - collate_fn=data.SST.batcher(device), + collate_fn=SST.batcher(device), shuffle=False, num_workers=0)