Skip to content

Commit

Permalink
[Bugfix] tolist and dependencies in dgl.data (dmlc#239)
Browse files Browse the repository at this point in the history
* change Index.tolist -> Index.tonumpy; fix bug in traversal; remove dependencies in data

* fix import

* fix __all__ and some docstring
  • Loading branch information
jermainewang authored Dec 5, 2018
1 parent eafcb7e commit 21255b6
Show file tree
Hide file tree
Showing 14 changed files with 62 additions and 106 deletions.
14 changes: 7 additions & 7 deletions examples/pytorch/tree_lstm/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from torch.utils.data import DataLoader

import dgl
import dgl.data as data
from dgl.data.tree import SST

from tree_lstm import TreeLSTM

Expand All @@ -25,22 +25,22 @@ def main(args):
if cuda:
th.cuda.set_device(args.gpu)

trainset = data.SST()
trainset = SST()
train_loader = DataLoader(dataset=trainset,
batch_size=args.batch_size,
collate_fn=data.SST.batcher(device),
collate_fn=SST.batcher(device),
shuffle=True,
num_workers=0)
devset = data.SST(mode='dev')
devset = SST(mode='dev')
dev_loader = DataLoader(dataset=devset,
batch_size=100,
collate_fn=data.SST.batcher(device),
collate_fn=SST.batcher(device),
shuffle=False,
num_workers=0)

testset = data.SST(mode='test')
testset = SST(mode='test')
test_loader = DataLoader(dataset=testset,
batch_size=100, collate_fn=data.SST.batcher(device), shuffle=False, num_workers=0)
batch_size=100, collate_fn=SST.batcher(device), shuffle=False, num_workers=0)

model = TreeLSTM(trainset.num_vocabs,
args.x_size,
Expand Down
1 change: 1 addition & 0 deletions python/dgl/data/sbm.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""Dataset for stochastic block model."""
import math
import os
import pickle
Expand Down
5 changes: 3 additions & 2 deletions python/dgl/data/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
from __future__ import absolute_import

from collections import namedtuple, OrderedDict
from nltk.tree import Tree
from nltk.corpus.reader import BracketParseCorpusReader
import networkx as nx

import numpy as np
Expand All @@ -16,6 +14,8 @@
import dgl.backend as F
from dgl.data.utils import download, extract_archive, get_download_dir, _get_dgl_url

__all__ = ['SSTBatch', 'SST']

_urls = {
'sst' : 'dataset/sst.zip',
}
Expand Down Expand Up @@ -63,6 +63,7 @@ def __init__(self, mode='train', vocab_file=None):
print('Dataset creation finished. #Trees:', len(self.trees))

def _load(self):
from nltk.corpus.reader import BracketParseCorpusReader
# load vocab file
self.vocab = OrderedDict()
with open(self.vocab_file, encoding='utf-8') as vf:
Expand Down
2 changes: 2 additions & 0 deletions python/dgl/data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ class requests_failed_to_import(object):
pass
requests = requests_failed_to_import

__all__ = ['download', 'check_sha1', 'extract_archive', 'get_download_dir']

def _get_dgl_url(file_url):
"""Get DGL online url for download."""
dgl_repo_url = 'https://s3.us-east-2.amazonaws.com/dgl.ai/'
Expand Down
50 changes: 1 addition & 49 deletions python/dgl/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -765,7 +765,7 @@ def delete_rows(self, query):
if isinstance(query, slice):
query = range(query.start, query.stop)
else:
query = query.tolist()
query = query.tonumpy()

if isinstance(self._index_data, slice):
self._index_data = range(self._index_data.start, self._index_data.stop)
Expand Down Expand Up @@ -861,51 +861,3 @@ def frame_like(other, num_rows):
# now supports non-exist columns.
newf._initializers = other._initializers
return newf

def merge_frames(frames, indices, max_index, reduce_func):
"""Merge a list of frames.
The result frame contains `max_index` number of rows. For each frame in
the given list, its row is merged as follows:
merged[indices[i][row]] += frames[i][row]
Parameters
----------
frames : iterator of dgl.frame.FrameRef
A list of frames to be merged.
indices : iterator of dgl.utils.Index
The indices of the frame rows.
reduce_func : str
The reduce function (only 'sum' is supported currently)
Returns
-------
merged : FrameRef
The merged frame.
"""
# TODO(minjie)
assert False, 'Buggy code, disabled for now.'
assert reduce_func == 'sum'
assert len(frames) > 0
schemes = frames[0].schemes
# create an adj to merge
# row index is equal to the concatenation of all the indices.
row = sum([idx.tolist() for idx in indices], [])
col = list(range(len(row)))
n = max_index
m = len(row)
row = F.unsqueeze(F.tensor(row, dtype=F.int64), 0)
col = F.unsqueeze(F.tensor(col, dtype=F.int64), 0)
idx = F.cat([row, col], dim=0)
dat = F.ones((m,))
adjmat = F.sparse_tensor(idx, dat, [n, m])
ctx_adjmat = utils.CtxCachedObject(lambda ctx: F.to_context(adjmat, ctx))
merged = {}
for key in schemes:
# the rhs of the spmv is the concatenation of all the frame columns
feats = F.pack([fr[key] for fr in frames])
merged_feats = F.spmm(ctx_adjmat.get(F.get_context(feats)), feats)
merged[key] = merged_feats
merged = FrameRef(Frame(merged))
return merged
2 changes: 1 addition & 1 deletion python/dgl/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import dgl
from .base import ALL, is_all, DGLError, dgl_warning
from . import backend as F
from .frame import FrameRef, Frame, merge_frames
from .frame import FrameRef, Frame
from .graph_index import GraphIndex, create_graph_index
from .runtime import ir, scheduler, Runtime
from . import utils
Expand Down
2 changes: 1 addition & 1 deletion python/dgl/runtime/degree_bucketing.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def _process_buckets(buckets):
msg_ids = [utils.toindex(msg_id) for msg_id in msg_ids]

# handle zero deg
degs = degs.tolist()
degs = degs.tonumpy()
if degs[-1] == 0:
degs = degs[:-1]
zero_deg_nodes = dsts[-1]
Expand Down
12 changes: 6 additions & 6 deletions python/dgl/traversal.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def bfs_nodes_generator(graph, source, reversed=False):
ret = _CAPI_DGLBFSNodes(ghandle, source, reversed)
all_nodes = utils.toindex(ret(0)).tousertensor()
# TODO(minjie): how to support directly creating python list
sections = utils.toindex(ret(1)).tousertensor().tolist()
sections = utils.toindex(ret(1)).tonumpy().tolist()
node_frontiers = F.split(all_nodes, sections, dim=0)
return node_frontiers

Expand Down Expand Up @@ -84,7 +84,7 @@ def bfs_edges_generator(graph, source, reversed=False):
ret = _CAPI_DGLBFSEdges(ghandle, source, reversed)
all_edges = utils.toindex(ret(0)).tousertensor()
# TODO(minjie): how to support directly creating python list
sections = utils.toindex(ret(1)).tousertensor().tolist()
sections = utils.toindex(ret(1)).tonumpy().tolist()
edge_frontiers = F.split(all_edges, sections, dim=0)
return edge_frontiers

Expand Down Expand Up @@ -120,7 +120,7 @@ def topological_nodes_generator(graph, reversed=False):
ret = _CAPI_DGLTopologicalNodes(ghandle, reversed)
all_nodes = utils.toindex(ret(0)).tousertensor()
# TODO(minjie): how to support directly creating python list
sections = utils.toindex(ret(1)).tousertensor().tolist()
sections = utils.toindex(ret(1)).tonumpy().tolist()
return F.split(all_nodes, sections, dim=0)

def dfs_edges_generator(graph, source, reversed=False):
Expand Down Expand Up @@ -165,7 +165,7 @@ def dfs_edges_generator(graph, source, reversed=False):
ret = _CAPI_DGLDFSEdges(ghandle, source, reversed)
all_edges = utils.toindex(ret(0)).tousertensor()
# TODO(minjie): how to support directly creating python list
sections = utils.toindex(ret(1)).tousertensor().tolist()
sections = utils.toindex(ret(1)).tonumpy().tolist()
return F.split(all_edges, sections, dim=0)

def dfs_labeled_edges_generator(
Expand Down Expand Up @@ -244,11 +244,11 @@ def dfs_labeled_edges_generator(
# TODO(minjie): how to support directly creating python list
if return_labels:
all_labels = utils.toindex(ret(1)).tousertensor()
sections = utils.toindex(ret(2)).tousertensor().tolist()
sections = utils.toindex(ret(2)).tonumpy().tolist()
return (F.split(all_edges, sections, dim=0),
F.split(all_labels, sections, dim=0))
else:
sections = utils.toindex(ret(1)).tousertensor().tolist()
sections = utils.toindex(ret(1)).tonumpy().tolist()
return F.split(all_edges, sections, dim=0)

_init_api("dgl.traversal")
48 changes: 24 additions & 24 deletions python/dgl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,32 +15,32 @@ def __init__(self, data):
self._initialize_data(data)

def _initialize_data(self, data):
self._list_data = None # a numpy type data or a slice
self._pydata = None # a numpy type data or a slice
self._user_tensor_data = dict() # dictionary of user tensors
self._dgl_tensor_data = None # a dgl ndarray
self._dispatch(data)

def __iter__(self):
for i in self.tolist():
for i in self.tonumpy():
yield int(i)

def __len__(self):
if self._list_data is not None and isinstance(self._list_data, slice):
slc = self._list_data
if self._pydata is not None and isinstance(self._pydata, slice):
slc = self._pydata
if slc.step is None:
return slc.stop - slc.start
else:
return (slc.stop - slc.start) // slc.step
elif self._list_data is not None:
return len(self._list_data)
elif self._pydata is not None:
return len(self._pydata)
elif len(self._user_tensor_data) > 0:
data = next(iter(self._user_tensor_data.values()))
return len(data)
else:
return len(self._dgl_tensor_data)

def __getitem__(self, i):
return int(self.tolist()[i])
return int(self.tonumpy()[i])

def _dispatch(self, data):
"""Store data based on its type."""
Expand All @@ -59,35 +59,35 @@ def _dispatch(self, data):
raise DGLError('Index data must be 1D int64 vector, but got: %s' % str(data))
self._dgl_tensor_data = data
elif isinstance(data, slice):
# save it in the _list_data temporarily; materialize it if `tolist` is called
self._list_data = data
# save it in the _pydata temporarily; materialize it if `tonumpy` is called
self._pydata = data
else:
try:
self._list_data = np.array([int(data)]).astype(np.int64)
self._pydata = np.array([int(data)]).astype(np.int64)
except:
try:
data = np.array(data).astype(np.int64)
if data.ndim != 1:
raise DGLError('Index data must be 1D int64 vector,'
' but got: %s' % str(data))
self._list_data = data
self._pydata = data
except:
raise DGLError('Error index data: %s' % str(data))
self._user_tensor_data[F.cpu()] = F.zerocopy_from_numpy(self._list_data)
self._user_tensor_data[F.cpu()] = F.zerocopy_from_numpy(self._pydata)

def tolist(self):
"""Convert to a python-list compatible object."""
if self._list_data is None:
def tonumpy(self):
"""Convert to a numpy ndarray."""
if self._pydata is None:
if self._dgl_tensor_data is not None:
self._list_data = self._dgl_tensor_data.asnumpy()
self._pydata = self._dgl_tensor_data.asnumpy()
else:
data = self.tousertensor()
self._list_data = F.zerocopy_to_numpy(data)
elif isinstance(self._list_data, slice):
self._pydata = F.zerocopy_to_numpy(data)
elif isinstance(self._pydata, slice):
# convert it to numpy array
slc = self._list_data
self._list_data = np.arange(slc.start, slc.stop, slc.step).astype(np.int64)
return self._list_data
slc = self._pydata
self._pydata = np.arange(slc.start, slc.stop, slc.step).astype(np.int64)
return self._pydata

def tousertensor(self, ctx=None):
"""Convert to user tensor (defined in `backend`)."""
Expand All @@ -100,7 +100,7 @@ def tousertensor(self, ctx=None):
self._user_tensor_data[F.cpu()] = F.zerocopy_from_dlpack(dl)
else:
# zero copy from numpy array
self._user_tensor_data[F.cpu()] = F.zerocopy_from_numpy(self.tolist())
self._user_tensor_data[F.cpu()] = F.zerocopy_from_numpy(self.tonumpy())
if ctx not in self._user_tensor_data:
# copy from cpu to another device
data = next(iter(self._user_tensor_data.values()))
Expand All @@ -117,8 +117,8 @@ def todgltensor(self):
return self._dgl_tensor_data

def is_slice(self, start, stop, step=None):
return (isinstance(self._list_data, slice)
and self._list_data == slice(start, stop, step))
return (isinstance(self._pydata, slice)
and self._pydata == slice(start, stop, step))

def __getstate__(self):
return self.tousertensor()
Expand Down
6 changes: 3 additions & 3 deletions tests/graph_index/test_basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ def test_edge_id():

gi.add_nodes(4)
gi.add_edge(0, 1)
eid = gi.edge_id(0, 1).tolist()
eid = gi.edge_id(0, 1).tonumpy()
assert len(eid) == 1
assert eid[0] == 0
assert gi.is_multigraph()

# multiedges
gi.add_edge(0, 1)
eid = gi.edge_id(0, 1).tolist()
eid = gi.edge_id(0, 1).tonumpy()
assert len(eid) == 2
assert eid[0] == 0
assert eid[1] == 1
Expand Down Expand Up @@ -60,7 +60,7 @@ def test_edge_id():

gi.add_nodes(4)
gi.add_edge(0, 1)
eid = gi.edge_id(0, 1).tolist()
eid = gi.edge_id(0, 1).tonumpy()
assert len(eid) == 1
assert eid[0] == 0

Expand Down
10 changes: 5 additions & 5 deletions tests/mxnet/test_graph_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@ def check_basics(g, ig):

for u in randv.asnumpy():
for v in randv.asnumpy():
if len(g.edge_id(u, v).tolist()) == 1:
assert g.edge_id(u, v).tolist() == ig.edge_id(u, v).tolist()
if len(g.edge_id(u, v)) == 1:
assert g.edge_id(u, v).tonumpy() == ig.edge_id(u, v).tonumpy()
assert g.has_edge_between(u, v) == ig.has_edge_between(u, v)
randv = utils.toindex(randv)
ids = g.edge_ids(randv, randv)[2].tolist()
assert sum(ig.edge_ids(randv, randv)[2].tolist() == ids) == len(ids)
assert sum(g.has_edges_between(randv, randv).tolist() == ig.has_edges_between(randv, randv).tolist()) == len(randv)
ids = g.edge_ids(randv, randv)[2].tonumpy()
assert sum(ig.edge_ids(randv, randv)[2].tonumpy() == ids) == len(ids)
assert sum(g.has_edges_between(randv, randv).tonumpy() == ig.has_edges_between(randv, randv).tonumpy()) == len(randv)


def test_basics():
Expand Down
2 changes: 1 addition & 1 deletion tests/pytorch/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def test_row3():
assert f.is_contiguous()
assert f.is_span_whole_column()
assert f.num_rows == N
del f[th.tensor([2, 3])]
del f[toindex(th.tensor([2, 3]))]
assert not f.is_contiguous()
assert not f.is_span_whole_column()
# delete is lazy: only reflect on the ref while the
Expand Down
Loading

0 comments on commit 21255b6

Please sign in to comment.